jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import random
	10	import re
	11	import time
	12	import traceback
	13
	14	from .common import InfoExtractor, SearchInfoExtractor
	15	from ..jsinterp import JSInterpreter
	16	from ..swfinterp import SWFInterpreter
	17	from ..compat import (
	18	compat_chr,
	19	compat_kwargs,
	20	compat_parse_qs,
	21	compat_urllib_parse_unquote,
	22	compat_urllib_parse_unquote_plus,
	23	compat_urllib_parse_urlencode,
	24	compat_urllib_parse_urlparse,
	25	compat_urlparse,
	26	compat_str,
	27	)
	28	from ..utils import (
	29	clean_html,
	30	error_to_compat_str,
	31	ExtractorError,
	32	float_or_none,
	33	get_element_by_attribute,
	34	get_element_by_id,
	35	int_or_none,
	36	mimetype2ext,
	37	orderedSet,
	38	parse_codecs,
	39	parse_duration,
	40	qualities,
	41	remove_quotes,
	42	remove_start,
	43	smuggle_url,
	44	str_or_none,
	45	str_to_int,
	46	try_get,
	47	unescapeHTML,
	48	unified_strdate,
	49	unsmuggle_url,
	50	uppercase_escape,
	51	url_or_none,
	52	urlencode_postdata,
	53	)
	54
	55
	56	class YoutubeBaseInfoExtractor(InfoExtractor):
	57	"""Provide base functions for Youtube extractors"""
	58	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	59	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	60
	61	_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
	62	_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
	63	_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
	64
	65	_NETRC_MACHINE = 'youtube'
	66	# If True it will raise an error if no login info is provided
	67	_LOGIN_REQUIRED = False
	68
	69	_PLAYLIST_ID_RE = r'(?:PL\|LL\|EC\|UU\|FL\|RD\|UL\|TL\|OLAK5uy_)[0-9A-Za-z-_]{10,}'
	70
	71	def _set_language(self):
	72	self._set_cookie(
	73	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	74	# YouTube sets the expire time to about two months
	75	expire_time=time.time() + 2 * 30 * 24 * 3600)
	76
	77	def _ids_to_results(self, ids):
	78	return [
	79	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	80	for vid_id in ids]
	81
	82	def _login(self):
	83	"""
	84	Attempt to log in to YouTube.
	85	True is returned if successful or skipped.
	86	False is returned if login failed.
	87
	88	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	89	"""
	90	username, password = self._get_login_info()
	91	# No authentication to be performed
	92	if username is None:
	93	if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
	94	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	95	return True
	96
	97	login_page = self._download_webpage(
	98	self._LOGIN_URL, None,
	99	note='Downloading login page',
	100	errnote='unable to fetch login page', fatal=False)
	101	if login_page is False:
	102	return
	103
	104	login_form = self._hidden_inputs(login_page)
	105
	106	def req(url, f_req, note, errnote):
	107	data = login_form.copy()
	108	data.update({
	109	'pstMsg': 1,
	110	'checkConnection': 'youtube',
	111	'checkedDomains': 'youtube',
	112	'hl': 'en',
	113	'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
	114	'f.req': json.dumps(f_req),
	115	'flowName': 'GlifWebSignIn',
	116	'flowEntry': 'ServiceLogin',
	117	})
	118	return self._download_json(
	119	url, None, note=note, errnote=errnote,
	120	transform_source=lambda s: re.sub(r'^[^[]*', '', s),
	121	fatal=False,
	122	data=urlencode_postdata(data), headers={
	123	'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
	124	'Google-Accounts-XSRF': 1,
	125	})
	126
	127	def warn(message):
	128	self._downloader.report_warning(message)
	129
	130	lookup_req = [
	131	username,
	132	None, [], None, 'US', None, None, 2, False, True,
	133	[
	134	None, None,
	135	[2, 1, None, 1,
	136	'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
	137	None, [], 4],
	138	1, [None, None, []], None, None, None, True
	139	],
	140	username,
	141	]
	142
	143	lookup_results = req(
	144	self._LOOKUP_URL, lookup_req,
	145	'Looking up account info', 'Unable to look up account info')
	146
	147	if lookup_results is False:
	148	return False
	149
	150	user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
	151	if not user_hash:
	152	warn('Unable to extract user hash')
	153	return False
	154
	155	challenge_req = [
	156	user_hash,
	157	None, 1, None, [1, None, None, None, [password, None, True]],
	158	[
	159	None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
	160	1, [None, None, []], None, None, None, True
	161	]]
	162
	163	challenge_results = req(
	164	self._CHALLENGE_URL, challenge_req,
	165	'Logging in', 'Unable to log in')
	166
	167	if challenge_results is False:
	168	return
	169
	170	login_res = try_get(challenge_results, lambda x: x[0][5], list)
	171	if login_res:
	172	login_msg = try_get(login_res, lambda x: x[5], compat_str)
	173	warn(
	174	'Unable to login: %s' % 'Invalid password'
	175	if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
	176	return False
	177
	178	res = try_get(challenge_results, lambda x: x[0][-1], list)
	179	if not res:
	180	warn('Unable to extract result entry')
	181	return False
	182
	183	login_challenge = try_get(res, lambda x: x[0][0], list)
	184	if login_challenge:
	185	challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
	186	if challenge_str == 'TWO_STEP_VERIFICATION':
	187	# SEND_SUCCESS - TFA code has been successfully sent to phone
	188	# QUOTA_EXCEEDED - reached the limit of TFA codes
	189	status = try_get(login_challenge, lambda x: x[5], compat_str)
	190	if status == 'QUOTA_EXCEEDED':
	191	warn('Exceeded the limit of TFA codes, try later')
	192	return False
	193
	194	tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
	195	if not tl:
	196	warn('Unable to extract TL')
	197	return False
	198
	199	tfa_code = self._get_tfa_info('2-step verification code')
	200
	201	if not tfa_code:
	202	warn(
	203	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	204	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	205	return False
	206
	207	tfa_code = remove_start(tfa_code, 'G-')
	208
	209	tfa_req = [
	210	user_hash, None, 2, None,
	211	[
	212	9, None, None, None, None, None, None, None,
	213	[None, tfa_code, True, 2]
	214	]]
	215
	216	tfa_results = req(
	217	self._TFA_URL.format(tl), tfa_req,
	218	'Submitting TFA code', 'Unable to submit TFA code')
	219
	220	if tfa_results is False:
	221	return False
	222
	223	tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
	224	if tfa_res:
	225	tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
	226	warn(
	227	'Unable to finish TFA: %s' % 'Invalid TFA code'
	228	if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
	229	return False
	230
	231	check_cookie_url = try_get(
	232	tfa_results, lambda x: x[0][-1][2], compat_str)
	233	else:
	234	CHALLENGES = {
	235	'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
	236	'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
	237	'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
	238	}
	239	challenge = CHALLENGES.get(
	240	challenge_str,
	241	'%s returned error %s.' % (self.IE_NAME, challenge_str))
	242	warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
	243	return False
	244	else:
	245	check_cookie_url = try_get(res, lambda x: x[2], compat_str)
	246
	247	if not check_cookie_url:
	248	warn('Unable to extract CheckCookie URL')
	249	return False
	250
	251	check_cookie_results = self._download_webpage(
	252	check_cookie_url, None, 'Checking cookie', fatal=False)
	253
	254	if check_cookie_results is False:
	255	return False
	256
	257	if 'https://myaccount.google.com/' not in check_cookie_results:
	258	warn('Unable to log in')
	259	return False
	260
	261	return True
	262
	263	def _download_webpage_handle(self, args, *kwargs):
	264	query = kwargs.get('query', {}).copy()
	265	query['disable_polymer'] = 'true'
	266	kwargs['query'] = query
	267	return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
	268	args, *compat_kwargs(kwargs))
	269
	270	def _real_initialize(self):
	271	if self._downloader is None:
	272	return
	273	self._set_language()
	274	if not self._login():
	275	return
	276
	277
	278	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	279	# Extract entries from page with "Load more" button
	280	def _entries(self, page, playlist_id):
	281	more_widget_html = content_html = page
	282	for page_num in itertools.count(1):
	283	for entry in self._process_page(content_html):
	284	yield entry
	285
	286	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	287	if not mobj:
	288	break
	289
	290	more = self._download_json(
	291	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	292	'Downloading page #%s' % page_num,
	293	transform_source=uppercase_escape)
	294	content_html = more['content_html']
	295	if not content_html.strip():
	296	# Some webpages show a "Load more" button but they don't
	297	# have more videos
	298	break
	299	more_widget_html = more['load_more_widget_html']
	300
	301
	302	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	303	def _process_page(self, content):
	304	for video_id, video_title in self.extract_videos_from_page(content):
	305	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	306
	307	def extract_videos_from_page(self, page):
	308	ids_in_page = []
	309	titles_in_page = []
	310	for mobj in re.finditer(self._VIDEO_RE, page):
	311	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	312	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	313	continue
	314	video_id = mobj.group('id')
	315	video_title = unescapeHTML(mobj.group('title'))
	316	if video_title:
	317	video_title = video_title.strip()
	318	try:
	319	idx = ids_in_page.index(video_id)
	320	if video_title and not titles_in_page[idx]:
	321	titles_in_page[idx] = video_title
	322	except ValueError:
	323	ids_in_page.append(video_id)
	324	titles_in_page.append(video_title)
	325	return zip(ids_in_page, titles_in_page)
	326
	327
	328	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	329	def _process_page(self, content):
	330	for playlist_id in orderedSet(re.findall(
	331	r'<h3[^>]+class="[^"]yt-lockup-title[^"]"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
	332	content)):
	333	yield self.url_result(
	334	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	335
	336	def _real_extract(self, url):
	337	playlist_id = self._match_id(url)
	338	webpage = self._download_webpage(url, playlist_id)
	339	title = self._og_search_title(webpage, fatal=False)
	340	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	341
	342
	343	class YoutubeIE(YoutubeBaseInfoExtractor):
	344	IE_DESC = 'YouTube.com'
	345	_VALID_URL = r"""(?x)^
	346	(
	347	(?:https?://\|//) # http(s):// or protocol-independent URL
	348	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	349	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	350	(?:www\.)?pwnyoutube\.com/\|
	351	(?:www\.)?hooktube\.com/\|
	352	(?:www\.)?yourepeat\.com/\|
	353	tube\.majestyc\.net/\|
	354	(?:(?:www\|dev)\.)?invidio\.us/\|
	355	(?:www\.)?invidiou\.sh/\|
	356	(?:www\.)?invidious\.snopyta\.org/\|
	357	(?:www\.)?invidious\.kabi\.tk/\|
	358	(?:www\.)?vid\.wxzm\.sx/\|
	359	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	360	(?:.*?\#/)? # handle anchor (#/) redirect urls
	361	(?: # the various things that can precede the ID:
	362	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	363	\|(?: # or the v= param in all its forms
	364	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	365	(?:\?\|\#!?) # the params delimiter ? or # or #!
	366	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	367	v=
	368	)
	369	))
	370	\|(?:
	371	youtu\.be\| # just youtu.be/xxxx
	372	vid\.plus\| # or vid.plus/xxxx
	373	zwearz\.com/watch\| # or zwearz.com/watch/xxxx
	374	)/
	375	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	376	)
	377	)? # all until now is optional -> you can pass the naked ID
	378	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	379	(?!.*?\blist=
	380	(?:
	381	%(playlist_id)s\| # combined list/video URLs are handled by the playlist IE
	382	WL # WL are handled by the watch later IE
	383	)
	384	)
	385	(?(1).+)? # if we found the ID, everything can follow
	386	$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
	387	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	388	_formats = {
	389	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	390	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	391	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	392	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	393	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	394	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	395	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	396	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	397	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	398	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
	399	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	400	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	401	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	402	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	403	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	404	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	405	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	406	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	407
	408
	409	# 3D videos
	410	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	411	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	412	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	413	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	414	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	415	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	416	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	417
	418	# Apple HTTP Live Streaming
	419	'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	420	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	421	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	422	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	423	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	424	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	425	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	426	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	427
	428	# DASH mp4 video
	429	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
	430	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
	431	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	432	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
	433	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
	434	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
	435	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
	436	'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	437	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
	438	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	439	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	440	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
	441
	442	# Dash mp4 audio
	443	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
	444	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
	445	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
	446	'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
	447	'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
	448	'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
	449	'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
	450
	451	# Dash webm
	452	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	453	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	454	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	455	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	456	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	457	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	458	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
	459	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	460	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	461	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	462	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	463	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	464	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	465	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	466	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	467	# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
	468	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	469	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	470	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	471	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	472	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	473	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	474
	475	# Dash webm audio
	476	'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
	477	'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
	478
	479	# Dash webm audio with opus inside
	480	'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
	481	'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
	482	'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
	483
	484	# RTMP (unnamed)
	485	'_rtmp': {'protocol': 'rtmp'},
	486	}
	487	_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
	488
	489	_GEO_BYPASS = False
	490
	491	IE_NAME = 'youtube'
	492	_TESTS = [
	493	{
	494	'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
	495	'info_dict': {
	496	'id': 'BaW_jenozKc',
	497	'ext': 'mp4',
	498	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	499	'uploader': 'Philipp Hagemeister',
	500	'uploader_id': 'phihag',

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import random

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

15

from ..jsinterp import JSInterpreter

16

from ..swfinterp import SWFInterpreter

17

from ..compat import (

compat_chr,

compat_kwargs,

compat_parse_qs,

compat_urllib_parse_unquote,

22

compat_urllib_parse_unquote_plus,

23

compat_urllib_parse_urlencode,

24

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

error_to_compat_str,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_codecs,

parse_duration,

qualities,

remove_quotes,

remove_start,

smuggle_url,

str_or_none,

str_to_int,

try_get,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

url_or_none,

urlencode_postdata,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

57

"""Provide base functions for Youtube extractors"""

58

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

59

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

60

61

_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'

62

_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'

63

_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'

64

65

_NETRC_MACHINE = 'youtube'

66

# If True it will raise an error if no login info is provided

67

_LOGIN_REQUIRED = False

68

69

_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'

70

71

def _set_language(self):

72

self._set_cookie(

73

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

74

# YouTube sets the expire time to about two months

75

expire_time=time.time() + 2 * 30 * 24 * 3600)

76

77

def _ids_to_results(self, ids):

78

return [

79

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

85

True is returned if successful or skipped.

86

False is returned if login failed.

87

88

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

89

"""

90

username, password = self._get_login_info()

91

# No authentication to be performed

92

if username is None:

93

if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:

94

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

95

return True

96

97

login_page = self._download_webpage(

98

self._LOGIN_URL, None,

99

note='Downloading login page',

100

errnote='unable to fetch login page', fatal=False)

101

if login_page is False:

102

return

103

104

login_form = self._hidden_inputs(login_page)

105

106

def req(url, f_req, note, errnote):

107

data = login_form.copy()

108

data.update({

109

'pstMsg': 1,

110

'checkConnection': 'youtube',

111

'checkedDomains': 'youtube',

112

'hl': 'en',

113

'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',

114

'f.req': json.dumps(f_req),

115

'flowName': 'GlifWebSignIn',

116

'flowEntry': 'ServiceLogin',

117

})

118

return self._download_json(

119

url, None, note=note, errnote=errnote,

120

transform_source=lambda s: re.sub(r'^[^[]*', '', s),

121

fatal=False,

122

data=urlencode_postdata(data), headers={

123

'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',

124

'Google-Accounts-XSRF': 1,

})

def warn(message):

self._downloader.report_warning(message)

lookup_req = [

username,

None, [], None, 'US', None, None, 2, False, True,

[

None, None,

[2, 1, None, 1,

'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',

137

None, [], 4],

138

1, [None, None, []], None, None, None, True

],

username,

]

lookup_results = req(

144

self._LOOKUP_URL, lookup_req,

145

'Looking up account info', 'Unable to look up account info')

146

147

if lookup_results is False:

148

return False

149

150

user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)

151

if not user_hash:

152

warn('Unable to extract user hash')

return False

challenge_req = [

user_hash,

None, 1, None, [1, None, None, None, [password, None, True]],

158

[

159

None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],

160

1, [None, None, []], None, None, None, True

161

]]

162

163

challenge_results = req(

164

self._CHALLENGE_URL, challenge_req,

165

'Logging in', 'Unable to log in')

166

167

if challenge_results is False:

168

return

169

170

login_res = try_get(challenge_results, lambda x: x[0][5], list)

171

if login_res:

172

login_msg = try_get(login_res, lambda x: x[5], compat_str)

173

warn(

174

'Unable to login: %s' % 'Invalid password'

175

if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)

176

return False

177

178

res = try_get(challenge_results, lambda x: x[0][-1], list)

179

if not res:

180

warn('Unable to extract result entry')

181

return False

182

183

login_challenge = try_get(res, lambda x: x[0][0], list)

184

if login_challenge:

185

challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)

186

if challenge_str == 'TWO_STEP_VERIFICATION':

187

# SEND_SUCCESS - TFA code has been successfully sent to phone

188

# QUOTA_EXCEEDED - reached the limit of TFA codes

189

status = try_get(login_challenge, lambda x: x[5], compat_str)

190

if status == 'QUOTA_EXCEEDED':

191

warn('Exceeded the limit of TFA codes, try later')

192

return False

193

194

tl = try_get(challenge_results, lambda x: x[1][2], compat_str)

195

if not tl:

196

warn('Unable to extract TL')

197

return False

198

199

tfa_code = self._get_tfa_info('2-step verification code')

if not tfa_code:

warn(

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

204

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

205

return False

206

207

tfa_code = remove_start(tfa_code, 'G-')

208

209

tfa_req = [

210

user_hash, None, 2, None,

211

[

212

9, None, None, None, None, None, None, None,

213

[None, tfa_code, True, 2]

]]

tfa_results = req(

self._TFA_URL.format(tl), tfa_req,

218

'Submitting TFA code', 'Unable to submit TFA code')

219

220

if tfa_results is False:

221

return False

222

223

tfa_res = try_get(tfa_results, lambda x: x[0][5], list)

224

if tfa_res:

225

tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)

226

warn(

227

'Unable to finish TFA: %s' % 'Invalid TFA code'

228

if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)

229

return False

230

231

check_cookie_url = try_get(

232

tfa_results, lambda x: x[0][-1][2], compat_str)

233

else:

234

CHALLENGES = {

235

'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",

236

'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',

237

'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",

238

}

239

challenge = CHALLENGES.get(

240

challenge_str,

241

'%s returned error %s.' % (self.IE_NAME, challenge_str))

242

warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)

243

return False

244

else:

245

check_cookie_url = try_get(res, lambda x: x[2], compat_str)

246

247

if not check_cookie_url:

248

warn('Unable to extract CheckCookie URL')

249

return False

250

251

check_cookie_results = self._download_webpage(

252

check_cookie_url, None, 'Checking cookie', fatal=False)

253

254

if check_cookie_results is False:

255

return False

256

257

if 'https://myaccount.google.com/' not in check_cookie_results:

258

warn('Unable to log in')

return False

return True

def _download_webpage_handle(self, *args, **kwargs):

264

query = kwargs.get('query', {}).copy()

265

query['disable_polymer'] = 'true'

266

kwargs['query'] = query

267

return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(

268

*args, **compat_kwargs(kwargs))

269

270

def _real_initialize(self):

271

if self._downloader is None:

272

return

273

self._set_language()

274

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

279

# Extract entries from page with "Load more" button

280

def _entries(self, page, playlist_id):

281

more_widget_html = content_html = page

282

for page_num in itertools.count(1):

283

for entry in self._process_page(content_html):

284

yield entry

285

286

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

291

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

292

'Downloading page #%s' % page_num,

293

transform_source=uppercase_escape)

294

content_html = more['content_html']

295

if not content_html.strip():

296

# Some webpages show a "Load more" button but they don't

297

# have more videos

298

break

299

more_widget_html = more['load_more_widget_html']

300

301

302

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

303

def _process_page(self, content):

304

for video_id, video_title in self.extract_videos_from_page(content):

305

yield self.url_result(video_id, 'Youtube', video_id, video_title)

306

307

def extract_videos_from_page(self, page):

308

ids_in_page = []

309

titles_in_page = []

310

for mobj in re.finditer(self._VIDEO_RE, page):

311

# The link with index 0 is not the first video of the playlist (not sure if still actual)

312

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

313

continue

314

video_id = mobj.group('id')

315

video_title = unescapeHTML(mobj.group('title'))

316

if video_title:

317

video_title = video_title.strip()

318

try:

319

idx = ids_in_page.index(video_id)

320

if video_title and not titles_in_page[idx]:

321

titles_in_page[idx] = video_title

322

except ValueError:

323

ids_in_page.append(video_id)

324

titles_in_page.append(video_title)

325

return zip(ids_in_page, titles_in_page)

326

327

328

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

329

def _process_page(self, content):

330

for playlist_id in orderedSet(re.findall(

331

r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',

332

content)):

333

yield self.url_result(

334

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

335

336

def _real_extract(self, url):

337

playlist_id = self._match_id(url)

338

webpage = self._download_webpage(url, playlist_id)

339

title = self._og_search_title(webpage, fatal=False)

340

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

341

342

343

class YoutubeIE(YoutubeBaseInfoExtractor):

344

IE_DESC = 'YouTube.com'

345

_VALID_URL = r"""(?x)^

346

(

347

(?:https?://|//) # http(s):// or protocol-independent URL

348

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

349

(?:www\.)?deturl\.com/www\.youtube\.com/|

350

(?:www\.)?pwnyoutube\.com/|

351

(?:www\.)?hooktube\.com/|

352

(?:www\.)?yourepeat\.com/|

353

tube\.majestyc\.net/|

354

(?:(?:www|dev)\.)?invidio\.us/|

355

(?:www\.)?invidiou\.sh/|

356

(?:www\.)?invidious\.snopyta\.org/|

357

(?:www\.)?invidious\.kabi\.tk/|

358

(?:www\.)?vid\.wxzm\.sx/|

359

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

360

(?:.*?\#/)? # handle anchor (#/) redirect urls

361

(?: # the various things that can precede the ID:

362

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

363

|(?: # or the v= param in all its forms

364

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

365

(?:\?|\#!?) # the params delimiter ? or # or #!

366

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

372

vid\.plus| # or vid.plus/xxxx

373

zwearz\.com/watch| # or zwearz.com/watch/xxxx

374

)/

375

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

376

)

377

)? # all until now is optional -> you can pass the naked ID

378

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

379

(?!.*?\blist=

380

(?:

381

%(playlist_id)s| # combined list/video URLs are handled by the playlist IE

382

WL # WL are handled by the watch later IE

383

)

384

)

385

(?(1).+)? # if we found the ID, everything can follow

386

$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

387

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

388

_formats = {

389

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

390

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

391

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

392

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

393

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

394

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

395

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

396

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

397

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

398

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},

399

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

400

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

401

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

402

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

403

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

404

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

405

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

406

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

411

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

412

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

413

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

414

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

415

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

416

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

417

418

# Apple HTTP Live Streaming

419

'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

420

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

421

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

422

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

423

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

424

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

425

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

426

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

427

428

# DASH mp4 video

429

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},

430

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},

431

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

432

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},

433

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},

434

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)

435

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},

436

'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

437

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},

438

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

439

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

440

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},

441

442

# Dash mp4 audio

443

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},

444

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},

445

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},

446

'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

447

'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

448

'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},

449

'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},

450

451

# Dash webm

452

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

453

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

454

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

455

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

456

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

457

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

458

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},

459

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},

460

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},

461

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

462

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

463

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

464

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},

465

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},

466

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},

467

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

468

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

469

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

470

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

471

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

472

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

473

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

474

475

# Dash webm audio

476

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},

477

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},

478

479

# Dash webm audio with opus inside

480

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},

481

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},

482

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},

483

484

# RTMP (unnamed)

485

'_rtmp': {'protocol': 'rtmp'},

486

}

487

_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')

_GEO_BYPASS = False

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

499

'uploader': 'Philipp Hagemeister',

500

'uploader_id': 'phihag',

501

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

502

'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',

503

'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',

504

'upload_date': '20121002',

505

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

506

'categories': ['Science & Technology'],

507

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',

518

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

523

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

524

'alt_title': 'I Love It (feat. Charli XCX)',

525

'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',

526

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

527

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

528

'iconic ep', 'iconic', 'love', 'it'],

529

'duration': 180,

530

'uploader': 'Icona Pop',

531

'uploader_id': 'IconaPop',

532

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',

533

'creator': 'Icona Pop',

534

'track': 'I Love It (feat. Charli XCX)',

535

'artist': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

540

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

545

'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',

546

'alt_title': 'Tunnel Vision',

547

'description': 'md5:07dab3356cde4199048e4c7cd93471e1',

548

'duration': 419,

549

'uploader': 'justintimberlakeVEVO',

550

'uploader_id': 'justintimberlakeVEVO',

551

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',

552

'creator': 'Justin Timberlake',

553

'track': 'Tunnel Vision',

554

'artist': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

560

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

565

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

566

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

567

'uploader': 'SET India',

568

'uploader_id': 'setindia',

569

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',

'age_limit': 18,

}

},

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

575

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

580

'uploader': 'Philipp Hagemeister',

581

'uploader_id': 'phihag',

582

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

583

'upload_date': '20121002',

584

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

585

'categories': ['Science & Technology'],

586

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

591

},

592

'params': {

593

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',

598

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

603

'uploader_id': '8KVIDEO',

604

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',

605

'description': '',

606

'uploader': '8KVIDEO',

607

'title': 'UHDTV TEST 8K VIDEO.mp4'

608

},

609

'params': {

610

'youtube_include_dash_manifest': True,

611

'format': '141',

612

},

613

'skip': 'format 141 not served anymore',

614

},

615

# DASH manifest with encrypted signature

616

{

617

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',

622

'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',

623

'duration': 244,

624

'uploader': 'AfrojackVEVO',

625

'uploader_id': 'AfrojackVEVO',

626

'upload_date': '20131011',

627

},

628

'params': {

629

'youtube_include_dash_manifest': True,

630

'format': '141/bestaudio[ext=m4a]',

631

},

632

},

633

# JS player signature function name containing $

634

{

635

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

640

'description': 'md5:bec2185232c05479482cb5a9b82719bf',

641

'duration': 242,

642

'uploader': 'TaylorSwiftVEVO',

643

'uploader_id': 'TaylorSwiftVEVO',

644

'upload_date': '20140818',

645

'creator': 'Taylor Swift',

646

},

647

'params': {

648

'youtube_include_dash_manifest': True,

649

'format': '141/bestaudio[ext=m4a]',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'duration': 219,

'upload_date': '20100909',

660

'uploader': 'Amazing Atheist',

661

'uploader_id': 'TheAmazingAtheist',

662

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',

663

'title': 'Burning Everyone\'s Koran',

664

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

665

}

666

},

667

# Normal age-gate video (No vevo, embed allowed)

668

{

669

'url': 'https://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

674

'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

675

'duration': 142,

676

'uploader': 'The Witcher',

677

'uploader_id': 'WitcherGame',

678

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',

679

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

684

{

685

'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

690

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

691

'duration': 246,

692

'uploader': 'LloydVEVO',

693

'uploader_id': 'LloydVEVO',

694

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',

695

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)

700

# YouTube Red ad is not captured for creator

701

{

702

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'duration': 266,

'upload_date': '20100430',

708

'uploader_id': 'deadmau5',

709

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',

710

'creator': 'deadmau5',

711

'description': 'md5:12c56784b8032162bb936a5f76d55360',

712

'uploader': 'deadmau5',

713

'title': 'Deadmau5 - Some Chords (HD)',

714

'alt_title': 'Some Chords',

715

},

716

'expected_warnings': [

717

'DASH manifest missing',

718

]

719

},

720

# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)

721

{

722

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'duration': 6085,

'upload_date': '20150827',

728

'uploader_id': 'olympic',

729

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',

730

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

731

'uploader': 'Olympic',

732

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

733

},

734

'params': {

735

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

745

'duration': 85,

746

'upload_date': '20110310',

747

'uploader_id': 'AllenMeow',

748

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',

749

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

750

'uploader': '孫ᄋᄅ',

751

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

752

},

753

},

754

# url_encoded_fmt_stream_map is empty string

755

{

756

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

761

'description': '',

762

'upload_date': '20150404',

763

'uploader_id': 'spbelect',

764

'uploader': 'Наблюдатели Петербурга',

765

},

766

'params': {

767

'skip_download': 'requires avconv',

768

},

769

'skip': 'This live event has ended.',

770

},

771

# Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)

772

{

773

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'webm',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

778

'description': 'md5:116377fd2963b81ec4ce64b542173306',

779

'duration': 220,

780

'upload_date': '20150625',

781

'uploader_id': 'dorappi2000',

782

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',

783

'uploader': 'dorappi2000',

784

'formats': 'mincount:31',

785

},

786

'skip': 'not actual anymore',

787

},

788

# DASH manifest with segment_list

789

{

790

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

791

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

796

'uploader': 'Airtek',

797

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

798

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

799

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

800

},

801

'params': {

802

'youtube_include_dash_manifest': True,

803

'format': '135', # bestvideo

804

},

805

'skip': 'This live event has ended.',

806

},

807

{

808

# Multifeed videos (multiple cameras), URL is for Main Camera

809

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

810

'info_dict': {

811

'id': 'jqWvoWXjCVs',

812

'title': 'teamPGP: Rocket League Noob Stream',

813

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

820

'description': 'md5:dc7872fb300e143831327f1bae3af010',

821

'duration': 7335,

822

'upload_date': '20150721',

823

'uploader': 'Beer Games Beer',

824

'uploader_id': 'beergamesbeer',

825

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

826

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

833

'description': 'md5:dc7872fb300e143831327f1bae3af010',

834

'duration': 7337,

835

'upload_date': '20150721',

836

'uploader': 'Beer Games Beer',

837

'uploader_id': 'beergamesbeer',

838

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

839

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

846

'description': 'md5:dc7872fb300e143831327f1bae3af010',

847

'duration': 7337,

848

'upload_date': '20150721',

849

'uploader': 'Beer Games Beer',

850

'uploader_id': 'beergamesbeer',

851

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

852

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

859

'description': 'md5:dc7872fb300e143831327f1bae3af010',

860

'duration': 7334,

861

'upload_date': '20150721',

862

'uploader': 'Beer Games Beer',

863

'uploader_id': 'beergamesbeer',

864

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

865

'license': 'Standard YouTube License',

},

}],

'params': {

'skip_download': True,

870

},

871

'skip': 'This video is not available.',

872

},

873

{

874

# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)

875

'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',

876

'info_dict': {

877

'id': 'gVfLd0zydlo',

878

'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',

879

},

880

'playlist_count': 2,

881

'skip': 'Not multifeed anymore',

882

},

883

{

884

'url': 'https://vid.plus/FlRa-iH7PGw',

885

'only_matching': True,

886

},

887

{

888

'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',

889

'only_matching': True,

890

},

891

{

892

# Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)

893

# Also tests cut-off URL expansion in video description (see

894

# https://github.com/ytdl-org/youtube-dl/issues/1892,

895

# https://github.com/ytdl-org/youtube-dl/issues/8164)

896

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

901

'alt_title': 'Dark Walk - Position Music',

902

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

903

'duration': 133,

904

'upload_date': '20151119',

905

'uploader_id': 'IronSoulElf',

906

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',

907

'uploader': 'IronSoulElf',

908

'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

909

'track': 'Dark Walk - Position Music',

910

'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

911

},

912

'params': {

913

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)

918

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

919

'only_matching': True,

920

},

921

{

922

# Video with yt:stretch=17:0

923

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

928

'description': 'md5:ee18a25c350637c8faff806845bddee9',

929

'upload_date': '20151107',

930

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

931

'uploader': 'CH GAMER DROID',

932

},

933

'params': {

934

'skip_download': True,

935

},

936

'skip': 'This video does not exist.',

937

},

938

{

939

# Video licensed under Creative Commons

940

'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',

'info_dict': {

'id': 'M4gD1WSo5mA',

'ext': 'mp4',

'title': 'md5:e41008789470fc2533a3252216f1c1d1',

945

'description': 'md5:a677553cf0840649b731a3024aeff4cc',

946

'duration': 721,

947

'upload_date': '20150127',

948

'uploader_id': 'BerkmanCenter',

949

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',

950

'uploader': 'The Berkman Klein Center for Internet & Society',

951

'license': 'Creative Commons Attribution license (reuse allowed)',

952

},

953

'params': {

954

'skip_download': True,

},

},

{

# Channel-like uploader_url

959

'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',

'info_dict': {

'id': 'eQcmzGIKrzg',

'ext': 'mp4',

'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',

964

'description': 'md5:dda0d780d5a6e120758d1711d062a867',

965

'duration': 4060,

966

'upload_date': '20151119',

967

'uploader': 'Bernie Sanders',

968

'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',

969

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',

970

'license': 'Creative Commons Attribution license (reuse allowed)',

971

},

972

'params': {

973

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

978

'only_matching': True,

979

},

980

{

981

# YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)

982

'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',

983

'only_matching': True,

984

},

985

{

986

# Rental video preview

987

'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',

'info_dict': {

'id': 'uGpuVWrhIzE',

'ext': 'mp4',

'title': 'Piku - Trailer',

992

'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',

993

'upload_date': '20150811',

994

'uploader': 'FlixMatrix',

995

'uploader_id': 'FlixMatrixKaravan',

996

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',

997

'license': 'Standard YouTube License',

998

},

999

'params': {

1000

'skip_download': True,

1001

},

1002

'skip': 'This video is not available.',

1003

},

1004

{

1005

# YouTube Red video with episode data

1006

'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',

'info_dict': {

'id': 'iqKdEhx-dD4',

'ext': 'mp4',

'title': 'Isolation - Mind Field (Ep 1)',

1011

'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',

1012

'duration': 2085,

1013

'upload_date': '20170118',

1014

'uploader': 'Vsauce',

1015

'uploader_id': 'Vsauce',

1016

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',

1017

'series': 'Mind Field',

'season_number': 1,

'episode_number': 1,

},

'params': {

'skip_download': True,

1023

},

1024

'expected_warnings': [

1025

'Skipping DASH manifest',

],

},

{

# The following content has been identified by the YouTube community

1030

# as inappropriate or offensive to some audiences.

1031

'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',

'info_dict': {

'id': '6SJNVb0GnPI',

'ext': 'mp4',

'title': 'Race Differences in Intelligence',

1036

'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',

1037

'duration': 965,

1038

'upload_date': '20140124',

1039

'uploader': 'New Century Foundation',

1040

'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',

1041

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',

1042

},

1043

'params': {

1044

'skip_download': True,

},

},

{

# itag 212

'url': '1t24XAntNCY',

1050

'only_matching': True,

1051

},

1052

{

1053

# geo restricted to JP

1054

'url': 'sJL6WA-aGkQ',

1055

'only_matching': True,

1056

},

1057

{

1058

'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',

1059

'only_matching': True,

1060

},

1061

{

1062

'url': 'https://invidio.us/watch?v=BaW_jenozKc',

1063

'only_matching': True,

},

{

# DRM protected

'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',

1068

'only_matching': True,

1069

},

1070

{

1071

# Video with unsupported adaptive stream type formats

1072

'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',

'info_dict': {

'id': 'Z4Vy8R84T1U',

'ext': 'mp4',

'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',

1077

'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',

1078

'duration': 433,

1079

'upload_date': '20130923',

1080

'uploader': 'Amelia Putri Harwita',

1081

'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',

1082

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',

1083

'formats': 'maxcount:10',

1084

},

1085

'params': {

1086

'skip_download': True,

1087

'youtube_include_dash_manifest': False,

},

},

{

# Youtube Music Auto-generated description

1092

'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',

'info_dict': {

'id': 'MgNrAu2pzNs',

'ext': 'mp4',

'title': 'Voyeur Girl',

1097

'description': 'md5:7ae382a65843d6df2685993e90a8628f',

1098

'upload_date': '20190312',

1099

'uploader': 'Various Artists - Topic',

1100

'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw',

1101

'artist': 'Stephen',

1102

'track': 'Voyeur Girl',

1103

'album': 'it\'s too much love to know my dear',

1104

'release_date': '20190313',

1105

'release_year': 2019,

1106

},

1107

'params': {

1108

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1113

# Retrieve 'artist' field from 'Artist:' in video description

1114

# when it is present on youtube music video

1115

'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',

'info_dict': {

'id': 'k0jLE7tTwjY',

'ext': 'mp4',

'title': 'Latch Feat. Sam Smith',

1120

'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',

1121

'upload_date': '20150110',

1122

'uploader': 'Various Artists - Topic',

1123

'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',

1124

'artist': 'Disclosure',

1125

'track': 'Latch Feat. Sam Smith',

1126

'album': 'Latch Featuring Sam Smith',

1127

'release_date': '20121008',

1128

'release_year': 2012,

1129

},

1130

'params': {

1131

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1136

# handle multiple artists on youtube music video

1137

'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',

'info_dict': {

'id': '74qn0eJSjpA',

'ext': 'mp4',

'title': 'Eastside',

'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',

1143

'upload_date': '20180710',

1144

'uploader': 'Benny Blanco - Topic',

1145

'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',

1146

'artist': 'benny blanco, Halsey, Khalid',

1147

'track': 'Eastside',

1148

'album': 'Eastside',

1149

'release_date': '20180713',

1150

'release_year': 2018,

1151

},

1152

'params': {

1153

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1158

# handle youtube music video with release_year and no release_date

1159

'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',

'info_dict': {

'id': '-hcAI0g-f5M',

'ext': 'mp4',

'title': 'Put It On Me',

1164

'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e',

1165

'upload_date': '20180426',

1166

'uploader': 'Matt Maeson - Topic',

1167

'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',

1168

'artist': 'Matt Maeson',

1169

'track': 'Put It On Me',

1170

'album': 'The Hearse',

1171

'release_date': None,

1172

'release_year': 2018,

1173

},

1174

'params': {

1175

'skip_download': True,

},

},

]

def __init__(self, *args, **kwargs):

1181

super(YoutubeIE, self).__init__(*args, **kwargs)

1182

self._player_cache = {}

1183

1184

def report_video_info_webpage_download(self, video_id):

1185

"""Report attempt to download video info webpage."""

1186

self.to_screen('%s: Downloading video info webpage' % video_id)

1187

1188

def report_information_extraction(self, video_id):

1189

"""Report attempt to extract video information."""

1190

self.to_screen('%s: Extracting video information' % video_id)

1191

1192

def report_unavailable_format(self, video_id, format):

1193

"""Report extracted video URL."""

1194

self.to_screen('%s: Format %s not available' % (video_id, format))

1195

1196

def report_rtmp_download(self):

1197

"""Indicate the download will use the RTMP protocol."""

1198

self.to_screen('RTMP download detected')

1199

1200

def _signature_cache_id(self, example_sig):

1201

""" Return a string representation of a signature """

1202

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

1203

1204

def _extract_signature_function(self, video_id, player_url, example_sig):

1205

id_m = re.match(

1206

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',

1207

player_url)

1208

if not id_m:

1209

raise ExtractorError('Cannot identify player %r' % player_url)

1210

player_type = id_m.group('ext')

1211

player_id = id_m.group('id')

1212

1213

# Read from filesystem cache

1214

func_id = '%s_%s_%s' % (

1215

player_type, player_id, self._signature_cache_id(example_sig))

1216

assert os.path.basename(func_id) == func_id

1217

1218

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

1219

if cache_spec is not None:

1220

return lambda s: ''.join(s[i] for i in cache_spec)

1221

1222

download_note = (

1223

'Downloading player %s' % player_url

1224

if self._downloader.params.get('verbose') else

1225

'Downloading %s player %s' % (player_type, player_id)

1226

)

1227

if player_type == 'js':

1228

code = self._download_webpage(

1229

player_url, video_id,

1230

note=download_note,

1231

errnote='Download of %s failed' % player_url)

1232

res = self._parse_sig_js(code)

1233

elif player_type == 'swf':

1234

urlh = self._request_webpage(

1235

player_url, video_id,

1236

note=download_note,

1237

errnote='Download of %s failed' % player_url)

1238

code = urlh.read()

1239

res = self._parse_sig_swf(code)

1240

else:

1241

assert False, 'Invalid player type %r' % player_type

1242

1243

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1244

cache_res = res(test_string)

1245

cache_spec = [ord(c) for c in cache_res]

1246

1247

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

1248

return res

1249

1250

def _print_sig_code(self, func, example_sig):

1251

def gen_sig_code(idxs):

1252

def _genslice(start, end, step):

1253

starts = '' if start == 0 else str(start)

1254

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

1255

steps = '' if step == 1 else (':%d' % step)

1256

return 's[%s%s%s]' % (starts, ends, steps)

1257

1258

step = None

1259

# Quelch pyflakes warnings - start will be set when step is set

1260

start = '(Never used)'

1261

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

1266

step = None

1267

continue

1268

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

1278

1279

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1280

cache_res = func(test_string)

1281

cache_spec = [ord(c) for c in cache_res]

1282

expr_code = ' + '.join(gen_sig_code(cache_spec))

1283

signature_id_tuple = '(%s)' % (

1284

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

1285

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

1286

' return %s\n') % (signature_id_tuple, expr_code)

1287

self.to_screen('Extracted signature function:\n' + code)

1288

1289

def _parse_sig_js(self, jscode):

1290

funcname = self._search_regex(

1291

(r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1292

r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',

1293

r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',

1294

r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1295

r'\bc\s*&&\s*d\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),

1296

jscode, 'Initial JS player signature function name', group='sig')

1297

1298

jsi = JSInterpreter(jscode)

1299

initial_function = jsi.extract_function(funcname)

1300

return lambda s: initial_function([s])

1301

1302

def _parse_sig_swf(self, file_contents):

1303

swfi = SWFInterpreter(file_contents)

1304

TARGET_CLASSNAME = 'SignatureDecipher'

1305

searched_class = swfi.extract_class(TARGET_CLASSNAME)

1306

initial_function = swfi.extract_function(searched_class, 'decipher')

1307

return lambda s: initial_function([s])

1308

1309

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

1310

"""Turn the encrypted s field into a working signature"""

1311

1312

if player_url is None:

1313

raise ExtractorError('Cannot decrypt signature without player_url')

1314

1315

if player_url.startswith('//'):

1316

player_url = 'https:' + player_url

1317

elif not re.match(r'https?://', player_url):

1318

player_url = compat_urlparse.urljoin(

1319

'https://www.youtube.com', player_url)

1320

try:

1321

player_id = (player_url, self._signature_cache_id(s))

1322

if player_id not in self._player_cache:

1323

func = self._extract_signature_function(

1324

video_id, player_url, s

1325

)

1326

self._player_cache[player_id] = func

1327

func = self._player_cache[player_id]

1328

if self._downloader.params.get('youtube_print_sig_code'):

1329

self._print_sig_code(func, s)

1330

return func(s)

1331

except Exception as e:

1332

tb = traceback.format_exc()

1333

raise ExtractorError(

1334

'Signature extraction failed: ' + tb, cause=e)

1335

1336

def _get_subtitles(self, video_id, webpage):

1337

try:

1338

subs_doc = self._download_xml(

1339

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

1340

video_id, note=False)

1341

except ExtractorError as err:

1342

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

1347

lang = track.attrib['lang_code']

1348

if lang in sub_lang_list:

1349

continue

1350

sub_formats = []

1351

for ext in self._SUBTITLE_FORMATS:

1352

params = compat_urllib_parse_urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

1357

})

1358

sub_formats.append({

1359

'url': 'https://www.youtube.com/api/timedtext?' + params,

1360

'ext': ext,

1361

})

1362

sub_lang_list[lang] = sub_formats

1363

if not sub_lang_list:

1364

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

1369

patterns = (

1370

# User data may contain arbitrary character sequences that may affect

1371

# JSON extraction with regex, e.g. when '};' is contained the second

1372

# regex won't capture the whole JSON. Yet working around by trying more

1373

# concrete regex first keeping in mind proper quoted string handling

1374

# to be implemented in future that will replace this workaround (see

1375

# https://github.com/ytdl-org/youtube-dl/issues/7468,

1376

# https://github.com/ytdl-org/youtube-dl/pull/7599)

1377

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

1378

r';ytplayer\.config\s*=\s*({.+?});',

1379

)

1380

config = self._search_regex(

1381

patterns, webpage, 'ytplayer.config', default=None)

1382

if config:

1383

return self._parse_json(

1384

uppercase_escape(config), video_id, fatal=False)

1385

1386

def _get_automatic_captions(self, video_id, webpage):

1387

"""We need the webpage for getting the captions url, pass it as an

1388

argument to speed up the process."""

1389

self.to_screen('%s: Looking for automatic captions' % video_id)

1390

player_config = self._get_ytplayer_config(video_id, webpage)

1391

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

1392

if not player_config:

1393

self._downloader.report_warning(err_msg)

1394

return {}

1395

try:

1396

args = player_config['args']

1397

caption_url = args.get('ttsurl')

1398

if caption_url:

1399

timestamp = args['timestamp']

1400

# We get the available subtitles

1401

list_params = compat_urllib_parse_urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

1407

caption_list = self._download_xml(list_url, video_id)

1408

original_lang_node = caption_list.find('track')

1409

if original_lang_node is None:

1410

self._downloader.report_warning('Video doesn\'t have automatic captions')

1411

return {}

1412

original_lang = original_lang_node.attrib['lang_code']

1413

caption_kind = original_lang_node.attrib.get('kind', '')

1414

1415

sub_lang_list = {}

1416

for lang_node in caption_list.findall('target'):

1417

sub_lang = lang_node.attrib['lang_code']

1418

sub_formats = []

1419

for ext in self._SUBTITLE_FORMATS:

1420

params = compat_urllib_parse_urlencode({

1421

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1426

})

1427

sub_formats.append({

1428

'url': caption_url + '&' + params,

1429

'ext': ext,

1430

})

1431

sub_lang_list[sub_lang] = sub_formats

1432

return sub_lang_list

1433

1434

def make_captions(sub_url, sub_langs):

1435

parsed_sub_url = compat_urllib_parse_urlparse(sub_url)

1436

caption_qs = compat_parse_qs(parsed_sub_url.query)

1437

captions = {}

1438

for sub_lang in sub_langs:

1439

sub_formats = []

1440

for ext in self._SUBTITLE_FORMATS:

caption_qs.update({

'tlang': [sub_lang],

'fmt': [ext],

})

sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(

1446

query=compat_urllib_parse_urlencode(caption_qs, True)))

sub_formats.append({

'url': sub_url,

'ext': ext,

})

captions[sub_lang] = sub_formats

1452

return captions

1453

1454

# New captions format as of 22.06.2017

1455

player_response = args.get('player_response')

1456

if player_response and isinstance(player_response, compat_str):

1457

player_response = self._parse_json(

1458

player_response, video_id, fatal=False)

1459

if player_response:

1460

renderer = player_response['captions']['playerCaptionsTracklistRenderer']

1461

base_url = renderer['captionTracks'][0]['baseUrl']

1462

sub_lang_list = []

1463

for lang in renderer['translationLanguages']:

1464

lang_code = lang.get('languageCode')

1465

if lang_code:

1466

sub_lang_list.append(lang_code)

1467

return make_captions(base_url, sub_lang_list)

1468

1469

# Some videos don't provide ttsurl but rather caption_tracks and

1470

# caption_translation_languages (e.g. 20LmZk1hakA)

1471

# Does not used anymore as of 22.06.2017

1472

caption_tracks = args['caption_tracks']

1473

caption_translation_languages = args['caption_translation_languages']

1474

caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]

1475

sub_lang_list = []

1476

for lang in caption_translation_languages.split(','):

1477

lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))

1478

sub_lang = lang_qs.get('lc', [None])[0]

1479

if sub_lang:

1480

sub_lang_list.append(sub_lang)

1481

return make_captions(caption_url, sub_lang_list)

1482

# An extractor error can be raise by the download process if there are

1483

# no automatic captions but there are subtitles

1484

except (KeyError, IndexError, ExtractorError):

1485

self._downloader.report_warning(err_msg)

1486

return {}

1487

1488

def _mark_watched(self, video_id, video_info, player_response):

1489

playback_url = url_or_none(try_get(

1490

player_response,

1491

lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(

1492

video_info, lambda x: x['videostats_playback_base_url'][0]))

1493

if not playback_url:

1494

return

1495

parsed_playback_url = compat_urlparse.urlparse(playback_url)

1496

qs = compat_urlparse.parse_qs(parsed_playback_url.query)

1497

1498

# cpn generation algorithm is reverse engineered from base.js.

1499

# In fact it works even with dummy cpn.

1500

CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'

1501

cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))

qs.update({

'ver': ['2'],

'cpn': [cpn],

})

playback_url = compat_urlparse.urlunparse(

1508

parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))

1509

1510

self._download_webpage(

1511

playback_url, video_id, 'Marking watched',

1512

'Unable to mark watched', fatal=False)

1513

1514

@staticmethod

1515

def _extract_urls(webpage):

1516

# Embedded YouTube player

1517

entries = [

1518

unescapeHTML(mobj.group('url'))

1519

for mobj in re.finditer(r'''(?x)

(?:

<iframe[^>]+?src=|

data-video-url=|

<embed[^>]+?src=|

embedSWF\(?:\s*|

<object[^>]+data=|

new\s+SWFObject\(

)

(["\'])

(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/

1530

(?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)

1531

\1''', webpage)]

1532

1533

# lazyYT YouTube embed

1534

entries.extend(list(map(

1535

unescapeHTML,

1536

re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))

1537

1538

# Wordpress "YouTube Video Importer" plugin

1539

matches = re.findall(r'''(?x)<div[^>]+

1540

class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+

1541

data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)

1542

entries.extend(m[-1] for m in matches)

return entries

@staticmethod

def _extract_url(webpage):

1548

urls = YoutubeIE._extract_urls(webpage)

1549

return urls[0] if urls else None

1550

1551

@classmethod

1552

def extract_id(cls, url):

1553

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1554

if mobj is None:

1555

raise ExtractorError('Invalid URL: %s' % url)

1556

video_id = mobj.group(2)

1557

return video_id

1558

1559

def _extract_annotations(self, video_id):

1560

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1561

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1562

1563

@staticmethod

1564

def _extract_chapters(description, duration):

1565

if not description:

1566

return None

1567

chapter_lines = re.findall(

1568

r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',

1569

description)

1570

if not chapter_lines:

1571

return None

1572

chapters = []

1573

for next_num, (chapter_line, time_point) in enumerate(

1574

chapter_lines, start=1):

1575

start_time = parse_duration(time_point)

1576

if start_time is None:

1577

continue

1578

if start_time > duration:

1579

break

1580

end_time = (duration if next_num == len(chapter_lines)

1581

else parse_duration(chapter_lines[next_num][1]))

1582

if end_time is None:

1583

continue

1584

if end_time > duration:

1585

end_time = duration

1586

if start_time > end_time:

1587

break

1588

chapter_title = re.sub(

1589

r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')

1590

chapter_title = re.sub(r'\s+', ' ', chapter_title)

1591

chapters.append({

1592

'start_time': start_time,

1593

'end_time': end_time,

1594

'title': chapter_title,

})

return chapters

def _real_extract(self, url):

1599

url, smuggled_data = unsmuggle_url(url, {})

1600

1601

proto = (

1602

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1608

for component in [parsed_url.fragment, parsed_url.query]:

1609

query = compat_parse_qs(component)

1610

if start_time is None and 't' in query:

1611

start_time = parse_duration(query['t'][0])

1612

if start_time is None and 'start' in query:

1613

start_time = parse_duration(query['start'][0])

1614

if end_time is None and 'end' in query:

1615

end_time = parse_duration(query['end'][0])

1616

1617

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1618

mobj = re.search(self._NEXT_URL_RE, url)

1619

if mobj:

1620

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1621

video_id = self.extract_id(url)

1622

1623

# Get video webpage

1624

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1625

video_webpage = self._download_webpage(url, video_id)

1626

1627

# Attempt to extract SWF player URL

1628

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1629

if mobj is not None:

1630

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1637

dash_mpd = video_info.get('dashmpd')

1638

if dash_mpd and dash_mpd[0] not in dash_mpds:

1639

dash_mpds.append(dash_mpd[0])

1640

1641

def add_dash_mpd_pr(pl_response):

1642

dash_mpd = url_or_none(try_get(

1643

pl_response, lambda x: x['streamingData']['dashManifestUrl'],

1644

compat_str))

1645

if dash_mpd and dash_mpd not in dash_mpds:

1646

dash_mpds.append(dash_mpd)

is_live = None

view_count = None

def extract_view_count(v_info):

1652

return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))

player_response = {}

# Get video info

embed_webpage = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1659

age_gate = True

1660

# We simulate the access to the video from www.youtube.com/v/{video_id}

1661

# this can be viewed without login into Youtube

1662

url = proto + '://www.youtube.com/embed/%s' % video_id

1663

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1664

data = compat_urllib_parse_urlencode({

1665

'video_id': video_id,

1666

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1667

'sts': self._search_regex(

1668

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1669

})

1670

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1671

video_info_webpage = self._download_webpage(

1672

video_info_url, video_id,

1673

note='Refetching age-gated info webpage',

1674

errnote='unable to download video info webpage')

1675

video_info = compat_parse_qs(video_info_webpage)

1676

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

sts = None

# Try looking directly into the video webpage

1682

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1683

if ytplayer_config:

1684

args = ytplayer_config['args']

1685

if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):

1686

# Convert to the same format returned by compat_parse_qs

1687

video_info = dict((k, [v]) for k, v in args.items())

1688

add_dash_mpd(video_info)

1689

# Rental video is not rented but preview is available (e.g.

1690

# https://www.youtube.com/watch?v=yYr8q0y5Jfg,

1691

# https://github.com/ytdl-org/youtube-dl/issues/10532)

1692

if not video_info and args.get('ypc_vid'):

1693

return self.url_result(

1694

args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])

1695

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1696

is_live = True

1697

sts = ytplayer_config.get('sts')

1698

if not player_response:

1699

pl_response = str_or_none(args.get('player_response'))

1700

if pl_response:

1701

pl_response = self._parse_json(pl_response, video_id, fatal=False)

1702

if isinstance(pl_response, dict):

1703

player_response = pl_response

1704

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1705

add_dash_mpd_pr(player_response)

1706

# We also try looking in get_video_info since it may contain different dashmpd

1707

# URL that points to a DASH manifest with possibly different itag set (some itags

1708

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1709

# manifest pointed by get_video_info's dashmpd).

1710

# The general idea is to take a union of itags of both DASH manifests (for example

1711

# video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093)

1712

self.report_video_info_webpage_download(video_id)

1713

for el in ('info', 'embedded', 'detailpage', 'vevo', ''):

1714

query = {

1715

'video_id': video_id,

'ps': 'default',

'eurl': '',

'gl': 'US',

'hl': 'en',

}

if el:

query['el'] = el

if sts:

query['sts'] = sts

video_info_webpage = self._download_webpage(

1726

'%s://www.youtube.com/get_video_info' % proto,

1727

video_id, note=False,

1728

errnote='unable to download video info webpage',

1729

fatal=False, query=query)

1730

if not video_info_webpage:

1731

continue

1732

get_video_info = compat_parse_qs(video_info_webpage)

1733

if not player_response:

1734

pl_response = get_video_info.get('player_response', [None])[0]

1735

if isinstance(pl_response, dict):

1736

player_response = pl_response

1737

add_dash_mpd_pr(player_response)

1738

add_dash_mpd(get_video_info)

1739

if view_count is None:

1740

view_count = extract_view_count(get_video_info)

1741

if not video_info:

1742

video_info = get_video_info

1743

get_token = get_video_info.get('token') or get_video_info.get('account_playback_token')

1744

if get_token:

1745

# Different get_video_info requests may report different results, e.g.

1746

# some may report video unavailability, but some may serve it without

1747

# any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362,

1748

# the original webpage as well as el=info and el=embedded get_video_info

1749

# requests report video unavailability due to geo restriction while

1750

# el=detailpage succeeds and returns valid data). This is probably

1751

# due to YouTube measures against IP ranges of hosting providers.

1752

# Working around by preferring the first succeeded video_info containing

1753

# the token if no such video_info yet was found.

1754

token = video_info.get('token') or video_info.get('account_playback_token')

1755

if not token:

1756

video_info = get_video_info

1757

break

1758

1759

def extract_unavailable_message():

1760

return self._html_search_regex(

1761

r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',

1762

video_webpage, 'unavailable message', default=None)

1763

1764

if not video_info:

1765

unavailable_message = extract_unavailable_message()

1766

if not unavailable_message:

1767

unavailable_message = 'Unable to extract video data'

1768

raise ExtractorError(

1769

'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)

1770

1771

token = video_info.get('token') or video_info.get('account_playback_token')

1772

if not token:

1773

if 'reason' in video_info:

1774

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1775

regions_allowed = self._html_search_meta(

1776

'regionsAllowed', video_webpage, default=None)

1777

countries = regions_allowed.split(',') if regions_allowed else None

1778

self.raise_geo_restricted(

1779

msg=video_info['reason'][0], countries=countries)

1780

reason = video_info['reason'][0]

1781

if 'Invalid parameters' in reason:

1782

unavailable_message = extract_unavailable_message()

1783

if unavailable_message:

1784

reason = unavailable_message

1785

raise ExtractorError(

1786

'YouTube said: %s' % reason,

1787

expected=True, video_id=video_id)

1788

else:

1789

raise ExtractorError(

1790

'"token" parameter not in video info for unknown reason',

1791

video_id=video_id)

1792

1793

if video_info.get('license_info'):

1794

raise ExtractorError('This video is DRM protected.', expected=True)

1795

1796

video_details = try_get(

1797

player_response, lambda x: x['videoDetails'], dict) or {}

1798

1799

# title

1800

if 'title' in video_info:

1801

video_title = video_info['title'][0]

1802

elif 'title' in player_response:

1803

video_title = video_details['title']

1804

else:

1805

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

description_original = video_description = get_element_by_id("eow-description", video_webpage)

1810

if video_description:

1811

1812

def replace_url(m):

1813

redir_url = compat_urlparse.urljoin(url, m.group(1))

1814

parsed_redir_url = compat_urllib_parse_urlparse(redir_url)

1815

if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':

1816

qs = compat_parse_qs(parsed_redir_url.query)

q = qs.get('q')

if q and q[0]:

return q[0]

return redir_url

description_original = video_description = re.sub(r'''(?x)

1823

<a\s+

1824

(?:[a-zA-Z-]+="[^"]*"\s+)*?

1825

(?:title|href)="([^"]+)"\s+

1826

(?:[a-zA-Z-]+="[^"]*"\s+)*?

class="[^"]*"[^>]*>

[^<]+\.{3}\s*

</a>

''', replace_url, video_description)

1831

video_description = clean_html(video_description)

1832

else:

1833

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1834

if fd_mobj:

1835

video_description = unescapeHTML(fd_mobj.group(1))

1836

else:

1837

video_description = ''

1838

1839

if not smuggled_data.get('force_singlefeed', False):

1840

if not self._downloader.params.get('noplaylist'):

1841

multifeed_metadata_list = try_get(

1842

player_response,

1843

lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],

1844

compat_str) or try_get(

1845

video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)

1846

if multifeed_metadata_list:

1847

entries = []

1848

feed_ids = []

1849

for feed in multifeed_metadata_list.split(','):

1850

# Unquote should take place before split on comma (,) since textual

1851

# fields may contain comma as well (see

1852

# https://github.com/ytdl-org/youtube-dl/issues/8536)

1853

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1854

entries.append({

1855

'_type': 'url_transparent',

1856

'ie_key': 'Youtube',

1857

'url': smuggle_url(

1858

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1859

{'force_singlefeed': True}),

1860

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1861

})

1862

feed_ids.append(feed_data['id'][0])

1863

self.to_screen(

1864

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1865

% (', '.join(feed_ids), video_id))

1866

return self.playlist_result(entries, video_id, video_title, video_description)

1867

else:

1868

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1869

1870

if view_count is None:

1871

view_count = extract_view_count(video_info)

1872

if view_count is None and video_details:

1873

view_count = int_or_none(video_details.get('viewCount'))

1874

1875

# Check for "rental" videos

1876

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1877

raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)

1878

1879

def _extract_filesize(media_url):

1880

return int_or_none(self._search_regex(

1881

r'\bclen[=/](\d+)', media_url, 'filesize', default=None))

1882

1883

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1884

self.report_rtmp_download()

1885

formats = [{

1886

'format_id': '_rtmp',

1887

'protocol': 'rtmp',

1888

'url': video_info['conn'][0],

1889

'player_url': player_url,

1890

}]

1891

elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):

1892

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1893

if 'rtmpe%3Dyes' in encoded_url_map:

1894

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)

1895

formats_spec = {}

1896

fmt_list = video_info.get('fmt_list', [''])[0]

1897

if fmt_list:

1898

for fmt in fmt_list.split(','):

1899

spec = fmt.split('/')

1900

if len(spec) > 1:

1901

width_height = spec[1].split('x')

1902

if len(width_height) == 2:

1903

formats_spec[spec[0]] = {

1904

'resolution': spec[1],

1905

'width': int_or_none(width_height[0]),

1906

'height': int_or_none(width_height[1]),

1907

}

1908

q = qualities(['small', 'medium', 'hd720'])

1909

streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)

1910

if streaming_formats:

1911

for fmt in streaming_formats:

1912

itag = str_or_none(fmt.get('itag'))

1913

if not itag:

1914

continue

1915

quality = fmt.get('quality')

1916

quality_label = fmt.get('qualityLabel') or quality

1917

formats_spec[itag] = {

1918

'asr': int_or_none(fmt.get('audioSampleRate')),

1919

'filesize': int_or_none(fmt.get('contentLength')),

1920

'format_note': quality_label,

1921

'fps': int_or_none(fmt.get('fps')),

1922

'height': int_or_none(fmt.get('height')),

1923

'quality': q(quality),

1924

# bitrate for itag 43 is always 2147483647

1925

'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,

1926

'width': int_or_none(fmt.get('width')),

1927

}

1928

formats = []

1929

for url_data_str in encoded_url_map.split(','):

1930

url_data = compat_parse_qs(url_data_str)

1931

if 'itag' not in url_data or 'url' not in url_data:

1932

continue

1933

stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))

1934

# Unsupported FORMAT_STREAM_TYPE_OTF

1935

if stream_type == 3:

1936

continue

1937

format_id = url_data['itag'][0]

1938

url = url_data['url'][0]

1939

1940

if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):

1941

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1942

jsplayer_url_json = self._search_regex(

1943

ASSETS_RE,

1944

embed_webpage if age_gate else video_webpage,

1945

'JS player URL (1)', default=None)

1946

if not jsplayer_url_json and not age_gate:

1947

# We need the embed website after all

1948

if embed_webpage is None:

1949

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1950

embed_webpage = self._download_webpage(

1951

embed_url, video_id, 'Downloading embed webpage')

1952

jsplayer_url_json = self._search_regex(

1953

ASSETS_RE, embed_webpage, 'JS player URL')

1954

1955

player_url = json.loads(jsplayer_url_json)

1956

if player_url is None:

1957

player_url_json = self._search_regex(

1958

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1959

video_webpage, 'age gate player URL')

1960

player_url = json.loads(player_url_json)

1961

1962

if 'sig' in url_data:

1963

url += '&signature=' + url_data['sig'][0]

1964

elif 's' in url_data:

1965

encrypted_sig = url_data['s'][0]

1966

1967

if self._downloader.params.get('verbose'):

1968

if player_url is None:

1969

player_version = 'unknown'

1970

player_desc = 'unknown'

1971

else:

1972

if player_url.endswith('swf'):

1973

player_version = self._search_regex(

1974

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1975

'flash player', fatal=False)

1976

player_desc = 'flash player %s' % player_version

1977

else:

1978

player_version = self._search_regex(

1979

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',

1980

r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],

1981

player_url,

1982

'html5 player', fatal=False)

1983

player_desc = 'html5 player %s' % player_version

1984

1985

parts_sizes = self._signature_cache_id(encrypted_sig)

1986

self.to_screen('{%s} signature length %s, %s' %

1987

(format_id, parts_sizes, player_desc))

1988

1989

signature = self._decrypt_signature(

1990

encrypted_sig, video_id, player_url, age_gate)

1991

url += '&signature=' + signature

1992

if 'ratebypass' not in url:

1993

url += '&ratebypass=yes'

1994

1995

dct = {

1996

'format_id': format_id,

1997

'url': url,

1998

'player_url': player_url,

1999

}

2000

if format_id in self._formats:

2001

dct.update(self._formats[format_id])

2002

if format_id in formats_spec:

2003

dct.update(formats_spec[format_id])

2004

2005

# Some itags are not included in DASH manifest thus corresponding formats will

2006

# lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).

2007

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

2008

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

2009

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

2010

2011

filesize = int_or_none(url_data.get(

2012

'clen', [None])[0]) or _extract_filesize(url)

2013

2014

quality = url_data.get('quality', [None])[0]

2015

2016

more_fields = {

2017

'filesize': filesize,

2018

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

2019

'width': width,

2020

'height': height,

2021

'fps': int_or_none(url_data.get('fps', [None])[0]),

2022

'format_note': url_data.get('quality_label', [None])[0] or quality,

2023

'quality': q(quality),

2024

}

2025

for key, value in more_fields.items():

2026

if value:

2027

dct[key] = value

2028

type_ = url_data.get('type', [None])[0]

2029

if type_:

2030

type_split = type_.split(';')

2031

kind_ext = type_split[0].split('/')

2032

if len(kind_ext) == 2:

2033

kind, _ = kind_ext

2034

dct['ext'] = mimetype2ext(type_split[0])

2035

if kind in ('audio', 'video'):

2036

codecs = None

2037

for mobj in re.finditer(

2038

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

2039

if mobj.group('key') == 'codecs':

2040

codecs = mobj.group('val')

2041

break

2042

if codecs:

2043

dct.update(parse_codecs(codecs))

2044

if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':

2045

dct['downloader_options'] = {

2046

# Youtube throttles chunks >~10M

2047

'http_chunk_size': 10485760,

}

formats.append(dct)

else:

manifest_url = (

url_or_none(try_get(

player_response,

lambda x: x['streamingData']['hlsManifestUrl'],

2055

compat_str)) or

2056

url_or_none(try_get(

2057

video_info, lambda x: x['hlsvp'][0], compat_str)))

2058

if manifest_url:

2059

formats = []

2060

m3u8_formats = self._extract_m3u8_formats(

2061

manifest_url, video_id, 'mp4', fatal=False)

2062

for a_format in m3u8_formats:

2063

itag = self._search_regex(

2064

r'/itag/(\d+)/', a_format['url'], 'itag', default=None)

2065

if itag:

2066

a_format['format_id'] = itag

2067

if itag in self._formats:

2068

dct = self._formats[itag].copy()

2069

dct.update(a_format)

2070

a_format = dct

2071

a_format['player_url'] = player_url

2072

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

2073

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

2074

formats.append(a_format)

2075

else:

2076

error_message = clean_html(video_info.get('reason', [None])[0])

2077

if not error_message:

2078

error_message = extract_unavailable_message()

2079

if error_message:

2080

raise ExtractorError(error_message, expected=True)

2081

raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')

2082

2083

# uploader

2084

video_uploader = try_get(

2085

video_info, lambda x: x['author'][0],

2086

compat_str) or str_or_none(video_details.get('author'))

2087

if video_uploader:

2088

video_uploader = compat_urllib_parse_unquote_plus(video_uploader)

2089

else:

2090

self._downloader.report_warning('unable to extract uploader name')

2091

2092

# uploader_id

2093

video_uploader_id = None

2094

video_uploader_url = None

2095

mobj = re.search(

2096

r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',

2097

video_webpage)

2098

if mobj is not None:

2099

video_uploader_id = mobj.group('uploader_id')

2100

video_uploader_url = mobj.group('uploader_url')

2101

else:

2102

self._downloader.report_warning('unable to extract uploader nickname')

2103

2104

channel_id = self._html_search_meta(

2105

'channelId', video_webpage, 'channel id')

2106

channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None

2107

2108

# thumbnail image

2109

# We try first to get a high quality image:

2110

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

2111

video_webpage, re.DOTALL)

2112

if m_thumb is not None:

2113

video_thumbnail = m_thumb.group(1)

2114

elif 'thumbnail_url' not in video_info:

2115

self._downloader.report_warning('unable to extract video thumbnail')

2116

video_thumbnail = None

2117

else: # don't panic if we can't find it

2118

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

2119

2120

# upload date

2121

upload_date = self._html_search_meta(

2122

'datePublished', video_webpage, 'upload date', default=None)

2123

if not upload_date:

2124

upload_date = self._search_regex(

2125

[r'(?s)id="eow-date.*?>(.*?)</span>',

2126

r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],

2127

video_webpage, 'upload date', default=None)

2128

upload_date = unified_strdate(upload_date)

2129

2130

video_license = self._html_search_regex(

2131

r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',

2132

video_webpage, 'license', default=None)

m_music = re.search(

r'''(?x)

<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*

<ul[^>]*>\s*

<li>(?P<title>.+?)

by (?P<creator>.+?)

(?:

$.+?$|

<a[^>]*

(?:

\bhref=["\']/red[^>]*>| # drop possible

2145

>\s*Listen ad-free with YouTube Red # YouTube Red ad

)

.*?

)?</li

''',

video_webpage)

if m_music:

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

2153

video_creator = clean_html(m_music.group('creator'))

2154

else:

2155

video_alt_title = video_creator = None

2156

2157

def extract_meta(field):

2158

return self._html_search_regex(

2159

r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,

2160

video_webpage, field, default=None)

2161

2162

track = extract_meta('Song')

2163

artist = extract_meta('Artist')

2164

2165

# Youtube Music Auto-generated description

2166

album = release_date = release_year = None

2167

if video_description:

2168

mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)

2169

if mobj:

2170

if not track:

2171

track = mobj.group('track').strip()

2172

if not artist:

2173

artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))

2174

album = mobj.group('album'.strip())

2175

release_year = mobj.group('release_year')

2176

release_date = mobj.group('release_date')

2177

if release_date:

2178

release_date = release_date.replace('-', '')

2179

if not release_year:

2180

release_year = int(release_date[:4])

2181

if release_year:

2182

release_year = int(release_year)

2183

2184

m_episode = re.search(

2185

r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',

2186

video_webpage)

2187

if m_episode:

2188

series = unescapeHTML(m_episode.group('series'))

2189

season_number = int(m_episode.group('season'))

2190

episode_number = int(m_episode.group('episode'))

2191

else:

2192

series = season_number = episode_number = None

2193

2194

m_cat_container = self._search_regex(

2195

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

2196

video_webpage, 'categories', default=None)

2197

if m_cat_container:

2198

category = self._html_search_regex(

2199

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

2200

default=None)

2201

video_categories = None if category is None else [category]

2202

else:

2203

video_categories = None

2204

2205

video_tags = [

2206

unescapeHTML(m.group('content'))

2207

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

2208

2209

def _extract_count(count_name):

2210

return str_to_int(self._search_regex(

2211

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

2212

% re.escape(count_name),

2213

video_webpage, count_name, default=None))

2214

2215

like_count = _extract_count('like')

2216

dislike_count = _extract_count('dislike')

2217

2218

if view_count is None:

2219

view_count = str_to_int(self._search_regex(

2220

r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,

2221

'view count', default=None))

2222

2223

# subtitles

2224

video_subtitles = self.extract_subtitles(video_id, video_webpage)

2225

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

2226

2227

video_duration = try_get(

2228

video_info, lambda x: int_or_none(x['length_seconds'][0]))

2229

if not video_duration:

2230

video_duration = int_or_none(video_details.get('lengthSeconds'))

2231

if not video_duration:

2232

video_duration = parse_duration(self._html_search_meta(

2233

'duration', video_webpage, 'video duration'))

2234

2235

# annotations

2236

video_annotations = None

2237

if self._downloader.params.get('writeannotations', False):

2238

video_annotations = self._extract_annotations(video_id)

2239

2240

chapters = self._extract_chapters(description_original, video_duration)

2241

2242

# Look for the DASH manifest

2243

if self._downloader.params.get('youtube_include_dash_manifest', True):

2244

dash_mpd_fatal = True

2245

for mpd_url in dash_mpds:

2246

dash_formats = {}

2247

try:

2248

def decrypt_sig(mobj):

2249

s = mobj.group(1)

2250

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

2251

return '/signature/%s' % dec_s

2252

2253

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

2254

2255

for df in self._extract_mpd_formats(

2256

mpd_url, video_id, fatal=dash_mpd_fatal,

2257

formats_dict=self._formats):

2258

if not df.get('filesize'):

2259

df['filesize'] = _extract_filesize(df['url'])

2260

# Do not overwrite DASH format found in some previous DASH manifest

2261

if df['format_id'] not in dash_formats:

2262

dash_formats[df['format_id']] = df

2263

# Additional DASH manifests may end up in HTTP Error 403 therefore

2264

# allow them to fail without bug report message if we already have

2265

# some DASH manifest succeeded. This is temporary workaround to reduce

2266

# burst of bug reports until we figure out the reason and whether it

2267

# can be fixed at all.

2268

dash_mpd_fatal = False

2269

except (ExtractorError, KeyError) as e:

2270

self.report_warning(

2271

'Skipping DASH manifest: %r' % e, video_id)

2272

if dash_formats:

2273

# Remove the formats we found through non-DASH, they

2274

# contain less info and it can be wrong, because we use

2275

# fixed values (for example the resolution). See

2276

# https://github.com/ytdl-org/youtube-dl/issues/5774 for an

2277

# example.

2278

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

2279

formats.extend(dash_formats.values())

2280

2281

# Check for malformed aspect ratio

2282

stretched_m = re.search(

2283

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

2284

video_webpage)

2285

if stretched_m:

2286

w = float(stretched_m.group('w'))

2287

h = float(stretched_m.group('h'))

2288

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

2289

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

2294

f['stretched_ratio'] = ratio

2295

2296

self._sort_formats(formats)

2297

2298

self.mark_watched(video_id, video_info, player_response)

return {

'id': video_id,

'uploader': video_uploader,

2303

'uploader_id': video_uploader_id,

2304

'uploader_url': video_uploader_url,

2305

'channel_id': channel_id,

2306

'channel_url': channel_url,

2307

'upload_date': upload_date,

2308

'license': video_license,

2309

'creator': video_creator or artist,

2310

'title': video_title,

2311

'alt_title': video_alt_title or track,

2312

'thumbnail': video_thumbnail,

2313

'description': video_description,

2314

'categories': video_categories,

2315

'tags': video_tags,

2316

'subtitles': video_subtitles,

2317

'automatic_captions': automatic_captions,

2318

'duration': video_duration,

2319

'age_limit': 18 if age_gate else 0,

2320

'annotations': video_annotations,

2321

'chapters': chapters,

2322

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

2323

'view_count': view_count,

2324

'like_count': like_count,

2325

'dislike_count': dislike_count,

2326

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

2327

'formats': formats,

2328

'is_live': is_live,

2329

'start_time': start_time,

2330

'end_time': end_time,

2331

'series': series,

2332

'season_number': season_number,

2333

'episode_number': episode_number,

'track': track,

'artist': artist,

'album': album,

'release_date': release_date,

2338

'release_year': release_year,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

2343

IE_DESC = 'YouTube.com playlists'

2344

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

(?:

(?:

youtube\.com|

invidio\.us

)

/

(?:

(?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))

2355

\? (?:.*?[&;])*? (?:p|a|list)=

2356

| p/

2357

)|

2358

youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=

2359

)

2360

(

2361

(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}

2362

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

(%(playlist_id)s)

)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

2369

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

2370

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

2371

IE_NAME = 'youtube:playlist'

2372

_TESTS = [{

2373

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

2374

'info_dict': {

2375

'title': 'ytdl test PL',

2376

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

2381

'info_dict': {

2382

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

2383

'title': 'YDL_Empty_List',

2384

},

2385

'playlist_count': 0,

2386

'skip': 'This playlist is private',

2387

}, {

2388

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

2389

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2390

'info_dict': {

2391

'title': '29C3: Not my department',

2392

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2393

},

2394

'playlist_count': 95,

2395

}, {

2396

'note': 'issue #673',

2397

'url': 'PLBB231211A4F62143',

2398

'info_dict': {

2399

'title': '[OLD]Team Fortress 2 (Class-based LP)',

2400

'id': 'PLBB231211A4F62143',

2401

},

2402

'playlist_mincount': 26,

2403

}, {

2404

'note': 'Large playlist',

2405

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

2406

'info_dict': {

2407

'title': 'Uploads from Cauchemar',

2408

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

2409

},

2410

'playlist_mincount': 799,

2411

}, {

2412

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2413

'info_dict': {

2414

'title': 'YDL_safe_search',

2415

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2416

},

2417

'playlist_count': 2,

2418

'skip': 'This playlist is private',

2419

}, {

2420

'note': 'embedded',

2421

'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

2426

}

2427

}, {

2428

'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2429

'playlist_mincount': 485,

2430

'info_dict': {

2431

'title': '2017 華語最新單曲 (2/24更新)',

2432

'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2433

}

2434

}, {

2435

'note': 'Embedded SWF player',

2436

'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

2441

}

2442

}, {

2443

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

2444

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

2445

'info_dict': {

2446

'title': 'Uploads from Interstellar Movie',

2447

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

2448

},

2449

'playlist_mincount': 21,

2450

}, {

2451

# Playlist URL that does not actually serve a playlist

2452

'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',

'info_dict': {

'id': 'FqZTN594JQw',

'ext': 'webm',

'title': "Smiley's People 01 detective, Adventure Series, Action",

2457

'uploader': 'STREEM',

2458

'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',

2459

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',

2460

'upload_date': '20150526',

2461

'license': 'Standard YouTube License',

2462

'description': 'md5:507cdcb5a49ac0da37a920ece610be80',

2463

'categories': ['People & Blogs'],

'tags': list,

'view_count': int,

'like_count': int,

'dislike_count': int,

2468

},

2469

'params': {

2470

'skip_download': True,

2471

},

2472

'add_ie': [YoutubeIE.ie_key()],

2473

}, {

2474

'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',

'info_dict': {

'id': 'yeWKywCrFtk',

'ext': 'mp4',

'title': 'Small Scale Baler and Braiding Rugs',

2479

'uploader': 'Backus-Page House Museum',

2480

'uploader_id': 'backuspagemuseum',

2481

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',

2482

'upload_date': '20161008',

2483

'license': 'Standard YouTube License',

2484

'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',

2485

'categories': ['Nonprofits & Activism'],

2486

'tags': list,

2487

'like_count': int,

2488

'dislike_count': int,

},

'params': {

'noplaylist': True,

'skip_download': True,

2493

},

2494

}, {

2495

'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',

2496

'only_matching': True,

2497

}, {

2498

'url': 'TLGGrESM50VT6acwMjAyMjAxNw',

2499

'only_matching': True,

2500

}, {

2501

# music album playlist

2502

'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',

2503

'only_matching': True,

2504

}, {

2505

'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',

2506

'only_matching': True,

2507

}]

2508

2509

def _real_initialize(self):

2510

self._login()

2511

2512

def _extract_mix(self, playlist_id):

2513

# The mixes are generated from a single video

2514

# the id of the playlist is just 'RD' + video_id

2515

ids = []

2516

last_id = playlist_id[-11:]

2517

for n in itertools.count(1):

2518

url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)

2519

webpage = self._download_webpage(

2520

url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))

2521

new_ids = orderedSet(re.findall(

2522

r'''(?xs)data-video-username=".*?".*?

2523

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

2524

webpage))

2525

# Fetch new pages until all the videos are repeated, it seems that

2526

# there are always 51 unique videos.

2527

new_ids = [_id for _id in new_ids if _id not in ids]

if not new_ids:

break

ids.extend(new_ids)

last_id = ids[-1]

url_results = self._ids_to_results(ids)

2534

2535

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

2536

title_span = (

2537

search_title('playlist-title') or

2538

search_title('title long-title') or

2539

search_title('title'))

2540

title = clean_html(title_span)

2541

2542

return self.playlist_result(url_results, playlist_id, title)

2543

2544

def _extract_playlist(self, playlist_id):

2545

url = self._TEMPLATE_URL % playlist_id

2546

page = self._download_webpage(url, playlist_id)

2547

2548

# the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)

2549

for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):

2550

match = match.strip()

2551

# Check if the playlist exists or is private

2552

mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)

2553

if mobj:

2554

reason = mobj.group('reason')

2555

message = 'This playlist %s' % reason

2556

if 'private' in reason:

2557

message += ', use --username or --netrc to access it'

2558

message += '.'

2559

raise ExtractorError(message, expected=True)

2560

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

2561

raise ExtractorError(

2562

'Invalid parameters. Maybe URL is incorrect.',

2563

expected=True)

2564

elif re.match(r'[^<]*Choose your language[^<]*', match):

2565

continue

2566

else:

2567

self.report_warning('Youtube gives an alert message: ' + match)

2568

2569

playlist_title = self._html_search_regex(

2570

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

2571

page, 'title', default=None)

2572

2573

_UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='

2574

uploader = self._search_regex(

2575

r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,

2576

page, 'uploader', default=None)

2577

mobj = re.search(

2578

r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,

2579

page)

2580

if mobj:

2581

uploader_id = mobj.group('uploader_id')

2582

uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))

2583

else:

2584

uploader_id = uploader_url = None

has_videos = True

if not playlist_title:

2589

try:

2590

# Some playlist URLs don't actually serve a playlist (e.g.

2591

# https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)

2592

next(self._entries(page, playlist_id))

2593

except StopIteration:

2594

has_videos = False

2595

2596

playlist = self.playlist_result(

2597

self._entries(page, playlist_id), playlist_id, playlist_title)

2598

playlist.update({

2599

'uploader': uploader,

2600

'uploader_id': uploader_id,

2601

'uploader_url': uploader_url,

2602

})

2603

2604

return has_videos, playlist

2605

2606

def _check_download_just_video(self, url, playlist_id):

2607

# Check if it's a video-specific URL

2608

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

2609

video_id = query_dict.get('v', [None])[0] or self._search_regex(

2610

r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,

2611

'video id', default=None)

2612

if video_id:

2613

if self._downloader.params.get('noplaylist'):

2614

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

2615

return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)

2616

else:

2617

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

2618

return video_id, None

2619

return None, None

2620

2621

def _real_extract(self, url):

2622

# Extract playlist id

2623

mobj = re.match(self._VALID_URL, url)

2624

if mobj is None:

2625

raise ExtractorError('Invalid URL: %s' % url)

2626

playlist_id = mobj.group(1) or mobj.group(2)

2627

2628

video_id, video = self._check_download_just_video(url, playlist_id)

if video:

return video

if playlist_id.startswith(('RD', 'UL', 'PU')):

2633

# Mixes require a custom extraction process

2634

return self._extract_mix(playlist_id)

2635

2636

has_videos, playlist = self._extract_playlist(playlist_id)

2637

if has_videos or not video_id:

2638

return playlist

2639

2640

# Some playlist URLs don't actually serve a playlist (see

2641

# https://github.com/ytdl-org/youtube-dl/issues/10537).

2642

# Fallback to plain video extraction if there is a video id

2643

# along with playlist id.

2644

return self.url_result(video_id, 'Youtube', video_id=video_id)

2645

2646

2647

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

2648

IE_DESC = 'YouTube.com channels'

2649

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'

2650

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

2651

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

2652

IE_NAME = 'youtube:channel'

2653

_TESTS = [{

2654

'note': 'paginated channel',

2655

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

2656

'playlist_mincount': 91,

2657

'info_dict': {

2658

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

2659

'title': 'Uploads from lex will',

2660

}

2661

}, {

2662

'note': 'Age restricted channel',

2663

# from https://www.youtube.com/user/DeusExOfficial

2664

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

2665

'playlist_mincount': 64,

2666

'info_dict': {

2667

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

2668

'title': 'Uploads from Deus Ex',

2669

},

2670

}, {

2671

'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',

2672

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2677

return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)

2678

else super(YoutubeChannelIE, cls).suitable(url))

2679

2680

def _build_template_url(self, url, channel_id):

2681

return self._TEMPLATE_URL % channel_id

2682

2683

def _real_extract(self, url):

2684

channel_id = self._match_id(url)

2685

2686

url = self._build_template_url(url, channel_id)

2687

2688

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

2689

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

2690

# otherwise fallback on channel by page extraction

2691

channel_page = self._download_webpage(

2692

url + '?view=57', channel_id,

2693

'Downloading channel page', fatal=False)

2694

if channel_page is False:

2695

channel_playlist_id = False

2696

else:

2697

channel_playlist_id = self._html_search_meta(

2698

'channelId', channel_page, 'channel id', default=None)

2699

if not channel_playlist_id:

2700

channel_url = self._html_search_meta(

2701

('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),

2702

channel_page, 'channel url', default=None)

2703

if channel_url:

2704

channel_playlist_id = self._search_regex(

2705

r'vnd\.youtube://user/([0-9A-Za-z_-]+)',

2706

channel_url, 'channel id', default=None)

2707

if channel_playlist_id and channel_playlist_id.startswith('UC'):

2708

playlist_id = 'UU' + channel_playlist_id[2:]

2709

return self.url_result(

2710

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

2711

2712

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

2713

autogenerated = re.search(r'''(?x)

2714

class="[^"]*?(?:

2715

channel-header-autogenerated-label|

2716

yt-channel-title-autogenerated

2717

)[^"]*"''', channel_page) is not None

2718

2719

if autogenerated:

2720

# The videos are contained in a single page

2721

# the ajax pages can't be used, they are empty

2722

entries = [

2723

self.url_result(

2724

video_id, 'Youtube', video_id=video_id,

2725

video_title=video_title)

2726

for video_id, video_title in self.extract_videos_from_page(channel_page)]

2727

return self.playlist_result(entries, channel_id)

2728

2729

try:

2730

next(self._entries(channel_page, channel_id))

2731

except StopIteration:

2732

alert_message = self._html_search_regex(

2733

r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',

2734

channel_page, 'alert', default=None, group='alert')

2735

if alert_message:

2736

raise ExtractorError('Youtube said: %s' % alert_message, expected=True)

2737

2738

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

2739

2740

2741

class YoutubeUserIE(YoutubeChannelIE):

2742

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

2743

2744

_TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'

2745

IE_NAME = 'youtube:user'

2746

2747

_TESTS = [{

2748

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

2749

'playlist_mincount': 320,

2750

'info_dict': {

2751

'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',

2752

'title': 'Uploads from The Linux Foundation',

2753

}

2754

}, {

2755

# Only available via https://www.youtube.com/c/12minuteathlete/videos

2756

# but not https://www.youtube.com/user/12minuteathlete/videos

2757

'url': 'https://www.youtube.com/c/12minuteathlete/videos',

2758

'playlist_mincount': 249,

2759

'info_dict': {

2760

'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',

2761

'title': 'Uploads from 12 Minute Athlete',

2762

}

2763

}, {

2764

'url': 'ytuser:phihag',

2765

'only_matching': True,

2766

}, {

2767

'url': 'https://www.youtube.com/c/gametrailers',

2768

'only_matching': True,

2769

}, {

2770

'url': 'https://www.youtube.com/gametrailers',

2771

'only_matching': True,

2772

}, {

2773

# This channel is not available, geo restricted to JP

2774

'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',

2775

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2780

# Don't return True if the url can be extracted with other youtube

2781

# extractor, the regex would is too permissive and it would match.

2782

other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)

2783

if any(ie.suitable(url) for ie in other_yt_ies):

2784

return False

2785

else:

2786

return super(YoutubeUserIE, cls).suitable(url)

2787

2788

def _build_template_url(self, url, channel_id):

2789

mobj = re.match(self._VALID_URL, url)

2790

return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))

2791

2792

2793

class YoutubeLiveIE(YoutubeBaseInfoExtractor):

2794

IE_DESC = 'YouTube.com live streams'

2795

_VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'

2796

IE_NAME = 'youtube:live'

2797

2798

_TESTS = [{

2799

'url': 'https://www.youtube.com/user/TheYoungTurks/live',

'info_dict': {

'id': 'a48o2S1cPoo',

'ext': 'mp4',

'title': 'The Young Turks - Live Main Show',

2804

'uploader': 'The Young Turks',

2805

'uploader_id': 'TheYoungTurks',

2806

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',

2807

'upload_date': '20150715',

2808

'license': 'Standard YouTube License',

2809

'description': 'md5:438179573adcdff3c97ebb1ee632b891',

2810

'categories': ['News & Politics'],

2811

'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],

2812

'like_count': int,

2813

'dislike_count': int,

2814

},

2815

'params': {

2816

'skip_download': True,

2817

},

2818

}, {

2819

'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',

2820

'only_matching': True,

2821

}, {

2822

'url': 'https://www.youtube.com/c/CommanderVideoHq/live',

2823

'only_matching': True,

2824

}, {

2825

'url': 'https://www.youtube.com/TheYoungTurks/live',

2826

'only_matching': True,

2827

}]

2828

2829

def _real_extract(self, url):

2830

mobj = re.match(self._VALID_URL, url)

2831

channel_id = mobj.group('id')

2832

base_url = mobj.group('base_url')

2833

webpage = self._download_webpage(url, channel_id, fatal=False)

2834

if webpage:

2835

page_type = self._og_search_property(

2836

'type', webpage, 'page type', default='')

2837

video_id = self._html_search_meta(

2838

'videoId', webpage, 'video id', default=None)

2839

if page_type.startswith('video') and video_id and re.match(

2840

r'^[0-9A-Za-z_-]{11}$', video_id):

2841

return self.url_result(video_id, YoutubeIE.ie_key())

2842

return self.url_result(base_url)

2843

2844

2845

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

2846

IE_DESC = 'YouTube.com user/channel playlists'

2847

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

2848

IE_NAME = 'youtube:playlists'

2849

2850

_TESTS = [{

2851

'url': 'https://www.youtube.com/user/ThirstForScience/playlists',

2852

'playlist_mincount': 4,

2853

'info_dict': {

2854

'id': 'ThirstForScience',

2855

'title': 'Thirst for Science',

2856

},

2857

}, {

2858

# with "Load more" button

2859

'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

2860

'playlist_mincount': 70,

2861

'info_dict': {

2862

'id': 'igorkle1',

2863

'title': 'Игорь Клейнер',

2864

},

2865

}, {

2866

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

2867

'playlist_mincount': 17,

2868

'info_dict': {

2869

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

2870

'title': 'Chem Player',

},

}]

class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):

2876

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'

2877

2878

2879

class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):

2880

IE_DESC = 'YouTube.com searches'

2881

# there doesn't appear to be a real limit, for example if you search for

2882

# 'python' you get more than 8.000.000 results

2883

_MAX_RESULTS = float('inf')

2884

IE_NAME = 'youtube:search'

2885

_SEARCH_KEY = 'ytsearch'

2886

_EXTRA_QUERY_ARGS = {}

2887

_TESTS = []

2888

2889

def _get_n_results(self, query, n):

2890

"""Get a specified number of results for a query"""

videos = []

limit = n

url_query = {

'search_query': query.encode('utf-8'),

2897

}

2898

url_query.update(self._EXTRA_QUERY_ARGS)

2899

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)

2900

2901

for pagenum in itertools.count(1):

2902

data = self._download_json(

2903

result_url, video_id='query "%s"' % query,

2904

note='Downloading page %s' % pagenum,

2905

errnote='Unable to download API page',

2906

query={'spf': 'navigate'})

2907

html_content = data[1]['body']['content']

2908

2909

if 'class="search-message' in html_content:

2910

raise ExtractorError(

2911

'[youtube] No video results', expected=True)

2912

2913

new_videos = list(self._process_page(html_content))

2914

videos += new_videos

2915

if not new_videos or len(videos) > limit:

2916

break

2917

next_link = self._html_search_regex(

2918

r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',

2919

html_content, 'next link', default=None)

2920

if next_link is None:

2921

break

2922

result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

2927

2928

2929

class YoutubeSearchDateIE(YoutubeSearchIE):

2930

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

2931

_SEARCH_KEY = 'ytsearchdate'

2932

IE_DESC = 'YouTube.com searches, newest videos first'

2933

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

2934

2935

2936

class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):

2937

IE_DESC = 'YouTube.com search URLs'

2938

IE_NAME = 'youtube:search_url'

2939

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'

2940

_TESTS = [{

2941

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

2942

'playlist_mincount': 5,

2943

'info_dict': {

2944

'title': 'youtube-dl test video',

2945

}

2946

}, {

2947

'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',

2948

'only_matching': True,

2949

}]

2950

2951

def _real_extract(self, url):

2952

mobj = re.match(self._VALID_URL, url)

2953

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

2954

webpage = self._download_webpage(url, query)

2955

return self.playlist_result(self._process_page(webpage), playlist_title=query)

2956

2957

2958

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

2959

IE_DESC = 'YouTube.com (multi-season) shows'

2960

_VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'

2961

IE_NAME = 'youtube:show'

2962

_TESTS = [{

2963

'url': 'https://www.youtube.com/show/airdisasters',

2964

'playlist_mincount': 5,

2965

'info_dict': {

2966

'id': 'airdisasters',

2967

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

2972

playlist_id = self._match_id(url)

2973

return super(YoutubeShowIE, self)._real_extract(

2974

'https://www.youtube.com/show/%s/playlists' % playlist_id)

2975

2976

2977

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

2978

"""

2979

Base class for feed extractors

2980

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

2981

"""

2982

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

2987

2988

def _real_initialize(self):

2989

self._login()

2990

2991

def _entries(self, page):

2992

# The extraction process is the same as for playlists, but the regex

2993

# for the video ids doesn't contain an index

2994

ids = []

2995

more_widget_html = content_html = page

2996

for page_num in itertools.count(1):

2997

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

2998

2999

# 'recommended' feed has infinite 'load more' and each new portion spins

3000

# the same videos in (sometimes) slightly different order, so we'll check

3001

# for unicity and break when portion has no new videos

3002

new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))

if not new_ids:

break

ids.extend(new_ids)

for entry in self._ids_to_results(new_ids):

3009

yield entry

3010

3011

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

3016

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

3017

'Downloading page #%s' % page_num,

3018

transform_source=uppercase_escape)

3019

content_html = more['content_html']

3020

more_widget_html = more['load_more_widget_html']

3021

3022

def _real_extract(self, url):

3023

page = self._download_webpage(

3024

'https://www.youtube.com/feed/%s' % self._FEED_NAME,

3025

self._PLAYLIST_TITLE)

3026

return self.playlist_result(

3027

self._entries(page), playlist_title=self._PLAYLIST_TITLE)

3028

3029

3030

class YoutubeWatchLaterIE(YoutubePlaylistIE):

3031

IE_NAME = 'youtube:watchlater'

3032

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

3033

_VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'

3034

3035

_TESTS = [{

3036

'url': 'https://www.youtube.com/playlist?list=WL',

3037

'only_matching': True,

3038

}, {

3039

'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',

3040

'only_matching': True,

3041

}]

3042

3043

def _real_extract(self, url):

3044

_, video = self._check_download_just_video(url, 'WL')

3045

if video:

3046

return video

3047

_, playlist = self._extract_playlist('WL')

return playlist

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

3052

IE_NAME = 'youtube:favorites'

3053

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

3054

_VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

3055

_LOGIN_REQUIRED = True

3056

3057

def _real_extract(self, url):

3058

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

3059

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

3060

return self.url_result(playlist_id, 'YoutubePlaylist')

3061

3062

3063

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

3064

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

3065

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'

3066

_FEED_NAME = 'recommended'

3067

_PLAYLIST_TITLE = 'Youtube Recommended videos'

3068

3069

3070

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

3071

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

3072

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

3073

_FEED_NAME = 'subscriptions'

3074

_PLAYLIST_TITLE = 'Youtube Subscriptions'

3075

3076

3077

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

3078

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

3079

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'

3080

_FEED_NAME = 'history'

3081

_PLAYLIST_TITLE = 'Youtube History'

3082

3083

3084

class YoutubeTruncatedURLIE(InfoExtractor):

3085

IE_NAME = 'youtube:truncated_url'

3086

IE_DESC = False # Do not list

3087

_VALID_URL = r'''(?x)

3088

(?:https?://)?

3089

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

3090

(?:watch\?(?:

3091

feature=[a-z_]+|

3092

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',

3105

'only_matching': True,

3106

}, {

3107

'url': 'https://www.youtube.com/watch?',

3108

'only_matching': True,

3109

}, {

3110

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

3111

'only_matching': True,

3112

}, {

3113

'url': 'https://www.youtube.com/watch?feature=foo',

3114

'only_matching': True,

3115

}, {

3116

'url': 'https://www.youtube.com/watch?hl=en-GB',

3117

'only_matching': True,

3118

}, {

3119

'url': 'https://www.youtube.com/watch?t=2372',

3120

'only_matching': True,

3121

}]

3122

3123

def _real_extract(self, url):

3124

raise ExtractorError(

3125

'Did you forget to quote the URL? Remember that & is a meta '

3126

'character in most shells, so you want to put the URL in quotes, '

3127

'like youtube-dl '

3128

'"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

3129

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

3134

IE_NAME = 'youtube:truncated_id'

3135

IE_DESC = False # Do not list

3136

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

3137

3138

_TESTS = [{

3139

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

3140

'only_matching': True,

3141

}]

3142

3143

def _real_extract(self, url):

3144

video_id = self._match_id(url)

3145

raise ExtractorError(

3146

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

3147

expected=True)