jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import random
	10	import re
	11	import time
	12	import traceback
	13
	14	from .common import InfoExtractor, SearchInfoExtractor
	15	from ..jsinterp import JSInterpreter
	16	from ..swfinterp import SWFInterpreter
	17	from ..compat import (
	18	compat_chr,
	19	compat_kwargs,
	20	compat_parse_qs,
	21	compat_urllib_parse_unquote,
	22	compat_urllib_parse_unquote_plus,
	23	compat_urllib_parse_urlencode,
	24	compat_urllib_parse_urlparse,
	25	compat_urlparse,
	26	compat_str,
	27	)
	28	from ..utils import (
	29	clean_html,
	30	error_to_compat_str,
	31	ExtractorError,
	32	float_or_none,
	33	get_element_by_attribute,
	34	get_element_by_id,
	35	int_or_none,
	36	mimetype2ext,
	37	orderedSet,
	38	parse_codecs,
	39	parse_duration,
	40	qualities,
	41	remove_quotes,
	42	remove_start,
	43	smuggle_url,
	44	str_or_none,
	45	str_to_int,
	46	try_get,
	47	unescapeHTML,
	48	unified_strdate,
	49	unsmuggle_url,
	50	uppercase_escape,
	51	url_or_none,
	52	urlencode_postdata,
	53	)
	54
	55
	56	class YoutubeBaseInfoExtractor(InfoExtractor):
	57	"""Provide base functions for Youtube extractors"""
	58	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	59	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	60
	61	_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
	62	_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
	63	_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
	64
	65	_NETRC_MACHINE = 'youtube'
	66	# If True it will raise an error if no login info is provided
	67	_LOGIN_REQUIRED = False
	68
	69	_PLAYLIST_ID_RE = r'(?:PL\|LL\|EC\|UU\|FL\|RD\|UL\|TL\|OLAK5uy_)[0-9A-Za-z-_]{10,}'
	70
	71	def _set_language(self):
	72	self._set_cookie(
	73	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	74	# YouTube sets the expire time to about two months
	75	expire_time=time.time() + 2 * 30 * 24 * 3600)
	76
	77	def _ids_to_results(self, ids):
	78	return [
	79	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	80	for vid_id in ids]
	81
	82	def _login(self):
	83	"""
	84	Attempt to log in to YouTube.
	85	True is returned if successful or skipped.
	86	False is returned if login failed.
	87
	88	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	89	"""
	90	username, password = self._get_login_info()
	91	# No authentication to be performed
	92	if username is None:
	93	if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
	94	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	95	return True
	96
	97	login_page = self._download_webpage(
	98	self._LOGIN_URL, None,
	99	note='Downloading login page',
	100	errnote='unable to fetch login page', fatal=False)
	101	if login_page is False:
	102	return
	103
	104	login_form = self._hidden_inputs(login_page)
	105
	106	def req(url, f_req, note, errnote):
	107	data = login_form.copy()
	108	data.update({
	109	'pstMsg': 1,
	110	'checkConnection': 'youtube',
	111	'checkedDomains': 'youtube',
	112	'hl': 'en',
	113	'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
	114	'f.req': json.dumps(f_req),
	115	'flowName': 'GlifWebSignIn',
	116	'flowEntry': 'ServiceLogin',
	117	})
	118	return self._download_json(
	119	url, None, note=note, errnote=errnote,
	120	transform_source=lambda s: re.sub(r'^[^[]*', '', s),
	121	fatal=False,
	122	data=urlencode_postdata(data), headers={
	123	'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
	124	'Google-Accounts-XSRF': 1,
	125	})
	126
	127	def warn(message):
	128	self._downloader.report_warning(message)
	129
	130	lookup_req = [
	131	username,
	132	None, [], None, 'US', None, None, 2, False, True,
	133	[
	134	None, None,
	135	[2, 1, None, 1,
	136	'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
	137	None, [], 4],
	138	1, [None, None, []], None, None, None, True
	139	],
	140	username,
	141	]
	142
	143	lookup_results = req(
	144	self._LOOKUP_URL, lookup_req,
	145	'Looking up account info', 'Unable to look up account info')
	146
	147	if lookup_results is False:
	148	return False
	149
	150	user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
	151	if not user_hash:
	152	warn('Unable to extract user hash')
	153	return False
	154
	155	challenge_req = [
	156	user_hash,
	157	None, 1, None, [1, None, None, None, [password, None, True]],
	158	[
	159	None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
	160	1, [None, None, []], None, None, None, True
	161	]]
	162
	163	challenge_results = req(
	164	self._CHALLENGE_URL, challenge_req,
	165	'Logging in', 'Unable to log in')
	166
	167	if challenge_results is False:
	168	return
	169
	170	login_res = try_get(challenge_results, lambda x: x[0][5], list)
	171	if login_res:
	172	login_msg = try_get(login_res, lambda x: x[5], compat_str)
	173	warn(
	174	'Unable to login: %s' % 'Invalid password'
	175	if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
	176	return False
	177
	178	res = try_get(challenge_results, lambda x: x[0][-1], list)
	179	if not res:
	180	warn('Unable to extract result entry')
	181	return False
	182
	183	login_challenge = try_get(res, lambda x: x[0][0], list)
	184	if login_challenge:
	185	challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
	186	if challenge_str == 'TWO_STEP_VERIFICATION':
	187	# SEND_SUCCESS - TFA code has been successfully sent to phone
	188	# QUOTA_EXCEEDED - reached the limit of TFA codes
	189	status = try_get(login_challenge, lambda x: x[5], compat_str)
	190	if status == 'QUOTA_EXCEEDED':
	191	warn('Exceeded the limit of TFA codes, try later')
	192	return False
	193
	194	tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
	195	if not tl:
	196	warn('Unable to extract TL')
	197	return False
	198
	199	tfa_code = self._get_tfa_info('2-step verification code')
	200
	201	if not tfa_code:
	202	warn(
	203	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	204	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	205	return False
	206
	207	tfa_code = remove_start(tfa_code, 'G-')
	208
	209	tfa_req = [
	210	user_hash, None, 2, None,
	211	[
	212	9, None, None, None, None, None, None, None,
	213	[None, tfa_code, True, 2]
	214	]]
	215
	216	tfa_results = req(
	217	self._TFA_URL.format(tl), tfa_req,
	218	'Submitting TFA code', 'Unable to submit TFA code')
	219
	220	if tfa_results is False:
	221	return False
	222
	223	tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
	224	if tfa_res:
	225	tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
	226	warn(
	227	'Unable to finish TFA: %s' % 'Invalid TFA code'
	228	if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
	229	return False
	230
	231	check_cookie_url = try_get(
	232	tfa_results, lambda x: x[0][-1][2], compat_str)
	233	else:
	234	CHALLENGES = {
	235	'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
	236	'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
	237	'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
	238	}
	239	challenge = CHALLENGES.get(
	240	challenge_str,
	241	'%s returned error %s.' % (self.IE_NAME, challenge_str))
	242	warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
	243	return False
	244	else:
	245	check_cookie_url = try_get(res, lambda x: x[2], compat_str)
	246
	247	if not check_cookie_url:
	248	warn('Unable to extract CheckCookie URL')
	249	return False
	250
	251	check_cookie_results = self._download_webpage(
	252	check_cookie_url, None, 'Checking cookie', fatal=False)
	253
	254	if check_cookie_results is False:
	255	return False
	256
	257	if 'https://myaccount.google.com/' not in check_cookie_results:
	258	warn('Unable to log in')
	259	return False
	260
	261	return True
	262
	263	def _download_webpage_handle(self, args, *kwargs):
	264	query = kwargs.get('query', {}).copy()
	265	query['disable_polymer'] = 'true'
	266	kwargs['query'] = query
	267	return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
	268	args, *compat_kwargs(kwargs))
	269
	270	def _real_initialize(self):
	271	if self._downloader is None:
	272	return
	273	self._set_language()
	274	if not self._login():
	275	return
	276
	277
	278	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	279	# Extract entries from page with "Load more" button
	280	def _entries(self, page, playlist_id):
	281	more_widget_html = content_html = page
	282	for page_num in itertools.count(1):
	283	for entry in self._process_page(content_html):
	284	yield entry
	285
	286	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	287	if not mobj:
	288	break
	289
	290	more = self._download_json(
	291	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	292	'Downloading page #%s' % page_num,
	293	transform_source=uppercase_escape)
	294	content_html = more['content_html']
	295	if not content_html.strip():
	296	# Some webpages show a "Load more" button but they don't
	297	# have more videos
	298	break
	299	more_widget_html = more['load_more_widget_html']
	300
	301
	302	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	303	def _process_page(self, content):
	304	for video_id, video_title in self.extract_videos_from_page(content):
	305	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	306
	307	def extract_videos_from_page(self, page):
	308	ids_in_page = []
	309	titles_in_page = []
	310	for mobj in re.finditer(self._VIDEO_RE, page):
	311	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	312	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	313	continue
	314	video_id = mobj.group('id')
	315	video_title = unescapeHTML(mobj.group('title'))
	316	if video_title:
	317	video_title = video_title.strip()
	318	try:
	319	idx = ids_in_page.index(video_id)
	320	if video_title and not titles_in_page[idx]:
	321	titles_in_page[idx] = video_title
	322	except ValueError:
	323	ids_in_page.append(video_id)
	324	titles_in_page.append(video_title)
	325	return zip(ids_in_page, titles_in_page)
	326
	327
	328	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	329	def _process_page(self, content):
	330	for playlist_id in orderedSet(re.findall(
	331	r'<h3[^>]+class="[^"]yt-lockup-title[^"]"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
	332	content)):
	333	yield self.url_result(
	334	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	335
	336	def _real_extract(self, url):
	337	playlist_id = self._match_id(url)
	338	webpage = self._download_webpage(url, playlist_id)
	339	title = self._og_search_title(webpage, fatal=False)
	340	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	341
	342
	343	class YoutubeIE(YoutubeBaseInfoExtractor):
	344	IE_DESC = 'YouTube.com'
	345	_VALID_URL = r"""(?x)^
	346	(
	347	(?:https?://\|//) # http(s):// or protocol-independent URL
	348	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	349	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	350	(?:www\.)?pwnyoutube\.com/\|
	351	(?:www\.)?hooktube\.com/\|
	352	(?:www\.)?yourepeat\.com/\|
	353	tube\.majestyc\.net/\|
	354	(?:(?:www\|dev)\.)?invidio\.us/\|
	355	(?:www\.)?invidiou\.sh/\|
	356	(?:www\.)?invidious\.snopyta\.org/\|
	357	(?:www\.)?invidious\.kabi\.tk/\|
	358	(?:www\.)?vid\.wxzm\.sx/\|
	359	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	360	(?:.*?\#/)? # handle anchor (#/) redirect urls
	361	(?: # the various things that can precede the ID:
	362	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	363	\|(?: # or the v= param in all its forms
	364	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	365	(?:\?\|\#!?) # the params delimiter ? or # or #!
	366	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	367	v=
	368	)
	369	))
	370	\|(?:
	371	youtu\.be\| # just youtu.be/xxxx
	372	vid\.plus\| # or vid.plus/xxxx
	373	zwearz\.com/watch\| # or zwearz.com/watch/xxxx
	374	)/
	375	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	376	)
	377	)? # all until now is optional -> you can pass the naked ID
	378	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	379	(?!.*?\blist=
	380	(?:
	381	%(playlist_id)s\| # combined list/video URLs are handled by the playlist IE
	382	WL # WL are handled by the watch later IE
	383	)
	384	)
	385	(?(1).+)? # if we found the ID, everything can follow
	386	$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
	387	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	388	_formats = {
	389	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	390	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	391	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	392	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	393	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	394	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	395	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	396	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	397	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	398	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
	399	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	400	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	401	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	402	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	403	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	404	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	405	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	406	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	407
	408
	409	# 3D videos
	410	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	411	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	412	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	413	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	414	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	415	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	416	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	417
	418	# Apple HTTP Live Streaming
	419	'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	420	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	421	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	422	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	423	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	424	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	425	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	426	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	427
	428	# DASH mp4 video
	429	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
	430	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
	431	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	432	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
	433	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
	434	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
	435	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
	436	'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	437	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
	438	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	439	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	440	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
	441
	442	# Dash mp4 audio
	443	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
	444	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
	445	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
	446	'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
	447	'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
	448	'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
	449	'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
	450
	451	# Dash webm
	452	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	453	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	454	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	455	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	456	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	457	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	458	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
	459	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	460	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	461	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	462	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	463	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	464	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	465	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	466	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	467	# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
	468	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	469	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	470	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	471	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	472	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
	473	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
	474
	475	# Dash webm audio
	476	'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
	477	'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
	478
	479	# Dash webm audio with opus inside
	480	'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
	481	'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
	482	'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
	483
	484	# RTMP (unnamed)
	485	'_rtmp': {'protocol': 'rtmp'},
	486	}
	487	_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
	488
	489	_GEO_BYPASS = False
	490
	491	IE_NAME = 'youtube'
	492	_TESTS = [
	493	{
	494	'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
	495	'info_dict': {
	496	'id': 'BaW_jenozKc',
	497	'ext': 'mp4',
	498	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	499	'uploader': 'Philipp Hagemeister',
	500	'uploader_id': 'phihag',

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import random

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

15

from ..jsinterp import JSInterpreter

16

from ..swfinterp import SWFInterpreter

17

from ..compat import (

compat_chr,

compat_kwargs,

compat_parse_qs,

compat_urllib_parse_unquote,

22

compat_urllib_parse_unquote_plus,

23

compat_urllib_parse_urlencode,

24

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

error_to_compat_str,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_codecs,

parse_duration,

qualities,

remove_quotes,

remove_start,

smuggle_url,

str_or_none,

str_to_int,

try_get,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

url_or_none,

urlencode_postdata,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

57

"""Provide base functions for Youtube extractors"""

58

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

59

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

60

61

_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'

62

_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'

63

_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'

64

65

_NETRC_MACHINE = 'youtube'

66

# If True it will raise an error if no login info is provided

67

_LOGIN_REQUIRED = False

68

69

_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'

70

71

def _set_language(self):

72

self._set_cookie(

73

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

74

# YouTube sets the expire time to about two months

75

expire_time=time.time() + 2 * 30 * 24 * 3600)

76

77

def _ids_to_results(self, ids):

78

return [

79

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

85

True is returned if successful or skipped.

86

False is returned if login failed.

87

88

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

89

"""

90

username, password = self._get_login_info()

91

# No authentication to be performed

92

if username is None:

93

if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:

94

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

95

return True

96

97

login_page = self._download_webpage(

98

self._LOGIN_URL, None,

99

note='Downloading login page',

100

errnote='unable to fetch login page', fatal=False)

101

if login_page is False:

102

return

103

104

login_form = self._hidden_inputs(login_page)

105

106

def req(url, f_req, note, errnote):

107

data = login_form.copy()

108

data.update({

109

'pstMsg': 1,

110

'checkConnection': 'youtube',

111

'checkedDomains': 'youtube',

112

'hl': 'en',

113

'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',

114

'f.req': json.dumps(f_req),

115

'flowName': 'GlifWebSignIn',

116

'flowEntry': 'ServiceLogin',

117

})

118

return self._download_json(

119

url, None, note=note, errnote=errnote,

120

transform_source=lambda s: re.sub(r'^[^[]*', '', s),

121

fatal=False,

122

data=urlencode_postdata(data), headers={

123

'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',

124

'Google-Accounts-XSRF': 1,

})

def warn(message):

self._downloader.report_warning(message)

lookup_req = [

username,

None, [], None, 'US', None, None, 2, False, True,

[

None, None,

[2, 1, None, 1,

'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',

137

None, [], 4],

138

1, [None, None, []], None, None, None, True

],

username,

]

lookup_results = req(

144

self._LOOKUP_URL, lookup_req,

145

'Looking up account info', 'Unable to look up account info')

146

147

if lookup_results is False:

148

return False

149

150

user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)

151

if not user_hash:

152

warn('Unable to extract user hash')

return False

challenge_req = [

user_hash,

None, 1, None, [1, None, None, None, [password, None, True]],

158

[

159

None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],

160

1, [None, None, []], None, None, None, True

161

]]

162

163

challenge_results = req(

164

self._CHALLENGE_URL, challenge_req,

165

'Logging in', 'Unable to log in')

166

167

if challenge_results is False:

168

return

169

170

login_res = try_get(challenge_results, lambda x: x[0][5], list)

171

if login_res:

172

login_msg = try_get(login_res, lambda x: x[5], compat_str)

173

warn(

174

'Unable to login: %s' % 'Invalid password'

175

if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)

176

return False

177

178

res = try_get(challenge_results, lambda x: x[0][-1], list)

179

if not res:

180

warn('Unable to extract result entry')

181

return False

182

183

login_challenge = try_get(res, lambda x: x[0][0], list)

184

if login_challenge:

185

challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)

186

if challenge_str == 'TWO_STEP_VERIFICATION':

187

# SEND_SUCCESS - TFA code has been successfully sent to phone

188

# QUOTA_EXCEEDED - reached the limit of TFA codes

189

status = try_get(login_challenge, lambda x: x[5], compat_str)

190

if status == 'QUOTA_EXCEEDED':

191

warn('Exceeded the limit of TFA codes, try later')

192

return False

193

194

tl = try_get(challenge_results, lambda x: x[1][2], compat_str)

195

if not tl:

196

warn('Unable to extract TL')

197

return False

198

199

tfa_code = self._get_tfa_info('2-step verification code')

if not tfa_code:

warn(

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

204

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

205

return False

206

207

tfa_code = remove_start(tfa_code, 'G-')

208

209

tfa_req = [

210

user_hash, None, 2, None,

211

[

212

9, None, None, None, None, None, None, None,

213

[None, tfa_code, True, 2]

]]

tfa_results = req(

self._TFA_URL.format(tl), tfa_req,

218

'Submitting TFA code', 'Unable to submit TFA code')

219

220

if tfa_results is False:

221

return False

222

223

tfa_res = try_get(tfa_results, lambda x: x[0][5], list)

224

if tfa_res:

225

tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)

226

warn(

227

'Unable to finish TFA: %s' % 'Invalid TFA code'

228

if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)

229

return False

230

231

check_cookie_url = try_get(

232

tfa_results, lambda x: x[0][-1][2], compat_str)

233

else:

234

CHALLENGES = {

235

'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",

236

'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',

237

'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",

238

}

239

challenge = CHALLENGES.get(

240

challenge_str,

241

'%s returned error %s.' % (self.IE_NAME, challenge_str))

242

warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)

243

return False

244

else:

245

check_cookie_url = try_get(res, lambda x: x[2], compat_str)

246

247

if not check_cookie_url:

248

warn('Unable to extract CheckCookie URL')

249

return False

250

251

check_cookie_results = self._download_webpage(

252

check_cookie_url, None, 'Checking cookie', fatal=False)

253

254

if check_cookie_results is False:

255

return False

256

257

if 'https://myaccount.google.com/' not in check_cookie_results:

258

warn('Unable to log in')

return False

return True

def _download_webpage_handle(self, *args, **kwargs):

264

query = kwargs.get('query', {}).copy()

265

query['disable_polymer'] = 'true'

266

kwargs['query'] = query

267

return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(

268

*args, **compat_kwargs(kwargs))

269

270

def _real_initialize(self):

271

if self._downloader is None:

272

return

273

self._set_language()

274

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

279

# Extract entries from page with "Load more" button

280

def _entries(self, page, playlist_id):

281

more_widget_html = content_html = page

282

for page_num in itertools.count(1):

283

for entry in self._process_page(content_html):

284

yield entry

285

286

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

291

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

292

'Downloading page #%s' % page_num,

293

transform_source=uppercase_escape)

294

content_html = more['content_html']

295

if not content_html.strip():

296

# Some webpages show a "Load more" button but they don't

297

# have more videos

298

break

299

more_widget_html = more['load_more_widget_html']

300

301

302

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

303

def _process_page(self, content):

304

for video_id, video_title in self.extract_videos_from_page(content):

305

yield self.url_result(video_id, 'Youtube', video_id, video_title)

306

307

def extract_videos_from_page(self, page):

308

ids_in_page = []

309

titles_in_page = []

310

for mobj in re.finditer(self._VIDEO_RE, page):

311

# The link with index 0 is not the first video of the playlist (not sure if still actual)

312

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

313

continue

314

video_id = mobj.group('id')

315

video_title = unescapeHTML(mobj.group('title'))

316

if video_title:

317

video_title = video_title.strip()

318

try:

319

idx = ids_in_page.index(video_id)

320

if video_title and not titles_in_page[idx]:

321

titles_in_page[idx] = video_title

322

except ValueError:

323

ids_in_page.append(video_id)

324

titles_in_page.append(video_title)

325

return zip(ids_in_page, titles_in_page)

326

327

328

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

329

def _process_page(self, content):

330

for playlist_id in orderedSet(re.findall(

331

r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',

332

content)):

333

yield self.url_result(

334

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

335

336

def _real_extract(self, url):

337

playlist_id = self._match_id(url)

338

webpage = self._download_webpage(url, playlist_id)

339

title = self._og_search_title(webpage, fatal=False)

340

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

341

342

343

class YoutubeIE(YoutubeBaseInfoExtractor):

344

IE_DESC = 'YouTube.com'

345

_VALID_URL = r"""(?x)^

346

(

347

(?:https?://|//) # http(s):// or protocol-independent URL

348

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

349

(?:www\.)?deturl\.com/www\.youtube\.com/|

350

(?:www\.)?pwnyoutube\.com/|

351

(?:www\.)?hooktube\.com/|

352

(?:www\.)?yourepeat\.com/|

353

tube\.majestyc\.net/|

354

(?:(?:www|dev)\.)?invidio\.us/|

355

(?:www\.)?invidiou\.sh/|

356

(?:www\.)?invidious\.snopyta\.org/|

357

(?:www\.)?invidious\.kabi\.tk/|

358

(?:www\.)?vid\.wxzm\.sx/|

359

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

360

(?:.*?\#/)? # handle anchor (#/) redirect urls

361

(?: # the various things that can precede the ID:

362

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

363

|(?: # or the v= param in all its forms

364

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

365

(?:\?|\#!?) # the params delimiter ? or # or #!

366

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

372

vid\.plus| # or vid.plus/xxxx

373

zwearz\.com/watch| # or zwearz.com/watch/xxxx

374

)/

375

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

376

)

377

)? # all until now is optional -> you can pass the naked ID

378

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

379

(?!.*?\blist=

380

(?:

381

%(playlist_id)s| # combined list/video URLs are handled by the playlist IE

382

WL # WL are handled by the watch later IE

383

)

384

)

385

(?(1).+)? # if we found the ID, everything can follow

386

$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

387

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

388

_formats = {

389

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

390

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

391

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

392

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

393

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

394

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

395

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

396

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

397

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

398

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},

399

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

400

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

401

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

402

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

403

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

404

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

405

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

406

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

411

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

412

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

413

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

414

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

415

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

416

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

417

418

# Apple HTTP Live Streaming

419

'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

420

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

421

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

422

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

423

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

424

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

425

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

426

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

427

428

# DASH mp4 video

429

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},

430

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},

431

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

432

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},

433

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},

434

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)

435

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},

436

'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

437

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},

438

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

439

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

440

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},

441

442

# Dash mp4 audio

443

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},

444

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},

445

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},

446

'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

447

'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

448

'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},

449

'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},

450

451

# Dash webm

452

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

453

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

454

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

455

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

456

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

457

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

458

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},

459

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},

460

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},

461

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

462

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

463

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

464

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},

465

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},

466

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},

467

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

468

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

469

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

470

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

471

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

472

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

473

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

474

475

# Dash webm audio

476

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},

477

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},

478

479

# Dash webm audio with opus inside

480

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},

481

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},

482

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},

483

484

# RTMP (unnamed)

485

'_rtmp': {'protocol': 'rtmp'},

486

}

487

_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')

_GEO_BYPASS = False

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

499

'uploader': 'Philipp Hagemeister',

500

'uploader_id': 'phihag',

501

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

502

'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',

503

'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',

504

'upload_date': '20121002',

505

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

506

'categories': ['Science & Technology'],

507

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',

518

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

523

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

524

'alt_title': 'I Love It (feat. Charli XCX)',

525

'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',

526

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

527

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

528

'iconic ep', 'iconic', 'love', 'it'],

529

'duration': 180,

530

'uploader': 'Icona Pop',

531

'uploader_id': 'IconaPop',

532

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',

533

'creator': 'Icona Pop',

534

'track': 'I Love It (feat. Charli XCX)',

535

'artist': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

540

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

545

'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',

546

'alt_title': 'Tunnel Vision',

547

'description': 'md5:07dab3356cde4199048e4c7cd93471e1',

548

'duration': 419,

549

'uploader': 'justintimberlakeVEVO',

550

'uploader_id': 'justintimberlakeVEVO',

551

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',

552

'creator': 'Justin Timberlake',

553

'track': 'Tunnel Vision',

554

'artist': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

560

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

565

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

566

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

567

'uploader': 'SET India',

568

'uploader_id': 'setindia',

569

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',

'age_limit': 18,

}

},

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

575

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

580

'uploader': 'Philipp Hagemeister',

581

'uploader_id': 'phihag',

582

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

583

'upload_date': '20121002',

584

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

585

'categories': ['Science & Technology'],

586

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

591

},

592

'params': {

593

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',

598

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

603

'uploader_id': '8KVIDEO',

604

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',

605

'description': '',

606

'uploader': '8KVIDEO',

607

'title': 'UHDTV TEST 8K VIDEO.mp4'

608

},

609

'params': {

610

'youtube_include_dash_manifest': True,

611

'format': '141',

612

},

613

'skip': 'format 141 not served anymore',

614

},

615

# DASH manifest with encrypted signature

616

{

617

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',

622

'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',

623

'duration': 244,

624

'uploader': 'AfrojackVEVO',

625

'uploader_id': 'AfrojackVEVO',

626

'upload_date': '20131011',

627

},

628

'params': {

629

'youtube_include_dash_manifest': True,

630

'format': '141/bestaudio[ext=m4a]',

631

},

632

},

633

# JS player signature function name containing $

634

{

635

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

640

'description': 'md5:bec2185232c05479482cb5a9b82719bf',

641

'duration': 242,

642

'uploader': 'TaylorSwiftVEVO',

643

'uploader_id': 'TaylorSwiftVEVO',

644

'upload_date': '20140818',

645

'creator': 'Taylor Swift',

646

},

647

'params': {

648

'youtube_include_dash_manifest': True,

649

'format': '141/bestaudio[ext=m4a]',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'duration': 219,

'upload_date': '20100909',

660

'uploader': 'Amazing Atheist',

661

'uploader_id': 'TheAmazingAtheist',

662

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',

663

'title': 'Burning Everyone\'s Koran',

664

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

665

}

666

},

667

# Normal age-gate video (No vevo, embed allowed)

668

{

669

'url': 'https://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

674

'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

675

'duration': 142,

676

'uploader': 'The Witcher',

677

'uploader_id': 'WitcherGame',

678

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',

679

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

684

{

685

'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

690

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

691

'duration': 246,

692

'uploader': 'LloydVEVO',

693

'uploader_id': 'LloydVEVO',

694

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',

695

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)

700

# YouTube Red ad is not captured for creator

701

{

702

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'duration': 266,

'upload_date': '20100430',

708

'uploader_id': 'deadmau5',

709

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',

710

'creator': 'deadmau5',

711

'description': 'md5:12c56784b8032162bb936a5f76d55360',

712

'uploader': 'deadmau5',

713

'title': 'Deadmau5 - Some Chords (HD)',

714

'alt_title': 'Some Chords',

715

},

716

'expected_warnings': [

717

'DASH manifest missing',

718

]

719

},

720

# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)

721

{

722

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'duration': 6085,

'upload_date': '20150827',

728

'uploader_id': 'olympic',

729

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',

730

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

731

'uploader': 'Olympic',

732

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

733

},

734

'params': {

735

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

745

'duration': 85,

746

'upload_date': '20110310',

747

'uploader_id': 'AllenMeow',

748

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',

749

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

750

'uploader': '孫ᄋᄅ',

751

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

752

},

753

},

754

# url_encoded_fmt_stream_map is empty string

755

{

756

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

761

'description': '',

762

'upload_date': '20150404',

763

'uploader_id': 'spbelect',

764

'uploader': 'Наблюдатели Петербурга',

765

},

766

'params': {

767

'skip_download': 'requires avconv',

768

},

769

'skip': 'This live event has ended.',

770

},

771

# Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)

772

{

773

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'webm',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

778

'description': 'md5:116377fd2963b81ec4ce64b542173306',

779

'duration': 220,

780

'upload_date': '20150625',

781

'uploader_id': 'dorappi2000',

782

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',

783

'uploader': 'dorappi2000',

784

'formats': 'mincount:31',

785

},

786

'skip': 'not actual anymore',

787

},

788

# DASH manifest with segment_list

789

{

790

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

791

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

796

'uploader': 'Airtek',

797

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

798

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

799

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

800

},

801

'params': {

802

'youtube_include_dash_manifest': True,

803

'format': '135', # bestvideo

804

},

805

'skip': 'This live event has ended.',

806

},

807

{

808

# Multifeed videos (multiple cameras), URL is for Main Camera

809

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

810

'info_dict': {

811

'id': 'jqWvoWXjCVs',

812

'title': 'teamPGP: Rocket League Noob Stream',

813

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

820

'description': 'md5:dc7872fb300e143831327f1bae3af010',

821

'duration': 7335,

822

'upload_date': '20150721',

823

'uploader': 'Beer Games Beer',

824

'uploader_id': 'beergamesbeer',

825

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

826

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

833

'description': 'md5:dc7872fb300e143831327f1bae3af010',

834

'duration': 7337,

835

'upload_date': '20150721',

836

'uploader': 'Beer Games Beer',

837

'uploader_id': 'beergamesbeer',

838

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

839

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

846

'description': 'md5:dc7872fb300e143831327f1bae3af010',

847

'duration': 7337,

848

'upload_date': '20150721',

849

'uploader': 'Beer Games Beer',

850

'uploader_id': 'beergamesbeer',

851

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

852

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

859

'description': 'md5:dc7872fb300e143831327f1bae3af010',

860

'duration': 7334,

861

'upload_date': '20150721',

862

'uploader': 'Beer Games Beer',

863

'uploader_id': 'beergamesbeer',

864

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

865

'license': 'Standard YouTube License',

},

}],

'params': {

'skip_download': True,

870

},

871

'skip': 'This video is not available.',

872

},

873

{

874

# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)

875

'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',

876

'info_dict': {

877

'id': 'gVfLd0zydlo',

878

'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',

879

},

880

'playlist_count': 2,

881

'skip': 'Not multifeed anymore',

882

},

883

{

884

'url': 'https://vid.plus/FlRa-iH7PGw',

885

'only_matching': True,

886

},

887

{

888

'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',

889

'only_matching': True,

890

},

891

{

892

# Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)

893

# Also tests cut-off URL expansion in video description (see

894

# https://github.com/ytdl-org/youtube-dl/issues/1892,

895

# https://github.com/ytdl-org/youtube-dl/issues/8164)

896

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

901

'alt_title': 'Dark Walk - Position Music',

902

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

903

'duration': 133,

904

'upload_date': '20151119',

905

'uploader_id': 'IronSoulElf',

906

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',

907

'uploader': 'IronSoulElf',

908

'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

909

'track': 'Dark Walk - Position Music',

910

'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

911

},

912

'params': {

913

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)

918

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

919

'only_matching': True,

920

},

921

{

922

# Video with yt:stretch=17:0

923

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

928

'description': 'md5:ee18a25c350637c8faff806845bddee9',

929

'upload_date': '20151107',

930

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

931

'uploader': 'CH GAMER DROID',

932

},

933

'params': {

934

'skip_download': True,

935

},

936

'skip': 'This video does not exist.',

937

},

938

{

939

# Video licensed under Creative Commons

940

'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',

'info_dict': {

'id': 'M4gD1WSo5mA',

'ext': 'mp4',

'title': 'md5:e41008789470fc2533a3252216f1c1d1',

945

'description': 'md5:a677553cf0840649b731a3024aeff4cc',

946

'duration': 721,

947

'upload_date': '20150127',

948

'uploader_id': 'BerkmanCenter',

949

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',

950

'uploader': 'The Berkman Klein Center for Internet & Society',

951

'license': 'Creative Commons Attribution license (reuse allowed)',

952

},

953

'params': {

954

'skip_download': True,

},

},

{

# Channel-like uploader_url

959

'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',

'info_dict': {

'id': 'eQcmzGIKrzg',

'ext': 'mp4',

'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',

964

'description': 'md5:dda0d780d5a6e120758d1711d062a867',

965

'duration': 4060,

966

'upload_date': '20151119',

967

'uploader': 'Bernie Sanders',

968

'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',

969

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',

970

'license': 'Creative Commons Attribution license (reuse allowed)',

971

},

972

'params': {

973

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

978

'only_matching': True,

979

},

980

{

981

# YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)

982

'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',

983

'only_matching': True,

984

},

985

{

986

# Rental video preview

987

'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',

'info_dict': {

'id': 'uGpuVWrhIzE',

'ext': 'mp4',

'title': 'Piku - Trailer',

992

'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',

993

'upload_date': '20150811',

994

'uploader': 'FlixMatrix',

995

'uploader_id': 'FlixMatrixKaravan',

996

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',

997

'license': 'Standard YouTube License',

998

},

999

'params': {

1000

'skip_download': True,

1001

},

1002

'skip': 'This video is not available.',

1003

},

1004

{

1005

# YouTube Red video with episode data

1006

'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',

'info_dict': {

'id': 'iqKdEhx-dD4',

'ext': 'mp4',

'title': 'Isolation - Mind Field (Ep 1)',

1011

'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',

1012

'duration': 2085,

1013

'upload_date': '20170118',

1014

'uploader': 'Vsauce',

1015

'uploader_id': 'Vsauce',

1016

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',

1017

'series': 'Mind Field',

'season_number': 1,

'episode_number': 1,

},

'params': {

'skip_download': True,

1023

},

1024

'expected_warnings': [

1025

'Skipping DASH manifest',

],

},

{

# The following content has been identified by the YouTube community

1030

# as inappropriate or offensive to some audiences.

1031

'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',

'info_dict': {

'id': '6SJNVb0GnPI',

'ext': 'mp4',

'title': 'Race Differences in Intelligence',

1036

'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',

1037

'duration': 965,

1038

'upload_date': '20140124',

1039

'uploader': 'New Century Foundation',

1040

'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',

1041

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',

1042

},

1043

'params': {

1044

'skip_download': True,

},

},

{

# itag 212

'url': '1t24XAntNCY',

1050

'only_matching': True,

1051

},

1052

{

1053

# geo restricted to JP

1054

'url': 'sJL6WA-aGkQ',

1055

'only_matching': True,

1056

},

1057

{

1058

'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',

1059

'only_matching': True,

1060

},

1061

{

1062

'url': 'https://invidio.us/watch?v=BaW_jenozKc',

1063

'only_matching': True,

},

{

# DRM protected

'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',

1068

'only_matching': True,

1069

},

1070

{

1071

# Video with unsupported adaptive stream type formats

1072

'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',

'info_dict': {

'id': 'Z4Vy8R84T1U',

'ext': 'mp4',

'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',

1077

'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',

1078

'duration': 433,

1079

'upload_date': '20130923',

1080

'uploader': 'Amelia Putri Harwita',

1081

'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',

1082

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',

1083

'formats': 'maxcount:10',

1084

},

1085

'params': {

1086

'skip_download': True,

1087

'youtube_include_dash_manifest': False,

},

}

]

def __init__(self, *args, **kwargs):

1093

super(YoutubeIE, self).__init__(*args, **kwargs)

1094

self._player_cache = {}

1095

1096

def report_video_info_webpage_download(self, video_id):

1097

"""Report attempt to download video info webpage."""

1098

self.to_screen('%s: Downloading video info webpage' % video_id)

1099

1100

def report_information_extraction(self, video_id):

1101

"""Report attempt to extract video information."""

1102

self.to_screen('%s: Extracting video information' % video_id)

1103

1104

def report_unavailable_format(self, video_id, format):

1105

"""Report extracted video URL."""

1106

self.to_screen('%s: Format %s not available' % (video_id, format))

1107

1108

def report_rtmp_download(self):

1109

"""Indicate the download will use the RTMP protocol."""

1110

self.to_screen('RTMP download detected')

1111

1112

def _signature_cache_id(self, example_sig):

1113

""" Return a string representation of a signature """

1114

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

1115

1116

def _extract_signature_function(self, video_id, player_url, example_sig):

1117

id_m = re.match(

1118

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',

1119

player_url)

1120

if not id_m:

1121

raise ExtractorError('Cannot identify player %r' % player_url)

1122

player_type = id_m.group('ext')

1123

player_id = id_m.group('id')

1124

1125

# Read from filesystem cache

1126

func_id = '%s_%s_%s' % (

1127

player_type, player_id, self._signature_cache_id(example_sig))

1128

assert os.path.basename(func_id) == func_id

1129

1130

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

1131

if cache_spec is not None:

1132

return lambda s: ''.join(s[i] for i in cache_spec)

1133

1134

download_note = (

1135

'Downloading player %s' % player_url

1136

if self._downloader.params.get('verbose') else

1137

'Downloading %s player %s' % (player_type, player_id)

1138

)

1139

if player_type == 'js':

1140

code = self._download_webpage(

1141

player_url, video_id,

1142

note=download_note,

1143

errnote='Download of %s failed' % player_url)

1144

res = self._parse_sig_js(code)

1145

elif player_type == 'swf':

1146

urlh = self._request_webpage(

1147

player_url, video_id,

1148

note=download_note,

1149

errnote='Download of %s failed' % player_url)

1150

code = urlh.read()

1151

res = self._parse_sig_swf(code)

1152

else:

1153

assert False, 'Invalid player type %r' % player_type

1154

1155

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1156

cache_res = res(test_string)

1157

cache_spec = [ord(c) for c in cache_res]

1158

1159

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

1160

return res

1161

1162

def _print_sig_code(self, func, example_sig):

1163

def gen_sig_code(idxs):

1164

def _genslice(start, end, step):

1165

starts = '' if start == 0 else str(start)

1166

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

1167

steps = '' if step == 1 else (':%d' % step)

1168

return 's[%s%s%s]' % (starts, ends, steps)

1169

1170

step = None

1171

# Quelch pyflakes warnings - start will be set when step is set

1172

start = '(Never used)'

1173

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

1178

step = None

1179

continue

1180

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

1190

1191

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1192

cache_res = func(test_string)

1193

cache_spec = [ord(c) for c in cache_res]

1194

expr_code = ' + '.join(gen_sig_code(cache_spec))

1195

signature_id_tuple = '(%s)' % (

1196

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

1197

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

1198

' return %s\n') % (signature_id_tuple, expr_code)

1199

self.to_screen('Extracted signature function:\n' + code)

1200

1201

def _parse_sig_js(self, jscode):

1202

funcname = self._search_regex(

1203

(r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1204

r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',

1205

r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P<sig>[a-zA-Z0-9$]+)\(',

1206

r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1207

r'\bc\s*&&\s*d\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),

1208

jscode, 'Initial JS player signature function name', group='sig')

1209

1210

jsi = JSInterpreter(jscode)

1211

initial_function = jsi.extract_function(funcname)

1212

return lambda s: initial_function([s])

1213

1214

def _parse_sig_swf(self, file_contents):

1215

swfi = SWFInterpreter(file_contents)

1216

TARGET_CLASSNAME = 'SignatureDecipher'

1217

searched_class = swfi.extract_class(TARGET_CLASSNAME)

1218

initial_function = swfi.extract_function(searched_class, 'decipher')

1219

return lambda s: initial_function([s])

1220

1221

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

1222

"""Turn the encrypted s field into a working signature"""

1223

1224

if player_url is None:

1225

raise ExtractorError('Cannot decrypt signature without player_url')

1226

1227

if player_url.startswith('//'):

1228

player_url = 'https:' + player_url

1229

elif not re.match(r'https?://', player_url):

1230

player_url = compat_urlparse.urljoin(

1231

'https://www.youtube.com', player_url)

1232

try:

1233

player_id = (player_url, self._signature_cache_id(s))

1234

if player_id not in self._player_cache:

1235

func = self._extract_signature_function(

1236

video_id, player_url, s

1237

)

1238

self._player_cache[player_id] = func

1239

func = self._player_cache[player_id]

1240

if self._downloader.params.get('youtube_print_sig_code'):

1241

self._print_sig_code(func, s)

1242

return func(s)

1243

except Exception as e:

1244

tb = traceback.format_exc()

1245

raise ExtractorError(

1246

'Signature extraction failed: ' + tb, cause=e)

1247

1248

def _get_subtitles(self, video_id, webpage):

1249

try:

1250

subs_doc = self._download_xml(

1251

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

1252

video_id, note=False)

1253

except ExtractorError as err:

1254

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

1259

lang = track.attrib['lang_code']

1260

if lang in sub_lang_list:

1261

continue

1262

sub_formats = []

1263

for ext in self._SUBTITLE_FORMATS:

1264

params = compat_urllib_parse_urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

1269

})

1270

sub_formats.append({

1271

'url': 'https://www.youtube.com/api/timedtext?' + params,

1272

'ext': ext,

1273

})

1274

sub_lang_list[lang] = sub_formats

1275

if not sub_lang_list:

1276

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

1281

patterns = (

1282

# User data may contain arbitrary character sequences that may affect

1283

# JSON extraction with regex, e.g. when '};' is contained the second

1284

# regex won't capture the whole JSON. Yet working around by trying more

1285

# concrete regex first keeping in mind proper quoted string handling

1286

# to be implemented in future that will replace this workaround (see

1287

# https://github.com/ytdl-org/youtube-dl/issues/7468,

1288

# https://github.com/ytdl-org/youtube-dl/pull/7599)

1289

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

1290

r';ytplayer\.config\s*=\s*({.+?});',

1291

)

1292

config = self._search_regex(

1293

patterns, webpage, 'ytplayer.config', default=None)

1294

if config:

1295

return self._parse_json(

1296

uppercase_escape(config), video_id, fatal=False)

1297

1298

def _get_automatic_captions(self, video_id, webpage):

1299

"""We need the webpage for getting the captions url, pass it as an

1300

argument to speed up the process."""

1301

self.to_screen('%s: Looking for automatic captions' % video_id)

1302

player_config = self._get_ytplayer_config(video_id, webpage)

1303

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

1304

if not player_config:

1305

self._downloader.report_warning(err_msg)

1306

return {}

1307

try:

1308

args = player_config['args']

1309

caption_url = args.get('ttsurl')

1310

if caption_url:

1311

timestamp = args['timestamp']

1312

# We get the available subtitles

1313

list_params = compat_urllib_parse_urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

1319

caption_list = self._download_xml(list_url, video_id)

1320

original_lang_node = caption_list.find('track')

1321

if original_lang_node is None:

1322

self._downloader.report_warning('Video doesn\'t have automatic captions')

1323

return {}

1324

original_lang = original_lang_node.attrib['lang_code']

1325

caption_kind = original_lang_node.attrib.get('kind', '')

1326

1327

sub_lang_list = {}

1328

for lang_node in caption_list.findall('target'):

1329

sub_lang = lang_node.attrib['lang_code']

1330

sub_formats = []

1331

for ext in self._SUBTITLE_FORMATS:

1332

params = compat_urllib_parse_urlencode({

1333

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1338

})

1339

sub_formats.append({

1340

'url': caption_url + '&' + params,

1341

'ext': ext,

1342

})

1343

sub_lang_list[sub_lang] = sub_formats

1344

return sub_lang_list

1345

1346

def make_captions(sub_url, sub_langs):

1347

parsed_sub_url = compat_urllib_parse_urlparse(sub_url)

1348

caption_qs = compat_parse_qs(parsed_sub_url.query)

1349

captions = {}

1350

for sub_lang in sub_langs:

1351

sub_formats = []

1352

for ext in self._SUBTITLE_FORMATS:

caption_qs.update({

'tlang': [sub_lang],

'fmt': [ext],

})

sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(

1358

query=compat_urllib_parse_urlencode(caption_qs, True)))

sub_formats.append({

'url': sub_url,

'ext': ext,

})

captions[sub_lang] = sub_formats

1364

return captions

1365

1366

# New captions format as of 22.06.2017

1367

player_response = args.get('player_response')

1368

if player_response and isinstance(player_response, compat_str):

1369

player_response = self._parse_json(

1370

player_response, video_id, fatal=False)

1371

if player_response:

1372

renderer = player_response['captions']['playerCaptionsTracklistRenderer']

1373

base_url = renderer['captionTracks'][0]['baseUrl']

1374

sub_lang_list = []

1375

for lang in renderer['translationLanguages']:

1376

lang_code = lang.get('languageCode')

1377

if lang_code:

1378

sub_lang_list.append(lang_code)

1379

return make_captions(base_url, sub_lang_list)

1380

1381

# Some videos don't provide ttsurl but rather caption_tracks and

1382

# caption_translation_languages (e.g. 20LmZk1hakA)

1383

# Does not used anymore as of 22.06.2017

1384

caption_tracks = args['caption_tracks']

1385

caption_translation_languages = args['caption_translation_languages']

1386

caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]

1387

sub_lang_list = []

1388

for lang in caption_translation_languages.split(','):

1389

lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))

1390

sub_lang = lang_qs.get('lc', [None])[0]

1391

if sub_lang:

1392

sub_lang_list.append(sub_lang)

1393

return make_captions(caption_url, sub_lang_list)

1394

# An extractor error can be raise by the download process if there are

1395

# no automatic captions but there are subtitles

1396

except (KeyError, IndexError, ExtractorError):

1397

self._downloader.report_warning(err_msg)

1398

return {}

1399

1400

def _mark_watched(self, video_id, video_info, player_response):

1401

playback_url = url_or_none(try_get(

1402

player_response,

1403

lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(

1404

video_info, lambda x: x['videostats_playback_base_url'][0]))

1405

if not playback_url:

1406

return

1407

parsed_playback_url = compat_urlparse.urlparse(playback_url)

1408

qs = compat_urlparse.parse_qs(parsed_playback_url.query)

1409

1410

# cpn generation algorithm is reverse engineered from base.js.

1411

# In fact it works even with dummy cpn.

1412

CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'

1413

cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))

qs.update({

'ver': ['2'],

'cpn': [cpn],

})

playback_url = compat_urlparse.urlunparse(

1420

parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))

1421

1422

self._download_webpage(

1423

playback_url, video_id, 'Marking watched',

1424

'Unable to mark watched', fatal=False)

1425

1426

@staticmethod

1427

def _extract_urls(webpage):

1428

# Embedded YouTube player

1429

entries = [

1430

unescapeHTML(mobj.group('url'))

1431

for mobj in re.finditer(r'''(?x)

(?:

<iframe[^>]+?src=|

data-video-url=|

<embed[^>]+?src=|

embedSWF\(?:\s*|

<object[^>]+data=|

new\s+SWFObject\(

)

(["\'])

(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/

1442

(?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)

1443

\1''', webpage)]

1444

1445

# lazyYT YouTube embed

1446

entries.extend(list(map(

1447

unescapeHTML,

1448

re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))

1449

1450

# Wordpress "YouTube Video Importer" plugin

1451

matches = re.findall(r'''(?x)<div[^>]+

1452

class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+

1453

data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)

1454

entries.extend(m[-1] for m in matches)

return entries

@staticmethod

def _extract_url(webpage):

1460

urls = YoutubeIE._extract_urls(webpage)

1461

return urls[0] if urls else None

1462

1463

@classmethod

1464

def extract_id(cls, url):

1465

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1466

if mobj is None:

1467

raise ExtractorError('Invalid URL: %s' % url)

1468

video_id = mobj.group(2)

1469

return video_id

1470

1471

def _extract_annotations(self, video_id):

1472

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1473

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1474

1475

@staticmethod

1476

def _extract_chapters(description, duration):

1477

if not description:

1478

return None

1479

chapter_lines = re.findall(

1480

r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',

1481

description)

1482

if not chapter_lines:

1483

return None

1484

chapters = []

1485

for next_num, (chapter_line, time_point) in enumerate(

1486

chapter_lines, start=1):

1487

start_time = parse_duration(time_point)

1488

if start_time is None:

1489

continue

1490

if start_time > duration:

1491

break

1492

end_time = (duration if next_num == len(chapter_lines)

1493

else parse_duration(chapter_lines[next_num][1]))

1494

if end_time is None:

1495

continue

1496

if end_time > duration:

1497

end_time = duration

1498

if start_time > end_time:

1499

break

1500

chapter_title = re.sub(

1501

r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')

1502

chapter_title = re.sub(r'\s+', ' ', chapter_title)

1503

chapters.append({

1504

'start_time': start_time,

1505

'end_time': end_time,

1506

'title': chapter_title,

})

return chapters

def _real_extract(self, url):

1511

url, smuggled_data = unsmuggle_url(url, {})

1512

1513

proto = (

1514

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1520

for component in [parsed_url.fragment, parsed_url.query]:

1521

query = compat_parse_qs(component)

1522

if start_time is None and 't' in query:

1523

start_time = parse_duration(query['t'][0])

1524

if start_time is None and 'start' in query:

1525

start_time = parse_duration(query['start'][0])

1526

if end_time is None and 'end' in query:

1527

end_time = parse_duration(query['end'][0])

1528

1529

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1530

mobj = re.search(self._NEXT_URL_RE, url)

1531

if mobj:

1532

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1533

video_id = self.extract_id(url)

1534

1535

# Get video webpage

1536

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1537

video_webpage = self._download_webpage(url, video_id)

1538

1539

# Attempt to extract SWF player URL

1540

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1541

if mobj is not None:

1542

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1549

dash_mpd = video_info.get('dashmpd')

1550

if dash_mpd and dash_mpd[0] not in dash_mpds:

1551

dash_mpds.append(dash_mpd[0])

1552

1553

def add_dash_mpd_pr(pl_response):

1554

dash_mpd = url_or_none(try_get(

1555

pl_response, lambda x: x['streamingData']['dashManifestUrl'],

1556

compat_str))

1557

if dash_mpd and dash_mpd not in dash_mpds:

1558

dash_mpds.append(dash_mpd)

is_live = None

view_count = None

def extract_view_count(v_info):

1564

return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))

player_response = {}

# Get video info

embed_webpage = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1571

age_gate = True

1572

# We simulate the access to the video from www.youtube.com/v/{video_id}

1573

# this can be viewed without login into Youtube

1574

url = proto + '://www.youtube.com/embed/%s' % video_id

1575

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1576

data = compat_urllib_parse_urlencode({

1577

'video_id': video_id,

1578

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1579

'sts': self._search_regex(

1580

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1581

})

1582

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1583

video_info_webpage = self._download_webpage(

1584

video_info_url, video_id,

1585

note='Refetching age-gated info webpage',

1586

errnote='unable to download video info webpage')

1587

video_info = compat_parse_qs(video_info_webpage)

1588

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

sts = None

# Try looking directly into the video webpage

1594

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1595

if ytplayer_config:

1596

args = ytplayer_config['args']

1597

if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):

1598

# Convert to the same format returned by compat_parse_qs

1599

video_info = dict((k, [v]) for k, v in args.items())

1600

add_dash_mpd(video_info)

1601

# Rental video is not rented but preview is available (e.g.

1602

# https://www.youtube.com/watch?v=yYr8q0y5Jfg,

1603

# https://github.com/ytdl-org/youtube-dl/issues/10532)

1604

if not video_info and args.get('ypc_vid'):

1605

return self.url_result(

1606

args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])

1607

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1608

is_live = True

1609

sts = ytplayer_config.get('sts')

1610

if not player_response:

1611

pl_response = str_or_none(args.get('player_response'))

1612

if pl_response:

1613

pl_response = self._parse_json(pl_response, video_id, fatal=False)

1614

if isinstance(pl_response, dict):

1615

player_response = pl_response

1616

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1617

add_dash_mpd_pr(player_response)

1618

# We also try looking in get_video_info since it may contain different dashmpd

1619

# URL that points to a DASH manifest with possibly different itag set (some itags

1620

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1621

# manifest pointed by get_video_info's dashmpd).

1622

# The general idea is to take a union of itags of both DASH manifests (for example

1623

# video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093)

1624

self.report_video_info_webpage_download(video_id)

1625

for el in ('info', 'embedded', 'detailpage', 'vevo', ''):

1626

query = {

1627

'video_id': video_id,

'ps': 'default',

'eurl': '',

'gl': 'US',

'hl': 'en',

}

if el:

query['el'] = el

if sts:

query['sts'] = sts

video_info_webpage = self._download_webpage(

1638

'%s://www.youtube.com/get_video_info' % proto,

1639

video_id, note=False,

1640

errnote='unable to download video info webpage',

1641

fatal=False, query=query)

1642

if not video_info_webpage:

1643

continue

1644

get_video_info = compat_parse_qs(video_info_webpage)

1645

if not player_response:

1646

pl_response = get_video_info.get('player_response', [None])[0]

1647

if isinstance(pl_response, dict):

1648

player_response = pl_response

1649

add_dash_mpd_pr(player_response)

1650

add_dash_mpd(get_video_info)

1651

if view_count is None:

1652

view_count = extract_view_count(get_video_info)

1653

if not video_info:

1654

video_info = get_video_info

1655

get_token = get_video_info.get('token') or get_video_info.get('account_playback_token')

1656

if get_token:

1657

# Different get_video_info requests may report different results, e.g.

1658

# some may report video unavailability, but some may serve it without

1659

# any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362,

1660

# the original webpage as well as el=info and el=embedded get_video_info

1661

# requests report video unavailability due to geo restriction while

1662

# el=detailpage succeeds and returns valid data). This is probably

1663

# due to YouTube measures against IP ranges of hosting providers.

1664

# Working around by preferring the first succeeded video_info containing

1665

# the token if no such video_info yet was found.

1666

token = video_info.get('token') or video_info.get('account_playback_token')

1667

if not token:

1668

video_info = get_video_info

1669

break

1670

1671

def extract_unavailable_message():

1672

return self._html_search_regex(

1673

r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',

1674

video_webpage, 'unavailable message', default=None)

1675

1676

if not video_info:

1677

unavailable_message = extract_unavailable_message()

1678

if not unavailable_message:

1679

unavailable_message = 'Unable to extract video data'

1680

raise ExtractorError(

1681

'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)

1682

1683

token = video_info.get('token') or video_info.get('account_playback_token')

1684

if not token:

1685

if 'reason' in video_info:

1686

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1687

regions_allowed = self._html_search_meta(

1688

'regionsAllowed', video_webpage, default=None)

1689

countries = regions_allowed.split(',') if regions_allowed else None

1690

self.raise_geo_restricted(

1691

msg=video_info['reason'][0], countries=countries)

1692

reason = video_info['reason'][0]

1693

if 'Invalid parameters' in reason:

1694

unavailable_message = extract_unavailable_message()

1695

if unavailable_message:

1696

reason = unavailable_message

1697

raise ExtractorError(

1698

'YouTube said: %s' % reason,

1699

expected=True, video_id=video_id)

1700

else:

1701

raise ExtractorError(

1702

'"token" parameter not in video info for unknown reason',

1703

video_id=video_id)

1704

1705

if video_info.get('license_info'):

1706

raise ExtractorError('This video is DRM protected.', expected=True)

1707

1708

video_details = try_get(

1709

player_response, lambda x: x['videoDetails'], dict) or {}

1710

1711

# title

1712

if 'title' in video_info:

1713

video_title = video_info['title'][0]

1714

elif 'title' in player_response:

1715

video_title = video_details['title']

1716

else:

1717

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

description_original = video_description = get_element_by_id("eow-description", video_webpage)

1722

if video_description:

1723

1724

def replace_url(m):

1725

redir_url = compat_urlparse.urljoin(url, m.group(1))

1726

parsed_redir_url = compat_urllib_parse_urlparse(redir_url)

1727

if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':

1728

qs = compat_parse_qs(parsed_redir_url.query)

q = qs.get('q')

if q and q[0]:

return q[0]

return redir_url

description_original = video_description = re.sub(r'''(?x)

1735

<a\s+

1736

(?:[a-zA-Z-]+="[^"]*"\s+)*?

1737

(?:title|href)="([^"]+)"\s+

1738

(?:[a-zA-Z-]+="[^"]*"\s+)*?

class="[^"]*"[^>]*>

[^<]+\.{3}\s*

</a>

''', replace_url, video_description)

1743

video_description = clean_html(video_description)

1744

else:

1745

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1746

if fd_mobj:

1747

video_description = unescapeHTML(fd_mobj.group(1))

1748

else:

1749

video_description = ''

1750

1751

if not smuggled_data.get('force_singlefeed', False):

1752

if not self._downloader.params.get('noplaylist'):

1753

multifeed_metadata_list = try_get(

1754

player_response,

1755

lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],

1756

compat_str) or try_get(

1757

video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)

1758

if multifeed_metadata_list:

1759

entries = []

1760

feed_ids = []

1761

for feed in multifeed_metadata_list.split(','):

1762

# Unquote should take place before split on comma (,) since textual

1763

# fields may contain comma as well (see

1764

# https://github.com/ytdl-org/youtube-dl/issues/8536)

1765

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1766

entries.append({

1767

'_type': 'url_transparent',

1768

'ie_key': 'Youtube',

1769

'url': smuggle_url(

1770

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1771

{'force_singlefeed': True}),

1772

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1773

})

1774

feed_ids.append(feed_data['id'][0])

1775

self.to_screen(

1776

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1777

% (', '.join(feed_ids), video_id))

1778

return self.playlist_result(entries, video_id, video_title, video_description)

1779

else:

1780

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1781

1782

if view_count is None:

1783

view_count = extract_view_count(video_info)

1784

if view_count is None and video_details:

1785

view_count = int_or_none(video_details.get('viewCount'))

1786

1787

# Check for "rental" videos

1788

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1789

raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)

1790

1791

def _extract_filesize(media_url):

1792

return int_or_none(self._search_regex(

1793

r'\bclen[=/](\d+)', media_url, 'filesize', default=None))

1794

1795

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1796

self.report_rtmp_download()

1797

formats = [{

1798

'format_id': '_rtmp',

1799

'protocol': 'rtmp',

1800

'url': video_info['conn'][0],

1801

'player_url': player_url,

1802

}]

1803

elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):

1804

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1805

if 'rtmpe%3Dyes' in encoded_url_map:

1806

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)

1807

formats_spec = {}

1808

fmt_list = video_info.get('fmt_list', [''])[0]

1809

if fmt_list:

1810

for fmt in fmt_list.split(','):

1811

spec = fmt.split('/')

1812

if len(spec) > 1:

1813

width_height = spec[1].split('x')

1814

if len(width_height) == 2:

1815

formats_spec[spec[0]] = {

1816

'resolution': spec[1],

1817

'width': int_or_none(width_height[0]),

1818

'height': int_or_none(width_height[1]),

1819

}

1820

q = qualities(['small', 'medium', 'hd720'])

1821

streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)

1822

if streaming_formats:

1823

for fmt in streaming_formats:

1824

itag = str_or_none(fmt.get('itag'))

1825

if not itag:

1826

continue

1827

quality = fmt.get('quality')

1828

quality_label = fmt.get('qualityLabel') or quality

1829

formats_spec[itag] = {

1830

'asr': int_or_none(fmt.get('audioSampleRate')),

1831

'filesize': int_or_none(fmt.get('contentLength')),

1832

'format_note': quality_label,

1833

'fps': int_or_none(fmt.get('fps')),

1834

'height': int_or_none(fmt.get('height')),

1835

'quality': q(quality),

1836

# bitrate for itag 43 is always 2147483647

1837

'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,

1838

'width': int_or_none(fmt.get('width')),

1839

}

1840

formats = []

1841

for url_data_str in encoded_url_map.split(','):

1842

url_data = compat_parse_qs(url_data_str)

1843

if 'itag' not in url_data or 'url' not in url_data:

1844

continue

1845

stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))

1846

# Unsupported FORMAT_STREAM_TYPE_OTF

1847

if stream_type == 3:

1848

continue

1849

format_id = url_data['itag'][0]

1850

url = url_data['url'][0]

1851

1852

if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):

1853

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1854

jsplayer_url_json = self._search_regex(

1855

ASSETS_RE,

1856

embed_webpage if age_gate else video_webpage,

1857

'JS player URL (1)', default=None)

1858

if not jsplayer_url_json and not age_gate:

1859

# We need the embed website after all

1860

if embed_webpage is None:

1861

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1862

embed_webpage = self._download_webpage(

1863

embed_url, video_id, 'Downloading embed webpage')

1864

jsplayer_url_json = self._search_regex(

1865

ASSETS_RE, embed_webpage, 'JS player URL')

1866

1867

player_url = json.loads(jsplayer_url_json)

1868

if player_url is None:

1869

player_url_json = self._search_regex(

1870

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1871

video_webpage, 'age gate player URL')

1872

player_url = json.loads(player_url_json)

1873

1874

if 'sig' in url_data:

1875

url += '&signature=' + url_data['sig'][0]

1876

elif 's' in url_data:

1877

encrypted_sig = url_data['s'][0]

1878

1879

if self._downloader.params.get('verbose'):

1880

if player_url is None:

1881

player_version = 'unknown'

1882

player_desc = 'unknown'

1883

else:

1884

if player_url.endswith('swf'):

1885

player_version = self._search_regex(

1886

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1887

'flash player', fatal=False)

1888

player_desc = 'flash player %s' % player_version

1889

else:

1890

player_version = self._search_regex(

1891

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',

1892

r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],

1893

player_url,

1894

'html5 player', fatal=False)

1895

player_desc = 'html5 player %s' % player_version

1896

1897

parts_sizes = self._signature_cache_id(encrypted_sig)

1898

self.to_screen('{%s} signature length %s, %s' %

1899

(format_id, parts_sizes, player_desc))

1900

1901

signature = self._decrypt_signature(

1902

encrypted_sig, video_id, player_url, age_gate)

1903

url += '&signature=' + signature

1904

if 'ratebypass' not in url:

1905

url += '&ratebypass=yes'

1906

1907

dct = {

1908

'format_id': format_id,

1909

'url': url,

1910

'player_url': player_url,

1911

}

1912

if format_id in self._formats:

1913

dct.update(self._formats[format_id])

1914

if format_id in formats_spec:

1915

dct.update(formats_spec[format_id])

1916

1917

# Some itags are not included in DASH manifest thus corresponding formats will

1918

# lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).

1919

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

1920

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

1921

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

1922

1923

filesize = int_or_none(url_data.get(

1924

'clen', [None])[0]) or _extract_filesize(url)

1925

1926

quality = url_data.get('quality', [None])[0]

1927

1928

more_fields = {

1929

'filesize': filesize,

1930

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

1931

'width': width,

1932

'height': height,

1933

'fps': int_or_none(url_data.get('fps', [None])[0]),

1934

'format_note': url_data.get('quality_label', [None])[0] or quality,

1935

'quality': q(quality),

1936

}

1937

for key, value in more_fields.items():

1938

if value:

1939

dct[key] = value

1940

type_ = url_data.get('type', [None])[0]

1941

if type_:

1942

type_split = type_.split(';')

1943

kind_ext = type_split[0].split('/')

1944

if len(kind_ext) == 2:

1945

kind, _ = kind_ext

1946

dct['ext'] = mimetype2ext(type_split[0])

1947

if kind in ('audio', 'video'):

1948

codecs = None

1949

for mobj in re.finditer(

1950

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

1951

if mobj.group('key') == 'codecs':

1952

codecs = mobj.group('val')

1953

break

1954

if codecs:

1955

dct.update(parse_codecs(codecs))

1956

if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':

1957

dct['downloader_options'] = {

1958

# Youtube throttles chunks >~10M

1959

'http_chunk_size': 10485760,

}

formats.append(dct)

else:

manifest_url = (

url_or_none(try_get(

player_response,

lambda x: x['streamingData']['hlsManifestUrl'],

1967

compat_str)) or

1968

url_or_none(try_get(

1969

video_info, lambda x: x['hlsvp'][0], compat_str)))

1970

if manifest_url:

1971

formats = []

1972

m3u8_formats = self._extract_m3u8_formats(

1973

manifest_url, video_id, 'mp4', fatal=False)

1974

for a_format in m3u8_formats:

1975

itag = self._search_regex(

1976

r'/itag/(\d+)/', a_format['url'], 'itag', default=None)

1977

if itag:

1978

a_format['format_id'] = itag

1979

if itag in self._formats:

1980

dct = self._formats[itag].copy()

1981

dct.update(a_format)

1982

a_format = dct

1983

a_format['player_url'] = player_url

1984

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

1985

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

1986

formats.append(a_format)

1987

else:

1988

error_message = clean_html(video_info.get('reason', [None])[0])

1989

if not error_message:

1990

error_message = extract_unavailable_message()

1991

if error_message:

1992

raise ExtractorError(error_message, expected=True)

1993

raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')

1994

1995

# uploader

1996

video_uploader = try_get(

1997

video_info, lambda x: x['author'][0],

1998

compat_str) or str_or_none(video_details.get('author'))

1999

if video_uploader:

2000

video_uploader = compat_urllib_parse_unquote_plus(video_uploader)

2001

else:

2002

self._downloader.report_warning('unable to extract uploader name')

2003

2004

# uploader_id

2005

video_uploader_id = None

2006

video_uploader_url = None

2007

mobj = re.search(

2008

r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',

2009

video_webpage)

2010

if mobj is not None:

2011

video_uploader_id = mobj.group('uploader_id')

2012

video_uploader_url = mobj.group('uploader_url')

2013

else:

2014

self._downloader.report_warning('unable to extract uploader nickname')

2015

2016

channel_id = self._html_search_meta(

2017

'channelId', video_webpage, 'channel id')

2018

channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None

2019

2020

# thumbnail image

2021

# We try first to get a high quality image:

2022

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

2023

video_webpage, re.DOTALL)

2024

if m_thumb is not None:

2025

video_thumbnail = m_thumb.group(1)

2026

elif 'thumbnail_url' not in video_info:

2027

self._downloader.report_warning('unable to extract video thumbnail')

2028

video_thumbnail = None

2029

else: # don't panic if we can't find it

2030

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

2031

2032

# upload date

2033

upload_date = self._html_search_meta(

2034

'datePublished', video_webpage, 'upload date', default=None)

2035

if not upload_date:

2036

upload_date = self._search_regex(

2037

[r'(?s)id="eow-date.*?>(.*?)</span>',

2038

r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],

2039

video_webpage, 'upload date', default=None)

2040

upload_date = unified_strdate(upload_date)

2041

2042

video_license = self._html_search_regex(

2043

r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',

2044

video_webpage, 'license', default=None)

m_music = re.search(

r'''(?x)

<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*

<ul[^>]*>\s*

<li>(?P<title>.+?)

by (?P<creator>.+?)

(?:

$.+?$|

<a[^>]*

(?:

\bhref=["\']/red[^>]*>| # drop possible

2057

>\s*Listen ad-free with YouTube Red # YouTube Red ad

)

.*?

)?</li

''',

video_webpage)

if m_music:

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

2065

video_creator = clean_html(m_music.group('creator'))

2066

else:

2067

video_alt_title = video_creator = None

2068

2069

def extract_meta(field):

2070

return self._html_search_regex(

2071

r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,

2072

video_webpage, field, default=None)

2073

2074

track = extract_meta('Song')

2075

artist = extract_meta('Artist')

2076

2077

m_episode = re.search(

2078

r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',

2079

video_webpage)

2080

if m_episode:

2081

series = unescapeHTML(m_episode.group('series'))

2082

season_number = int(m_episode.group('season'))

2083

episode_number = int(m_episode.group('episode'))

2084

else:

2085

series = season_number = episode_number = None

2086

2087

m_cat_container = self._search_regex(

2088

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

2089

video_webpage, 'categories', default=None)

2090

if m_cat_container:

2091

category = self._html_search_regex(

2092

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

2093

default=None)

2094

video_categories = None if category is None else [category]

2095

else:

2096

video_categories = None

2097

2098

video_tags = [

2099

unescapeHTML(m.group('content'))

2100

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

2101

2102

def _extract_count(count_name):

2103

return str_to_int(self._search_regex(

2104

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

2105

% re.escape(count_name),

2106

video_webpage, count_name, default=None))

2107

2108

like_count = _extract_count('like')

2109

dislike_count = _extract_count('dislike')

2110

2111

if view_count is None:

2112

view_count = str_to_int(self._search_regex(

2113

r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,

2114

'view count', default=None))

2115

2116

# subtitles

2117

video_subtitles = self.extract_subtitles(video_id, video_webpage)

2118

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

2119

2120

video_duration = try_get(

2121

video_info, lambda x: int_or_none(x['length_seconds'][0]))

2122

if not video_duration:

2123

video_duration = int_or_none(video_details.get('lengthSeconds'))

2124

if not video_duration:

2125

video_duration = parse_duration(self._html_search_meta(

2126

'duration', video_webpage, 'video duration'))

2127

2128

# annotations

2129

video_annotations = None

2130

if self._downloader.params.get('writeannotations', False):

2131

video_annotations = self._extract_annotations(video_id)

2132

2133

chapters = self._extract_chapters(description_original, video_duration)

2134

2135

# Look for the DASH manifest

2136

if self._downloader.params.get('youtube_include_dash_manifest', True):

2137

dash_mpd_fatal = True

2138

for mpd_url in dash_mpds:

2139

dash_formats = {}

2140

try:

2141

def decrypt_sig(mobj):

2142

s = mobj.group(1)

2143

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

2144

return '/signature/%s' % dec_s

2145

2146

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

2147

2148

for df in self._extract_mpd_formats(

2149

mpd_url, video_id, fatal=dash_mpd_fatal,

2150

formats_dict=self._formats):

2151

if not df.get('filesize'):

2152

df['filesize'] = _extract_filesize(df['url'])

2153

# Do not overwrite DASH format found in some previous DASH manifest

2154

if df['format_id'] not in dash_formats:

2155

dash_formats[df['format_id']] = df

2156

# Additional DASH manifests may end up in HTTP Error 403 therefore

2157

# allow them to fail without bug report message if we already have

2158

# some DASH manifest succeeded. This is temporary workaround to reduce

2159

# burst of bug reports until we figure out the reason and whether it

2160

# can be fixed at all.

2161

dash_mpd_fatal = False

2162

except (ExtractorError, KeyError) as e:

2163

self.report_warning(

2164

'Skipping DASH manifest: %r' % e, video_id)

2165

if dash_formats:

2166

# Remove the formats we found through non-DASH, they

2167

# contain less info and it can be wrong, because we use

2168

# fixed values (for example the resolution). See

2169

# https://github.com/ytdl-org/youtube-dl/issues/5774 for an

2170

# example.

2171

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

2172

formats.extend(dash_formats.values())

2173

2174

# Check for malformed aspect ratio

2175

stretched_m = re.search(

2176

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

2177

video_webpage)

2178

if stretched_m:

2179

w = float(stretched_m.group('w'))

2180

h = float(stretched_m.group('h'))

2181

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

2182

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

2187

f['stretched_ratio'] = ratio

2188

2189

self._sort_formats(formats)

2190

2191

self.mark_watched(video_id, video_info, player_response)

return {

'id': video_id,

'uploader': video_uploader,

2196

'uploader_id': video_uploader_id,

2197

'uploader_url': video_uploader_url,

2198

'channel_id': channel_id,

2199

'channel_url': channel_url,

2200

'upload_date': upload_date,

2201

'license': video_license,

2202

'creator': video_creator or artist,

2203

'title': video_title,

2204

'alt_title': video_alt_title or track,

2205

'thumbnail': video_thumbnail,

2206

'description': video_description,

2207

'categories': video_categories,

2208

'tags': video_tags,

2209

'subtitles': video_subtitles,

2210

'automatic_captions': automatic_captions,

2211

'duration': video_duration,

2212

'age_limit': 18 if age_gate else 0,

2213

'annotations': video_annotations,

2214

'chapters': chapters,

2215

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

2216

'view_count': view_count,

2217

'like_count': like_count,

2218

'dislike_count': dislike_count,

2219

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

2220

'formats': formats,

2221

'is_live': is_live,

2222

'start_time': start_time,

2223

'end_time': end_time,

2224

'series': series,

2225

'season_number': season_number,

2226

'episode_number': episode_number,

'track': track,

'artist': artist,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

2233

IE_DESC = 'YouTube.com playlists'

2234

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

(?:

(?:

youtube\.com|

invidio\.us

)

/

(?:

(?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))

2245

\? (?:.*?[&;])*? (?:p|a|list)=

2246

| p/

2247

)|

2248

youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=

2249

)

2250

(

2251

(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}

2252

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

(%(playlist_id)s)

)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

2259

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

2260

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

2261

IE_NAME = 'youtube:playlist'

2262

_TESTS = [{

2263

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

2264

'info_dict': {

2265

'title': 'ytdl test PL',

2266

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

2271

'info_dict': {

2272

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

2273

'title': 'YDL_Empty_List',

2274

},

2275

'playlist_count': 0,

2276

'skip': 'This playlist is private',

2277

}, {

2278

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

2279

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2280

'info_dict': {

2281

'title': '29C3: Not my department',

2282

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2283

},

2284

'playlist_count': 95,

2285

}, {

2286

'note': 'issue #673',

2287

'url': 'PLBB231211A4F62143',

2288

'info_dict': {

2289

'title': '[OLD]Team Fortress 2 (Class-based LP)',

2290

'id': 'PLBB231211A4F62143',

2291

},

2292

'playlist_mincount': 26,

2293

}, {

2294

'note': 'Large playlist',

2295

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

2296

'info_dict': {

2297

'title': 'Uploads from Cauchemar',

2298

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

2299

},

2300

'playlist_mincount': 799,

2301

}, {

2302

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2303

'info_dict': {

2304

'title': 'YDL_safe_search',

2305

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2306

},

2307

'playlist_count': 2,

2308

'skip': 'This playlist is private',

2309

}, {

2310

'note': 'embedded',

2311

'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

2316

}

2317

}, {

2318

'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2319

'playlist_mincount': 485,

2320

'info_dict': {

2321

'title': '2017 華語最新單曲 (2/24更新)',

2322

'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2323

}

2324

}, {

2325

'note': 'Embedded SWF player',

2326

'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

2331

}

2332

}, {

2333

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

2334

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

2335

'info_dict': {

2336

'title': 'Uploads from Interstellar Movie',

2337

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

2338

},

2339

'playlist_mincount': 21,

2340

}, {

2341

# Playlist URL that does not actually serve a playlist

2342

'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',

'info_dict': {

'id': 'FqZTN594JQw',

'ext': 'webm',

'title': "Smiley's People 01 detective, Adventure Series, Action",

2347

'uploader': 'STREEM',

2348

'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',

2349

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',

2350

'upload_date': '20150526',

2351

'license': 'Standard YouTube License',

2352

'description': 'md5:507cdcb5a49ac0da37a920ece610be80',

2353

'categories': ['People & Blogs'],

'tags': list,

'view_count': int,

'like_count': int,

'dislike_count': int,

2358

},

2359

'params': {

2360

'skip_download': True,

2361

},

2362

'add_ie': [YoutubeIE.ie_key()],

2363

}, {

2364

'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',

'info_dict': {

'id': 'yeWKywCrFtk',

'ext': 'mp4',

'title': 'Small Scale Baler and Braiding Rugs',

2369

'uploader': 'Backus-Page House Museum',

2370

'uploader_id': 'backuspagemuseum',

2371

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',

2372

'upload_date': '20161008',

2373

'license': 'Standard YouTube License',

2374

'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',

2375

'categories': ['Nonprofits & Activism'],

2376

'tags': list,

2377

'like_count': int,

2378

'dislike_count': int,

},

'params': {

'noplaylist': True,

'skip_download': True,

2383

},

2384

}, {

2385

'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',

2386

'only_matching': True,

2387

}, {

2388

'url': 'TLGGrESM50VT6acwMjAyMjAxNw',

2389

'only_matching': True,

2390

}, {

2391

# music album playlist

2392

'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',

2393

'only_matching': True,

2394

}, {

2395

'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',

2396

'only_matching': True,

2397

}]

2398

2399

def _real_initialize(self):

2400

self._login()

2401

2402

def _extract_mix(self, playlist_id):

2403

# The mixes are generated from a single video

2404

# the id of the playlist is just 'RD' + video_id

2405

ids = []

2406

last_id = playlist_id[-11:]

2407

for n in itertools.count(1):

2408

url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)

2409

webpage = self._download_webpage(

2410

url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))

2411

new_ids = orderedSet(re.findall(

2412

r'''(?xs)data-video-username=".*?".*?

2413

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

2414

webpage))

2415

# Fetch new pages until all the videos are repeated, it seems that

2416

# there are always 51 unique videos.

2417

new_ids = [_id for _id in new_ids if _id not in ids]

if not new_ids:

break

ids.extend(new_ids)

last_id = ids[-1]

url_results = self._ids_to_results(ids)

2424

2425

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

2426

title_span = (

2427

search_title('playlist-title') or

2428

search_title('title long-title') or

2429

search_title('title'))

2430

title = clean_html(title_span)

2431

2432

return self.playlist_result(url_results, playlist_id, title)

2433

2434

def _extract_playlist(self, playlist_id):

2435

url = self._TEMPLATE_URL % playlist_id

2436

page = self._download_webpage(url, playlist_id)

2437

2438

# the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)

2439

for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):

2440

match = match.strip()

2441

# Check if the playlist exists or is private

2442

mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)

2443

if mobj:

2444

reason = mobj.group('reason')

2445

message = 'This playlist %s' % reason

2446

if 'private' in reason:

2447

message += ', use --username or --netrc to access it'

2448

message += '.'

2449

raise ExtractorError(message, expected=True)

2450

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

2451

raise ExtractorError(

2452

'Invalid parameters. Maybe URL is incorrect.',

2453

expected=True)

2454

elif re.match(r'[^<]*Choose your language[^<]*', match):

2455

continue

2456

else:

2457

self.report_warning('Youtube gives an alert message: ' + match)

2458

2459

playlist_title = self._html_search_regex(

2460

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

2461

page, 'title', default=None)

2462

2463

_UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='

2464

uploader = self._search_regex(

2465

r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,

2466

page, 'uploader', default=None)

2467

mobj = re.search(

2468

r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,

2469

page)

2470

if mobj:

2471

uploader_id = mobj.group('uploader_id')

2472

uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))

2473

else:

2474

uploader_id = uploader_url = None

has_videos = True

if not playlist_title:

2479

try:

2480

# Some playlist URLs don't actually serve a playlist (e.g.

2481

# https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)

2482

next(self._entries(page, playlist_id))

2483

except StopIteration:

2484

has_videos = False

2485

2486

playlist = self.playlist_result(

2487

self._entries(page, playlist_id), playlist_id, playlist_title)

2488

playlist.update({

2489

'uploader': uploader,

2490

'uploader_id': uploader_id,

2491

'uploader_url': uploader_url,

2492

})

2493

2494

return has_videos, playlist

2495

2496

def _check_download_just_video(self, url, playlist_id):

2497

# Check if it's a video-specific URL

2498

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

2499

video_id = query_dict.get('v', [None])[0] or self._search_regex(

2500

r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,

2501

'video id', default=None)

2502

if video_id:

2503

if self._downloader.params.get('noplaylist'):

2504

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

2505

return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)

2506

else:

2507

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

2508

return video_id, None

2509

return None, None

2510

2511

def _real_extract(self, url):

2512

# Extract playlist id

2513

mobj = re.match(self._VALID_URL, url)

2514

if mobj is None:

2515

raise ExtractorError('Invalid URL: %s' % url)

2516

playlist_id = mobj.group(1) or mobj.group(2)

2517

2518

video_id, video = self._check_download_just_video(url, playlist_id)

if video:

return video

if playlist_id.startswith(('RD', 'UL', 'PU')):

2523

# Mixes require a custom extraction process

2524

return self._extract_mix(playlist_id)

2525

2526

has_videos, playlist = self._extract_playlist(playlist_id)

2527

if has_videos or not video_id:

2528

return playlist

2529

2530

# Some playlist URLs don't actually serve a playlist (see

2531

# https://github.com/ytdl-org/youtube-dl/issues/10537).

2532

# Fallback to plain video extraction if there is a video id

2533

# along with playlist id.

2534

return self.url_result(video_id, 'Youtube', video_id=video_id)

2535

2536

2537

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

2538

IE_DESC = 'YouTube.com channels'

2539

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'

2540

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

2541

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

2542

IE_NAME = 'youtube:channel'

2543

_TESTS = [{

2544

'note': 'paginated channel',

2545

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

2546

'playlist_mincount': 91,

2547

'info_dict': {

2548

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

2549

'title': 'Uploads from lex will',

2550

}

2551

}, {

2552

'note': 'Age restricted channel',

2553

# from https://www.youtube.com/user/DeusExOfficial

2554

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

2555

'playlist_mincount': 64,

2556

'info_dict': {

2557

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

2558

'title': 'Uploads from Deus Ex',

2559

},

2560

}, {

2561

'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',

2562

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2567

return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)

2568

else super(YoutubeChannelIE, cls).suitable(url))

2569

2570

def _build_template_url(self, url, channel_id):

2571

return self._TEMPLATE_URL % channel_id

2572

2573

def _real_extract(self, url):

2574

channel_id = self._match_id(url)

2575

2576

url = self._build_template_url(url, channel_id)

2577

2578

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

2579

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

2580

# otherwise fallback on channel by page extraction

2581

channel_page = self._download_webpage(

2582

url + '?view=57', channel_id,

2583

'Downloading channel page', fatal=False)

2584

if channel_page is False:

2585

channel_playlist_id = False

2586

else:

2587

channel_playlist_id = self._html_search_meta(

2588

'channelId', channel_page, 'channel id', default=None)

2589

if not channel_playlist_id:

2590

channel_url = self._html_search_meta(

2591

('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),

2592

channel_page, 'channel url', default=None)

2593

if channel_url:

2594

channel_playlist_id = self._search_regex(

2595

r'vnd\.youtube://user/([0-9A-Za-z_-]+)',

2596

channel_url, 'channel id', default=None)

2597

if channel_playlist_id and channel_playlist_id.startswith('UC'):

2598

playlist_id = 'UU' + channel_playlist_id[2:]

2599

return self.url_result(

2600

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

2601

2602

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

2603

autogenerated = re.search(r'''(?x)

2604

class="[^"]*?(?:

2605

channel-header-autogenerated-label|

2606

yt-channel-title-autogenerated

2607

)[^"]*"''', channel_page) is not None

2608

2609

if autogenerated:

2610

# The videos are contained in a single page

2611

# the ajax pages can't be used, they are empty

2612

entries = [

2613

self.url_result(

2614

video_id, 'Youtube', video_id=video_id,

2615

video_title=video_title)

2616

for video_id, video_title in self.extract_videos_from_page(channel_page)]

2617

return self.playlist_result(entries, channel_id)

2618

2619

try:

2620

next(self._entries(channel_page, channel_id))

2621

except StopIteration:

2622

alert_message = self._html_search_regex(

2623

r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',

2624

channel_page, 'alert', default=None, group='alert')

2625

if alert_message:

2626

raise ExtractorError('Youtube said: %s' % alert_message, expected=True)

2627

2628

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

2629

2630

2631

class YoutubeUserIE(YoutubeChannelIE):

2632

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

2633

2634

_TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'

2635

IE_NAME = 'youtube:user'

2636

2637

_TESTS = [{

2638

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

2639

'playlist_mincount': 320,

2640

'info_dict': {

2641

'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',

2642

'title': 'Uploads from The Linux Foundation',

2643

}

2644

}, {

2645

# Only available via https://www.youtube.com/c/12minuteathlete/videos

2646

# but not https://www.youtube.com/user/12minuteathlete/videos

2647

'url': 'https://www.youtube.com/c/12minuteathlete/videos',

2648

'playlist_mincount': 249,

2649

'info_dict': {

2650

'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',

2651

'title': 'Uploads from 12 Minute Athlete',

2652

}

2653

}, {

2654

'url': 'ytuser:phihag',

2655

'only_matching': True,

2656

}, {

2657

'url': 'https://www.youtube.com/c/gametrailers',

2658

'only_matching': True,

2659

}, {

2660

'url': 'https://www.youtube.com/gametrailers',

2661

'only_matching': True,

2662

}, {

2663

# This channel is not available, geo restricted to JP

2664

'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',

2665

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2670

# Don't return True if the url can be extracted with other youtube

2671

# extractor, the regex would is too permissive and it would match.

2672

other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)

2673

if any(ie.suitable(url) for ie in other_yt_ies):

2674

return False

2675

else:

2676

return super(YoutubeUserIE, cls).suitable(url)

2677

2678

def _build_template_url(self, url, channel_id):

2679

mobj = re.match(self._VALID_URL, url)

2680

return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))

2681

2682

2683

class YoutubeLiveIE(YoutubeBaseInfoExtractor):

2684

IE_DESC = 'YouTube.com live streams'

2685

_VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'

2686

IE_NAME = 'youtube:live'

2687

2688

_TESTS = [{

2689

'url': 'https://www.youtube.com/user/TheYoungTurks/live',

'info_dict': {

'id': 'a48o2S1cPoo',

'ext': 'mp4',

'title': 'The Young Turks - Live Main Show',

2694

'uploader': 'The Young Turks',

2695

'uploader_id': 'TheYoungTurks',

2696

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',

2697

'upload_date': '20150715',

2698

'license': 'Standard YouTube License',

2699

'description': 'md5:438179573adcdff3c97ebb1ee632b891',

2700

'categories': ['News & Politics'],

2701

'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],

2702

'like_count': int,

2703

'dislike_count': int,

2704

},

2705

'params': {

2706

'skip_download': True,

2707

},

2708

}, {

2709

'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',

2710

'only_matching': True,

2711

}, {

2712

'url': 'https://www.youtube.com/c/CommanderVideoHq/live',

2713

'only_matching': True,

2714

}, {

2715

'url': 'https://www.youtube.com/TheYoungTurks/live',

2716

'only_matching': True,

2717

}]

2718

2719

def _real_extract(self, url):

2720

mobj = re.match(self._VALID_URL, url)

2721

channel_id = mobj.group('id')

2722

base_url = mobj.group('base_url')

2723

webpage = self._download_webpage(url, channel_id, fatal=False)

2724

if webpage:

2725

page_type = self._og_search_property(

2726

'type', webpage, 'page type', default='')

2727

video_id = self._html_search_meta(

2728

'videoId', webpage, 'video id', default=None)

2729

if page_type.startswith('video') and video_id and re.match(

2730

r'^[0-9A-Za-z_-]{11}$', video_id):

2731

return self.url_result(video_id, YoutubeIE.ie_key())

2732

return self.url_result(base_url)

2733

2734

2735

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

2736

IE_DESC = 'YouTube.com user/channel playlists'

2737

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

2738

IE_NAME = 'youtube:playlists'

2739

2740

_TESTS = [{

2741

'url': 'https://www.youtube.com/user/ThirstForScience/playlists',

2742

'playlist_mincount': 4,

2743

'info_dict': {

2744

'id': 'ThirstForScience',

2745

'title': 'Thirst for Science',

2746

},

2747

}, {

2748

# with "Load more" button

2749

'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

2750

'playlist_mincount': 70,

2751

'info_dict': {

2752

'id': 'igorkle1',

2753

'title': 'Игорь Клейнер',

2754

},

2755

}, {

2756

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

2757

'playlist_mincount': 17,

2758

'info_dict': {

2759

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

2760

'title': 'Chem Player',

},

}]

class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):

2766

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'

2767

2768

2769

class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):

2770

IE_DESC = 'YouTube.com searches'

2771

# there doesn't appear to be a real limit, for example if you search for

2772

# 'python' you get more than 8.000.000 results

2773

_MAX_RESULTS = float('inf')

2774

IE_NAME = 'youtube:search'

2775

_SEARCH_KEY = 'ytsearch'

2776

_EXTRA_QUERY_ARGS = {}

2777

_TESTS = []

2778

2779

def _get_n_results(self, query, n):

2780

"""Get a specified number of results for a query"""

videos = []

limit = n

url_query = {

'search_query': query.encode('utf-8'),

2787

}

2788

url_query.update(self._EXTRA_QUERY_ARGS)

2789

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)

2790

2791

for pagenum in itertools.count(1):

2792

data = self._download_json(

2793

result_url, video_id='query "%s"' % query,

2794

note='Downloading page %s' % pagenum,

2795

errnote='Unable to download API page',

2796

query={'spf': 'navigate'})

2797

html_content = data[1]['body']['content']

2798

2799

if 'class="search-message' in html_content:

2800

raise ExtractorError(

2801

'[youtube] No video results', expected=True)

2802

2803

new_videos = list(self._process_page(html_content))

2804

videos += new_videos

2805

if not new_videos or len(videos) > limit:

2806

break

2807

next_link = self._html_search_regex(

2808

r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',

2809

html_content, 'next link', default=None)

2810

if next_link is None:

2811

break

2812

result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

2817

2818

2819

class YoutubeSearchDateIE(YoutubeSearchIE):

2820

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

2821

_SEARCH_KEY = 'ytsearchdate'

2822

IE_DESC = 'YouTube.com searches, newest videos first'

2823

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

2824

2825

2826

class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):

2827

IE_DESC = 'YouTube.com search URLs'

2828

IE_NAME = 'youtube:search_url'

2829

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'

2830

_TESTS = [{

2831

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

2832

'playlist_mincount': 5,

2833

'info_dict': {

2834

'title': 'youtube-dl test video',

2835

}

2836

}, {

2837

'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',

2838

'only_matching': True,

2839

}]

2840

2841

def _real_extract(self, url):

2842

mobj = re.match(self._VALID_URL, url)

2843

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

2844

webpage = self._download_webpage(url, query)

2845

return self.playlist_result(self._process_page(webpage), playlist_title=query)

2846

2847

2848

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

2849

IE_DESC = 'YouTube.com (multi-season) shows'

2850

_VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'

2851

IE_NAME = 'youtube:show'

2852

_TESTS = [{

2853

'url': 'https://www.youtube.com/show/airdisasters',

2854

'playlist_mincount': 5,

2855

'info_dict': {

2856

'id': 'airdisasters',

2857

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

2862

playlist_id = self._match_id(url)

2863

return super(YoutubeShowIE, self)._real_extract(

2864

'https://www.youtube.com/show/%s/playlists' % playlist_id)

2865

2866

2867

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

2868

"""

2869

Base class for feed extractors

2870

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

2871

"""

2872

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

2877

2878

def _real_initialize(self):

2879

self._login()

2880

2881

def _entries(self, page):

2882

# The extraction process is the same as for playlists, but the regex

2883

# for the video ids doesn't contain an index

2884

ids = []

2885

more_widget_html = content_html = page

2886

for page_num in itertools.count(1):

2887

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

2888

2889

# 'recommended' feed has infinite 'load more' and each new portion spins

2890

# the same videos in (sometimes) slightly different order, so we'll check

2891

# for unicity and break when portion has no new videos

2892

new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))

if not new_ids:

break

ids.extend(new_ids)

for entry in self._ids_to_results(new_ids):

2899

yield entry

2900

2901

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

2906

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

2907

'Downloading page #%s' % page_num,

2908

transform_source=uppercase_escape)

2909

content_html = more['content_html']

2910

more_widget_html = more['load_more_widget_html']

2911

2912

def _real_extract(self, url):

2913

page = self._download_webpage(

2914

'https://www.youtube.com/feed/%s' % self._FEED_NAME,

2915

self._PLAYLIST_TITLE)

2916

return self.playlist_result(

2917

self._entries(page), playlist_title=self._PLAYLIST_TITLE)

2918

2919

2920

class YoutubeWatchLaterIE(YoutubePlaylistIE):

2921

IE_NAME = 'youtube:watchlater'

2922

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

2923

_VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'

2924

2925

_TESTS = [{

2926

'url': 'https://www.youtube.com/playlist?list=WL',

2927

'only_matching': True,

2928

}, {

2929

'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',

2930

'only_matching': True,

2931

}]

2932

2933

def _real_extract(self, url):

2934

_, video = self._check_download_just_video(url, 'WL')

2935

if video:

2936

return video

2937

_, playlist = self._extract_playlist('WL')

return playlist

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

2942

IE_NAME = 'youtube:favorites'

2943

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

2944

_VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

2945

_LOGIN_REQUIRED = True

2946

2947

def _real_extract(self, url):

2948

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

2949

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

2950

return self.url_result(playlist_id, 'YoutubePlaylist')

2951

2952

2953

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

2954

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

2955

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'

2956

_FEED_NAME = 'recommended'

2957

_PLAYLIST_TITLE = 'Youtube Recommended videos'

2958

2959

2960

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

2961

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

2962

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

2963

_FEED_NAME = 'subscriptions'

2964

_PLAYLIST_TITLE = 'Youtube Subscriptions'

2965

2966

2967

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

2968

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

2969

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'

2970

_FEED_NAME = 'history'

2971

_PLAYLIST_TITLE = 'Youtube History'

2972

2973

2974

class YoutubeTruncatedURLIE(InfoExtractor):

2975

IE_NAME = 'youtube:truncated_url'

2976

IE_DESC = False # Do not list

2977

_VALID_URL = r'''(?x)

2978

(?:https?://)?

2979

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

2980

(?:watch\?(?:

2981

feature=[a-z_]+|

2982

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',

2995

'only_matching': True,

2996

}, {

2997

'url': 'https://www.youtube.com/watch?',

2998

'only_matching': True,

2999

}, {

3000

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

3001

'only_matching': True,

3002

}, {

3003

'url': 'https://www.youtube.com/watch?feature=foo',

3004

'only_matching': True,

3005

}, {

3006

'url': 'https://www.youtube.com/watch?hl=en-GB',

3007

'only_matching': True,

3008

}, {

3009

'url': 'https://www.youtube.com/watch?t=2372',

3010

'only_matching': True,

3011

}]

3012

3013

def _real_extract(self, url):

3014

raise ExtractorError(

3015

'Did you forget to quote the URL? Remember that & is a meta '

3016

'character in most shells, so you want to put the URL in quotes, '

3017

'like youtube-dl '

3018

'"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

3019

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

3024

IE_NAME = 'youtube:truncated_id'

3025

IE_DESC = False # Do not list

3026

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

3027

3028

_TESTS = [{

3029

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

3030

'only_matching': True,

3031

}]

3032

3033

def _real_extract(self, url):

3034

video_id = self._match_id(url)

3035

raise ExtractorError(

3036

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

3037

expected=True)