jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import random
	10	import re
	11	import time
	12	import traceback
	13
	14	from .common import InfoExtractor, SearchInfoExtractor
	15	from ..jsinterp import JSInterpreter
	16	from ..swfinterp import SWFInterpreter
	17	from ..compat import (
	18	compat_chr,
	19	compat_HTTPError,
	20	compat_kwargs,
	21	compat_parse_qs,
	22	compat_urllib_parse_unquote,
	23	compat_urllib_parse_unquote_plus,
	24	compat_urllib_parse_urlencode,
	25	compat_urllib_parse_urlparse,
	26	compat_urlparse,
	27	compat_str,
	28	)
	29	from ..utils import (
	30	bool_or_none,
	31	clean_html,
	32	error_to_compat_str,
	33	extract_attributes,
	34	ExtractorError,
	35	float_or_none,
	36	get_element_by_attribute,
	37	get_element_by_id,
	38	int_or_none,
	39	mimetype2ext,
	40	orderedSet,
	41	parse_codecs,
	42	parse_duration,
	43	remove_quotes,
	44	remove_start,
	45	smuggle_url,
	46	str_or_none,
	47	str_to_int,
	48	try_get,
	49	unescapeHTML,
	50	unified_strdate,
	51	unsmuggle_url,
	52	uppercase_escape,
	53	url_or_none,
	54	urlencode_postdata,
	55	)
	56
	57
	58	class YoutubeBaseInfoExtractor(InfoExtractor):
	59	"""Provide base functions for Youtube extractors"""
	60	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	61	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	62
	63	_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
	64	_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
	65	_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
	66
	67	_NETRC_MACHINE = 'youtube'
	68	# If True it will raise an error if no login info is provided
	69	_LOGIN_REQUIRED = False
	70
	71	_PLAYLIST_ID_RE = r'(?:PL\|LL\|EC\|UU\|FL\|RD\|UL\|TL\|PU\|OLAK5uy_)[0-9A-Za-z-_]{10,}'
	72
	73	_YOUTUBE_CLIENT_HEADERS = {
	74	'x-youtube-client-name': '1',
	75	'x-youtube-client-version': '1.20200609.04.02',
	76	}
	77
	78	def _set_language(self):
	79	self._set_cookie(
	80	'.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
	81	# YouTube sets the expire time to about two months
	82	expire_time=time.time() + 2 * 30 * 24 * 3600)
	83
	84	def _ids_to_results(self, ids):
	85	return [
	86	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	87	for vid_id in ids]
	88
	89	def _login(self):
	90	"""
	91	Attempt to log in to YouTube.
	92	True is returned if successful or skipped.
	93	False is returned if login failed.
	94
	95	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	96	"""
	97	username, password = self._get_login_info()
	98	# No authentication to be performed
	99	if username is None:
	100	if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
	101	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	102	return True
	103
	104	login_page = self._download_webpage(
	105	self._LOGIN_URL, None,
	106	note='Downloading login page',
	107	errnote='unable to fetch login page', fatal=False)
	108	if login_page is False:
	109	return
	110
	111	login_form = self._hidden_inputs(login_page)
	112
	113	def req(url, f_req, note, errnote):
	114	data = login_form.copy()
	115	data.update({
	116	'pstMsg': 1,
	117	'checkConnection': 'youtube',
	118	'checkedDomains': 'youtube',
	119	'hl': 'en',
	120	'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
	121	'f.req': json.dumps(f_req),
	122	'flowName': 'GlifWebSignIn',
	123	'flowEntry': 'ServiceLogin',
	124	# TODO: reverse actual botguard identifier generation algo
	125	'bgRequest': '["identifier",""]',
	126	})
	127	return self._download_json(
	128	url, None, note=note, errnote=errnote,
	129	transform_source=lambda s: re.sub(r'^[^[]*', '', s),
	130	fatal=False,
	131	data=urlencode_postdata(data), headers={
	132	'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
	133	'Google-Accounts-XSRF': 1,
	134	})
	135
	136	def warn(message):
	137	self._downloader.report_warning(message)
	138
	139	lookup_req = [
	140	username,
	141	None, [], None, 'US', None, None, 2, False, True,
	142	[
	143	None, None,
	144	[2, 1, None, 1,
	145	'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
	146	None, [], 4],
	147	1, [None, None, []], None, None, None, True
	148	],
	149	username,
	150	]
	151
	152	lookup_results = req(
	153	self._LOOKUP_URL, lookup_req,
	154	'Looking up account info', 'Unable to look up account info')
	155
	156	if lookup_results is False:
	157	return False
	158
	159	user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
	160	if not user_hash:
	161	warn('Unable to extract user hash')
	162	return False
	163
	164	challenge_req = [
	165	user_hash,
	166	None, 1, None, [1, None, None, None, [password, None, True]],
	167	[
	168	None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
	169	1, [None, None, []], None, None, None, True
	170	]]
	171
	172	challenge_results = req(
	173	self._CHALLENGE_URL, challenge_req,
	174	'Logging in', 'Unable to log in')
	175
	176	if challenge_results is False:
	177	return
	178
	179	login_res = try_get(challenge_results, lambda x: x[0][5], list)
	180	if login_res:
	181	login_msg = try_get(login_res, lambda x: x[5], compat_str)
	182	warn(
	183	'Unable to login: %s' % 'Invalid password'
	184	if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
	185	return False
	186
	187	res = try_get(challenge_results, lambda x: x[0][-1], list)
	188	if not res:
	189	warn('Unable to extract result entry')
	190	return False
	191
	192	login_challenge = try_get(res, lambda x: x[0][0], list)
	193	if login_challenge:
	194	challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
	195	if challenge_str == 'TWO_STEP_VERIFICATION':
	196	# SEND_SUCCESS - TFA code has been successfully sent to phone
	197	# QUOTA_EXCEEDED - reached the limit of TFA codes
	198	status = try_get(login_challenge, lambda x: x[5], compat_str)
	199	if status == 'QUOTA_EXCEEDED':
	200	warn('Exceeded the limit of TFA codes, try later')
	201	return False
	202
	203	tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
	204	if not tl:
	205	warn('Unable to extract TL')
	206	return False
	207
	208	tfa_code = self._get_tfa_info('2-step verification code')
	209
	210	if not tfa_code:
	211	warn(
	212	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	213	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	214	return False
	215
	216	tfa_code = remove_start(tfa_code, 'G-')
	217
	218	tfa_req = [
	219	user_hash, None, 2, None,
	220	[
	221	9, None, None, None, None, None, None, None,
	222	[None, tfa_code, True, 2]
	223	]]
	224
	225	tfa_results = req(
	226	self._TFA_URL.format(tl), tfa_req,
	227	'Submitting TFA code', 'Unable to submit TFA code')
	228
	229	if tfa_results is False:
	230	return False
	231
	232	tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
	233	if tfa_res:
	234	tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
	235	warn(
	236	'Unable to finish TFA: %s' % 'Invalid TFA code'
	237	if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
	238	return False
	239
	240	check_cookie_url = try_get(
	241	tfa_results, lambda x: x[0][-1][2], compat_str)
	242	else:
	243	CHALLENGES = {
	244	'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
	245	'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
	246	'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
	247	}
	248	challenge = CHALLENGES.get(
	249	challenge_str,
	250	'%s returned error %s.' % (self.IE_NAME, challenge_str))
	251	warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
	252	return False
	253	else:
	254	check_cookie_url = try_get(res, lambda x: x[2], compat_str)
	255
	256	if not check_cookie_url:
	257	warn('Unable to extract CheckCookie URL')
	258	return False
	259
	260	check_cookie_results = self._download_webpage(
	261	check_cookie_url, None, 'Checking cookie', fatal=False)
	262
	263	if check_cookie_results is False:
	264	return False
	265
	266	if 'https://myaccount.google.com/' not in check_cookie_results:
	267	warn('Unable to log in')
	268	return False
	269
	270	return True
	271
	272	def _download_webpage_handle(self, args, *kwargs):
	273	query = kwargs.get('query', {}).copy()
	274	query['disable_polymer'] = 'true'
	275	kwargs['query'] = query
	276	return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
	277	args, *compat_kwargs(kwargs))
	278
	279	def _real_initialize(self):
	280	if self._downloader is None:
	281	return
	282	self._set_language()
	283	if not self._login():
	284	return
	285
	286
	287	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	288	# Extract entries from page with "Load more" button
	289	def _entries(self, page, playlist_id):
	290	more_widget_html = content_html = page
	291	for page_num in itertools.count(1):
	292	for entry in self._process_page(content_html):
	293	yield entry
	294
	295	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	296	if not mobj:
	297	break
	298
	299	count = 0
	300	retries = 3
	301	while count <= retries:
	302	try:
	303	# Downloading page may result in intermittent 5xx HTTP error
	304	# that is usually worked around with a retry
	305	more = self._download_json(
	306	'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
	307	'Downloading page #%s%s'
	308	% (page_num, ' (retry #%d)' % count if count else ''),
	309	transform_source=uppercase_escape,
	310	headers=self._YOUTUBE_CLIENT_HEADERS)
	311	break
	312	except ExtractorError as e:
	313	if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
	314	count += 1
	315	if count <= retries:
	316	continue
	317	raise
	318
	319	content_html = more['content_html']
	320	if not content_html.strip():
	321	# Some webpages show a "Load more" button but they don't
	322	# have more videos
	323	break
	324	more_widget_html = more['load_more_widget_html']
	325
	326
	327	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	328	def _process_page(self, content):
	329	for video_id, video_title in self.extract_videos_from_page(content):
	330	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	331
	332	def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
	333	for mobj in re.finditer(video_re, page):
	334	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	335	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	336	continue
	337	video_id = mobj.group('id')
	338	video_title = unescapeHTML(
	339	mobj.group('title')) if 'title' in mobj.groupdict() else None
	340	if video_title:
	341	video_title = video_title.strip()
	342	if video_title == '► Play all':
	343	video_title = None
	344	try:
	345	idx = ids_in_page.index(video_id)
	346	if video_title and not titles_in_page[idx]:
	347	titles_in_page[idx] = video_title
	348	except ValueError:
	349	ids_in_page.append(video_id)
	350	titles_in_page.append(video_title)
	351
	352	def extract_videos_from_page(self, page):
	353	ids_in_page = []
	354	titles_in_page = []
	355	self.extract_videos_from_page_impl(
	356	self._VIDEO_RE, page, ids_in_page, titles_in_page)
	357	return zip(ids_in_page, titles_in_page)
	358
	359
	360	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	361	def _process_page(self, content):
	362	for playlist_id in orderedSet(re.findall(
	363	r'<h3[^>]+class="[^"]yt-lockup-title[^"]"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
	364	content)):
	365	yield self.url_result(
	366	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	367
	368	def _real_extract(self, url):
	369	playlist_id = self._match_id(url)
	370	webpage = self._download_webpage(url, playlist_id)
	371	title = self._og_search_title(webpage, fatal=False)
	372	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	373
	374
	375	class YoutubeIE(YoutubeBaseInfoExtractor):
	376	IE_DESC = 'YouTube.com'
	377	_VALID_URL = r"""(?x)^
	378	(
	379	(?:https?://\|//) # http(s):// or protocol-independent URL
	380	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie\|kids)?\.com/\|
	381	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	382	(?:www\.)?pwnyoutube\.com/\|
	383	(?:www\.)?hooktube\.com/\|
	384	(?:www\.)?yourepeat\.com/\|
	385	tube\.majestyc\.net/\|
	386	# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
	387	(?:(?:www\|dev)\.)?invidio\.us/\|
	388	(?:(?:www\|no)\.)?invidiou\.sh/\|
	389	(?:(?:www\|fi\|de)\.)?invidious\.snopyta\.org/\|
	390	(?:www\.)?invidious\.kabi\.tk/\|
	391	(?:www\.)?invidious\.13ad\.de/\|
	392	(?:www\.)?invidious\.mastodon\.host/\|
	393	(?:www\.)?invidious\.nixnet\.xyz/\|
	394	(?:www\.)?invidious\.drycat\.fr/\|
	395	(?:www\.)?tube\.poal\.co/\|
	396	(?:www\.)?vid\.wxzm\.sx/\|
	397	(?:www\.)?yewtu\.be/\|
	398	(?:www\.)?yt\.elukerio\.org/\|
	399	(?:www\.)?yt\.lelux\.fi/\|
	400	(?:www\.)?invidious\.ggc-project\.de/\|
	401	(?:www\.)?yt\.maisputain\.ovh/\|
	402	(?:www\.)?invidious\.13ad\.de/\|
	403	(?:www\.)?invidious\.toot\.koeln/\|
	404	(?:www\.)?invidious\.fdn\.fr/\|
	405	(?:www\.)?watch\.nettohikari\.com/\|
	406	(?:www\.)?kgg2m7yk5aybusll\.onion/\|
	407	(?:www\.)?qklhadlycap4cnod\.onion/\|
	408	(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/\|
	409	(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/\|
	410	(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/\|
	411	(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/\|
	412	(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/\|
	413	(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/\|
	414	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	415	(?:.*?\#/)? # handle anchor (#/) redirect urls
	416	(?: # the various things that can precede the ID:
	417	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	418	\|(?: # or the v= param in all its forms
	419	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	420	(?:\?\|\#!?) # the params delimiter ? or # or #!
	421	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	422	v=
	423	)
	424	))
	425	\|(?:
	426	youtu\.be\| # just youtu.be/xxxx
	427	vid\.plus\| # or vid.plus/xxxx
	428	zwearz\.com/watch\| # or zwearz.com/watch/xxxx
	429	)/
	430	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	431	)
	432	)? # all until now is optional -> you can pass the naked ID
	433	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	434	(?!.*?\blist=
	435	(?:
	436	%(playlist_id)s\| # combined list/video URLs are handled by the playlist IE
	437	WL # WL are handled by the watch later IE
	438	)
	439	)
	440	(?(1).+)? # if we found the ID, everything can follow
	441	$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
	442	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	443	_PLAYER_INFO_RE = (
	444	r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
	445	r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
	446	)
	447	_formats = {
	448	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	449	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	450	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	451	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	452	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	453	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	454	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	455	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	456	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	457	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
	458	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	459	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	460	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	461	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	462	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	463	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	464	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	465	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	466
	467
	468	# 3D videos
	469	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	470	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	471	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	472	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	473	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	474	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	475	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	476
	477	# Apple HTTP Live Streaming
	478	'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	479	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	480	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	481	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	482	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	483	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	484	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	485	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	486
	487	# DASH mp4 video
	488	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
	489	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
	490	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	491	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
	492	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
	493	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
	494	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
	495	'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	496	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
	497	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	498	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	499	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
	500

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import random

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

15

from ..jsinterp import JSInterpreter

16

from ..swfinterp import SWFInterpreter

17

from ..compat import (

compat_chr,

compat_HTTPError,

compat_kwargs,

compat_parse_qs,

compat_urllib_parse_unquote,

23

compat_urllib_parse_unquote_plus,

24

compat_urllib_parse_urlencode,

25

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

bool_or_none,

clean_html,

error_to_compat_str,

extract_attributes,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_codecs,

parse_duration,

remove_quotes,

remove_start,

smuggle_url,

str_or_none,

str_to_int,

try_get,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

url_or_none,

urlencode_postdata,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

59

"""Provide base functions for Youtube extractors"""

60

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

61

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

62

63

_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'

64

_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'

65

_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'

66

67

_NETRC_MACHINE = 'youtube'

68

# If True it will raise an error if no login info is provided

69

_LOGIN_REQUIRED = False

70

71

_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'

72

73

_YOUTUBE_CLIENT_HEADERS = {

74

'x-youtube-client-name': '1',

75

'x-youtube-client-version': '1.20200609.04.02',

76

}

77

78

def _set_language(self):

79

self._set_cookie(

80

'.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',

81

# YouTube sets the expire time to about two months

82

expire_time=time.time() + 2 * 30 * 24 * 3600)

83

84

def _ids_to_results(self, ids):

85

return [

86

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

92

True is returned if successful or skipped.

93

False is returned if login failed.

94

95

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

96

"""

97

username, password = self._get_login_info()

98

# No authentication to be performed

99

if username is None:

100

if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:

101

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

102

return True

103

104

login_page = self._download_webpage(

105

self._LOGIN_URL, None,

106

note='Downloading login page',

107

errnote='unable to fetch login page', fatal=False)

108

if login_page is False:

109

return

110

111

login_form = self._hidden_inputs(login_page)

112

113

def req(url, f_req, note, errnote):

114

data = login_form.copy()

115

data.update({

116

'pstMsg': 1,

117

'checkConnection': 'youtube',

118

'checkedDomains': 'youtube',

119

'hl': 'en',

120

'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',

121

'f.req': json.dumps(f_req),

122

'flowName': 'GlifWebSignIn',

123

'flowEntry': 'ServiceLogin',

124

# TODO: reverse actual botguard identifier generation algo

125

'bgRequest': '["identifier",""]',

126

})

127

return self._download_json(

128

url, None, note=note, errnote=errnote,

129

transform_source=lambda s: re.sub(r'^[^[]*', '', s),

130

fatal=False,

131

data=urlencode_postdata(data), headers={

132

'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',

133

'Google-Accounts-XSRF': 1,

})

def warn(message):

self._downloader.report_warning(message)

lookup_req = [

username,

None, [], None, 'US', None, None, 2, False, True,

[

None, None,

[2, 1, None, 1,

'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',

146

None, [], 4],

147

1, [None, None, []], None, None, None, True

],

username,

]

lookup_results = req(

153

self._LOOKUP_URL, lookup_req,

154

'Looking up account info', 'Unable to look up account info')

155

156

if lookup_results is False:

157

return False

158

159

user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)

160

if not user_hash:

161

warn('Unable to extract user hash')

return False

challenge_req = [

user_hash,

None, 1, None, [1, None, None, None, [password, None, True]],

167

[

168

None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],

169

1, [None, None, []], None, None, None, True

170

]]

171

172

challenge_results = req(

173

self._CHALLENGE_URL, challenge_req,

174

'Logging in', 'Unable to log in')

175

176

if challenge_results is False:

177

return

178

179

login_res = try_get(challenge_results, lambda x: x[0][5], list)

180

if login_res:

181

login_msg = try_get(login_res, lambda x: x[5], compat_str)

182

warn(

183

'Unable to login: %s' % 'Invalid password'

184

if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)

185

return False

186

187

res = try_get(challenge_results, lambda x: x[0][-1], list)

188

if not res:

189

warn('Unable to extract result entry')

190

return False

191

192

login_challenge = try_get(res, lambda x: x[0][0], list)

193

if login_challenge:

194

challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)

195

if challenge_str == 'TWO_STEP_VERIFICATION':

196

# SEND_SUCCESS - TFA code has been successfully sent to phone

197

# QUOTA_EXCEEDED - reached the limit of TFA codes

198

status = try_get(login_challenge, lambda x: x[5], compat_str)

199

if status == 'QUOTA_EXCEEDED':

200

warn('Exceeded the limit of TFA codes, try later')

201

return False

202

203

tl = try_get(challenge_results, lambda x: x[1][2], compat_str)

204

if not tl:

205

warn('Unable to extract TL')

206

return False

207

208

tfa_code = self._get_tfa_info('2-step verification code')

if not tfa_code:

warn(

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

213

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

214

return False

215

216

tfa_code = remove_start(tfa_code, 'G-')

217

218

tfa_req = [

219

user_hash, None, 2, None,

220

[

221

9, None, None, None, None, None, None, None,

222

[None, tfa_code, True, 2]

]]

tfa_results = req(

self._TFA_URL.format(tl), tfa_req,

227

'Submitting TFA code', 'Unable to submit TFA code')

228

229

if tfa_results is False:

230

return False

231

232

tfa_res = try_get(tfa_results, lambda x: x[0][5], list)

233

if tfa_res:

234

tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)

235

warn(

236

'Unable to finish TFA: %s' % 'Invalid TFA code'

237

if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)

238

return False

239

240

check_cookie_url = try_get(

241

tfa_results, lambda x: x[0][-1][2], compat_str)

242

else:

243

CHALLENGES = {

244

'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",

245

'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',

246

'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",

247

}

248

challenge = CHALLENGES.get(

249

challenge_str,

250

'%s returned error %s.' % (self.IE_NAME, challenge_str))

251

warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)

252

return False

253

else:

254

check_cookie_url = try_get(res, lambda x: x[2], compat_str)

255

256

if not check_cookie_url:

257

warn('Unable to extract CheckCookie URL')

258

return False

259

260

check_cookie_results = self._download_webpage(

261

check_cookie_url, None, 'Checking cookie', fatal=False)

262

263

if check_cookie_results is False:

264

return False

265

266

if 'https://myaccount.google.com/' not in check_cookie_results:

267

warn('Unable to log in')

return False

return True

def _download_webpage_handle(self, *args, **kwargs):

273

query = kwargs.get('query', {}).copy()

274

query['disable_polymer'] = 'true'

275

kwargs['query'] = query

276

return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(

277

*args, **compat_kwargs(kwargs))

278

279

def _real_initialize(self):

280

if self._downloader is None:

281

return

282

self._set_language()

283

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

288

# Extract entries from page with "Load more" button

289

def _entries(self, page, playlist_id):

290

more_widget_html = content_html = page

291

for page_num in itertools.count(1):

292

for entry in self._process_page(content_html):

293

yield entry

294

295

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

count = 0

retries = 3

while count <= retries:

302

try:

303

# Downloading page may result in intermittent 5xx HTTP error

304

# that is usually worked around with a retry

305

more = self._download_json(

306

'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,

307

'Downloading page #%s%s'

308

% (page_num, ' (retry #%d)' % count if count else ''),

309

transform_source=uppercase_escape,

310

headers=self._YOUTUBE_CLIENT_HEADERS)

311

break

312

except ExtractorError as e:

313

if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):

count += 1

if count <= retries:

continue

raise

content_html = more['content_html']

320

if not content_html.strip():

321

# Some webpages show a "Load more" button but they don't

322

# have more videos

323

break

324

more_widget_html = more['load_more_widget_html']

325

326

327

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

328

def _process_page(self, content):

329

for video_id, video_title in self.extract_videos_from_page(content):

330

yield self.url_result(video_id, 'Youtube', video_id, video_title)

331

332

def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):

333

for mobj in re.finditer(video_re, page):

334

# The link with index 0 is not the first video of the playlist (not sure if still actual)

335

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

336

continue

337

video_id = mobj.group('id')

338

video_title = unescapeHTML(

339

mobj.group('title')) if 'title' in mobj.groupdict() else None

340

if video_title:

341

video_title = video_title.strip()

342

if video_title == '► Play all':

343

video_title = None

344

try:

345

idx = ids_in_page.index(video_id)

346

if video_title and not titles_in_page[idx]:

347

titles_in_page[idx] = video_title

348

except ValueError:

349

ids_in_page.append(video_id)

350

titles_in_page.append(video_title)

351

352

def extract_videos_from_page(self, page):

353

ids_in_page = []

354

titles_in_page = []

355

self.extract_videos_from_page_impl(

356

self._VIDEO_RE, page, ids_in_page, titles_in_page)

357

return zip(ids_in_page, titles_in_page)

358

359

360

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

361

def _process_page(self, content):

362

for playlist_id in orderedSet(re.findall(

363

r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',

364

content)):

365

yield self.url_result(

366

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

367

368

def _real_extract(self, url):

369

playlist_id = self._match_id(url)

370

webpage = self._download_webpage(url, playlist_id)

371

title = self._og_search_title(webpage, fatal=False)

372

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

373

374

375

class YoutubeIE(YoutubeBaseInfoExtractor):

376

IE_DESC = 'YouTube.com'

377

_VALID_URL = r"""(?x)^

378

(

379

(?:https?://|//) # http(s):// or protocol-independent URL

380

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|

381

(?:www\.)?deturl\.com/www\.youtube\.com/|

382

(?:www\.)?pwnyoutube\.com/|

383

(?:www\.)?hooktube\.com/|

384

(?:www\.)?yourepeat\.com/|

385

tube\.majestyc\.net/|

386

# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances

387

(?:(?:www|dev)\.)?invidio\.us/|

388

(?:(?:www|no)\.)?invidiou\.sh/|

389

(?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|

390

(?:www\.)?invidious\.kabi\.tk/|

391

(?:www\.)?invidious\.13ad\.de/|

392

(?:www\.)?invidious\.mastodon\.host/|

393

(?:www\.)?invidious\.nixnet\.xyz/|

394

(?:www\.)?invidious\.drycat\.fr/|

395

(?:www\.)?tube\.poal\.co/|

396

(?:www\.)?vid\.wxzm\.sx/|

397

(?:www\.)?yewtu\.be/|

398

(?:www\.)?yt\.elukerio\.org/|

399

(?:www\.)?yt\.lelux\.fi/|

400

(?:www\.)?invidious\.ggc-project\.de/|

401

(?:www\.)?yt\.maisputain\.ovh/|

402

(?:www\.)?invidious\.13ad\.de/|

403

(?:www\.)?invidious\.toot\.koeln/|

404

(?:www\.)?invidious\.fdn\.fr/|

405

(?:www\.)?watch\.nettohikari\.com/|

406

(?:www\.)?kgg2m7yk5aybusll\.onion/|

407

(?:www\.)?qklhadlycap4cnod\.onion/|

408

(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|

409

(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|

410

(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|

411

(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|

412

(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|

413

(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|

414

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

415

(?:.*?\#/)? # handle anchor (#/) redirect urls

416

(?: # the various things that can precede the ID:

417

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

418

|(?: # or the v= param in all its forms

419

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

420

(?:\?|\#!?) # the params delimiter ? or # or #!

421

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

427

vid\.plus| # or vid.plus/xxxx

428

zwearz\.com/watch| # or zwearz.com/watch/xxxx

429

)/

430

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

431

)

432

)? # all until now is optional -> you can pass the naked ID

433

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

434

(?!.*?\blist=

435

(?:

436

%(playlist_id)s| # combined list/video URLs are handled by the playlist IE

437

WL # WL are handled by the watch later IE

438

)

439

)

440

(?(1).+)? # if we found the ID, everything can follow

441

$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

442

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

443

_PLAYER_INFO_RE = (

444

r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',

445

r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',

446

)

447

_formats = {

448

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

449

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

450

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

451

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

452

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

453

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

454

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

455

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

456

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

457

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},

458

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

459

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

460

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

461

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

462

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

463

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

464

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

465

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

470

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

471

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

472

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

473

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

474

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

475

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

476

477

# Apple HTTP Live Streaming

478

'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

479

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

480

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

481

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

482

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

483

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

484

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

485

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

486

487

# DASH mp4 video

488

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},

489

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},

490

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

491

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},

492

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},

493

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)

494

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},

495

'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

496

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},

497

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

498

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

499

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},

500

501

# Dash mp4 audio

502

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},

503

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},

504

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},

505

'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

506

'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

507

'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},

508

'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},

509

510

# Dash webm

511

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

512

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

513

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

514

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

515

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

516

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

517

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},

518

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},

519

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},

520

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

521

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

522

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

523

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},

524

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},

525

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},

526

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

527

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

528

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

529

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

530

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

531

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

532

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

533

534

# Dash webm audio

535

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},

536

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},

537

538

# Dash webm audio with opus inside

539

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},

540

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},

541

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},

542

543

# RTMP (unnamed)

544

'_rtmp': {'protocol': 'rtmp'},

545

546

# av01 video only formats sometimes served with "unknown" codecs

547

'394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},

548

'395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},

549

'396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},

550

'397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},

551

}

552

_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')

_GEO_BYPASS = False

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

564

'uploader': 'Philipp Hagemeister',

565

'uploader_id': 'phihag',

566

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

567

'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',

568

'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',

569

'upload_date': '20121002',

570

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

571

'categories': ['Science & Technology'],

572

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',

583

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

588

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

589

'alt_title': 'I Love It (feat. Charli XCX)',

590

'description': 'md5:19a2f98d9032b9311e686ed039564f63',

591

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

592

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

593

'iconic ep', 'iconic', 'love', 'it'],

594

'duration': 180,

595

'uploader': 'Icona Pop',

596

'uploader_id': 'IconaPop',

597

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',

598

'creator': 'Icona Pop',

599

'track': 'I Love It (feat. Charli XCX)',

600

'artist': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

605

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

610

'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',

611

'alt_title': 'Tunnel Vision',

612

'description': 'md5:07dab3356cde4199048e4c7cd93471e1',

613

'duration': 419,

614

'uploader': 'justintimberlakeVEVO',

615

'uploader_id': 'justintimberlakeVEVO',

616

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',

617

'creator': 'Justin Timberlake',

618

'track': 'Tunnel Vision',

619

'artist': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

625

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

630

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

631

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

632

'uploader': 'SET India',

633

'uploader_id': 'setindia',

634

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',

'age_limit': 18,

}

},

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

640

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

645

'uploader': 'Philipp Hagemeister',

646

'uploader_id': 'phihag',

647

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

648

'upload_date': '20121002',

649

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

650

'categories': ['Science & Technology'],

651

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

656

},

657

'params': {

658

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',

663

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

668

'uploader_id': '8KVIDEO',

669

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',

670

'description': '',

671

'uploader': '8KVIDEO',

672

'title': 'UHDTV TEST 8K VIDEO.mp4'

673

},

674

'params': {

675

'youtube_include_dash_manifest': True,

676

'format': '141',

677

},

678

'skip': 'format 141 not served anymore',

679

},

680

# DASH manifest with encrypted signature

681

{

682

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',

687

'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',

688

'duration': 244,

689

'uploader': 'AfrojackVEVO',

690

'uploader_id': 'AfrojackVEVO',

691

'upload_date': '20131011',

692

},

693

'params': {

694

'youtube_include_dash_manifest': True,

695

'format': '141/bestaudio[ext=m4a]',

696

},

697

},

698

# JS player signature function name containing $

699

{

700

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

705

'description': 'md5:307195cd21ff7fa352270fe884570ef0',

706

'duration': 242,

707

'uploader': 'TaylorSwiftVEVO',

708

'uploader_id': 'TaylorSwiftVEVO',

709

'upload_date': '20140818',

710

},

711

'params': {

712

'youtube_include_dash_manifest': True,

713

'format': '141/bestaudio[ext=m4a]',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'duration': 219,

'upload_date': '20100909',

724

'uploader': 'Amazing Atheist',

725

'uploader_id': 'TheAmazingAtheist',

726

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',

727

'title': 'Burning Everyone\'s Koran',

728

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

729

}

730

},

731

# Normal age-gate video (No vevo, embed allowed)

732

{

733

'url': 'https://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

738

'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

739

'duration': 142,

740

'uploader': 'The Witcher',

741

'uploader_id': 'WitcherGame',

742

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',

743

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

748

{

749

'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

754

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

755

'duration': 246,

756

'uploader': 'LloydVEVO',

757

'uploader_id': 'LloydVEVO',

758

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',

759

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)

764

# YouTube Red ad is not captured for creator

765

{

766

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'duration': 266,

'upload_date': '20100430',

772

'uploader_id': 'deadmau5',

773

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',

774

'creator': 'Dada Life, deadmau5',

775

'description': 'md5:12c56784b8032162bb936a5f76d55360',

776

'uploader': 'deadmau5',

777

'title': 'Deadmau5 - Some Chords (HD)',

778

'alt_title': 'This Machine Kills Some Chords',

779

},

780

'expected_warnings': [

781

'DASH manifest missing',

782

]

783

},

784

# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)

785

{

786

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'duration': 6085,

'upload_date': '20150827',

792

'uploader_id': 'olympic',

793

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',

794

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

795

'uploader': 'Olympic',

796

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

797

},

798

'params': {

799

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

809

'duration': 85,

810

'upload_date': '20110310',

811

'uploader_id': 'AllenMeow',

812

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',

813

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

814

'uploader': '孫ᄋᄅ',

815

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

816

},

817

},

818

# url_encoded_fmt_stream_map is empty string

819

{

820

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

825

'description': '',

826

'upload_date': '20150404',

827

'uploader_id': 'spbelect',

828

'uploader': 'Наблюдатели Петербурга',

829

},

830

'params': {

831

'skip_download': 'requires avconv',

832

},

833

'skip': 'This live event has ended.',

834

},

835

# Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)

836

{

837

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'webm',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

842

'description': 'md5:116377fd2963b81ec4ce64b542173306',

843

'duration': 220,

844

'upload_date': '20150625',

845

'uploader_id': 'dorappi2000',

846

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',

847

'uploader': 'dorappi2000',

848

'formats': 'mincount:31',

849

},

850

'skip': 'not actual anymore',

851

},

852

# DASH manifest with segment_list

853

{

854

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

855

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

860

'uploader': 'Airtek',

861

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

862

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

863

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

864

},

865

'params': {

866

'youtube_include_dash_manifest': True,

867

'format': '135', # bestvideo

868

},

869

'skip': 'This live event has ended.',

870

},

871

{

872

# Multifeed videos (multiple cameras), URL is for Main Camera

873

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

874

'info_dict': {

875

'id': 'jqWvoWXjCVs',

876

'title': 'teamPGP: Rocket League Noob Stream',

877

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

884

'description': 'md5:dc7872fb300e143831327f1bae3af010',

885

'duration': 7335,

886

'upload_date': '20150721',

887

'uploader': 'Beer Games Beer',

888

'uploader_id': 'beergamesbeer',

889

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

890

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

897

'description': 'md5:dc7872fb300e143831327f1bae3af010',

898

'duration': 7337,

899

'upload_date': '20150721',

900

'uploader': 'Beer Games Beer',

901

'uploader_id': 'beergamesbeer',

902

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

903

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

910

'description': 'md5:dc7872fb300e143831327f1bae3af010',

911

'duration': 7337,

912

'upload_date': '20150721',

913

'uploader': 'Beer Games Beer',

914

'uploader_id': 'beergamesbeer',

915

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

916

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

923

'description': 'md5:dc7872fb300e143831327f1bae3af010',

924

'duration': 7334,

925

'upload_date': '20150721',

926

'uploader': 'Beer Games Beer',

927

'uploader_id': 'beergamesbeer',

928

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

929

'license': 'Standard YouTube License',

},

}],

'params': {

'skip_download': True,

934

},

935

'skip': 'This video is not available.',

936

},

937

{

938

# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)

939

'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',

940

'info_dict': {

941

'id': 'gVfLd0zydlo',

942

'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',

943

},

944

'playlist_count': 2,

945

'skip': 'Not multifeed anymore',

946

},

947

{

948

'url': 'https://vid.plus/FlRa-iH7PGw',

949

'only_matching': True,

950

},

951

{

952

'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',

953

'only_matching': True,

954

},

955

{

956

# Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)

957

# Also tests cut-off URL expansion in video description (see

958

# https://github.com/ytdl-org/youtube-dl/issues/1892,

959

# https://github.com/ytdl-org/youtube-dl/issues/8164)

960

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

965

'alt_title': 'Dark Walk - Position Music',

966

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

967

'duration': 133,

968

'upload_date': '20151119',

969

'uploader_id': 'IronSoulElf',

970

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',

971

'uploader': 'IronSoulElf',

972

'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

973

'track': 'Dark Walk - Position Music',

974

'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

975

'album': 'Position Music - Production Music Vol. 143 - Dark Walk',

976

},

977

'params': {

978

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)

983

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

984

'only_matching': True,

985

},

986

{

987

# Video with yt:stretch=17:0

988

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

993

'description': 'md5:ee18a25c350637c8faff806845bddee9',

994

'upload_date': '20151107',

995

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

996

'uploader': 'CH GAMER DROID',

997

},

998

'params': {

999

'skip_download': True,

1000

},

1001

'skip': 'This video does not exist.',

1002

},

1003

{

1004

# Video licensed under Creative Commons

1005

'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',

'info_dict': {

'id': 'M4gD1WSo5mA',

'ext': 'mp4',

'title': 'md5:e41008789470fc2533a3252216f1c1d1',

1010

'description': 'md5:a677553cf0840649b731a3024aeff4cc',

1011

'duration': 721,

1012

'upload_date': '20150127',

1013

'uploader_id': 'BerkmanCenter',

1014

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',

1015

'uploader': 'The Berkman Klein Center for Internet & Society',

1016

'license': 'Creative Commons Attribution license (reuse allowed)',

1017

},

1018

'params': {

1019

'skip_download': True,

},

},

{

# Channel-like uploader_url

1024

'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',

'info_dict': {

'id': 'eQcmzGIKrzg',

'ext': 'mp4',

'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',

1029

'description': 'md5:dda0d780d5a6e120758d1711d062a867',

1030

'duration': 4060,

1031

'upload_date': '20151119',

1032

'uploader': 'Bernie Sanders',

1033

'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',

1034

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',

1035

'license': 'Creative Commons Attribution license (reuse allowed)',

1036

},

1037

'params': {

1038

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

1043

'only_matching': True,

1044

},

1045

{

1046

# YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)

1047

'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',

1048

'only_matching': True,

1049

},

1050

{

1051

# Rental video preview

1052

'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',

'info_dict': {

'id': 'uGpuVWrhIzE',

'ext': 'mp4',

'title': 'Piku - Trailer',

1057

'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',

1058

'upload_date': '20150811',

1059

'uploader': 'FlixMatrix',

1060

'uploader_id': 'FlixMatrixKaravan',

1061

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',

1062

'license': 'Standard YouTube License',

1063

},

1064

'params': {

1065

'skip_download': True,

1066

},

1067

'skip': 'This video is not available.',

1068

},

1069

{

1070

# YouTube Red video with episode data

1071

'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',

'info_dict': {

'id': 'iqKdEhx-dD4',

'ext': 'mp4',

'title': 'Isolation - Mind Field (Ep 1)',

1076

'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',

1077

'duration': 2085,

1078

'upload_date': '20170118',

1079

'uploader': 'Vsauce',

1080

'uploader_id': 'Vsauce',

1081

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',

1082

'series': 'Mind Field',

'season_number': 1,

'episode_number': 1,

},

'params': {

'skip_download': True,

1088

},

1089

'expected_warnings': [

1090

'Skipping DASH manifest',

],

},

{

# The following content has been identified by the YouTube community

1095

# as inappropriate or offensive to some audiences.

1096

'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',

'info_dict': {

'id': '6SJNVb0GnPI',

'ext': 'mp4',

'title': 'Race Differences in Intelligence',

1101

'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',

1102

'duration': 965,

1103

'upload_date': '20140124',

1104

'uploader': 'New Century Foundation',

1105

'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',

1106

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',

1107

},

1108

'params': {

1109

'skip_download': True,

},

},

{

# itag 212

'url': '1t24XAntNCY',

1115

'only_matching': True,

1116

},

1117

{

1118

# geo restricted to JP

1119

'url': 'sJL6WA-aGkQ',

1120

'only_matching': True,

1121

},

1122

{

1123

'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',

1124

'only_matching': True,

1125

},

1126

{

1127

'url': 'https://invidio.us/watch?v=BaW_jenozKc',

1128

'only_matching': True,

},

{

# DRM protected

'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',

1133

'only_matching': True,

1134

},

1135

{

1136

# Video with unsupported adaptive stream type formats

1137

'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',

'info_dict': {

'id': 'Z4Vy8R84T1U',

'ext': 'mp4',

'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',

1142

'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',

1143

'duration': 433,

1144

'upload_date': '20130923',

1145

'uploader': 'Amelia Putri Harwita',

1146

'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',

1147

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',

1148

'formats': 'maxcount:10',

1149

},

1150

'params': {

1151

'skip_download': True,

1152

'youtube_include_dash_manifest': False,

1153

},

1154

'skip': 'not actual anymore',

1155

},

1156

{

1157

# Youtube Music Auto-generated description

1158

'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',

'info_dict': {

'id': 'MgNrAu2pzNs',

'ext': 'mp4',

'title': 'Voyeur Girl',

1163

'description': 'md5:7ae382a65843d6df2685993e90a8628f',

1164

'upload_date': '20190312',

1165

'uploader': 'Stephen - Topic',

1166

'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',

1167

'artist': 'Stephen',

1168

'track': 'Voyeur Girl',

1169

'album': 'it\'s too much love to know my dear',

1170

'release_date': '20190313',

1171

'release_year': 2019,

1172

},

1173

'params': {

1174

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1179

# Retrieve 'artist' field from 'Artist:' in video description

1180

# when it is present on youtube music video

1181

'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',

'info_dict': {

'id': 'k0jLE7tTwjY',

'ext': 'mp4',

'title': 'Latch Feat. Sam Smith',

1186

'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',

1187

'upload_date': '20150110',

1188

'uploader': 'Various Artists - Topic',

1189

'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',

1190

'artist': 'Disclosure',

1191

'track': 'Latch Feat. Sam Smith',

1192

'album': 'Latch Featuring Sam Smith',

1193

'release_date': '20121008',

1194

'release_year': 2012,

1195

},

1196

'params': {

1197

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1202

# handle multiple artists on youtube music video

1203

'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',

'info_dict': {

'id': '74qn0eJSjpA',

'ext': 'mp4',

'title': 'Eastside',

'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',

1209

'upload_date': '20180710',

1210

'uploader': 'Benny Blanco - Topic',

1211

'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',

1212

'artist': 'benny blanco, Halsey, Khalid',

1213

'track': 'Eastside',

1214

'album': 'Eastside',

1215

'release_date': '20180713',

1216

'release_year': 2018,

1217

},

1218

'params': {

1219

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1224

# handle youtube music video with release_year and no release_date

1225

'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',

'info_dict': {

'id': '-hcAI0g-f5M',

'ext': 'mp4',

'title': 'Put It On Me',

1230

'description': 'md5:f6422397c07c4c907c6638e1fee380a5',

1231

'upload_date': '20180426',

1232

'uploader': 'Matt Maeson - Topic',

1233

'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',

1234

'artist': 'Matt Maeson',

1235

'track': 'Put It On Me',

1236

'album': 'The Hearse',

1237

'release_date': None,

1238

'release_year': 2018,

1239

},

1240

'params': {

1241

'skip_download': True,

},

},

{

'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',

1246

'only_matching': True,

1247

},

1248

{

1249

# invalid -> valid video id redirection

1250

'url': 'DJztXj2GPfl',

'info_dict': {

'id': 'DJztXj2GPfk',

'ext': 'mp4',

'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',

1255

'description': 'md5:bf577a41da97918e94fa9798d9228825',

1256

'upload_date': '20090125',

1257

'uploader': 'Prochorowka',

1258

'uploader_id': 'Prochorowka',

1259

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',

1260

'artist': 'Panjabi MC',

1261

'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',

1262

'album': 'Beware of the Boys (Mundian To Bach Ke)',

1263

},

1264

'params': {

1265

'skip_download': True,

},

}

]

def __init__(self, *args, **kwargs):

1271

super(YoutubeIE, self).__init__(*args, **kwargs)

1272

self._player_cache = {}

1273

1274

def report_video_info_webpage_download(self, video_id):

1275

"""Report attempt to download video info webpage."""

1276

self.to_screen('%s: Downloading video info webpage' % video_id)

1277

1278

def report_information_extraction(self, video_id):

1279

"""Report attempt to extract video information."""

1280

self.to_screen('%s: Extracting video information' % video_id)

1281

1282

def report_unavailable_format(self, video_id, format):

1283

"""Report extracted video URL."""

1284

self.to_screen('%s: Format %s not available' % (video_id, format))

1285

1286

def report_rtmp_download(self):

1287

"""Indicate the download will use the RTMP protocol."""

1288

self.to_screen('RTMP download detected')

1289

1290

def _signature_cache_id(self, example_sig):

1291

""" Return a string representation of a signature """

1292

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

1293

1294

@classmethod

1295

def _extract_player_info(cls, player_url):

1296

for player_re in cls._PLAYER_INFO_RE:

1297

id_m = re.search(player_re, player_url)

if id_m:

break

else:

raise ExtractorError('Cannot identify player %r' % player_url)

1302

return id_m.group('ext'), id_m.group('id')

1303

1304

def _extract_signature_function(self, video_id, player_url, example_sig):

1305

player_type, player_id = self._extract_player_info(player_url)

1306

1307

# Read from filesystem cache

1308

func_id = '%s_%s_%s' % (

1309

player_type, player_id, self._signature_cache_id(example_sig))

1310

assert os.path.basename(func_id) == func_id

1311

1312

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

1313

if cache_spec is not None:

1314

return lambda s: ''.join(s[i] for i in cache_spec)

1315

1316

download_note = (

1317

'Downloading player %s' % player_url

1318

if self._downloader.params.get('verbose') else

1319

'Downloading %s player %s' % (player_type, player_id)

1320

)

1321

if player_type == 'js':

1322

code = self._download_webpage(

1323

player_url, video_id,

1324

note=download_note,

1325

errnote='Download of %s failed' % player_url)

1326

res = self._parse_sig_js(code)

1327

elif player_type == 'swf':

1328

urlh = self._request_webpage(

1329

player_url, video_id,

1330

note=download_note,

1331

errnote='Download of %s failed' % player_url)

1332

code = urlh.read()

1333

res = self._parse_sig_swf(code)

1334

else:

1335

assert False, 'Invalid player type %r' % player_type

1336

1337

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1338

cache_res = res(test_string)

1339

cache_spec = [ord(c) for c in cache_res]

1340

1341

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

1342

return res

1343

1344

def _print_sig_code(self, func, example_sig):

1345

def gen_sig_code(idxs):

1346

def _genslice(start, end, step):

1347

starts = '' if start == 0 else str(start)

1348

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

1349

steps = '' if step == 1 else (':%d' % step)

1350

return 's[%s%s%s]' % (starts, ends, steps)

1351

1352

step = None

1353

# Quelch pyflakes warnings - start will be set when step is set

1354

start = '(Never used)'

1355

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

1360

step = None

1361

continue

1362

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

1372

1373

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1374

cache_res = func(test_string)

1375

cache_spec = [ord(c) for c in cache_res]

1376

expr_code = ' + '.join(gen_sig_code(cache_spec))

1377

signature_id_tuple = '(%s)' % (

1378

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

1379

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

1380

' return %s\n') % (signature_id_tuple, expr_code)

1381

self.to_screen('Extracted signature function:\n' + code)

1382

1383

def _parse_sig_js(self, jscode):

1384

funcname = self._search_regex(

1385

(r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1386

r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1387

r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function$\s*a\s*$\s*{\s*a\s*=\s*a\.split$\s*""\s*$',

1388

r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function$\s*a\s*$\s*{\s*a\s*=\s*a\.split$\s*""\s*$',

1389

# Obsolete patterns

1390

r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1391

r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',

1392

r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1393

r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1394

r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1395

r'\bc\s*&&\s*a\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1396

r'\bc\s*&&\s*[a-zA-Z0-9]+\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1397

r'\bc\s*&&\s*[a-zA-Z0-9]+\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),

1398

jscode, 'Initial JS player signature function name', group='sig')

1399

1400

jsi = JSInterpreter(jscode)

1401

initial_function = jsi.extract_function(funcname)

1402

return lambda s: initial_function([s])

1403

1404

def _parse_sig_swf(self, file_contents):

1405

swfi = SWFInterpreter(file_contents)

1406

TARGET_CLASSNAME = 'SignatureDecipher'

1407

searched_class = swfi.extract_class(TARGET_CLASSNAME)

1408

initial_function = swfi.extract_function(searched_class, 'decipher')

1409

return lambda s: initial_function([s])

1410

1411

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

1412

"""Turn the encrypted s field into a working signature"""

1413

1414

if player_url is None:

1415

raise ExtractorError('Cannot decrypt signature without player_url')

1416

1417

if player_url.startswith('//'):

1418

player_url = 'https:' + player_url

1419

elif not re.match(r'https?://', player_url):

1420

player_url = compat_urlparse.urljoin(

1421

'https://www.youtube.com', player_url)

1422

try:

1423

player_id = (player_url, self._signature_cache_id(s))

1424

if player_id not in self._player_cache:

1425

func = self._extract_signature_function(

1426

video_id, player_url, s

1427

)

1428

self._player_cache[player_id] = func

1429

func = self._player_cache[player_id]

1430

if self._downloader.params.get('youtube_print_sig_code'):

1431

self._print_sig_code(func, s)

1432

return func(s)

1433

except Exception as e:

1434

tb = traceback.format_exc()

1435

raise ExtractorError(

1436

'Signature extraction failed: ' + tb, cause=e)

1437

1438

def _get_subtitles(self, video_id, webpage):

1439

try:

1440

subs_doc = self._download_xml(

1441

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

1442

video_id, note=False)

1443

except ExtractorError as err:

1444

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

1449

lang = track.attrib['lang_code']

1450

if lang in sub_lang_list:

1451

continue

1452

sub_formats = []

1453

for ext in self._SUBTITLE_FORMATS:

1454

params = compat_urllib_parse_urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

1459

})

1460

sub_formats.append({

1461

'url': 'https://www.youtube.com/api/timedtext?' + params,

1462

'ext': ext,

1463

})

1464

sub_lang_list[lang] = sub_formats

1465

# TODO check that live chat replay actually exists

1466

sub_lang_list['live_chat'] = [

1467

{

1468

'video_id': video_id,

1469

'ext': 'json',

1470

'protocol': 'youtube_live_chat_replay',

1471

},

1472

]

1473

if not sub_lang_list:

1474

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

1479

patterns = (

1480

# User data may contain arbitrary character sequences that may affect

1481

# JSON extraction with regex, e.g. when '};' is contained the second

1482

# regex won't capture the whole JSON. Yet working around by trying more

1483

# concrete regex first keeping in mind proper quoted string handling

1484

# to be implemented in future that will replace this workaround (see

1485

# https://github.com/ytdl-org/youtube-dl/issues/7468,

1486

# https://github.com/ytdl-org/youtube-dl/pull/7599)

1487

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

1488

r';ytplayer\.config\s*=\s*({.+?});',

1489

)

1490

config = self._search_regex(

1491

patterns, webpage, 'ytplayer.config', default=None)

1492

if config:

1493

return self._parse_json(

1494

uppercase_escape(config), video_id, fatal=False)

1495

1496

def _get_automatic_captions(self, video_id, webpage):

1497

"""We need the webpage for getting the captions url, pass it as an

1498

argument to speed up the process."""

1499

self.to_screen('%s: Looking for automatic captions' % video_id)

1500

player_config = self._get_ytplayer_config(video_id, webpage)

1501

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

1502

if not player_config:

1503

self._downloader.report_warning(err_msg)

1504

return {}

1505

try:

1506

args = player_config['args']

1507

caption_url = args.get('ttsurl')

1508

if caption_url:

1509

timestamp = args['timestamp']

1510

# We get the available subtitles

1511

list_params = compat_urllib_parse_urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

1517

caption_list = self._download_xml(list_url, video_id)

1518

original_lang_node = caption_list.find('track')

1519

if original_lang_node is None:

1520

self._downloader.report_warning('Video doesn\'t have automatic captions')

1521

return {}

1522

original_lang = original_lang_node.attrib['lang_code']

1523

caption_kind = original_lang_node.attrib.get('kind', '')

1524

1525

sub_lang_list = {}

1526

for lang_node in caption_list.findall('target'):

1527

sub_lang = lang_node.attrib['lang_code']

1528

sub_formats = []

1529

for ext in self._SUBTITLE_FORMATS:

1530

params = compat_urllib_parse_urlencode({

1531

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1536

})

1537

sub_formats.append({

1538

'url': caption_url + '&' + params,

1539

'ext': ext,

1540

})

1541

sub_lang_list[sub_lang] = sub_formats

1542

return sub_lang_list

1543

1544

def make_captions(sub_url, sub_langs):

1545

parsed_sub_url = compat_urllib_parse_urlparse(sub_url)

1546

caption_qs = compat_parse_qs(parsed_sub_url.query)

1547

captions = {}

1548

for sub_lang in sub_langs:

1549

sub_formats = []

1550

for ext in self._SUBTITLE_FORMATS:

caption_qs.update({

'tlang': [sub_lang],

'fmt': [ext],

})

sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(

1556

query=compat_urllib_parse_urlencode(caption_qs, True)))

sub_formats.append({

'url': sub_url,

'ext': ext,

})

captions[sub_lang] = sub_formats

1562

return captions

1563

1564

# New captions format as of 22.06.2017

1565

player_response = args.get('player_response')

1566

if player_response and isinstance(player_response, compat_str):

1567

player_response = self._parse_json(

1568

player_response, video_id, fatal=False)

1569

if player_response:

1570

renderer = player_response['captions']['playerCaptionsTracklistRenderer']

1571

base_url = renderer['captionTracks'][0]['baseUrl']

1572

sub_lang_list = []

1573

for lang in renderer['translationLanguages']:

1574

lang_code = lang.get('languageCode')

1575

if lang_code:

1576

sub_lang_list.append(lang_code)

1577

return make_captions(base_url, sub_lang_list)

1578

1579

# Some videos don't provide ttsurl but rather caption_tracks and

1580

# caption_translation_languages (e.g. 20LmZk1hakA)

1581

# Does not used anymore as of 22.06.2017

1582

caption_tracks = args['caption_tracks']

1583

caption_translation_languages = args['caption_translation_languages']

1584

caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]

1585

sub_lang_list = []

1586

for lang in caption_translation_languages.split(','):

1587

lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))

1588

sub_lang = lang_qs.get('lc', [None])[0]

1589

if sub_lang:

1590

sub_lang_list.append(sub_lang)

1591

return make_captions(caption_url, sub_lang_list)

1592

# An extractor error can be raise by the download process if there are

1593

# no automatic captions but there are subtitles

1594

except (KeyError, IndexError, ExtractorError):

1595

self._downloader.report_warning(err_msg)

1596

return {}

1597

1598

def _mark_watched(self, video_id, video_info, player_response):

1599

playback_url = url_or_none(try_get(

1600

player_response,

1601

lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(

1602

video_info, lambda x: x['videostats_playback_base_url'][0]))

1603

if not playback_url:

1604

return

1605

parsed_playback_url = compat_urlparse.urlparse(playback_url)

1606

qs = compat_urlparse.parse_qs(parsed_playback_url.query)

1607

1608

# cpn generation algorithm is reverse engineered from base.js.

1609

# In fact it works even with dummy cpn.

1610

CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'

1611

cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))

qs.update({

'ver': ['2'],

'cpn': [cpn],

})

playback_url = compat_urlparse.urlunparse(

1618

parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))

1619

1620

self._download_webpage(

1621

playback_url, video_id, 'Marking watched',

1622

'Unable to mark watched', fatal=False)

1623

1624

@staticmethod

1625

def _extract_urls(webpage):

1626

# Embedded YouTube player

1627

entries = [

1628

unescapeHTML(mobj.group('url'))

1629

for mobj in re.finditer(r'''(?x)

(?:

<iframe[^>]+?src=|

data-video-url=|

<embed[^>]+?src=|

embedSWF\(?:\s*|

<object[^>]+data=|

new\s+SWFObject\(

)

(["\'])

(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/

1640

(?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)

1641

\1''', webpage)]

1642

1643

# lazyYT YouTube embed

1644

entries.extend(list(map(

1645

unescapeHTML,

1646

re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))

1647

1648

# Wordpress "YouTube Video Importer" plugin

1649

matches = re.findall(r'''(?x)<div[^>]+

1650

class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+

1651

data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)

1652

entries.extend(m[-1] for m in matches)

return entries

@staticmethod

def _extract_url(webpage):

1658

urls = YoutubeIE._extract_urls(webpage)

1659

return urls[0] if urls else None

1660

1661

@classmethod

1662

def extract_id(cls, url):

1663

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1664

if mobj is None:

1665

raise ExtractorError('Invalid URL: %s' % url)

1666

video_id = mobj.group(2)

1667

return video_id

1668

1669

def _extract_chapters_from_json(self, webpage, video_id, duration):

1670

if not webpage:

1671

return

1672

player = self._parse_json(

1673

self._search_regex(

1674

r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,

1675

'player args', default='{}'),

1676

video_id, fatal=False)

1677

if not player or not isinstance(player, dict):

1678

return

1679

watch_next_response = player.get('watch_next_response')

1680

if not isinstance(watch_next_response, compat_str):

1681

return

1682

response = self._parse_json(watch_next_response, video_id, fatal=False)

1683

if not response or not isinstance(response, dict):

1684

return

1685

chapters_list = try_get(

1686

response,

1687

lambda x: x['playerOverlays']

1688

['playerOverlayRenderer']

1689

['decoratedPlayerBarRenderer']

1690

['decoratedPlayerBarRenderer']

1691

['playerBar']

1692

['chapteredPlayerBarRenderer']

1693

['chapters'],

1694

list)

1695

if not chapters_list:

1696

return

1697

1698

def chapter_time(chapter):

1699

return float_or_none(

1700

try_get(

1701

chapter,

1702

lambda x: x['chapterRenderer']['timeRangeStartMillis'],

int),

scale=1000)

chapters = []

for next_num, chapter in enumerate(chapters_list, start=1):

1707

start_time = chapter_time(chapter)

1708

if start_time is None:

1709

continue

1710

end_time = (chapter_time(chapters_list[next_num])

1711

if next_num < len(chapters_list) else duration)

if end_time is None:

continue

title = try_get(

chapter, lambda x: x['chapterRenderer']['title']['simpleText'],

1716

compat_str)

1717

chapters.append({

1718

'start_time': start_time,

1719

'end_time': end_time,

'title': title,

})

return chapters

@staticmethod

def _extract_chapters_from_description(description, duration):

1726

if not description:

1727

return None

1728

chapter_lines = re.findall(

1729

r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',

1730

description)

1731

if not chapter_lines:

1732

return None

1733

chapters = []

1734

for next_num, (chapter_line, time_point) in enumerate(

1735

chapter_lines, start=1):

1736

start_time = parse_duration(time_point)

1737

if start_time is None:

1738

continue

1739

if start_time > duration:

1740

break

1741

end_time = (duration if next_num == len(chapter_lines)

1742

else parse_duration(chapter_lines[next_num][1]))

1743

if end_time is None:

1744

continue

1745

if end_time > duration:

1746

end_time = duration

1747

if start_time > end_time:

1748

break

1749

chapter_title = re.sub(

1750

r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')

1751

chapter_title = re.sub(r'\s+', ' ', chapter_title)

1752

chapters.append({

1753

'start_time': start_time,

1754

'end_time': end_time,

1755

'title': chapter_title,

})

return chapters

def _extract_chapters(self, webpage, description, video_id, duration):

1760

return (self._extract_chapters_from_json(webpage, video_id, duration)

1761

or self._extract_chapters_from_description(description, duration))

1762

1763

def _real_extract(self, url):

1764

url, smuggled_data = unsmuggle_url(url, {})

1765

1766

proto = (

1767

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1773

for component in [parsed_url.fragment, parsed_url.query]:

1774

query = compat_parse_qs(component)

1775

if start_time is None and 't' in query:

1776

start_time = parse_duration(query['t'][0])

1777

if start_time is None and 'start' in query:

1778

start_time = parse_duration(query['start'][0])

1779

if end_time is None and 'end' in query:

1780

end_time = parse_duration(query['end'][0])

1781

1782

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1783

mobj = re.search(self._NEXT_URL_RE, url)

1784

if mobj:

1785

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1786

video_id = self.extract_id(url)

1787

1788

# Get video webpage

1789

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1790

video_webpage, urlh = self._download_webpage_handle(url, video_id)

1791

1792

qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)

1793

video_id = qs.get('v', [None])[0] or video_id

1794

1795

# Attempt to extract SWF player URL

1796

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1797

if mobj is not None:

1798

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1805

dash_mpd = video_info.get('dashmpd')

1806

if dash_mpd and dash_mpd[0] not in dash_mpds:

1807

dash_mpds.append(dash_mpd[0])

1808

1809

def add_dash_mpd_pr(pl_response):

1810

dash_mpd = url_or_none(try_get(

1811

pl_response, lambda x: x['streamingData']['dashManifestUrl'],

1812

compat_str))

1813

if dash_mpd and dash_mpd not in dash_mpds:

1814

dash_mpds.append(dash_mpd)

is_live = None

view_count = None

def extract_view_count(v_info):

1820

return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))

1821

1822

def extract_player_response(player_response, video_id):

1823

pl_response = str_or_none(player_response)

1824

if not pl_response:

1825

return

1826

pl_response = self._parse_json(pl_response, video_id, fatal=False)

1827

if isinstance(pl_response, dict):

1828

add_dash_mpd_pr(pl_response)

return pl_response

player_response = {}

# Get video info

video_info = {}

embed_webpage = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1837

age_gate = True

1838

# We simulate the access to the video from www.youtube.com/v/{video_id}

1839

# this can be viewed without login into Youtube

1840

url = proto + '://www.youtube.com/embed/%s' % video_id

1841

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1842

data = compat_urllib_parse_urlencode({

1843

'video_id': video_id,

1844

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1845

'sts': self._search_regex(

1846

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1847

})

1848

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1849

try:

1850

video_info_webpage = self._download_webpage(

1851

video_info_url, video_id,

1852

note='Refetching age-gated info webpage',

1853

errnote='unable to download video info webpage')

1854

except ExtractorError:

1855

video_info_webpage = None

1856

if video_info_webpage:

1857

video_info = compat_parse_qs(video_info_webpage)

1858

pl_response = video_info.get('player_response', [None])[0]

1859

player_response = extract_player_response(pl_response, video_id)

1860

add_dash_mpd(video_info)

1861

view_count = extract_view_count(video_info)

1862

else:

1863

age_gate = False

1864

# Try looking directly into the video webpage

1865

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1866

if ytplayer_config:

1867

args = ytplayer_config['args']

1868

if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):

1869

# Convert to the same format returned by compat_parse_qs

1870

video_info = dict((k, [v]) for k, v in args.items())

1871

add_dash_mpd(video_info)

1872

# Rental video is not rented but preview is available (e.g.

1873

# https://www.youtube.com/watch?v=yYr8q0y5Jfg,

1874

# https://github.com/ytdl-org/youtube-dl/issues/10532)

1875

if not video_info and args.get('ypc_vid'):

1876

return self.url_result(

1877

args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])

1878

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1879

is_live = True

1880

if not player_response:

1881

player_response = extract_player_response(args.get('player_response'), video_id)

1882

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1883

add_dash_mpd_pr(player_response)

1884

1885

def extract_unavailable_message():

1886

messages = []

1887

for tag, kind in (('h1', 'message'), ('div', 'submessage')):

1888

msg = self._html_search_regex(

1889

r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),

1890

video_webpage, 'unavailable %s' % kind, default=None)

if msg:

messages.append(msg)

if messages:

return '\n'.join(messages)

1895

1896

if not video_info and not player_response:

1897

unavailable_message = extract_unavailable_message()

1898

if not unavailable_message:

1899

unavailable_message = 'Unable to extract video data'

1900

raise ExtractorError(

1901

'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)

1902

1903

if not isinstance(video_info, dict):

1904

video_info = {}

1905

1906

video_details = try_get(

1907

player_response, lambda x: x['videoDetails'], dict) or {}

1908

1909

microformat = try_get(

1910

player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}

1911

1912

video_title = video_info.get('title', [None])[0] or video_details.get('title')

1913

if not video_title:

1914

self._downloader.report_warning('Unable to extract video title')

1915

video_title = '_'

1916

1917

description_original = video_description = get_element_by_id("eow-description", video_webpage)

1918

if video_description:

1919

1920

def replace_url(m):

1921

redir_url = compat_urlparse.urljoin(url, m.group(1))

1922

parsed_redir_url = compat_urllib_parse_urlparse(redir_url)

1923

if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':

1924

qs = compat_parse_qs(parsed_redir_url.query)

q = qs.get('q')

if q and q[0]:

return q[0]

return redir_url

description_original = video_description = re.sub(r'''(?x)

1931

<a\s+

1932

(?:[a-zA-Z-]+="[^"]*"\s+)*?

1933

(?:title|href)="([^"]+)"\s+

1934

(?:[a-zA-Z-]+="[^"]*"\s+)*?

class="[^"]*"[^>]*>

[^<]+\.{3}\s*

</a>

''', replace_url, video_description)

1939

video_description = clean_html(video_description)

1940

else:

1941

video_description = video_details.get('shortDescription') or self._html_search_meta('description', video_webpage)

1942

1943

if not smuggled_data.get('force_singlefeed', False):

1944

if not self._downloader.params.get('noplaylist'):

1945

multifeed_metadata_list = try_get(

1946

player_response,

1947

lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],

1948

compat_str) or try_get(

1949

video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)

1950

if multifeed_metadata_list:

1951

entries = []

1952

feed_ids = []

1953

for feed in multifeed_metadata_list.split(','):

1954

# Unquote should take place before split on comma (,) since textual

1955

# fields may contain comma as well (see

1956

# https://github.com/ytdl-org/youtube-dl/issues/8536)

1957

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1958

1959

def feed_entry(name):

1960

return try_get(feed_data, lambda x: x[name][0], compat_str)

1961

1962

feed_id = feed_entry('id')

1963

if not feed_id:

1964

continue

1965

feed_title = feed_entry('title')

1966

title = video_title

1967

if feed_title:

1968

title += ' (%s)' % feed_title

1969

entries.append({

1970

'_type': 'url_transparent',

1971

'ie_key': 'Youtube',

1972

'url': smuggle_url(

1973

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1974

{'force_singlefeed': True}),

1975

'title': title,

1976

})

1977

feed_ids.append(feed_id)

1978

self.to_screen(

1979

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1980

% (', '.join(feed_ids), video_id))

1981

return self.playlist_result(entries, video_id, video_title, video_description)

1982

else:

1983

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1984

1985

if view_count is None:

1986

view_count = extract_view_count(video_info)

1987

if view_count is None and video_details:

1988

view_count = int_or_none(video_details.get('viewCount'))

1989

if view_count is None and microformat:

1990

view_count = int_or_none(microformat.get('viewCount'))

1991

1992

if is_live is None:

1993

is_live = bool_or_none(video_details.get('isLive'))

1994

1995

# Check for "rental" videos

1996

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1997

raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)

1998

1999

def _extract_filesize(media_url):

2000

return int_or_none(self._search_regex(

2001

r'\bclen[=/](\d+)', media_url, 'filesize', default=None))

2002

2003

streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []

2004

streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])

2005

2006

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

2007

self.report_rtmp_download()

2008

formats = [{

2009

'format_id': '_rtmp',

2010

'protocol': 'rtmp',

2011

'url': video_info['conn'][0],

2012

'player_url': player_url,

2013

}]

2014

elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):

2015

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

2016

if 'rtmpe%3Dyes' in encoded_url_map:

2017

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)

2018

formats = []

2019

formats_spec = {}

2020

fmt_list = video_info.get('fmt_list', [''])[0]

2021

if fmt_list:

2022

for fmt in fmt_list.split(','):

2023

spec = fmt.split('/')

2024

if len(spec) > 1:

2025

width_height = spec[1].split('x')

2026

if len(width_height) == 2:

2027

formats_spec[spec[0]] = {

2028

'resolution': spec[1],

2029

'width': int_or_none(width_height[0]),

2030

'height': int_or_none(width_height[1]),

2031

}

2032

for fmt in streaming_formats:

2033

itag = str_or_none(fmt.get('itag'))

2034

if not itag:

2035

continue

2036

quality = fmt.get('quality')

2037

quality_label = fmt.get('qualityLabel') or quality

2038

formats_spec[itag] = {

2039

'asr': int_or_none(fmt.get('audioSampleRate')),

2040

'filesize': int_or_none(fmt.get('contentLength')),

2041

'format_note': quality_label,

2042

'fps': int_or_none(fmt.get('fps')),

2043

'height': int_or_none(fmt.get('height')),

2044

# bitrate for itag 43 is always 2147483647

2045

'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,

2046

'width': int_or_none(fmt.get('width')),

2047

}

2048

2049

for fmt in streaming_formats:

2050

if fmt.get('drmFamilies') or fmt.get('drm_families'):

2051

continue

2052

url = url_or_none(fmt.get('url'))

2053

2054

if not url:

2055

cipher = fmt.get('cipher') or fmt.get('signatureCipher')

2056

if not cipher:

2057

continue

2058

url_data = compat_parse_qs(cipher)

2059

url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))

if not url:

continue

else:

cipher = None

url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)

2065

2066

stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))

2067

# Unsupported FORMAT_STREAM_TYPE_OTF

if stream_type == 3:

continue

format_id = fmt.get('itag') or url_data['itag'][0]

2072

if not format_id:

2073

continue

2074

format_id = compat_str(format_id)

2075

2076

if cipher:

2077

if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):

2078

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

2079

jsplayer_url_json = self._search_regex(

2080

ASSETS_RE,

2081

embed_webpage if age_gate else video_webpage,

2082

'JS player URL (1)', default=None)

2083

if not jsplayer_url_json and not age_gate:

2084

# We need the embed website after all

2085

if embed_webpage is None:

2086

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

2087

embed_webpage = self._download_webpage(

2088

embed_url, video_id, 'Downloading embed webpage')

2089

jsplayer_url_json = self._search_regex(

2090

ASSETS_RE, embed_webpage, 'JS player URL')

2091

2092

player_url = json.loads(jsplayer_url_json)

2093

if player_url is None:

2094

player_url_json = self._search_regex(

2095

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

2096

video_webpage, 'age gate player URL')

2097

player_url = json.loads(player_url_json)

2098

2099

if 'sig' in url_data:

2100

url += '&signature=' + url_data['sig'][0]

2101

elif 's' in url_data:

2102

encrypted_sig = url_data['s'][0]

2103

2104

if self._downloader.params.get('verbose'):

2105

if player_url is None:

2106

player_desc = 'unknown'

2107

else:

2108

player_type, player_version = self._extract_player_info(player_url)

2109

player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)

2110

parts_sizes = self._signature_cache_id(encrypted_sig)

2111

self.to_screen('{%s} signature length %s, %s' %

2112

(format_id, parts_sizes, player_desc))

2113

2114

signature = self._decrypt_signature(

2115

encrypted_sig, video_id, player_url, age_gate)

2116

sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'

2117

url += '&%s=%s' % (sp, signature)

2118

if 'ratebypass' not in url:

2119

url += '&ratebypass=yes'

2120

2121

dct = {

2122

'format_id': format_id,

2123

'url': url,

2124

'player_url': player_url,

2125

}

2126

if format_id in self._formats:

2127

dct.update(self._formats[format_id])

2128

if format_id in formats_spec:

2129

dct.update(formats_spec[format_id])

2130

2131

# Some itags are not included in DASH manifest thus corresponding formats will

2132

# lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).

2133

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

2134

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

2135

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

2136

2137

if width is None:

2138

width = int_or_none(fmt.get('width'))

2139

if height is None:

2140

height = int_or_none(fmt.get('height'))

2141

2142

filesize = int_or_none(url_data.get(

2143

'clen', [None])[0]) or _extract_filesize(url)

2144

2145

quality = url_data.get('quality', [None])[0] or fmt.get('quality')

2146

quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')

2147

2148

tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)

2149

or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None

2150

fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))

2151

2152

more_fields = {

2153

'filesize': filesize,

'tbr': tbr,

'width': width,

'height': height,

'fps': fps,

'format_note': quality_label or quality,

2159

}

2160

for key, value in more_fields.items():

2161

if value:

2162

dct[key] = value

2163

type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')

2164

if type_:

2165

type_split = type_.split(';')

2166

kind_ext = type_split[0].split('/')

2167

if len(kind_ext) == 2:

2168

kind, _ = kind_ext

2169

dct['ext'] = mimetype2ext(type_split[0])

2170

if kind in ('audio', 'video'):

2171

codecs = None

2172

for mobj in re.finditer(

2173

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

2174

if mobj.group('key') == 'codecs':

2175

codecs = mobj.group('val')

2176

break

2177

if codecs:

2178

dct.update(parse_codecs(codecs))

2179

if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':

2180

dct['downloader_options'] = {

2181

# Youtube throttles chunks >~10M

2182

'http_chunk_size': 10485760,

}

formats.append(dct)

else:

manifest_url = (

url_or_none(try_get(

player_response,

lambda x: x['streamingData']['hlsManifestUrl'],

2190

compat_str))

2191

or url_or_none(try_get(

2192

video_info, lambda x: x['hlsvp'][0], compat_str)))

2193

if manifest_url:

2194

formats = []

2195

m3u8_formats = self._extract_m3u8_formats(

2196

manifest_url, video_id, 'mp4', fatal=False)

2197

for a_format in m3u8_formats:

2198

itag = self._search_regex(

2199

r'/itag/(\d+)/', a_format['url'], 'itag', default=None)

2200

if itag:

2201

a_format['format_id'] = itag

2202

if itag in self._formats:

2203

dct = self._formats[itag].copy()

2204

dct.update(a_format)

2205

a_format = dct

2206

a_format['player_url'] = player_url

2207

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

2208

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

2209

formats.append(a_format)

2210

else:

2211

error_message = extract_unavailable_message()

2212

if not error_message:

2213

error_message = clean_html(try_get(

2214

player_response, lambda x: x['playabilityStatus']['reason'],

2215

compat_str))

2216

if not error_message:

2217

error_message = clean_html(

2218

try_get(video_info, lambda x: x['reason'][0], compat_str))

2219

if error_message:

2220

raise ExtractorError(error_message, expected=True)

2221

raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')

2222

2223

# uploader

2224

video_uploader = try_get(

2225

video_info, lambda x: x['author'][0],

2226

compat_str) or str_or_none(video_details.get('author'))

2227

if video_uploader:

2228

video_uploader = compat_urllib_parse_unquote_plus(video_uploader)

2229

else:

2230

self._downloader.report_warning('unable to extract uploader name')

2231

2232

# uploader_id

2233

video_uploader_id = None

2234

video_uploader_url = None

2235

mobj = re.search(

2236

r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',

2237

video_webpage)

2238

if mobj is not None:

2239

video_uploader_id = mobj.group('uploader_id')

2240

video_uploader_url = mobj.group('uploader_url')

2241

else:

2242

owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))

2243

if owner_profile_url:

2244

video_uploader_id = self._search_regex(

2245

r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',

2246

default=None)

2247

video_uploader_url = owner_profile_url

2248

2249

channel_id = (

2250

str_or_none(video_details.get('channelId'))

2251

or self._html_search_meta(

2252

'channelId', video_webpage, 'channel id', default=None)

2253

or self._search_regex(

2254

r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',

2255

video_webpage, 'channel id', default=None, group='id'))

2256

channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None

2257

2258

thumbnails = []

2259

thumbnails_list = try_get(

2260

video_details, lambda x: x['thumbnail']['thumbnails'], list) or []

2261

for t in thumbnails_list:

2262

if not isinstance(t, dict):

2263

continue

2264

thumbnail_url = url_or_none(t.get('url'))

2265

if not thumbnail_url:

2266

continue

2267

thumbnails.append({

2268

'url': thumbnail_url,

2269

'width': int_or_none(t.get('width')),

2270

'height': int_or_none(t.get('height')),

})

if not thumbnails:

video_thumbnail = None

2275

# We try first to get a high quality image:

2276

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

2277

video_webpage, re.DOTALL)

2278

if m_thumb is not None:

2279

video_thumbnail = m_thumb.group(1)

2280

thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)

2281

if thumbnail_url:

2282

video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)

2283

if video_thumbnail:

2284

thumbnails.append({'url': video_thumbnail})

2285

2286

# upload date

2287

upload_date = self._html_search_meta(

2288

'datePublished', video_webpage, 'upload date', default=None)

2289

if not upload_date:

2290

upload_date = self._search_regex(

2291

[r'(?s)id="eow-date.*?>(.*?)</span>',

2292

r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],

2293

video_webpage, 'upload date', default=None)

2294

if not upload_date:

2295

upload_date = microformat.get('publishDate') or microformat.get('uploadDate')

2296

upload_date = unified_strdate(upload_date)

2297

2298

video_license = self._html_search_regex(

2299

r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',

2300

video_webpage, 'license', default=None)

m_music = re.search(

r'''(?x)

<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*

<ul[^>]*>\s*

<li>(?P<title>.+?)

by (?P<creator>.+?)

(?:

$.+?$|

<a[^>]*

(?:

\bhref=["\']/red[^>]*>| # drop possible

2313

>\s*Listen ad-free with YouTube Red # YouTube Red ad

)

.*?

)?</li

''',

video_webpage)

if m_music:

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

2321

video_creator = clean_html(m_music.group('creator'))

2322

else:

2323

video_alt_title = video_creator = None

2324

2325

def extract_meta(field):

2326

return self._html_search_regex(

2327

r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,

2328

video_webpage, field, default=None)

2329

2330

track = extract_meta('Song')

2331

artist = extract_meta('Artist')

2332

album = extract_meta('Album')

2333

2334

# Youtube Music Auto-generated description

2335

release_date = release_year = None

2336

if video_description:

2337

mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)

2338

if mobj:

2339

if not track:

2340

track = mobj.group('track').strip()

2341

if not artist:

2342

artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))

2343

if not album:

2344

album = mobj.group('album'.strip())

2345

release_year = mobj.group('release_year')

2346

release_date = mobj.group('release_date')

2347

if release_date:

2348

release_date = release_date.replace('-', '')

2349

if not release_year:

2350

release_year = int(release_date[:4])

2351

if release_year:

2352

release_year = int(release_year)

2353

2354

m_episode = re.search(

2355

r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',

2356

video_webpage)

2357

if m_episode:

2358

series = unescapeHTML(m_episode.group('series'))

2359

season_number = int(m_episode.group('season'))

2360

episode_number = int(m_episode.group('episode'))

2361

else:

2362

series = season_number = episode_number = None

2363

2364

m_cat_container = self._search_regex(

2365

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

2366

video_webpage, 'categories', default=None)

2367

category = None

2368

if m_cat_container:

2369

category = self._html_search_regex(

2370

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

default=None)

if not category:

category = try_get(

microformat, lambda x: x['category'], compat_str)

2375

video_categories = None if category is None else [category]

2376

2377

video_tags = [

2378

unescapeHTML(m.group('content'))

2379

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

2380

if not video_tags:

2381

video_tags = try_get(video_details, lambda x: x['keywords'], list)

2382

2383

def _extract_count(count_name):

2384

return str_to_int(self._search_regex(

2385

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

2386

% re.escape(count_name),

2387

video_webpage, count_name, default=None))

2388

2389

like_count = _extract_count('like')

2390

dislike_count = _extract_count('dislike')

2391

2392

if view_count is None:

2393

view_count = str_to_int(self._search_regex(

2394

r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,

2395

'view count', default=None))

2396

2397

average_rating = (

2398

float_or_none(video_details.get('averageRating'))

2399

or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))

2400

2401

# subtitles

2402

video_subtitles = self.extract_subtitles(video_id, video_webpage)

2403

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

2404

2405

video_duration = try_get(

2406

video_info, lambda x: int_or_none(x['length_seconds'][0]))

2407

if not video_duration:

2408

video_duration = int_or_none(video_details.get('lengthSeconds'))

2409

if not video_duration:

2410

video_duration = parse_duration(self._html_search_meta(

2411

'duration', video_webpage, 'video duration'))

2412

2413

# annotations

2414

video_annotations = None

2415

if self._downloader.params.get('writeannotations', False):

2416

xsrf_token = self._search_regex(

2417

r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',

2418

video_webpage, 'xsrf token', group='xsrf_token', fatal=False)

2419

invideo_url = try_get(

2420

player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)

2421

if xsrf_token and invideo_url:

2422

xsrf_field_name = self._search_regex(

2423

r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',

2424

video_webpage, 'xsrf field name',

2425

group='xsrf_field_name', default='session_token')

2426

video_annotations = self._download_webpage(

2427

self._proto_relative_url(invideo_url),

2428

video_id, note='Downloading annotations',

2429

errnote='Unable to download video annotations', fatal=False,

2430

data=urlencode_postdata({xsrf_field_name: xsrf_token}))

2431

2432

chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)

2433

2434

# Look for the DASH manifest

2435

if self._downloader.params.get('youtube_include_dash_manifest', True):

2436

dash_mpd_fatal = True

2437

for mpd_url in dash_mpds:

2438

dash_formats = {}

2439

try:

2440

def decrypt_sig(mobj):

2441

s = mobj.group(1)

2442

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

2443

return '/signature/%s' % dec_s

2444

2445

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

2446

2447

for df in self._extract_mpd_formats(

2448

mpd_url, video_id, fatal=dash_mpd_fatal,

2449

formats_dict=self._formats):

2450

if not df.get('filesize'):

2451

df['filesize'] = _extract_filesize(df['url'])

2452

# Do not overwrite DASH format found in some previous DASH manifest

2453

if df['format_id'] not in dash_formats:

2454

dash_formats[df['format_id']] = df

2455

# Additional DASH manifests may end up in HTTP Error 403 therefore

2456

# allow them to fail without bug report message if we already have

2457

# some DASH manifest succeeded. This is temporary workaround to reduce

2458

# burst of bug reports until we figure out the reason and whether it

2459

# can be fixed at all.

2460

dash_mpd_fatal = False

2461

except (ExtractorError, KeyError) as e:

2462

self.report_warning(

2463

'Skipping DASH manifest: %r' % e, video_id)

2464

if dash_formats:

2465

# Remove the formats we found through non-DASH, they

2466

# contain less info and it can be wrong, because we use

2467

# fixed values (for example the resolution). See

2468

# https://github.com/ytdl-org/youtube-dl/issues/5774 for an

2469

# example.

2470

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

2471

formats.extend(dash_formats.values())

2472

2473

# Check for malformed aspect ratio

2474

stretched_m = re.search(

2475

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

2476

video_webpage)

2477

if stretched_m:

2478

w = float(stretched_m.group('w'))

2479

h = float(stretched_m.group('h'))

2480

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

2481

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

2486

f['stretched_ratio'] = ratio

2487

2488

if not formats:

2489

if 'reason' in video_info:

2490

if 'The uploader has not made this video available in your country.' in video_info['reason']:

2491

regions_allowed = self._html_search_meta(

2492

'regionsAllowed', video_webpage, default=None)

2493

countries = regions_allowed.split(',') if regions_allowed else None

2494

self.raise_geo_restricted(

2495

msg=video_info['reason'][0], countries=countries)

2496

reason = video_info['reason'][0]

2497

if 'Invalid parameters' in reason:

2498

unavailable_message = extract_unavailable_message()

2499

if unavailable_message:

2500

reason = unavailable_message

2501

raise ExtractorError(

2502

'YouTube said: %s' % reason,

2503

expected=True, video_id=video_id)

2504

if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):

2505

raise ExtractorError('This video is DRM protected.', expected=True)

2506

2507

self._sort_formats(formats)

2508

2509

self.mark_watched(video_id, video_info, player_response)

return {

'id': video_id,

'uploader': video_uploader,

2514

'uploader_id': video_uploader_id,

2515

'uploader_url': video_uploader_url,

2516

'channel_id': channel_id,

2517

'channel_url': channel_url,

2518

'upload_date': upload_date,

2519

'license': video_license,

2520

'creator': video_creator or artist,

2521

'title': video_title,

2522

'alt_title': video_alt_title or track,

2523

'thumbnails': thumbnails,

2524

'description': video_description,

2525

'categories': video_categories,

2526

'tags': video_tags,

2527

'subtitles': video_subtitles,

2528

'automatic_captions': automatic_captions,

2529

'duration': video_duration,

2530

'age_limit': 18 if age_gate else 0,

2531

'annotations': video_annotations,

2532

'chapters': chapters,

2533

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

2534

'view_count': view_count,

2535

'like_count': like_count,

2536

'dislike_count': dislike_count,

2537

'average_rating': average_rating,

2538

'formats': formats,

2539

'is_live': is_live,

2540

'start_time': start_time,

2541

'end_time': end_time,

2542

'series': series,

2543

'season_number': season_number,

2544

'episode_number': episode_number,

'track': track,

'artist': artist,

'album': album,

'release_date': release_date,

2549

'release_year': release_year,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

2554

IE_DESC = 'YouTube.com playlists'

2555

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

(?:

(?:

youtube(?:kids)?\.com|

invidio\.us

)

/

(?:

(?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))

2566

\? (?:.*?[&;])*? (?:p|a|list)=

2567

| p/

2568

)|

2569

youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=

2570

)

2571

(

2572

(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}

2573

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

(%(playlist_id)s)

)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

2580

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

2581

_VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'

2582

_VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'

2583

IE_NAME = 'youtube:playlist'

2584

_TESTS = [{

2585

'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',

2586

'info_dict': {

2587

'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',

2588

'uploader': 'Sergey M.',

2589

'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',

2590

'title': 'youtube-dl public playlist',

},

'playlist_count': 1,

}, {

'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',

2595

'info_dict': {

2596

'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',

2597

'uploader': 'Sergey M.',

2598

'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',

2599

'title': 'youtube-dl empty playlist',

},

'playlist_count': 0,

}, {

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

2604

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2605

'info_dict': {

2606

'title': '29C3: Not my department',

2607

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2608

'uploader': 'Christiaan008',

2609

'uploader_id': 'ChRiStIaAn008',

2610

},

2611

'playlist_count': 96,

2612

}, {

2613

'note': 'issue #673',

2614

'url': 'PLBB231211A4F62143',

2615

'info_dict': {

2616

'title': '[OLD]Team Fortress 2 (Class-based LP)',

2617

'id': 'PLBB231211A4F62143',

2618

'uploader': 'Wickydoo',

2619

'uploader_id': 'Wickydoo',

2620

},

2621

'playlist_mincount': 26,

2622

}, {

2623

'note': 'Large playlist',

2624

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

2625

'info_dict': {

2626

'title': 'Uploads from Cauchemar',

2627

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

2628

'uploader': 'Cauchemar',

2629

'uploader_id': 'Cauchemar89',

2630

},

2631

'playlist_mincount': 799,

2632

}, {

2633

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2634

'info_dict': {

2635

'title': 'YDL_safe_search',

2636

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2637

},

2638

'playlist_count': 2,

2639

'skip': 'This playlist is private',

2640

}, {

2641

'note': 'embedded',

2642

'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

2647

'uploader': 'milan',

2648

'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',

2649

}

2650

}, {

2651

'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2652

'playlist_mincount': 485,

2653

'info_dict': {

2654

'title': '2018 Chinese New Singles (11/6 updated)',

2655

'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2656

'uploader': 'LBK',

2657

'uploader_id': 'sdragonfang',

2658

}

2659

}, {

2660

'note': 'Embedded SWF player',

2661

'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

2666

},

2667

'skip': 'This playlist does not exist',

2668

}, {

2669

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

2670

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

2671

'info_dict': {

2672

'title': 'Uploads from Interstellar Movie',

2673

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

2674

'uploader': 'Interstellar Movie',

2675

'uploader_id': 'InterstellarMovie1',

2676

},

2677

'playlist_mincount': 21,

2678

}, {

2679

# Playlist URL that does not actually serve a playlist

2680

'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',

'info_dict': {

'id': 'FqZTN594JQw',

'ext': 'webm',

'title': "Smiley's People 01 detective, Adventure Series, Action",

2685

'uploader': 'STREEM',

2686

'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',

2687

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',

2688

'upload_date': '20150526',

2689

'license': 'Standard YouTube License',

2690

'description': 'md5:507cdcb5a49ac0da37a920ece610be80',

2691

'categories': ['People & Blogs'],

'tags': list,

'view_count': int,

'like_count': int,

'dislike_count': int,

2696

},

2697

'params': {

2698

'skip_download': True,

2699

},

2700

'skip': 'This video is not available.',

2701

'add_ie': [YoutubeIE.ie_key()],

2702

}, {

2703

'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',

'info_dict': {

'id': 'yeWKywCrFtk',

'ext': 'mp4',

'title': 'Small Scale Baler and Braiding Rugs',

2708

'uploader': 'Backus-Page House Museum',

2709

'uploader_id': 'backuspagemuseum',

2710

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',

2711

'upload_date': '20161008',

2712

'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',

2713

'categories': ['Nonprofits & Activism'],

2714

'tags': list,

2715

'like_count': int,

2716

'dislike_count': int,

},

'params': {

'noplaylist': True,

'skip_download': True,

2721

},

2722

}, {

2723

# https://github.com/ytdl-org/youtube-dl/issues/21844

2724

'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',

2725

'info_dict': {

2726

'title': 'Data Analysis with Dr Mike Pound',

2727

'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',

2728

'uploader_id': 'Computerphile',

2729

'uploader': 'Computerphile',

2730

},

2731

'playlist_mincount': 11,

2732

}, {

2733

'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',

2734

'only_matching': True,

2735

}, {

2736

'url': 'TLGGrESM50VT6acwMjAyMjAxNw',

2737

'only_matching': True,

2738

}, {

2739

# music album playlist

2740

'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',

2741

'only_matching': True,

2742

}, {

2743

'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',

2744

'only_matching': True,

2745

}, {

2746

'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',

2747

'only_matching': True,

2748

}]

2749

2750

def _real_initialize(self):

2751

self._login()

2752

2753

def extract_videos_from_page(self, page):

ids_in_page = []

titles_in_page = []

for item in re.findall(

2758

r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):

2759

attrs = extract_attributes(item)

2760

video_id = attrs['data-video-id']

2761

video_title = unescapeHTML(attrs.get('data-title'))

2762

if video_title:

2763

video_title = video_title.strip()

2764

ids_in_page.append(video_id)

2765

titles_in_page.append(video_title)

2766

2767

# Fallback with old _VIDEO_RE

2768

self.extract_videos_from_page_impl(

2769

self._VIDEO_RE, page, ids_in_page, titles_in_page)

2770

2771

# Relaxed fallbacks

2772

self.extract_videos_from_page_impl(

2773

r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,

2774

ids_in_page, titles_in_page)

2775

self.extract_videos_from_page_impl(

2776

r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,

2777

ids_in_page, titles_in_page)

2778

2779

return zip(ids_in_page, titles_in_page)

2780

2781

def _extract_mix(self, playlist_id):

2782

# The mixes are generated from a single video

2783

# the id of the playlist is just 'RD' + video_id

2784

ids = []

2785

last_id = playlist_id[-11:]

2786

for n in itertools.count(1):

2787

url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)

2788

webpage = self._download_webpage(

2789

url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))

2790

new_ids = orderedSet(re.findall(

2791

r'''(?xs)data-video-username=".*?".*?

2792

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

2793

webpage))

2794

# Fetch new pages until all the videos are repeated, it seems that

2795

# there are always 51 unique videos.

2796

new_ids = [_id for _id in new_ids if _id not in ids]

if not new_ids:

break

ids.extend(new_ids)

last_id = ids[-1]

url_results = self._ids_to_results(ids)

2803

2804

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

2805

title_span = (

2806

search_title('playlist-title')

2807

or search_title('title long-title')

2808

or search_title('title'))

2809

title = clean_html(title_span)

2810

2811

return self.playlist_result(url_results, playlist_id, title)

2812

2813

def _extract_playlist(self, playlist_id):

2814

url = self._TEMPLATE_URL % playlist_id

2815

page = self._download_webpage(url, playlist_id)

2816

2817

# the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)

2818

for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):

2819

match = match.strip()

2820

# Check if the playlist exists or is private

2821

mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)

2822

if mobj:

2823

reason = mobj.group('reason')

2824

message = 'This playlist %s' % reason

2825

if 'private' in reason:

2826

message += ', use --username or --netrc to access it'

2827

message += '.'

2828

raise ExtractorError(message, expected=True)

2829

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

2830

raise ExtractorError(

2831

'Invalid parameters. Maybe URL is incorrect.',

2832

expected=True)

2833

elif re.match(r'[^<]*Choose your language[^<]*', match):

2834

continue

2835

else:

2836

self.report_warning('Youtube gives an alert message: ' + match)

2837

2838

playlist_title = self._html_search_regex(

2839

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

2840

page, 'title', default=None)

2841

2842

_UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='

2843

uploader = self._html_search_regex(

2844

r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,

2845

page, 'uploader', default=None)

2846

mobj = re.search(

2847

r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,

2848

page)

2849

if mobj:

2850

uploader_id = mobj.group('uploader_id')

2851

uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))

2852

else:

2853

uploader_id = uploader_url = None

has_videos = True

if not playlist_title:

2858

try:

2859

# Some playlist URLs don't actually serve a playlist (e.g.

2860

# https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)

2861

next(self._entries(page, playlist_id))

2862

except StopIteration:

2863

has_videos = False

2864

2865

playlist = self.playlist_result(

2866

self._entries(page, playlist_id), playlist_id, playlist_title)

2867

playlist.update({

2868

'uploader': uploader,

2869

'uploader_id': uploader_id,

2870

'uploader_url': uploader_url,

2871

})

2872

2873

return has_videos, playlist

2874

2875

def _check_download_just_video(self, url, playlist_id):

2876

# Check if it's a video-specific URL

2877

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

2878

video_id = query_dict.get('v', [None])[0] or self._search_regex(

2879

r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,

2880

'video id', default=None)

2881

if video_id:

2882

if self._downloader.params.get('noplaylist'):

2883

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

2884

return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)

2885

else:

2886

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

2887

return video_id, None

2888

return None, None

2889

2890

def _real_extract(self, url):

2891

# Extract playlist id

2892

mobj = re.match(self._VALID_URL, url)

2893

if mobj is None:

2894

raise ExtractorError('Invalid URL: %s' % url)

2895

playlist_id = mobj.group(1) or mobj.group(2)

2896

2897

video_id, video = self._check_download_just_video(url, playlist_id)

if video:

return video

if playlist_id.startswith(('RD', 'UL', 'PU')):

2902

# Mixes require a custom extraction process

2903

return self._extract_mix(playlist_id)

2904

2905

has_videos, playlist = self._extract_playlist(playlist_id)

2906

if has_videos or not video_id:

2907

return playlist

2908

2909

# Some playlist URLs don't actually serve a playlist (see

2910

# https://github.com/ytdl-org/youtube-dl/issues/10537).

2911

# Fallback to plain video extraction if there is a video id

2912

# along with playlist id.

2913

return self.url_result(video_id, 'Youtube', video_id=video_id)

2914

2915

2916

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

2917

IE_DESC = 'YouTube.com channels'

2918

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'

2919

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

2920

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

2921

IE_NAME = 'youtube:channel'

2922

_TESTS = [{

2923

'note': 'paginated channel',

2924

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

2925

'playlist_mincount': 91,

2926

'info_dict': {

2927

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

2928

'title': 'Uploads from lex will',

2929

'uploader': 'lex will',

2930

'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',

2931

}

2932

}, {

2933

'note': 'Age restricted channel',

2934

# from https://www.youtube.com/user/DeusExOfficial

2935

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

2936

'playlist_mincount': 64,

2937

'info_dict': {

2938

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

2939

'title': 'Uploads from Deus Ex',

2940

'uploader': 'Deus Ex',

2941

'uploader_id': 'DeusExOfficial',

2942

},

2943

}, {

2944

'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',

2945

'only_matching': True,

2946

}, {

2947

'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',

2948

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2953

return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)

2954

else super(YoutubeChannelIE, cls).suitable(url))

2955

2956

def _build_template_url(self, url, channel_id):

2957

return self._TEMPLATE_URL % channel_id

2958

2959

def _real_extract(self, url):

2960

channel_id = self._match_id(url)

2961

2962

url = self._build_template_url(url, channel_id)

2963

2964

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

2965

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

2966

# otherwise fallback on channel by page extraction

2967

channel_page = self._download_webpage(

2968

url + '?view=57', channel_id,

2969

'Downloading channel page', fatal=False)

2970

if channel_page is False:

2971

channel_playlist_id = False

2972

else:

2973

channel_playlist_id = self._html_search_meta(

2974

'channelId', channel_page, 'channel id', default=None)

2975

if not channel_playlist_id:

2976

channel_url = self._html_search_meta(

2977

('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),

2978

channel_page, 'channel url', default=None)

2979

if channel_url:

2980

channel_playlist_id = self._search_regex(

2981

r'vnd\.youtube://user/([0-9A-Za-z_-]+)',

2982

channel_url, 'channel id', default=None)

2983

if channel_playlist_id and channel_playlist_id.startswith('UC'):

2984

playlist_id = 'UU' + channel_playlist_id[2:]

2985

return self.url_result(

2986

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

2987

2988

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

2989

autogenerated = re.search(r'''(?x)

2990

class="[^"]*?(?:

2991

channel-header-autogenerated-label|

2992

yt-channel-title-autogenerated

2993

)[^"]*"''', channel_page) is not None

2994

2995

if autogenerated:

2996

# The videos are contained in a single page

2997

# the ajax pages can't be used, they are empty

2998

entries = [

2999

self.url_result(

3000

video_id, 'Youtube', video_id=video_id,

3001

video_title=video_title)

3002

for video_id, video_title in self.extract_videos_from_page(channel_page)]

3003

return self.playlist_result(entries, channel_id)

3004

3005

try:

3006

next(self._entries(channel_page, channel_id))

3007

except StopIteration:

3008

alert_message = self._html_search_regex(

3009

r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',

3010

channel_page, 'alert', default=None, group='alert')

3011

if alert_message:

3012

raise ExtractorError('Youtube said: %s' % alert_message, expected=True)

3013

3014

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

3015

3016

3017

class YoutubeUserIE(YoutubeChannelIE):

3018

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

3019

3020

_TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'

3021

IE_NAME = 'youtube:user'

3022

3023

_TESTS = [{

3024

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

3025

'playlist_mincount': 320,

3026

'info_dict': {

3027

'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',

3028

'title': 'Uploads from The Linux Foundation',

3029

'uploader': 'The Linux Foundation',

3030

'uploader_id': 'TheLinuxFoundation',

3031

}

3032

}, {

3033

# Only available via https://www.youtube.com/c/12minuteathlete/videos

3034

# but not https://www.youtube.com/user/12minuteathlete/videos

3035

'url': 'https://www.youtube.com/c/12minuteathlete/videos',

3036

'playlist_mincount': 249,

3037

'info_dict': {

3038

'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',

3039

'title': 'Uploads from 12 Minute Athlete',

3040

'uploader': '12 Minute Athlete',

3041

'uploader_id': 'the12minuteathlete',

3042

}

3043

}, {

3044

'url': 'ytuser:phihag',

3045

'only_matching': True,

3046

}, {

3047

'url': 'https://www.youtube.com/c/gametrailers',

3048

'only_matching': True,

3049

}, {

3050

'url': 'https://www.youtube.com/gametrailers',

3051

'only_matching': True,

3052

}, {

3053

# This channel is not available, geo restricted to JP

3054

'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',

3055

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

3060

# Don't return True if the url can be extracted with other youtube

3061

# extractor, the regex would is too permissive and it would match.

3062

other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)

3063

if any(ie.suitable(url) for ie in other_yt_ies):

3064

return False

3065

else:

3066

return super(YoutubeUserIE, cls).suitable(url)

3067

3068

def _build_template_url(self, url, channel_id):

3069

mobj = re.match(self._VALID_URL, url)

3070

return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))

3071

3072

3073

class YoutubeLiveIE(YoutubeBaseInfoExtractor):

3074

IE_DESC = 'YouTube.com live streams'

3075

_VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'

3076

IE_NAME = 'youtube:live'

3077

3078

_TESTS = [{

3079

'url': 'https://www.youtube.com/user/TheYoungTurks/live',

'info_dict': {

'id': 'a48o2S1cPoo',

'ext': 'mp4',

'title': 'The Young Turks - Live Main Show',

3084

'uploader': 'The Young Turks',

3085

'uploader_id': 'TheYoungTurks',

3086

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',

3087

'upload_date': '20150715',

3088

'license': 'Standard YouTube License',

3089

'description': 'md5:438179573adcdff3c97ebb1ee632b891',

3090

'categories': ['News & Politics'],

3091

'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],

3092

'like_count': int,

3093

'dislike_count': int,

3094

},

3095

'params': {

3096

'skip_download': True,

3097

},

3098

}, {

3099

'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',

3100

'only_matching': True,

3101

}, {

3102

'url': 'https://www.youtube.com/c/CommanderVideoHq/live',

3103

'only_matching': True,

3104

}, {

3105

'url': 'https://www.youtube.com/TheYoungTurks/live',

3106

'only_matching': True,

3107

}]

3108

3109

def _real_extract(self, url):

3110

mobj = re.match(self._VALID_URL, url)

3111

channel_id = mobj.group('id')

3112

base_url = mobj.group('base_url')

3113

webpage = self._download_webpage(url, channel_id, fatal=False)

3114

if webpage:

3115

page_type = self._og_search_property(

3116

'type', webpage, 'page type', default='')

3117

video_id = self._html_search_meta(

3118

'videoId', webpage, 'video id', default=None)

3119

if page_type.startswith('video') and video_id and re.match(

3120

r'^[0-9A-Za-z_-]{11}$', video_id):

3121

return self.url_result(video_id, YoutubeIE.ie_key())

3122

return self.url_result(base_url)

3123

3124

3125

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

3126

IE_DESC = 'YouTube.com user/channel playlists'

3127

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'

3128

IE_NAME = 'youtube:playlists'

3129

3130

_TESTS = [{

3131

'url': 'https://www.youtube.com/user/ThirstForScience/playlists',

3132

'playlist_mincount': 4,

3133

'info_dict': {

3134

'id': 'ThirstForScience',

3135

'title': 'ThirstForScience',

3136

},

3137

}, {

3138

# with "Load more" button

3139

'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

3140

'playlist_mincount': 70,

3141

'info_dict': {

3142

'id': 'igorkle1',

3143

'title': 'Игорь Клейнер',

3144

},

3145

}, {

3146

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

3147

'playlist_mincount': 17,

3148

'info_dict': {

3149

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

3150

'title': 'Chem Player',

},

'skip': 'Blocked',

}, {

'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',

3155

'only_matching': True,

}]

class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):

3160

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'

3161

3162

3163

class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):

3164

IE_DESC = 'YouTube.com searches'

3165

# there doesn't appear to be a real limit, for example if you search for

3166

# 'python' you get more than 8.000.000 results

3167

_MAX_RESULTS = float('inf')

3168

IE_NAME = 'youtube:search'

3169

_SEARCH_KEY = 'ytsearch'

3170

_EXTRA_QUERY_ARGS = {}

3171

_TESTS = []

3172

3173

def _get_n_results(self, query, n):

3174

"""Get a specified number of results for a query"""

videos = []

limit = n

url_query = {

'search_query': query.encode('utf-8'),

3181

}

3182

url_query.update(self._EXTRA_QUERY_ARGS)

3183

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)

3184

3185

for pagenum in itertools.count(1):

3186

data = self._download_json(

3187

result_url, video_id='query "%s"' % query,

3188

note='Downloading page %s' % pagenum,

3189

errnote='Unable to download API page',

3190

query={'spf': 'navigate'})

3191

html_content = data[1]['body']['content']

3192

3193

if 'class="search-message' in html_content:

3194

raise ExtractorError(

3195

'[youtube] No video results', expected=True)

3196

3197

new_videos = list(self._process_page(html_content))

3198

videos += new_videos

3199

if not new_videos or len(videos) > limit:

3200

break

3201

next_link = self._html_search_regex(

3202

r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',

3203

html_content, 'next link', default=None)

3204

if next_link is None:

3205

break

3206

result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

3211

3212

3213

class YoutubeSearchDateIE(YoutubeSearchIE):

3214

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

3215

_SEARCH_KEY = 'ytsearchdate'

3216

IE_DESC = 'YouTube.com searches, newest videos first'

3217

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

3218

3219

3220

class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):

3221

IE_DESC = 'YouTube.com search URLs'

3222

IE_NAME = 'youtube:search_url'

3223

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'

3224

_TESTS = [{

3225

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

3226

'playlist_mincount': 5,

3227

'info_dict': {

3228

'title': 'youtube-dl test video',

3229

}

3230

}, {

3231

'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',

3232

'only_matching': True,

3233

}]

3234

3235

def _real_extract(self, url):

3236

mobj = re.match(self._VALID_URL, url)

3237

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

3238

webpage = self._download_webpage(url, query)

3239

return self.playlist_result(self._process_page(webpage), playlist_title=query)

3240

3241

3242

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

3243

IE_DESC = 'YouTube.com (multi-season) shows'

3244

_VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'

3245

IE_NAME = 'youtube:show'

3246

_TESTS = [{

3247

'url': 'https://www.youtube.com/show/airdisasters',

3248

'playlist_mincount': 5,

3249

'info_dict': {

3250

'id': 'airdisasters',

3251

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

3256

playlist_id = self._match_id(url)

3257

return super(YoutubeShowIE, self)._real_extract(

3258

'https://www.youtube.com/show/%s/playlists' % playlist_id)

3259

3260

3261

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

3262

"""

3263

Base class for feed extractors

3264

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

3265

"""

3266

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

3271

3272

def _real_initialize(self):

3273

self._login()

3274

3275

def _entries(self, page):

3276

# The extraction process is the same as for playlists, but the regex

3277

# for the video ids doesn't contain an index

3278

ids = []

3279

more_widget_html = content_html = page

3280

for page_num in itertools.count(1):

3281

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

3282

3283

# 'recommended' feed has infinite 'load more' and each new portion spins

3284

# the same videos in (sometimes) slightly different order, so we'll check

3285

# for unicity and break when portion has no new videos

3286

new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))

if not new_ids:

break

ids.extend(new_ids)

for entry in self._ids_to_results(new_ids):

3293

yield entry

3294

3295

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

3300

'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

3301

'Downloading page #%s' % page_num,

3302

transform_source=uppercase_escape,

3303

headers=self._YOUTUBE_CLIENT_HEADERS)

3304

content_html = more['content_html']

3305

more_widget_html = more['load_more_widget_html']

3306

3307

def _real_extract(self, url):

3308

page = self._download_webpage(

3309

'https://www.youtube.com/feed/%s' % self._FEED_NAME,

3310

self._PLAYLIST_TITLE)

3311

return self.playlist_result(

3312

self._entries(page), playlist_title=self._PLAYLIST_TITLE)

3313

3314

3315

class YoutubeWatchLaterIE(YoutubePlaylistIE):

3316

IE_NAME = 'youtube:watchlater'

3317

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

3318

_VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'

3319

3320

_TESTS = [{

3321

'url': 'https://www.youtube.com/playlist?list=WL',

3322

'only_matching': True,

3323

}, {

3324

'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',

3325

'only_matching': True,

3326

}]

3327

3328

def _real_extract(self, url):

3329

_, video = self._check_download_just_video(url, 'WL')

3330

if video:

3331

return video

3332

_, playlist = self._extract_playlist('WL')

return playlist

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

3337

IE_NAME = 'youtube:favorites'

3338

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

3339

_VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

3340

_LOGIN_REQUIRED = True

3341

3342

def _real_extract(self, url):

3343

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

3344

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

3345

return self.url_result(playlist_id, 'YoutubePlaylist')

3346

3347

3348

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

3349

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

3350

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'

3351

_FEED_NAME = 'recommended'

3352

_PLAYLIST_TITLE = 'Youtube Recommended videos'

3353

3354

3355

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

3356

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

3357

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

3358

_FEED_NAME = 'subscriptions'

3359

_PLAYLIST_TITLE = 'Youtube Subscriptions'

3360

3361

3362

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

3363

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

3364

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'

3365

_FEED_NAME = 'history'

3366

_PLAYLIST_TITLE = 'Youtube History'

3367

3368

3369

class YoutubeTruncatedURLIE(InfoExtractor):

3370

IE_NAME = 'youtube:truncated_url'

3371

IE_DESC = False # Do not list

3372

_VALID_URL = r'''(?x)

3373

(?:https?://)?

3374

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

3375

(?:watch\?(?:

3376

feature=[a-z_]+|

3377

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',

3390

'only_matching': True,

3391

}, {

3392

'url': 'https://www.youtube.com/watch?',

3393

'only_matching': True,

3394

}, {

3395

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

3396

'only_matching': True,

3397

}, {

3398

'url': 'https://www.youtube.com/watch?feature=foo',

3399

'only_matching': True,

3400

}, {

3401

'url': 'https://www.youtube.com/watch?hl=en-GB',

3402

'only_matching': True,

3403

}, {

3404

'url': 'https://www.youtube.com/watch?t=2372',

3405

'only_matching': True,

3406

}]

3407

3408

def _real_extract(self, url):

3409

raise ExtractorError(

3410

'Did you forget to quote the URL? Remember that & is a meta '

3411

'character in most shells, so you want to put the URL in quotes, '

3412

'like youtube-dl '

3413

'"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

3414

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

3419

IE_NAME = 'youtube:truncated_id'

3420

IE_DESC = False # Do not list

3421

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

3422

3423

_TESTS = [{

3424

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

3425

'only_matching': True,

3426

}]

3427

3428

def _real_extract(self, url):

3429

video_id = self._match_id(url)

3430

raise ExtractorError(

3431

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

3432

expected=True)