jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import random
	10	import re
	11	import time
	12	import traceback
	13
	14	from .common import InfoExtractor, SearchInfoExtractor
	15	from ..jsinterp import JSInterpreter
	16	from ..swfinterp import SWFInterpreter
	17	from ..compat import (
	18	compat_chr,
	19	compat_HTTPError,
	20	compat_kwargs,
	21	compat_parse_qs,
	22	compat_urllib_parse_unquote,
	23	compat_urllib_parse_unquote_plus,
	24	compat_urllib_parse_urlencode,
	25	compat_urllib_parse_urlparse,
	26	compat_urlparse,
	27	compat_str,
	28	)
	29	from ..utils import (
	30	bool_or_none,
	31	clean_html,
	32	dict_get,
	33	error_to_compat_str,
	34	extract_attributes,
	35	ExtractorError,
	36	float_or_none,
	37	get_element_by_attribute,
	38	get_element_by_id,
	39	int_or_none,
	40	mimetype2ext,
	41	orderedSet,
	42	parse_codecs,
	43	parse_duration,
	44	remove_quotes,
	45	remove_start,
	46	smuggle_url,
	47	str_or_none,
	48	str_to_int,
	49	try_get,
	50	unescapeHTML,
	51	unified_strdate,
	52	unsmuggle_url,
	53	uppercase_escape,
	54	url_or_none,
	55	urlencode_postdata,
	56	)
	57
	58
	59	class YoutubeBaseInfoExtractor(InfoExtractor):
	60	"""Provide base functions for Youtube extractors"""
	61	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	62	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	63
	64	_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
	65	_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
	66	_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
	67
	68	_NETRC_MACHINE = 'youtube'
	69	# If True it will raise an error if no login info is provided
	70	_LOGIN_REQUIRED = False
	71
	72	_PLAYLIST_ID_RE = r'(?:PL\|LL\|EC\|UU\|FL\|RD\|UL\|TL\|PU\|OLAK5uy_)[0-9A-Za-z-_]{10,}'
	73
	74	def _set_language(self):
	75	self._set_cookie(
	76	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	77	# YouTube sets the expire time to about two months
	78	expire_time=time.time() + 2 * 30 * 24 * 3600)
	79
	80	def _ids_to_results(self, ids):
	81	return [
	82	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	83	for vid_id in ids]
	84
	85	def _login(self):
	86	"""
	87	Attempt to log in to YouTube.
	88	True is returned if successful or skipped.
	89	False is returned if login failed.
	90
	91	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	92	"""
	93	username, password = self._get_login_info()
	94	# No authentication to be performed
	95	if username is None:
	96	if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
	97	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	98	return True
	99
	100	login_page = self._download_webpage(
	101	self._LOGIN_URL, None,
	102	note='Downloading login page',
	103	errnote='unable to fetch login page', fatal=False)
	104	if login_page is False:
	105	return
	106
	107	login_form = self._hidden_inputs(login_page)
	108
	109	def req(url, f_req, note, errnote):
	110	data = login_form.copy()
	111	data.update({
	112	'pstMsg': 1,
	113	'checkConnection': 'youtube',
	114	'checkedDomains': 'youtube',
	115	'hl': 'en',
	116	'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
	117	'f.req': json.dumps(f_req),
	118	'flowName': 'GlifWebSignIn',
	119	'flowEntry': 'ServiceLogin',
	120	# TODO: reverse actual botguard identifier generation algo
	121	'bgRequest': '["identifier",""]',
	122	})
	123	return self._download_json(
	124	url, None, note=note, errnote=errnote,
	125	transform_source=lambda s: re.sub(r'^[^[]*', '', s),
	126	fatal=False,
	127	data=urlencode_postdata(data), headers={
	128	'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
	129	'Google-Accounts-XSRF': 1,
	130	})
	131
	132	def warn(message):
	133	self._downloader.report_warning(message)
	134
	135	lookup_req = [
	136	username,
	137	None, [], None, 'US', None, None, 2, False, True,
	138	[
	139	None, None,
	140	[2, 1, None, 1,
	141	'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
	142	None, [], 4],
	143	1, [None, None, []], None, None, None, True
	144	],
	145	username,
	146	]
	147
	148	lookup_results = req(
	149	self._LOOKUP_URL, lookup_req,
	150	'Looking up account info', 'Unable to look up account info')
	151
	152	if lookup_results is False:
	153	return False
	154
	155	user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
	156	if not user_hash:
	157	warn('Unable to extract user hash')
	158	return False
	159
	160	challenge_req = [
	161	user_hash,
	162	None, 1, None, [1, None, None, None, [password, None, True]],
	163	[
	164	None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
	165	1, [None, None, []], None, None, None, True
	166	]]
	167
	168	challenge_results = req(
	169	self._CHALLENGE_URL, challenge_req,
	170	'Logging in', 'Unable to log in')
	171
	172	if challenge_results is False:
	173	return
	174
	175	login_res = try_get(challenge_results, lambda x: x[0][5], list)
	176	if login_res:
	177	login_msg = try_get(login_res, lambda x: x[5], compat_str)
	178	warn(
	179	'Unable to login: %s' % 'Invalid password'
	180	if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
	181	return False
	182
	183	res = try_get(challenge_results, lambda x: x[0][-1], list)
	184	if not res:
	185	warn('Unable to extract result entry')
	186	return False
	187
	188	login_challenge = try_get(res, lambda x: x[0][0], list)
	189	if login_challenge:
	190	challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
	191	if challenge_str == 'TWO_STEP_VERIFICATION':
	192	# SEND_SUCCESS - TFA code has been successfully sent to phone
	193	# QUOTA_EXCEEDED - reached the limit of TFA codes
	194	status = try_get(login_challenge, lambda x: x[5], compat_str)
	195	if status == 'QUOTA_EXCEEDED':
	196	warn('Exceeded the limit of TFA codes, try later')
	197	return False
	198
	199	tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
	200	if not tl:
	201	warn('Unable to extract TL')
	202	return False
	203
	204	tfa_code = self._get_tfa_info('2-step verification code')
	205
	206	if not tfa_code:
	207	warn(
	208	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	209	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	210	return False
	211
	212	tfa_code = remove_start(tfa_code, 'G-')
	213
	214	tfa_req = [
	215	user_hash, None, 2, None,
	216	[
	217	9, None, None, None, None, None, None, None,
	218	[None, tfa_code, True, 2]
	219	]]
	220
	221	tfa_results = req(
	222	self._TFA_URL.format(tl), tfa_req,
	223	'Submitting TFA code', 'Unable to submit TFA code')
	224
	225	if tfa_results is False:
	226	return False
	227
	228	tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
	229	if tfa_res:
	230	tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
	231	warn(
	232	'Unable to finish TFA: %s' % 'Invalid TFA code'
	233	if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
	234	return False
	235
	236	check_cookie_url = try_get(
	237	tfa_results, lambda x: x[0][-1][2], compat_str)
	238	else:
	239	CHALLENGES = {
	240	'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
	241	'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
	242	'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
	243	}
	244	challenge = CHALLENGES.get(
	245	challenge_str,
	246	'%s returned error %s.' % (self.IE_NAME, challenge_str))
	247	warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
	248	return False
	249	else:
	250	check_cookie_url = try_get(res, lambda x: x[2], compat_str)
	251
	252	if not check_cookie_url:
	253	warn('Unable to extract CheckCookie URL')
	254	return False
	255
	256	check_cookie_results = self._download_webpage(
	257	check_cookie_url, None, 'Checking cookie', fatal=False)
	258
	259	if check_cookie_results is False:
	260	return False
	261
	262	if 'https://myaccount.google.com/' not in check_cookie_results:
	263	warn('Unable to log in')
	264	return False
	265
	266	return True
	267
	268	def _download_webpage_handle(self, args, *kwargs):
	269	query = kwargs.get('query', {}).copy()
	270	query['disable_polymer'] = 'true'
	271	kwargs['query'] = query
	272	return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
	273	args, *compat_kwargs(kwargs))
	274
	275	def _real_initialize(self):
	276	if self._downloader is None:
	277	return
	278	self._set_language()
	279	if not self._login():
	280	return
	281
	282
	283	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	284	# Extract entries from page with "Load more" button
	285	def _entries(self, page, playlist_id):
	286	more_widget_html = content_html = page
	287	for page_num in itertools.count(1):
	288	for entry in self._process_page(content_html):
	289	yield entry
	290
	291	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	292	if not mobj:
	293	break
	294
	295	count = 0
	296	retries = 3
	297	while count <= retries:
	298	try:
	299	# Downloading page may result in intermittent 5xx HTTP error
	300	# that is usually worked around with a retry
	301	more = self._download_json(
	302	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	303	'Downloading page #%s%s'
	304	% (page_num, ' (retry #%d)' % count if count else ''),
	305	transform_source=uppercase_escape)
	306	break
	307	except ExtractorError as e:
	308	if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
	309	count += 1
	310	if count <= retries:
	311	continue
	312	raise
	313
	314	content_html = more['content_html']
	315	if not content_html.strip():
	316	# Some webpages show a "Load more" button but they don't
	317	# have more videos
	318	break
	319	more_widget_html = more['load_more_widget_html']
	320
	321
	322	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	323	def _process_page(self, content):
	324	for video_id, video_title in self.extract_videos_from_page(content):
	325	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	326
	327	def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
	328	for mobj in re.finditer(video_re, page):
	329	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	330	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	331	continue
	332	video_id = mobj.group('id')
	333	video_title = unescapeHTML(
	334	mobj.group('title')) if 'title' in mobj.groupdict() else None
	335	if video_title:
	336	video_title = video_title.strip()
	337	if video_title == '► Play all':
	338	video_title = None
	339	try:
	340	idx = ids_in_page.index(video_id)
	341	if video_title and not titles_in_page[idx]:
	342	titles_in_page[idx] = video_title
	343	except ValueError:
	344	ids_in_page.append(video_id)
	345	titles_in_page.append(video_title)
	346
	347	def extract_videos_from_page(self, page):
	348	ids_in_page = []
	349	titles_in_page = []
	350	self.extract_videos_from_page_impl(
	351	self._VIDEO_RE, page, ids_in_page, titles_in_page)
	352	return zip(ids_in_page, titles_in_page)
	353
	354
	355	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	356	def _process_page(self, content):
	357	for playlist_id in orderedSet(re.findall(
	358	r'<h3[^>]+class="[^"]yt-lockup-title[^"]"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
	359	content)):
	360	yield self.url_result(
	361	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	362
	363	def _real_extract(self, url):
	364	playlist_id = self._match_id(url)
	365	webpage = self._download_webpage(url, playlist_id)
	366	title = self._og_search_title(webpage, fatal=False)
	367	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	368
	369
	370	class YoutubeIE(YoutubeBaseInfoExtractor):
	371	IE_DESC = 'YouTube.com'
	372	_VALID_URL = r"""(?x)^
	373	(
	374	(?:https?://\|//) # http(s):// or protocol-independent URL
	375	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie\|kids)?\.com/\|
	376	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	377	(?:www\.)?pwnyoutube\.com/\|
	378	(?:www\.)?hooktube\.com/\|
	379	(?:www\.)?yourepeat\.com/\|
	380	tube\.majestyc\.net/\|
	381	# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
	382	(?:(?:www\|dev)\.)?invidio\.us/\|
	383	(?:(?:www\|no)\.)?invidiou\.sh/\|
	384	(?:(?:www\|fi\|de)\.)?invidious\.snopyta\.org/\|
	385	(?:www\.)?invidious\.kabi\.tk/\|
	386	(?:www\.)?invidious\.13ad\.de/\|
	387	(?:www\.)?invidious\.mastodon\.host/\|
	388	(?:www\.)?invidious\.nixnet\.xyz/\|
	389	(?:www\.)?invidious\.drycat\.fr/\|
	390	(?:www\.)?tube\.poal\.co/\|
	391	(?:www\.)?vid\.wxzm\.sx/\|
	392	(?:www\.)?yt\.elukerio\.org/\|
	393	(?:www\.)?yt\.lelux\.fi/\|
	394	(?:www\.)?kgg2m7yk5aybusll\.onion/\|
	395	(?:www\.)?qklhadlycap4cnod\.onion/\|
	396	(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/\|
	397	(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/\|
	398	(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/\|
	399	(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/\|
	400	(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/\|
	401	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	402	(?:.*?\#/)? # handle anchor (#/) redirect urls
	403	(?: # the various things that can precede the ID:
	404	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	405	\|(?: # or the v= param in all its forms
	406	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	407	(?:\?\|\#!?) # the params delimiter ? or # or #!
	408	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	409	v=
	410	)
	411	))
	412	\|(?:
	413	youtu\.be\| # just youtu.be/xxxx
	414	vid\.plus\| # or vid.plus/xxxx
	415	zwearz\.com/watch\| # or zwearz.com/watch/xxxx
	416	)/
	417	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	418	)
	419	)? # all until now is optional -> you can pass the naked ID
	420	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	421	(?!.*?\blist=
	422	(?:
	423	%(playlist_id)s\| # combined list/video URLs are handled by the playlist IE
	424	WL # WL are handled by the watch later IE
	425	)
	426	)
	427	(?(1).+)? # if we found the ID, everything can follow
	428	$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
	429	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	430	_formats = {
	431	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	432	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	433	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	434	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	435	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	436	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	437	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	438	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	439	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	440	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
	441	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	442	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	443	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	444	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	445	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	446	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	447	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	448	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	449
	450
	451	# 3D videos
	452	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	453	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	454	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	455	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	456	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	457	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	458	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	459
	460	# Apple HTTP Live Streaming
	461	'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	462	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	463	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	464	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	465	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	466	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	467	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	468	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	469
	470	# DASH mp4 video
	471	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
	472	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
	473	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	474	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
	475	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
	476	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
	477	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
	478	'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
	479	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
	480	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	481	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
	482	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
	483
	484	# Dash mp4 audio
	485	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
	486	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
	487	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
	488	'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
	489	'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
	490	'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
	491	'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
	492
	493	# Dash webm
	494	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	495	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	496	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	497	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	498	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	499	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
	500	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import random

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

15

from ..jsinterp import JSInterpreter

16

from ..swfinterp import SWFInterpreter

17

from ..compat import (

compat_chr,

compat_HTTPError,

compat_kwargs,

compat_parse_qs,

compat_urllib_parse_unquote,

23

compat_urllib_parse_unquote_plus,

24

compat_urllib_parse_urlencode,

25

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

bool_or_none,

clean_html,

dict_get,

error_to_compat_str,

extract_attributes,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_codecs,

parse_duration,

remove_quotes,

remove_start,

smuggle_url,

str_or_none,

str_to_int,

try_get,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

url_or_none,

urlencode_postdata,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

60

"""Provide base functions for Youtube extractors"""

61

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

62

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

63

64

_LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'

65

_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'

66

_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'

67

68

_NETRC_MACHINE = 'youtube'

69

# If True it will raise an error if no login info is provided

70

_LOGIN_REQUIRED = False

71

72

_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'

73

74

def _set_language(self):

75

self._set_cookie(

76

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

77

# YouTube sets the expire time to about two months

78

expire_time=time.time() + 2 * 30 * 24 * 3600)

79

80

def _ids_to_results(self, ids):

81

return [

82

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

88

True is returned if successful or skipped.

89

False is returned if login failed.

90

91

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

92

"""

93

username, password = self._get_login_info()

94

# No authentication to be performed

95

if username is None:

96

if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:

97

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

98

return True

99

100

login_page = self._download_webpage(

101

self._LOGIN_URL, None,

102

note='Downloading login page',

103

errnote='unable to fetch login page', fatal=False)

104

if login_page is False:

105

return

106

107

login_form = self._hidden_inputs(login_page)

108

109

def req(url, f_req, note, errnote):

110

data = login_form.copy()

111

data.update({

112

'pstMsg': 1,

113

'checkConnection': 'youtube',

114

'checkedDomains': 'youtube',

115

'hl': 'en',

116

'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',

117

'f.req': json.dumps(f_req),

118

'flowName': 'GlifWebSignIn',

119

'flowEntry': 'ServiceLogin',

120

# TODO: reverse actual botguard identifier generation algo

121

'bgRequest': '["identifier",""]',

122

})

123

return self._download_json(

124

url, None, note=note, errnote=errnote,

125

transform_source=lambda s: re.sub(r'^[^[]*', '', s),

126

fatal=False,

127

data=urlencode_postdata(data), headers={

128

'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',

129

'Google-Accounts-XSRF': 1,

})

def warn(message):

self._downloader.report_warning(message)

lookup_req = [

username,

None, [], None, 'US', None, None, 2, False, True,

[

None, None,

[2, 1, None, 1,

'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',

142

None, [], 4],

143

1, [None, None, []], None, None, None, True

],

username,

]

lookup_results = req(

149

self._LOOKUP_URL, lookup_req,

150

'Looking up account info', 'Unable to look up account info')

151

152

if lookup_results is False:

153

return False

154

155

user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)

156

if not user_hash:

157

warn('Unable to extract user hash')

return False

challenge_req = [

user_hash,

None, 1, None, [1, None, None, None, [password, None, True]],

163

[

164

None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],

165

1, [None, None, []], None, None, None, True

166

]]

167

168

challenge_results = req(

169

self._CHALLENGE_URL, challenge_req,

170

'Logging in', 'Unable to log in')

171

172

if challenge_results is False:

173

return

174

175

login_res = try_get(challenge_results, lambda x: x[0][5], list)

176

if login_res:

177

login_msg = try_get(login_res, lambda x: x[5], compat_str)

178

warn(

179

'Unable to login: %s' % 'Invalid password'

180

if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)

181

return False

182

183

res = try_get(challenge_results, lambda x: x[0][-1], list)

184

if not res:

185

warn('Unable to extract result entry')

186

return False

187

188

login_challenge = try_get(res, lambda x: x[0][0], list)

189

if login_challenge:

190

challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)

191

if challenge_str == 'TWO_STEP_VERIFICATION':

192

# SEND_SUCCESS - TFA code has been successfully sent to phone

193

# QUOTA_EXCEEDED - reached the limit of TFA codes

194

status = try_get(login_challenge, lambda x: x[5], compat_str)

195

if status == 'QUOTA_EXCEEDED':

196

warn('Exceeded the limit of TFA codes, try later')

197

return False

198

199

tl = try_get(challenge_results, lambda x: x[1][2], compat_str)

200

if not tl:

201

warn('Unable to extract TL')

202

return False

203

204

tfa_code = self._get_tfa_info('2-step verification code')

if not tfa_code:

warn(

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

209

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

210

return False

211

212

tfa_code = remove_start(tfa_code, 'G-')

213

214

tfa_req = [

215

user_hash, None, 2, None,

216

[

217

9, None, None, None, None, None, None, None,

218

[None, tfa_code, True, 2]

]]

tfa_results = req(

self._TFA_URL.format(tl), tfa_req,

223

'Submitting TFA code', 'Unable to submit TFA code')

224

225

if tfa_results is False:

226

return False

227

228

tfa_res = try_get(tfa_results, lambda x: x[0][5], list)

229

if tfa_res:

230

tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)

231

warn(

232

'Unable to finish TFA: %s' % 'Invalid TFA code'

233

if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)

234

return False

235

236

check_cookie_url = try_get(

237

tfa_results, lambda x: x[0][-1][2], compat_str)

238

else:

239

CHALLENGES = {

240

'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",

241

'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',

242

'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",

243

}

244

challenge = CHALLENGES.get(

245

challenge_str,

246

'%s returned error %s.' % (self.IE_NAME, challenge_str))

247

warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)

248

return False

249

else:

250

check_cookie_url = try_get(res, lambda x: x[2], compat_str)

251

252

if not check_cookie_url:

253

warn('Unable to extract CheckCookie URL')

254

return False

255

256

check_cookie_results = self._download_webpage(

257

check_cookie_url, None, 'Checking cookie', fatal=False)

258

259

if check_cookie_results is False:

260

return False

261

262

if 'https://myaccount.google.com/' not in check_cookie_results:

263

warn('Unable to log in')

return False

return True

def _download_webpage_handle(self, *args, **kwargs):

269

query = kwargs.get('query', {}).copy()

270

query['disable_polymer'] = 'true'

271

kwargs['query'] = query

272

return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(

273

*args, **compat_kwargs(kwargs))

274

275

def _real_initialize(self):

276

if self._downloader is None:

277

return

278

self._set_language()

279

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

284

# Extract entries from page with "Load more" button

285

def _entries(self, page, playlist_id):

286

more_widget_html = content_html = page

287

for page_num in itertools.count(1):

288

for entry in self._process_page(content_html):

289

yield entry

290

291

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

count = 0

retries = 3

while count <= retries:

298

try:

299

# Downloading page may result in intermittent 5xx HTTP error

300

# that is usually worked around with a retry

301

more = self._download_json(

302

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

303

'Downloading page #%s%s'

304

% (page_num, ' (retry #%d)' % count if count else ''),

305

transform_source=uppercase_escape)

306

break

307

except ExtractorError as e:

308

if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):

count += 1

if count <= retries:

continue

raise

content_html = more['content_html']

315

if not content_html.strip():

316

# Some webpages show a "Load more" button but they don't

317

# have more videos

318

break

319

more_widget_html = more['load_more_widget_html']

320

321

322

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

323

def _process_page(self, content):

324

for video_id, video_title in self.extract_videos_from_page(content):

325

yield self.url_result(video_id, 'Youtube', video_id, video_title)

326

327

def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):

328

for mobj in re.finditer(video_re, page):

329

# The link with index 0 is not the first video of the playlist (not sure if still actual)

330

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

331

continue

332

video_id = mobj.group('id')

333

video_title = unescapeHTML(

334

mobj.group('title')) if 'title' in mobj.groupdict() else None

335

if video_title:

336

video_title = video_title.strip()

337

if video_title == '► Play all':

338

video_title = None

339

try:

340

idx = ids_in_page.index(video_id)

341

if video_title and not titles_in_page[idx]:

342

titles_in_page[idx] = video_title

343

except ValueError:

344

ids_in_page.append(video_id)

345

titles_in_page.append(video_title)

346

347

def extract_videos_from_page(self, page):

348

ids_in_page = []

349

titles_in_page = []

350

self.extract_videos_from_page_impl(

351

self._VIDEO_RE, page, ids_in_page, titles_in_page)

352

return zip(ids_in_page, titles_in_page)

353

354

355

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

356

def _process_page(self, content):

357

for playlist_id in orderedSet(re.findall(

358

r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',

359

content)):

360

yield self.url_result(

361

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

362

363

def _real_extract(self, url):

364

playlist_id = self._match_id(url)

365

webpage = self._download_webpage(url, playlist_id)

366

title = self._og_search_title(webpage, fatal=False)

367

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

368

369

370

class YoutubeIE(YoutubeBaseInfoExtractor):

371

IE_DESC = 'YouTube.com'

372

_VALID_URL = r"""(?x)^

373

(

374

(?:https?://|//) # http(s):// or protocol-independent URL

375

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|

376

(?:www\.)?deturl\.com/www\.youtube\.com/|

377

(?:www\.)?pwnyoutube\.com/|

378

(?:www\.)?hooktube\.com/|

379

(?:www\.)?yourepeat\.com/|

380

tube\.majestyc\.net/|

381

# Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances

382

(?:(?:www|dev)\.)?invidio\.us/|

383

(?:(?:www|no)\.)?invidiou\.sh/|

384

(?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|

385

(?:www\.)?invidious\.kabi\.tk/|

386

(?:www\.)?invidious\.13ad\.de/|

387

(?:www\.)?invidious\.mastodon\.host/|

388

(?:www\.)?invidious\.nixnet\.xyz/|

389

(?:www\.)?invidious\.drycat\.fr/|

390

(?:www\.)?tube\.poal\.co/|

391

(?:www\.)?vid\.wxzm\.sx/|

392

(?:www\.)?yt\.elukerio\.org/|

393

(?:www\.)?yt\.lelux\.fi/|

394

(?:www\.)?kgg2m7yk5aybusll\.onion/|

395

(?:www\.)?qklhadlycap4cnod\.onion/|

396

(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|

397

(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|

398

(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|

399

(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|

400

(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|

401

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

402

(?:.*?\#/)? # handle anchor (#/) redirect urls

403

(?: # the various things that can precede the ID:

404

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

405

|(?: # or the v= param in all its forms

406

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

407

(?:\?|\#!?) # the params delimiter ? or # or #!

408

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

414

vid\.plus| # or vid.plus/xxxx

415

zwearz\.com/watch| # or zwearz.com/watch/xxxx

416

)/

417

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

418

)

419

)? # all until now is optional -> you can pass the naked ID

420

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

421

(?!.*?\blist=

422

(?:

423

%(playlist_id)s| # combined list/video URLs are handled by the playlist IE

424

WL # WL are handled by the watch later IE

425

)

426

)

427

(?(1).+)? # if we found the ID, everything can follow

428

$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

429

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

430

_formats = {

431

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

432

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

433

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

434

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

435

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

436

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

437

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

438

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

439

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

440

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},

441

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

442

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

443

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

444

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

445

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

446

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

447

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

448

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

453

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

454

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

455

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

456

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

457

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

458

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

459

460

# Apple HTTP Live Streaming

461

'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

462

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

463

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

464

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

465

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

466

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

467

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

468

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

469

470

# DASH mp4 video

471

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},

472

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},

473

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

474

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},

475

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},

476

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)

477

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},

478

'212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},

479

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},

480

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

481

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},

482

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},

483

484

# Dash mp4 audio

485

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},

486

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},

487

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},

488

'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

489

'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},

490

'325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},

491

'328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},

492

493

# Dash webm

494

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

495

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

496

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

497

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

498

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

499

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},

500

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},

501

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},

502

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},

503

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

504

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

505

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},

506

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},

507

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},

508

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},

509

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

510

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

511

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

512

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

513

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

514

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},

515

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},

516

517

# Dash webm audio

518

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},

519

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},

520

521

# Dash webm audio with opus inside

522

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},

523

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},

524

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},

525

526

# RTMP (unnamed)

527

'_rtmp': {'protocol': 'rtmp'},

528

529

# av01 video only formats sometimes served with "unknown" codecs

530

'394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},

531

'395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},

532

'396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},

533

'397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},

534

}

535

_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')

_GEO_BYPASS = False

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

547

'uploader': 'Philipp Hagemeister',

548

'uploader_id': 'phihag',

549

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

550

'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',

551

'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',

552

'upload_date': '20121002',

553

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

554

'categories': ['Science & Technology'],

555

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',

566

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

571

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

572

'alt_title': 'I Love It (feat. Charli XCX)',

573

'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',

574

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

575

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

576

'iconic ep', 'iconic', 'love', 'it'],

577

'duration': 180,

578

'uploader': 'Icona Pop',

579

'uploader_id': 'IconaPop',

580

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',

581

'creator': 'Icona Pop',

582

'track': 'I Love It (feat. Charli XCX)',

583

'artist': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

588

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

593

'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',

594

'alt_title': 'Tunnel Vision',

595

'description': 'md5:07dab3356cde4199048e4c7cd93471e1',

596

'duration': 419,

597

'uploader': 'justintimberlakeVEVO',

598

'uploader_id': 'justintimberlakeVEVO',

599

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',

600

'creator': 'Justin Timberlake',

601

'track': 'Tunnel Vision',

602

'artist': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

608

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

613

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

614

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

615

'uploader': 'SET India',

616

'uploader_id': 'setindia',

617

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',

'age_limit': 18,

}

},

{

'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

623

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

628

'uploader': 'Philipp Hagemeister',

629

'uploader_id': 'phihag',

630

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',

631

'upload_date': '20121002',

632

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

633

'categories': ['Science & Technology'],

634

'tags': ['youtube-dl'],

'duration': 10,

'view_count': int,

'like_count': int,

'dislike_count': int,

639

},

640

'params': {

641

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',

646

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

651

'uploader_id': '8KVIDEO',

652

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',

653

'description': '',

654

'uploader': '8KVIDEO',

655

'title': 'UHDTV TEST 8K VIDEO.mp4'

656

},

657

'params': {

658

'youtube_include_dash_manifest': True,

659

'format': '141',

660

},

661

'skip': 'format 141 not served anymore',

662

},

663

# DASH manifest with encrypted signature

664

{

665

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',

670

'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',

671

'duration': 244,

672

'uploader': 'AfrojackVEVO',

673

'uploader_id': 'AfrojackVEVO',

674

'upload_date': '20131011',

675

},

676

'params': {

677

'youtube_include_dash_manifest': True,

678

'format': '141/bestaudio[ext=m4a]',

679

},

680

},

681

# JS player signature function name containing $

682

{

683

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

688

'description': 'md5:bec2185232c05479482cb5a9b82719bf',

689

'duration': 242,

690

'uploader': 'TaylorSwiftVEVO',

691

'uploader_id': 'TaylorSwiftVEVO',

692

'upload_date': '20140818',

693

'creator': 'Taylor Swift',

694

},

695

'params': {

696

'youtube_include_dash_manifest': True,

697

'format': '141/bestaudio[ext=m4a]',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'duration': 219,

'upload_date': '20100909',

708

'uploader': 'Amazing Atheist',

709

'uploader_id': 'TheAmazingAtheist',

710

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',

711

'title': 'Burning Everyone\'s Koran',

712

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

713

}

714

},

715

# Normal age-gate video (No vevo, embed allowed)

716

{

717

'url': 'https://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

722

'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

723

'duration': 142,

724

'uploader': 'The Witcher',

725

'uploader_id': 'WitcherGame',

726

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',

727

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

732

{

733

'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

738

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

739

'duration': 246,

740

'uploader': 'LloydVEVO',

741

'uploader_id': 'LloydVEVO',

742

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',

743

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)

748

# YouTube Red ad is not captured for creator

749

{

750

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'duration': 266,

'upload_date': '20100430',

756

'uploader_id': 'deadmau5',

757

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',

758

'creator': 'deadmau5',

759

'description': 'md5:12c56784b8032162bb936a5f76d55360',

760

'uploader': 'deadmau5',

761

'title': 'Deadmau5 - Some Chords (HD)',

762

'alt_title': 'Some Chords',

763

},

764

'expected_warnings': [

765

'DASH manifest missing',

766

]

767

},

768

# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)

769

{

770

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'duration': 6085,

'upload_date': '20150827',

776

'uploader_id': 'olympic',

777

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',

778

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

779

'uploader': 'Olympic',

780

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

781

},

782

'params': {

783

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

793

'duration': 85,

794

'upload_date': '20110310',

795

'uploader_id': 'AllenMeow',

796

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',

797

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

798

'uploader': '孫ᄋᄅ',

799

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

800

},

801

},

802

# url_encoded_fmt_stream_map is empty string

803

{

804

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

809

'description': '',

810

'upload_date': '20150404',

811

'uploader_id': 'spbelect',

812

'uploader': 'Наблюдатели Петербурга',

813

},

814

'params': {

815

'skip_download': 'requires avconv',

816

},

817

'skip': 'This live event has ended.',

818

},

819

# Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)

820

{

821

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'webm',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

826

'description': 'md5:116377fd2963b81ec4ce64b542173306',

827

'duration': 220,

828

'upload_date': '20150625',

829

'uploader_id': 'dorappi2000',

830

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',

831

'uploader': 'dorappi2000',

832

'formats': 'mincount:31',

833

},

834

'skip': 'not actual anymore',

835

},

836

# DASH manifest with segment_list

837

{

838

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

839

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

844

'uploader': 'Airtek',

845

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

846

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

847

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

848

},

849

'params': {

850

'youtube_include_dash_manifest': True,

851

'format': '135', # bestvideo

852

},

853

'skip': 'This live event has ended.',

854

},

855

{

856

# Multifeed videos (multiple cameras), URL is for Main Camera

857

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

858

'info_dict': {

859

'id': 'jqWvoWXjCVs',

860

'title': 'teamPGP: Rocket League Noob Stream',

861

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

868

'description': 'md5:dc7872fb300e143831327f1bae3af010',

869

'duration': 7335,

870

'upload_date': '20150721',

871

'uploader': 'Beer Games Beer',

872

'uploader_id': 'beergamesbeer',

873

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

874

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

881

'description': 'md5:dc7872fb300e143831327f1bae3af010',

882

'duration': 7337,

883

'upload_date': '20150721',

884

'uploader': 'Beer Games Beer',

885

'uploader_id': 'beergamesbeer',

886

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

887

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

894

'description': 'md5:dc7872fb300e143831327f1bae3af010',

895

'duration': 7337,

896

'upload_date': '20150721',

897

'uploader': 'Beer Games Beer',

898

'uploader_id': 'beergamesbeer',

899

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

900

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

907

'description': 'md5:dc7872fb300e143831327f1bae3af010',

908

'duration': 7334,

909

'upload_date': '20150721',

910

'uploader': 'Beer Games Beer',

911

'uploader_id': 'beergamesbeer',

912

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

913

'license': 'Standard YouTube License',

},

}],

'params': {

'skip_download': True,

918

},

919

'skip': 'This video is not available.',

920

},

921

{

922

# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)

923

'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',

924

'info_dict': {

925

'id': 'gVfLd0zydlo',

926

'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',

927

},

928

'playlist_count': 2,

929

'skip': 'Not multifeed anymore',

930

},

931

{

932

'url': 'https://vid.plus/FlRa-iH7PGw',

933

'only_matching': True,

934

},

935

{

936

'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',

937

'only_matching': True,

938

},

939

{

940

# Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)

941

# Also tests cut-off URL expansion in video description (see

942

# https://github.com/ytdl-org/youtube-dl/issues/1892,

943

# https://github.com/ytdl-org/youtube-dl/issues/8164)

944

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

949

'alt_title': 'Dark Walk - Position Music',

950

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

951

'duration': 133,

952

'upload_date': '20151119',

953

'uploader_id': 'IronSoulElf',

954

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',

955

'uploader': 'IronSoulElf',

956

'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

957

'track': 'Dark Walk - Position Music',

958

'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',

959

'album': 'Position Music - Production Music Vol. 143 - Dark Walk',

960

},

961

'params': {

962

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)

967

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

968

'only_matching': True,

969

},

970

{

971

# Video with yt:stretch=17:0

972

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

977

'description': 'md5:ee18a25c350637c8faff806845bddee9',

978

'upload_date': '20151107',

979

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

980

'uploader': 'CH GAMER DROID',

981

},

982

'params': {

983

'skip_download': True,

984

},

985

'skip': 'This video does not exist.',

986

},

987

{

988

# Video licensed under Creative Commons

989

'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',

'info_dict': {

'id': 'M4gD1WSo5mA',

'ext': 'mp4',

'title': 'md5:e41008789470fc2533a3252216f1c1d1',

994

'description': 'md5:a677553cf0840649b731a3024aeff4cc',

995

'duration': 721,

996

'upload_date': '20150127',

997

'uploader_id': 'BerkmanCenter',

998

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',

999

'uploader': 'The Berkman Klein Center for Internet & Society',

1000

'license': 'Creative Commons Attribution license (reuse allowed)',

1001

},

1002

'params': {

1003

'skip_download': True,

},

},

{

# Channel-like uploader_url

1008

'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',

'info_dict': {

'id': 'eQcmzGIKrzg',

'ext': 'mp4',

'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',

1013

'description': 'md5:dda0d780d5a6e120758d1711d062a867',

1014

'duration': 4060,

1015

'upload_date': '20151119',

1016

'uploader': 'Bernie Sanders',

1017

'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',

1018

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',

1019

'license': 'Creative Commons Attribution license (reuse allowed)',

1020

},

1021

'params': {

1022

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

1027

'only_matching': True,

1028

},

1029

{

1030

# YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)

1031

'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',

1032

'only_matching': True,

1033

},

1034

{

1035

# Rental video preview

1036

'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',

'info_dict': {

'id': 'uGpuVWrhIzE',

'ext': 'mp4',

'title': 'Piku - Trailer',

1041

'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',

1042

'upload_date': '20150811',

1043

'uploader': 'FlixMatrix',

1044

'uploader_id': 'FlixMatrixKaravan',

1045

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',

1046

'license': 'Standard YouTube License',

1047

},

1048

'params': {

1049

'skip_download': True,

1050

},

1051

'skip': 'This video is not available.',

1052

},

1053

{

1054

# YouTube Red video with episode data

1055

'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',

'info_dict': {

'id': 'iqKdEhx-dD4',

'ext': 'mp4',

'title': 'Isolation - Mind Field (Ep 1)',

1060

'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',

1061

'duration': 2085,

1062

'upload_date': '20170118',

1063

'uploader': 'Vsauce',

1064

'uploader_id': 'Vsauce',

1065

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',

1066

'series': 'Mind Field',

'season_number': 1,

'episode_number': 1,

},

'params': {

'skip_download': True,

1072

},

1073

'expected_warnings': [

1074

'Skipping DASH manifest',

],

},

{

# The following content has been identified by the YouTube community

1079

# as inappropriate or offensive to some audiences.

1080

'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',

'info_dict': {

'id': '6SJNVb0GnPI',

'ext': 'mp4',

'title': 'Race Differences in Intelligence',

1085

'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',

1086

'duration': 965,

1087

'upload_date': '20140124',

1088

'uploader': 'New Century Foundation',

1089

'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',

1090

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',

1091

},

1092

'params': {

1093

'skip_download': True,

},

},

{

# itag 212

'url': '1t24XAntNCY',

1099

'only_matching': True,

1100

},

1101

{

1102

# geo restricted to JP

1103

'url': 'sJL6WA-aGkQ',

1104

'only_matching': True,

1105

},

1106

{

1107

'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',

1108

'only_matching': True,

1109

},

1110

{

1111

'url': 'https://invidio.us/watch?v=BaW_jenozKc',

1112

'only_matching': True,

},

{

# DRM protected

'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',

1117

'only_matching': True,

1118

},

1119

{

1120

# Video with unsupported adaptive stream type formats

1121

'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',

'info_dict': {

'id': 'Z4Vy8R84T1U',

'ext': 'mp4',

'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',

1126

'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',

1127

'duration': 433,

1128

'upload_date': '20130923',

1129

'uploader': 'Amelia Putri Harwita',

1130

'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',

1131

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',

1132

'formats': 'maxcount:10',

1133

},

1134

'params': {

1135

'skip_download': True,

1136

'youtube_include_dash_manifest': False,

},

},

{

# Youtube Music Auto-generated description

1141

'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',

'info_dict': {

'id': 'MgNrAu2pzNs',

'ext': 'mp4',

'title': 'Voyeur Girl',

1146

'description': 'md5:7ae382a65843d6df2685993e90a8628f',

1147

'upload_date': '20190312',

1148

'uploader': 'Various Artists - Topic',

1149

'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw',

1150

'artist': 'Stephen',

1151

'track': 'Voyeur Girl',

1152

'album': 'it\'s too much love to know my dear',

1153

'release_date': '20190313',

1154

'release_year': 2019,

1155

},

1156

'params': {

1157

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1162

# Retrieve 'artist' field from 'Artist:' in video description

1163

# when it is present on youtube music video

1164

'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',

'info_dict': {

'id': 'k0jLE7tTwjY',

'ext': 'mp4',

'title': 'Latch Feat. Sam Smith',

1169

'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',

1170

'upload_date': '20150110',

1171

'uploader': 'Various Artists - Topic',

1172

'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',

1173

'artist': 'Disclosure',

1174

'track': 'Latch Feat. Sam Smith',

1175

'album': 'Latch Featuring Sam Smith',

1176

'release_date': '20121008',

1177

'release_year': 2012,

1178

},

1179

'params': {

1180

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1185

# handle multiple artists on youtube music video

1186

'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',

'info_dict': {

'id': '74qn0eJSjpA',

'ext': 'mp4',

'title': 'Eastside',

'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',

1192

'upload_date': '20180710',

1193

'uploader': 'Benny Blanco - Topic',

1194

'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',

1195

'artist': 'benny blanco, Halsey, Khalid',

1196

'track': 'Eastside',

1197

'album': 'Eastside',

1198

'release_date': '20180713',

1199

'release_year': 2018,

1200

},

1201

'params': {

1202

'skip_download': True,

},

},

{

# Youtube Music Auto-generated description

1207

# handle youtube music video with release_year and no release_date

1208

'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',

'info_dict': {

'id': '-hcAI0g-f5M',

'ext': 'mp4',

'title': 'Put It On Me',

1213

'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e',

1214

'upload_date': '20180426',

1215

'uploader': 'Matt Maeson - Topic',

1216

'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',

1217

'artist': 'Matt Maeson',

1218

'track': 'Put It On Me',

1219

'album': 'The Hearse',

1220

'release_date': None,

1221

'release_year': 2018,

1222

},

1223

'params': {

1224

'skip_download': True,

},

},

{

'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',

1229

'only_matching': True,

},

]

def __init__(self, *args, **kwargs):

1234

super(YoutubeIE, self).__init__(*args, **kwargs)

1235

self._player_cache = {}

1236

1237

def report_video_info_webpage_download(self, video_id):

1238

"""Report attempt to download video info webpage."""

1239

self.to_screen('%s: Downloading video info webpage' % video_id)

1240

1241

def report_information_extraction(self, video_id):

1242

"""Report attempt to extract video information."""

1243

self.to_screen('%s: Extracting video information' % video_id)

1244

1245

def report_unavailable_format(self, video_id, format):

1246

"""Report extracted video URL."""

1247

self.to_screen('%s: Format %s not available' % (video_id, format))

1248

1249

def report_rtmp_download(self):

1250

"""Indicate the download will use the RTMP protocol."""

1251

self.to_screen('RTMP download detected')

1252

1253

def _signature_cache_id(self, example_sig):

1254

""" Return a string representation of a signature """

1255

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

1256

1257

def _extract_signature_function(self, video_id, player_url, example_sig):

1258

id_m = re.match(

1259

r'.*?[-.](?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',

1260

player_url)

1261

if not id_m:

1262

raise ExtractorError('Cannot identify player %r' % player_url)

1263

player_type = id_m.group('ext')

1264

player_id = id_m.group('id')

1265

1266

# Read from filesystem cache

1267

func_id = '%s_%s_%s' % (

1268

player_type, player_id, self._signature_cache_id(example_sig))

1269

assert os.path.basename(func_id) == func_id

1270

1271

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

1272

if cache_spec is not None:

1273

return lambda s: ''.join(s[i] for i in cache_spec)

1274

1275

download_note = (

1276

'Downloading player %s' % player_url

1277

if self._downloader.params.get('verbose') else

1278

'Downloading %s player %s' % (player_type, player_id)

1279

)

1280

if player_type == 'js':

1281

code = self._download_webpage(

1282

player_url, video_id,

1283

note=download_note,

1284

errnote='Download of %s failed' % player_url)

1285

res = self._parse_sig_js(code)

1286

elif player_type == 'swf':

1287

urlh = self._request_webpage(

1288

player_url, video_id,

1289

note=download_note,

1290

errnote='Download of %s failed' % player_url)

1291

code = urlh.read()

1292

res = self._parse_sig_swf(code)

1293

else:

1294

assert False, 'Invalid player type %r' % player_type

1295

1296

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1297

cache_res = res(test_string)

1298

cache_spec = [ord(c) for c in cache_res]

1299

1300

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

1301

return res

1302

1303

def _print_sig_code(self, func, example_sig):

1304

def gen_sig_code(idxs):

1305

def _genslice(start, end, step):

1306

starts = '' if start == 0 else str(start)

1307

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

1308

steps = '' if step == 1 else (':%d' % step)

1309

return 's[%s%s%s]' % (starts, ends, steps)

1310

1311

step = None

1312

# Quelch pyflakes warnings - start will be set when step is set

1313

start = '(Never used)'

1314

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

1319

step = None

1320

continue

1321

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

1331

1332

test_string = ''.join(map(compat_chr, range(len(example_sig))))

1333

cache_res = func(test_string)

1334

cache_spec = [ord(c) for c in cache_res]

1335

expr_code = ' + '.join(gen_sig_code(cache_spec))

1336

signature_id_tuple = '(%s)' % (

1337

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

1338

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

1339

' return %s\n') % (signature_id_tuple, expr_code)

1340

self.to_screen('Extracted signature function:\n' + code)

1341

1342

def _parse_sig_js(self, jscode):

1343

funcname = self._search_regex(

1344

(r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1345

r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1346

r'\b(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function$\s*a\s*$\s*{\s*a\s*=\s*a\.split$\s*""\s*$',

1347

r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function$\s*a\s*$\s*{\s*a\s*=\s*a\.split$\s*""\s*$',

1348

# Obsolete patterns

1349

r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1350

r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',

1351

r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1352

r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1353

r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1354

r'\bc\s*&&\s*a\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1355

r'\bc\s*&&\s*[a-zA-Z0-9]+\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',

1356

r'\bc\s*&&\s*[a-zA-Z0-9]+\.set$[^,]+\s*,\s*\([^)]*$\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),

1357

jscode, 'Initial JS player signature function name', group='sig')

1358

1359

jsi = JSInterpreter(jscode)

1360

initial_function = jsi.extract_function(funcname)

1361

return lambda s: initial_function([s])

1362

1363

def _parse_sig_swf(self, file_contents):

1364

swfi = SWFInterpreter(file_contents)

1365

TARGET_CLASSNAME = 'SignatureDecipher'

1366

searched_class = swfi.extract_class(TARGET_CLASSNAME)

1367

initial_function = swfi.extract_function(searched_class, 'decipher')

1368

return lambda s: initial_function([s])

1369

1370

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

1371

"""Turn the encrypted s field into a working signature"""

1372

1373

if player_url is None:

1374

raise ExtractorError('Cannot decrypt signature without player_url')

1375

1376

if player_url.startswith('//'):

1377

player_url = 'https:' + player_url

1378

elif not re.match(r'https?://', player_url):

1379

player_url = compat_urlparse.urljoin(

1380

'https://www.youtube.com', player_url)

1381

try:

1382

player_id = (player_url, self._signature_cache_id(s))

1383

if player_id not in self._player_cache:

1384

func = self._extract_signature_function(

1385

video_id, player_url, s

1386

)

1387

self._player_cache[player_id] = func

1388

func = self._player_cache[player_id]

1389

if self._downloader.params.get('youtube_print_sig_code'):

1390

self._print_sig_code(func, s)

1391

return func(s)

1392

except Exception as e:

1393

tb = traceback.format_exc()

1394

raise ExtractorError(

1395

'Signature extraction failed: ' + tb, cause=e)

1396

1397

def _get_subtitles(self, video_id, webpage):

1398

try:

1399

subs_doc = self._download_xml(

1400

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

1401

video_id, note=False)

1402

except ExtractorError as err:

1403

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

1408

lang = track.attrib['lang_code']

1409

if lang in sub_lang_list:

1410

continue

1411

sub_formats = []

1412

for ext in self._SUBTITLE_FORMATS:

1413

params = compat_urllib_parse_urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

1418

})

1419

sub_formats.append({

1420

'url': 'https://www.youtube.com/api/timedtext?' + params,

1421

'ext': ext,

1422

})

1423

sub_lang_list[lang] = sub_formats

1424

if not sub_lang_list:

1425

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

1430

patterns = (

1431

# User data may contain arbitrary character sequences that may affect

1432

# JSON extraction with regex, e.g. when '};' is contained the second

1433

# regex won't capture the whole JSON. Yet working around by trying more

1434

# concrete regex first keeping in mind proper quoted string handling

1435

# to be implemented in future that will replace this workaround (see

1436

# https://github.com/ytdl-org/youtube-dl/issues/7468,

1437

# https://github.com/ytdl-org/youtube-dl/pull/7599)

1438

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

1439

r';ytplayer\.config\s*=\s*({.+?});',

1440

)

1441

config = self._search_regex(

1442

patterns, webpage, 'ytplayer.config', default=None)

1443

if config:

1444

return self._parse_json(

1445

uppercase_escape(config), video_id, fatal=False)

1446

1447

def _get_automatic_captions(self, video_id, webpage):

1448

"""We need the webpage for getting the captions url, pass it as an

1449

argument to speed up the process."""

1450

self.to_screen('%s: Looking for automatic captions' % video_id)

1451

player_config = self._get_ytplayer_config(video_id, webpage)

1452

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

1453

if not player_config:

1454

self._downloader.report_warning(err_msg)

1455

return {}

1456

try:

1457

args = player_config['args']

1458

caption_url = args.get('ttsurl')

1459

if caption_url:

1460

timestamp = args['timestamp']

1461

# We get the available subtitles

1462

list_params = compat_urllib_parse_urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

1468

caption_list = self._download_xml(list_url, video_id)

1469

original_lang_node = caption_list.find('track')

1470

if original_lang_node is None:

1471

self._downloader.report_warning('Video doesn\'t have automatic captions')

1472

return {}

1473

original_lang = original_lang_node.attrib['lang_code']

1474

caption_kind = original_lang_node.attrib.get('kind', '')

1475

1476

sub_lang_list = {}

1477

for lang_node in caption_list.findall('target'):

1478

sub_lang = lang_node.attrib['lang_code']

1479

sub_formats = []

1480

for ext in self._SUBTITLE_FORMATS:

1481

params = compat_urllib_parse_urlencode({

1482

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1487

})

1488

sub_formats.append({

1489

'url': caption_url + '&' + params,

1490

'ext': ext,

1491

})

1492

sub_lang_list[sub_lang] = sub_formats

1493

return sub_lang_list

1494

1495

def make_captions(sub_url, sub_langs):

1496

parsed_sub_url = compat_urllib_parse_urlparse(sub_url)

1497

caption_qs = compat_parse_qs(parsed_sub_url.query)

1498

captions = {}

1499

for sub_lang in sub_langs:

1500

sub_formats = []

1501

for ext in self._SUBTITLE_FORMATS:

caption_qs.update({

'tlang': [sub_lang],

'fmt': [ext],

})

sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(

1507

query=compat_urllib_parse_urlencode(caption_qs, True)))

sub_formats.append({

'url': sub_url,

'ext': ext,

})

captions[sub_lang] = sub_formats

1513

return captions

1514

1515

# New captions format as of 22.06.2017

1516

player_response = args.get('player_response')

1517

if player_response and isinstance(player_response, compat_str):

1518

player_response = self._parse_json(

1519

player_response, video_id, fatal=False)

1520

if player_response:

1521

renderer = player_response['captions']['playerCaptionsTracklistRenderer']

1522

base_url = renderer['captionTracks'][0]['baseUrl']

1523

sub_lang_list = []

1524

for lang in renderer['translationLanguages']:

1525

lang_code = lang.get('languageCode')

1526

if lang_code:

1527

sub_lang_list.append(lang_code)

1528

return make_captions(base_url, sub_lang_list)

1529

1530

# Some videos don't provide ttsurl but rather caption_tracks and

1531

# caption_translation_languages (e.g. 20LmZk1hakA)

1532

# Does not used anymore as of 22.06.2017

1533

caption_tracks = args['caption_tracks']

1534

caption_translation_languages = args['caption_translation_languages']

1535

caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]

1536

sub_lang_list = []

1537

for lang in caption_translation_languages.split(','):

1538

lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))

1539

sub_lang = lang_qs.get('lc', [None])[0]

1540

if sub_lang:

1541

sub_lang_list.append(sub_lang)

1542

return make_captions(caption_url, sub_lang_list)

1543

# An extractor error can be raise by the download process if there are

1544

# no automatic captions but there are subtitles

1545

except (KeyError, IndexError, ExtractorError):

1546

self._downloader.report_warning(err_msg)

1547

return {}

1548

1549

def _mark_watched(self, video_id, video_info, player_response):

1550

playback_url = url_or_none(try_get(

1551

player_response,

1552

lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(

1553

video_info, lambda x: x['videostats_playback_base_url'][0]))

1554

if not playback_url:

1555

return

1556

parsed_playback_url = compat_urlparse.urlparse(playback_url)

1557

qs = compat_urlparse.parse_qs(parsed_playback_url.query)

1558

1559

# cpn generation algorithm is reverse engineered from base.js.

1560

# In fact it works even with dummy cpn.

1561

CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'

1562

cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))

qs.update({

'ver': ['2'],

'cpn': [cpn],

})

playback_url = compat_urlparse.urlunparse(

1569

parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))

1570

1571

self._download_webpage(

1572

playback_url, video_id, 'Marking watched',

1573

'Unable to mark watched', fatal=False)

1574

1575

@staticmethod

1576

def _extract_urls(webpage):

1577

# Embedded YouTube player

1578

entries = [

1579

unescapeHTML(mobj.group('url'))

1580

for mobj in re.finditer(r'''(?x)

(?:

<iframe[^>]+?src=|

data-video-url=|

<embed[^>]+?src=|

embedSWF\(?:\s*|

<object[^>]+data=|

new\s+SWFObject\(

)

(["\'])

(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/

1591

(?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)

1592

\1''', webpage)]

1593

1594

# lazyYT YouTube embed

1595

entries.extend(list(map(

1596

unescapeHTML,

1597

re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))

1598

1599

# Wordpress "YouTube Video Importer" plugin

1600

matches = re.findall(r'''(?x)<div[^>]+

1601

class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+

1602

data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)

1603

entries.extend(m[-1] for m in matches)

return entries

@staticmethod

def _extract_url(webpage):

1609

urls = YoutubeIE._extract_urls(webpage)

1610

return urls[0] if urls else None

1611

1612

@classmethod

1613

def extract_id(cls, url):

1614

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1615

if mobj is None:

1616

raise ExtractorError('Invalid URL: %s' % url)

1617

video_id = mobj.group(2)

return video_id

@staticmethod

def _extract_chapters(description, duration):

1622

if not description:

1623

return None

1624

chapter_lines = re.findall(

1625

r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',

1626

description)

1627

if not chapter_lines:

1628

return None

1629

chapters = []

1630

for next_num, (chapter_line, time_point) in enumerate(

1631

chapter_lines, start=1):

1632

start_time = parse_duration(time_point)

1633

if start_time is None:

1634

continue

1635

if start_time > duration:

1636

break

1637

end_time = (duration if next_num == len(chapter_lines)

1638

else parse_duration(chapter_lines[next_num][1]))

1639

if end_time is None:

1640

continue

1641

if end_time > duration:

1642

end_time = duration

1643

if start_time > end_time:

1644

break

1645

chapter_title = re.sub(

1646

r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')

1647

chapter_title = re.sub(r'\s+', ' ', chapter_title)

1648

chapters.append({

1649

'start_time': start_time,

1650

'end_time': end_time,

1651

'title': chapter_title,

})

return chapters

def _real_extract(self, url):

1656

url, smuggled_data = unsmuggle_url(url, {})

1657

1658

proto = (

1659

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1665

for component in [parsed_url.fragment, parsed_url.query]:

1666

query = compat_parse_qs(component)

1667

if start_time is None and 't' in query:

1668

start_time = parse_duration(query['t'][0])

1669

if start_time is None and 'start' in query:

1670

start_time = parse_duration(query['start'][0])

1671

if end_time is None and 'end' in query:

1672

end_time = parse_duration(query['end'][0])

1673

1674

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1675

mobj = re.search(self._NEXT_URL_RE, url)

1676

if mobj:

1677

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1678

video_id = self.extract_id(url)

1679

1680

# Get video webpage

1681

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1682

video_webpage = self._download_webpage(url, video_id)

1683

1684

# Attempt to extract SWF player URL

1685

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1686

if mobj is not None:

1687

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1694

dash_mpd = video_info.get('dashmpd')

1695

if dash_mpd and dash_mpd[0] not in dash_mpds:

1696

dash_mpds.append(dash_mpd[0])

1697

1698

def add_dash_mpd_pr(pl_response):

1699

dash_mpd = url_or_none(try_get(

1700

pl_response, lambda x: x['streamingData']['dashManifestUrl'],

1701

compat_str))

1702

if dash_mpd and dash_mpd not in dash_mpds:

1703

dash_mpds.append(dash_mpd)

is_live = None

view_count = None

def extract_view_count(v_info):

1709

return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))

1710

1711

def extract_token(v_info):

1712

return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))

1713

1714

def extract_player_response(player_response, video_id):

1715

pl_response = str_or_none(player_response)

1716

if not pl_response:

1717

return

1718

pl_response = self._parse_json(pl_response, video_id, fatal=False)

1719

if isinstance(pl_response, dict):

1720

add_dash_mpd_pr(pl_response)

return pl_response

player_response = {}

# Get video info

embed_webpage = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1728

age_gate = True

1729

# We simulate the access to the video from www.youtube.com/v/{video_id}

1730

# this can be viewed without login into Youtube

1731

url = proto + '://www.youtube.com/embed/%s' % video_id

1732

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1733

data = compat_urllib_parse_urlencode({

1734

'video_id': video_id,

1735

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1736

'sts': self._search_regex(

1737

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1738

})

1739

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1740

video_info_webpage = self._download_webpage(

1741

video_info_url, video_id,

1742

note='Refetching age-gated info webpage',

1743

errnote='unable to download video info webpage')

1744

video_info = compat_parse_qs(video_info_webpage)

1745

pl_response = video_info.get('player_response', [None])[0]

1746

player_response = extract_player_response(pl_response, video_id)

1747

add_dash_mpd(video_info)

1748

view_count = extract_view_count(video_info)

else:

age_gate = False

video_info = None

sts = None

# Try looking directly into the video webpage

1754

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1755

if ytplayer_config:

1756

args = ytplayer_config['args']

1757

if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):

1758

# Convert to the same format returned by compat_parse_qs

1759

video_info = dict((k, [v]) for k, v in args.items())

1760

add_dash_mpd(video_info)

1761

# Rental video is not rented but preview is available (e.g.

1762

# https://www.youtube.com/watch?v=yYr8q0y5Jfg,

1763

# https://github.com/ytdl-org/youtube-dl/issues/10532)

1764

if not video_info and args.get('ypc_vid'):

1765

return self.url_result(

1766

args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])

1767

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1768

is_live = True

1769

sts = ytplayer_config.get('sts')

1770

if not player_response:

1771

player_response = extract_player_response(args.get('player_response'), video_id)

1772

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1773

add_dash_mpd_pr(player_response)

1774

# We also try looking in get_video_info since it may contain different dashmpd

1775

# URL that points to a DASH manifest with possibly different itag set (some itags

1776

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1777

# manifest pointed by get_video_info's dashmpd).

1778

# The general idea is to take a union of itags of both DASH manifests (for example

1779

# video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093)

1780

self.report_video_info_webpage_download(video_id)

1781

for el in ('embedded', 'detailpage', 'vevo', ''):

1782

query = {

1783

'video_id': video_id,

'ps': 'default',

'eurl': '',

'gl': 'US',

'hl': 'en',

}

if el:

query['el'] = el

if sts:

query['sts'] = sts

video_info_webpage = self._download_webpage(

1794

'%s://www.youtube.com/get_video_info' % proto,

1795

video_id, note=False,

1796

errnote='unable to download video info webpage',

1797

fatal=False, query=query)

1798

if not video_info_webpage:

1799

continue

1800

get_video_info = compat_parse_qs(video_info_webpage)

1801

if not player_response:

1802

pl_response = get_video_info.get('player_response', [None])[0]

1803

player_response = extract_player_response(pl_response, video_id)

1804

add_dash_mpd(get_video_info)

1805

if view_count is None:

1806

view_count = extract_view_count(get_video_info)

1807

if not video_info:

1808

video_info = get_video_info

1809

get_token = extract_token(get_video_info)

1810

if get_token:

1811

# Different get_video_info requests may report different results, e.g.

1812

# some may report video unavailability, but some may serve it without

1813

# any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362,

1814

# the original webpage as well as el=info and el=embedded get_video_info

1815

# requests report video unavailability due to geo restriction while

1816

# el=detailpage succeeds and returns valid data). This is probably

1817

# due to YouTube measures against IP ranges of hosting providers.

1818

# Working around by preferring the first succeeded video_info containing

1819

# the token if no such video_info yet was found.

1820

token = extract_token(video_info)

1821

if not token:

1822

video_info = get_video_info

1823

break

1824

1825

def extract_unavailable_message():

1826

messages = []

1827

for tag, kind in (('h1', 'message'), ('div', 'submessage')):

1828

msg = self._html_search_regex(

1829

r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),

1830

video_webpage, 'unavailable %s' % kind, default=None)

if msg:

messages.append(msg)

if messages:

return '\n'.join(messages)

1835

1836

if not video_info:

1837

unavailable_message = extract_unavailable_message()

1838

if not unavailable_message:

1839

unavailable_message = 'Unable to extract video data'

1840

raise ExtractorError(

1841

'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)

1842

1843

video_details = try_get(

1844

player_response, lambda x: x['videoDetails'], dict) or {}

1845

1846

video_title = video_info.get('title', [None])[0] or video_details.get('title')

1847

if not video_title:

1848

self._downloader.report_warning('Unable to extract video title')

1849

video_title = '_'

1850

1851

description_original = video_description = get_element_by_id("eow-description", video_webpage)

1852

if video_description:

1853

1854

def replace_url(m):

1855

redir_url = compat_urlparse.urljoin(url, m.group(1))

1856

parsed_redir_url = compat_urllib_parse_urlparse(redir_url)

1857

if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':

1858

qs = compat_parse_qs(parsed_redir_url.query)

q = qs.get('q')

if q and q[0]:

return q[0]

return redir_url

description_original = video_description = re.sub(r'''(?x)

1865

<a\s+

1866

(?:[a-zA-Z-]+="[^"]*"\s+)*?

1867

(?:title|href)="([^"]+)"\s+

1868

(?:[a-zA-Z-]+="[^"]*"\s+)*?

class="[^"]*"[^>]*>

[^<]+\.{3}\s*

</a>

''', replace_url, video_description)

1873

video_description = clean_html(video_description)

1874

else:

1875

video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription')

1876

1877

if not smuggled_data.get('force_singlefeed', False):

1878

if not self._downloader.params.get('noplaylist'):

1879

multifeed_metadata_list = try_get(

1880

player_response,

1881

lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],

1882

compat_str) or try_get(

1883

video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)

1884

if multifeed_metadata_list:

1885

entries = []

1886

feed_ids = []

1887

for feed in multifeed_metadata_list.split(','):

1888

# Unquote should take place before split on comma (,) since textual

1889

# fields may contain comma as well (see

1890

# https://github.com/ytdl-org/youtube-dl/issues/8536)

1891

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1892

entries.append({

1893

'_type': 'url_transparent',

1894

'ie_key': 'Youtube',

1895

'url': smuggle_url(

1896

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1897

{'force_singlefeed': True}),

1898

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1899

})

1900

feed_ids.append(feed_data['id'][0])

1901

self.to_screen(

1902

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1903

% (', '.join(feed_ids), video_id))

1904

return self.playlist_result(entries, video_id, video_title, video_description)

1905

else:

1906

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1907

1908

if view_count is None:

1909

view_count = extract_view_count(video_info)

1910

if view_count is None and video_details:

1911

view_count = int_or_none(video_details.get('viewCount'))

1912

1913

if is_live is None:

1914

is_live = bool_or_none(video_details.get('isLive'))

1915

1916

# Check for "rental" videos

1917

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1918

raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)

1919

1920

def _extract_filesize(media_url):

1921

return int_or_none(self._search_regex(

1922

r'\bclen[=/](\d+)', media_url, 'filesize', default=None))

1923

1924

streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []

1925

streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])

1926

1927

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1928

self.report_rtmp_download()

1929

formats = [{

1930

'format_id': '_rtmp',

1931

'protocol': 'rtmp',

1932

'url': video_info['conn'][0],

1933

'player_url': player_url,

1934

}]

1935

elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):

1936

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1937

if 'rtmpe%3Dyes' in encoded_url_map:

1938

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)

1939

formats = []

1940

formats_spec = {}

1941

fmt_list = video_info.get('fmt_list', [''])[0]

1942

if fmt_list:

1943

for fmt in fmt_list.split(','):

1944

spec = fmt.split('/')

1945

if len(spec) > 1:

1946

width_height = spec[1].split('x')

1947

if len(width_height) == 2:

1948

formats_spec[spec[0]] = {

1949

'resolution': spec[1],

1950

'width': int_or_none(width_height[0]),

1951

'height': int_or_none(width_height[1]),

1952

}

1953

for fmt in streaming_formats:

1954

itag = str_or_none(fmt.get('itag'))

1955

if not itag:

1956

continue

1957

quality = fmt.get('quality')

1958

quality_label = fmt.get('qualityLabel') or quality

1959

formats_spec[itag] = {

1960

'asr': int_or_none(fmt.get('audioSampleRate')),

1961

'filesize': int_or_none(fmt.get('contentLength')),

1962

'format_note': quality_label,

1963

'fps': int_or_none(fmt.get('fps')),

1964

'height': int_or_none(fmt.get('height')),

1965

# bitrate for itag 43 is always 2147483647

1966

'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,

1967

'width': int_or_none(fmt.get('width')),

1968

}

1969

1970

for fmt in streaming_formats:

1971

if fmt.get('drm_families'):

1972

continue

1973

url = url_or_none(fmt.get('url'))

1974

1975

if not url:

1976

cipher = fmt.get('cipher')

1977

if not cipher:

1978

continue

1979

url_data = compat_parse_qs(cipher)

1980

url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))

if not url:

continue

else:

cipher = None

url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)

1986

1987

stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))

1988

# Unsupported FORMAT_STREAM_TYPE_OTF

if stream_type == 3:

continue

format_id = fmt.get('itag') or url_data['itag'][0]

1993

if not format_id:

1994

continue

1995

format_id = compat_str(format_id)

1996

1997

if cipher:

1998

if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):

1999

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

2000

jsplayer_url_json = self._search_regex(

2001

ASSETS_RE,

2002

embed_webpage if age_gate else video_webpage,

2003

'JS player URL (1)', default=None)

2004

if not jsplayer_url_json and not age_gate:

2005

# We need the embed website after all

2006

if embed_webpage is None:

2007

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

2008

embed_webpage = self._download_webpage(

2009

embed_url, video_id, 'Downloading embed webpage')

2010

jsplayer_url_json = self._search_regex(

2011

ASSETS_RE, embed_webpage, 'JS player URL')

2012

2013

player_url = json.loads(jsplayer_url_json)

2014

if player_url is None:

2015

player_url_json = self._search_regex(

2016

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

2017

video_webpage, 'age gate player URL')

2018

player_url = json.loads(player_url_json)

2019

2020

if 'sig' in url_data:

2021

url += '&signature=' + url_data['sig'][0]

2022

elif 's' in url_data:

2023

encrypted_sig = url_data['s'][0]

2024

2025

if self._downloader.params.get('verbose'):

2026

if player_url is None:

2027

player_version = 'unknown'

2028

player_desc = 'unknown'

2029

else:

2030

if player_url.endswith('swf'):

2031

player_version = self._search_regex(

2032

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

2033

'flash player', fatal=False)

2034

player_desc = 'flash player %s' % player_version

2035

else:

2036

player_version = self._search_regex(

2037

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',

2038

r'(?:www|player(?:_ias)?)[-.]([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],

2039

player_url,

2040

'html5 player', fatal=False)

2041

player_desc = 'html5 player %s' % player_version

2042

2043

parts_sizes = self._signature_cache_id(encrypted_sig)

2044

self.to_screen('{%s} signature length %s, %s' %

2045

(format_id, parts_sizes, player_desc))

2046

2047

signature = self._decrypt_signature(

2048

encrypted_sig, video_id, player_url, age_gate)

2049

sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'

2050

url += '&%s=%s' % (sp, signature)

2051

if 'ratebypass' not in url:

2052

url += '&ratebypass=yes'

2053

2054

dct = {

2055

'format_id': format_id,

2056

'url': url,

2057

'player_url': player_url,

2058

}

2059

if format_id in self._formats:

2060

dct.update(self._formats[format_id])

2061

if format_id in formats_spec:

2062

dct.update(formats_spec[format_id])

2063

2064

# Some itags are not included in DASH manifest thus corresponding formats will

2065

# lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).

2066

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

2067

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

2068

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

2069

2070

if width is None:

2071

width = int_or_none(fmt.get('width'))

2072

if height is None:

2073

height = int_or_none(fmt.get('height'))

2074

2075

filesize = int_or_none(url_data.get(

2076

'clen', [None])[0]) or _extract_filesize(url)

2077

2078

quality = url_data.get('quality', [None])[0] or fmt.get('quality')

2079

quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')

2080

2081

tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)

2082

or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None

2083

fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))

2084

2085

more_fields = {

2086

'filesize': filesize,

'tbr': tbr,

'width': width,

'height': height,

'fps': fps,

'format_note': quality_label or quality,

2092

}

2093

for key, value in more_fields.items():

2094

if value:

2095

dct[key] = value

2096

type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')

2097

if type_:

2098

type_split = type_.split(';')

2099

kind_ext = type_split[0].split('/')

2100

if len(kind_ext) == 2:

2101

kind, _ = kind_ext

2102

dct['ext'] = mimetype2ext(type_split[0])

2103

if kind in ('audio', 'video'):

2104

codecs = None

2105

for mobj in re.finditer(

2106

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

2107

if mobj.group('key') == 'codecs':

2108

codecs = mobj.group('val')

2109

break

2110

if codecs:

2111

dct.update(parse_codecs(codecs))

2112

if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':

2113

dct['downloader_options'] = {

2114

# Youtube throttles chunks >~10M

2115

'http_chunk_size': 10485760,

}

formats.append(dct)

else:

manifest_url = (

url_or_none(try_get(

player_response,

lambda x: x['streamingData']['hlsManifestUrl'],

2123

compat_str))

2124

or url_or_none(try_get(

2125

video_info, lambda x: x['hlsvp'][0], compat_str)))

2126

if manifest_url:

2127

formats = []

2128

m3u8_formats = self._extract_m3u8_formats(

2129

manifest_url, video_id, 'mp4', fatal=False)

2130

for a_format in m3u8_formats:

2131

itag = self._search_regex(

2132

r'/itag/(\d+)/', a_format['url'], 'itag', default=None)

2133

if itag:

2134

a_format['format_id'] = itag

2135

if itag in self._formats:

2136

dct = self._formats[itag].copy()

2137

dct.update(a_format)

2138

a_format = dct

2139

a_format['player_url'] = player_url

2140

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

2141

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

2142

formats.append(a_format)

2143

else:

2144

error_message = extract_unavailable_message()

2145

if not error_message:

2146

error_message = clean_html(try_get(

2147

player_response, lambda x: x['playabilityStatus']['reason'],

2148

compat_str))

2149

if not error_message:

2150

error_message = clean_html(

2151

try_get(video_info, lambda x: x['reason'][0], compat_str))

2152

if error_message:

2153

raise ExtractorError(error_message, expected=True)

2154

raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')

2155

2156

# uploader

2157

video_uploader = try_get(

2158

video_info, lambda x: x['author'][0],

2159

compat_str) or str_or_none(video_details.get('author'))

2160

if video_uploader:

2161

video_uploader = compat_urllib_parse_unquote_plus(video_uploader)

2162

else:

2163

self._downloader.report_warning('unable to extract uploader name')

2164

2165

# uploader_id

2166

video_uploader_id = None

2167

video_uploader_url = None

2168

mobj = re.search(

2169

r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',

2170

video_webpage)

2171

if mobj is not None:

2172

video_uploader_id = mobj.group('uploader_id')

2173

video_uploader_url = mobj.group('uploader_url')

2174

else:

2175

self._downloader.report_warning('unable to extract uploader nickname')

2176

2177

channel_id = (

2178

str_or_none(video_details.get('channelId'))

2179

or self._html_search_meta(

2180

'channelId', video_webpage, 'channel id', default=None)

2181

or self._search_regex(

2182

r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',

2183

video_webpage, 'channel id', default=None, group='id'))

2184

channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None

2185

2186

# thumbnail image

2187

# We try first to get a high quality image:

2188

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

2189

video_webpage, re.DOTALL)

2190

if m_thumb is not None:

2191

video_thumbnail = m_thumb.group(1)

2192

elif 'thumbnail_url' not in video_info:

2193

self._downloader.report_warning('unable to extract video thumbnail')

2194

video_thumbnail = None

2195

else: # don't panic if we can't find it

2196

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

2197

2198

# upload date

2199

upload_date = self._html_search_meta(

2200

'datePublished', video_webpage, 'upload date', default=None)

2201

if not upload_date:

2202

upload_date = self._search_regex(

2203

[r'(?s)id="eow-date.*?>(.*?)</span>',

2204

r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],

2205

video_webpage, 'upload date', default=None)

2206

upload_date = unified_strdate(upload_date)

2207

2208

video_license = self._html_search_regex(

2209

r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',

2210

video_webpage, 'license', default=None)

m_music = re.search(

r'''(?x)

<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*

<ul[^>]*>\s*

<li>(?P<title>.+?)

by (?P<creator>.+?)

(?:

$.+?$|

<a[^>]*

(?:

\bhref=["\']/red[^>]*>| # drop possible

2223

>\s*Listen ad-free with YouTube Red # YouTube Red ad

)

.*?

)?</li

''',

video_webpage)

if m_music:

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

2231

video_creator = clean_html(m_music.group('creator'))

2232

else:

2233

video_alt_title = video_creator = None

2234

2235

def extract_meta(field):

2236

return self._html_search_regex(

2237

r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,

2238

video_webpage, field, default=None)

2239

2240

track = extract_meta('Song')

2241

artist = extract_meta('Artist')

2242

album = extract_meta('Album')

2243

2244

# Youtube Music Auto-generated description

2245

release_date = release_year = None

2246

if video_description:

2247

mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)

2248

if mobj:

2249

if not track:

2250

track = mobj.group('track').strip()

2251

if not artist:

2252

artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))

2253

if not album:

2254

album = mobj.group('album'.strip())

2255

release_year = mobj.group('release_year')

2256

release_date = mobj.group('release_date')

2257

if release_date:

2258

release_date = release_date.replace('-', '')

2259

if not release_year:

2260

release_year = int(release_date[:4])

2261

if release_year:

2262

release_year = int(release_year)

2263

2264

m_episode = re.search(

2265

r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',

2266

video_webpage)

2267

if m_episode:

2268

series = unescapeHTML(m_episode.group('series'))

2269

season_number = int(m_episode.group('season'))

2270

episode_number = int(m_episode.group('episode'))

2271

else:

2272

series = season_number = episode_number = None

2273

2274

m_cat_container = self._search_regex(

2275

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

2276

video_webpage, 'categories', default=None)

2277

if m_cat_container:

2278

category = self._html_search_regex(

2279

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

2280

default=None)

2281

video_categories = None if category is None else [category]

2282

else:

2283

video_categories = None

2284

2285

video_tags = [

2286

unescapeHTML(m.group('content'))

2287

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

2288

2289

def _extract_count(count_name):

2290

return str_to_int(self._search_regex(

2291

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

2292

% re.escape(count_name),

2293

video_webpage, count_name, default=None))

2294

2295

like_count = _extract_count('like')

2296

dislike_count = _extract_count('dislike')

2297

2298

if view_count is None:

2299

view_count = str_to_int(self._search_regex(

2300

r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,

2301

'view count', default=None))

2302

2303

average_rating = (

2304

float_or_none(video_details.get('averageRating'))

2305

or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))

2306

2307

# subtitles

2308

video_subtitles = self.extract_subtitles(video_id, video_webpage)

2309

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

2310

2311

video_duration = try_get(

2312

video_info, lambda x: int_or_none(x['length_seconds'][0]))

2313

if not video_duration:

2314

video_duration = int_or_none(video_details.get('lengthSeconds'))

2315

if not video_duration:

2316

video_duration = parse_duration(self._html_search_meta(

2317

'duration', video_webpage, 'video duration'))

2318

2319

# annotations

2320

video_annotations = None

2321

if self._downloader.params.get('writeannotations', False):

2322

xsrf_token = self._search_regex(

2323

r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',

2324

video_webpage, 'xsrf token', group='xsrf_token', fatal=False)

2325

invideo_url = try_get(

2326

player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)

2327

if xsrf_token and invideo_url:

2328

xsrf_field_name = self._search_regex(

2329

r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',

2330

video_webpage, 'xsrf field name',

2331

group='xsrf_field_name', default='session_token')

2332

video_annotations = self._download_webpage(

2333

self._proto_relative_url(invideo_url),

2334

video_id, note='Downloading annotations',

2335

errnote='Unable to download video annotations', fatal=False,

2336

data=urlencode_postdata({xsrf_field_name: xsrf_token}))

2337

2338

chapters = self._extract_chapters(description_original, video_duration)

2339

2340

# Look for the DASH manifest

2341

if self._downloader.params.get('youtube_include_dash_manifest', True):

2342

dash_mpd_fatal = True

2343

for mpd_url in dash_mpds:

2344

dash_formats = {}

2345

try:

2346

def decrypt_sig(mobj):

2347

s = mobj.group(1)

2348

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

2349

return '/signature/%s' % dec_s

2350

2351

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

2352

2353

for df in self._extract_mpd_formats(

2354

mpd_url, video_id, fatal=dash_mpd_fatal,

2355

formats_dict=self._formats):

2356

if not df.get('filesize'):

2357

df['filesize'] = _extract_filesize(df['url'])

2358

# Do not overwrite DASH format found in some previous DASH manifest

2359

if df['format_id'] not in dash_formats:

2360

dash_formats[df['format_id']] = df

2361

# Additional DASH manifests may end up in HTTP Error 403 therefore

2362

# allow them to fail without bug report message if we already have

2363

# some DASH manifest succeeded. This is temporary workaround to reduce

2364

# burst of bug reports until we figure out the reason and whether it

2365

# can be fixed at all.

2366

dash_mpd_fatal = False

2367

except (ExtractorError, KeyError) as e:

2368

self.report_warning(

2369

'Skipping DASH manifest: %r' % e, video_id)

2370

if dash_formats:

2371

# Remove the formats we found through non-DASH, they

2372

# contain less info and it can be wrong, because we use

2373

# fixed values (for example the resolution). See

2374

# https://github.com/ytdl-org/youtube-dl/issues/5774 for an

2375

# example.

2376

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

2377

formats.extend(dash_formats.values())

2378

2379

# Check for malformed aspect ratio

2380

stretched_m = re.search(

2381

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

2382

video_webpage)

2383

if stretched_m:

2384

w = float(stretched_m.group('w'))

2385

h = float(stretched_m.group('h'))

2386

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

2387

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

2392

f['stretched_ratio'] = ratio

2393

2394

if not formats:

2395

token = extract_token(video_info)

2396

if not token:

2397

if 'reason' in video_info:

2398

if 'The uploader has not made this video available in your country.' in video_info['reason']:

2399

regions_allowed = self._html_search_meta(

2400

'regionsAllowed', video_webpage, default=None)

2401

countries = regions_allowed.split(',') if regions_allowed else None

2402

self.raise_geo_restricted(

2403

msg=video_info['reason'][0], countries=countries)

2404

reason = video_info['reason'][0]

2405

if 'Invalid parameters' in reason:

2406

unavailable_message = extract_unavailable_message()

2407

if unavailable_message:

2408

reason = unavailable_message

2409

raise ExtractorError(

2410

'YouTube said: %s' % reason,

2411

expected=True, video_id=video_id)

2412

else:

2413

raise ExtractorError(

2414

'"token" parameter not in video info for unknown reason',

2415

video_id=video_id)

2416

2417

if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])):

2418

raise ExtractorError('This video is DRM protected.', expected=True)

2419

2420

self._sort_formats(formats)

2421

2422

self.mark_watched(video_id, video_info, player_response)

return {

'id': video_id,

'uploader': video_uploader,

2427

'uploader_id': video_uploader_id,

2428

'uploader_url': video_uploader_url,

2429

'channel_id': channel_id,

2430

'channel_url': channel_url,

2431

'upload_date': upload_date,

2432

'license': video_license,

2433

'creator': video_creator or artist,

2434

'title': video_title,

2435

'alt_title': video_alt_title or track,

2436

'thumbnail': video_thumbnail,

2437

'description': video_description,

2438

'categories': video_categories,

2439

'tags': video_tags,

2440

'subtitles': video_subtitles,

2441

'automatic_captions': automatic_captions,

2442

'duration': video_duration,

2443

'age_limit': 18 if age_gate else 0,

2444

'annotations': video_annotations,

2445

'chapters': chapters,

2446

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

2447

'view_count': view_count,

2448

'like_count': like_count,

2449

'dislike_count': dislike_count,

2450

'average_rating': average_rating,

2451

'formats': formats,

2452

'is_live': is_live,

2453

'start_time': start_time,

2454

'end_time': end_time,

2455

'series': series,

2456

'season_number': season_number,

2457

'episode_number': episode_number,

'track': track,

'artist': artist,

'album': album,

'release_date': release_date,

2462

'release_year': release_year,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

2467

IE_DESC = 'YouTube.com playlists'

2468

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

(?:

(?:

youtube(?:kids)?\.com|

invidio\.us

)

/

(?:

(?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))

2479

\? (?:.*?[&;])*? (?:p|a|list)=

2480

| p/

2481

)|

2482

youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=

2483

)

2484

(

2485

(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}

2486

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

(%(playlist_id)s)

)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}

2493

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

2494

_VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'

2495

_VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'

2496

IE_NAME = 'youtube:playlist'

2497

_TESTS = [{

2498

'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',

2499

'info_dict': {

2500

'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',

2501

'uploader': 'Sergey M.',

2502

'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',

2503

'title': 'youtube-dl public playlist',

},

'playlist_count': 1,

}, {

'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',

2508

'info_dict': {

2509

'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',

2510

'uploader': 'Sergey M.',

2511

'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',

2512

'title': 'youtube-dl empty playlist',

},

'playlist_count': 0,

}, {

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

2517

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2518

'info_dict': {

2519

'title': '29C3: Not my department',

2520

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

2521

'uploader': 'Christiaan008',

2522

'uploader_id': 'ChRiStIaAn008',

2523

},

2524

'playlist_count': 96,

2525

}, {

2526

'note': 'issue #673',

2527

'url': 'PLBB231211A4F62143',

2528

'info_dict': {

2529

'title': '[OLD]Team Fortress 2 (Class-based LP)',

2530

'id': 'PLBB231211A4F62143',

2531

'uploader': 'Wickydoo',

2532

'uploader_id': 'Wickydoo',

2533

},

2534

'playlist_mincount': 26,

2535

}, {

2536

'note': 'Large playlist',

2537

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

2538

'info_dict': {

2539

'title': 'Uploads from Cauchemar',

2540

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

2541

'uploader': 'Cauchemar',

2542

'uploader_id': 'Cauchemar89',

2543

},

2544

'playlist_mincount': 799,

2545

}, {

2546

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2547

'info_dict': {

2548

'title': 'YDL_safe_search',

2549

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

2550

},

2551

'playlist_count': 2,

2552

'skip': 'This playlist is private',

2553

}, {

2554

'note': 'embedded',

2555

'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

2560

'uploader': 'milan',

2561

'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',

2562

}

2563

}, {

2564

'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2565

'playlist_mincount': 485,

2566

'info_dict': {

2567

'title': '2018 Chinese New Singles (11/6 updated)',

2568

'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',

2569

'uploader': 'LBK',

2570

'uploader_id': 'sdragonfang',

2571

}

2572

}, {

2573

'note': 'Embedded SWF player',

2574

'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

2579

},

2580

'skip': 'This playlist does not exist',

2581

}, {

2582

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

2583

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

2584

'info_dict': {

2585

'title': 'Uploads from Interstellar Movie',

2586

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

2587

'uploader': 'Interstellar Movie',

2588

'uploader_id': 'InterstellarMovie1',

2589

},

2590

'playlist_mincount': 21,

2591

}, {

2592

# Playlist URL that does not actually serve a playlist

2593

'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',

'info_dict': {

'id': 'FqZTN594JQw',

'ext': 'webm',

'title': "Smiley's People 01 detective, Adventure Series, Action",

2598

'uploader': 'STREEM',

2599

'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',

2600

'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',

2601

'upload_date': '20150526',

2602

'license': 'Standard YouTube License',

2603

'description': 'md5:507cdcb5a49ac0da37a920ece610be80',

2604

'categories': ['People & Blogs'],

'tags': list,

'view_count': int,

'like_count': int,

'dislike_count': int,

2609

},

2610

'params': {

2611

'skip_download': True,

2612

},

2613

'skip': 'This video is not available.',

2614

'add_ie': [YoutubeIE.ie_key()],

2615

}, {

2616

'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',

'info_dict': {

'id': 'yeWKywCrFtk',

'ext': 'mp4',

'title': 'Small Scale Baler and Braiding Rugs',

2621

'uploader': 'Backus-Page House Museum',

2622

'uploader_id': 'backuspagemuseum',

2623

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',

2624

'upload_date': '20161008',

2625

'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',

2626

'categories': ['Nonprofits & Activism'],

2627

'tags': list,

2628

'like_count': int,

2629

'dislike_count': int,

},

'params': {

'noplaylist': True,

'skip_download': True,

2634

},

2635

}, {

2636

# https://github.com/ytdl-org/youtube-dl/issues/21844

2637

'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',

2638

'info_dict': {

2639

'title': 'Data Analysis with Dr Mike Pound',

2640

'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',

2641

'uploader_id': 'Computerphile',

2642

'uploader': 'Computerphile',

2643

},

2644

'playlist_mincount': 11,

2645

}, {

2646

'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',

2647

'only_matching': True,

2648

}, {

2649

'url': 'TLGGrESM50VT6acwMjAyMjAxNw',

2650

'only_matching': True,

2651

}, {

2652

# music album playlist

2653

'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',

2654

'only_matching': True,

2655

}, {

2656

'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',

2657

'only_matching': True,

2658

}, {

2659

'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',

2660

'only_matching': True,

2661

}]

2662

2663

def _real_initialize(self):

2664

self._login()

2665

2666

def extract_videos_from_page(self, page):

ids_in_page = []

titles_in_page = []

for item in re.findall(

2671

r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):

2672

attrs = extract_attributes(item)

2673

video_id = attrs['data-video-id']

2674

video_title = unescapeHTML(attrs.get('data-title'))

2675

if video_title:

2676

video_title = video_title.strip()

2677

ids_in_page.append(video_id)

2678

titles_in_page.append(video_title)

2679

2680

# Fallback with old _VIDEO_RE

2681

self.extract_videos_from_page_impl(

2682

self._VIDEO_RE, page, ids_in_page, titles_in_page)

2683

2684

# Relaxed fallbacks

2685

self.extract_videos_from_page_impl(

2686

r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,

2687

ids_in_page, titles_in_page)

2688

self.extract_videos_from_page_impl(

2689

r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,

2690

ids_in_page, titles_in_page)

2691

2692

return zip(ids_in_page, titles_in_page)

2693

2694

def _extract_mix(self, playlist_id):

2695

# The mixes are generated from a single video

2696

# the id of the playlist is just 'RD' + video_id

2697

ids = []

2698

last_id = playlist_id[-11:]

2699

for n in itertools.count(1):

2700

url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)

2701

webpage = self._download_webpage(

2702

url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))

2703

new_ids = orderedSet(re.findall(

2704

r'''(?xs)data-video-username=".*?".*?

2705

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

2706

webpage))

2707

# Fetch new pages until all the videos are repeated, it seems that

2708

# there are always 51 unique videos.

2709

new_ids = [_id for _id in new_ids if _id not in ids]

if not new_ids:

break

ids.extend(new_ids)

last_id = ids[-1]

url_results = self._ids_to_results(ids)

2716

2717

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

2718

title_span = (

2719

search_title('playlist-title')

2720

or search_title('title long-title')

2721

or search_title('title'))

2722

title = clean_html(title_span)

2723

2724

return self.playlist_result(url_results, playlist_id, title)

2725

2726

def _extract_playlist(self, playlist_id):

2727

url = self._TEMPLATE_URL % playlist_id

2728

page = self._download_webpage(url, playlist_id)

2729

2730

# the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)

2731

for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):

2732

match = match.strip()

2733

# Check if the playlist exists or is private

2734

mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)

2735

if mobj:

2736

reason = mobj.group('reason')

2737

message = 'This playlist %s' % reason

2738

if 'private' in reason:

2739

message += ', use --username or --netrc to access it'

2740

message += '.'

2741

raise ExtractorError(message, expected=True)

2742

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

2743

raise ExtractorError(

2744

'Invalid parameters. Maybe URL is incorrect.',

2745

expected=True)

2746

elif re.match(r'[^<]*Choose your language[^<]*', match):

2747

continue

2748

else:

2749

self.report_warning('Youtube gives an alert message: ' + match)

2750

2751

playlist_title = self._html_search_regex(

2752

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

2753

page, 'title', default=None)

2754

2755

_UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='

2756

uploader = self._html_search_regex(

2757

r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,

2758

page, 'uploader', default=None)

2759

mobj = re.search(

2760

r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,

2761

page)

2762

if mobj:

2763

uploader_id = mobj.group('uploader_id')

2764

uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))

2765

else:

2766

uploader_id = uploader_url = None

has_videos = True

if not playlist_title:

2771

try:

2772

# Some playlist URLs don't actually serve a playlist (e.g.

2773

# https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)

2774

next(self._entries(page, playlist_id))

2775

except StopIteration:

2776

has_videos = False

2777

2778

playlist = self.playlist_result(

2779

self._entries(page, playlist_id), playlist_id, playlist_title)

2780

playlist.update({

2781

'uploader': uploader,

2782

'uploader_id': uploader_id,

2783

'uploader_url': uploader_url,

2784

})

2785

2786

return has_videos, playlist

2787

2788

def _check_download_just_video(self, url, playlist_id):

2789

# Check if it's a video-specific URL

2790

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

2791

video_id = query_dict.get('v', [None])[0] or self._search_regex(

2792

r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,

2793

'video id', default=None)

2794

if video_id:

2795

if self._downloader.params.get('noplaylist'):

2796

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

2797

return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)

2798

else:

2799

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

2800

return video_id, None

2801

return None, None

2802

2803

def _real_extract(self, url):

2804

# Extract playlist id

2805

mobj = re.match(self._VALID_URL, url)

2806

if mobj is None:

2807

raise ExtractorError('Invalid URL: %s' % url)

2808

playlist_id = mobj.group(1) or mobj.group(2)

2809

2810

video_id, video = self._check_download_just_video(url, playlist_id)

if video:

return video

if playlist_id.startswith(('RD', 'UL', 'PU')):

2815

# Mixes require a custom extraction process

2816

return self._extract_mix(playlist_id)

2817

2818

has_videos, playlist = self._extract_playlist(playlist_id)

2819

if has_videos or not video_id:

2820

return playlist

2821

2822

# Some playlist URLs don't actually serve a playlist (see

2823

# https://github.com/ytdl-org/youtube-dl/issues/10537).

2824

# Fallback to plain video extraction if there is a video id

2825

# along with playlist id.

2826

return self.url_result(video_id, 'Youtube', video_id=video_id)

2827

2828

2829

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

2830

IE_DESC = 'YouTube.com channels'

2831

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'

2832

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

2833

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

2834

IE_NAME = 'youtube:channel'

2835

_TESTS = [{

2836

'note': 'paginated channel',

2837

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

2838

'playlist_mincount': 91,

2839

'info_dict': {

2840

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

2841

'title': 'Uploads from lex will',

2842

'uploader': 'lex will',

2843

'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',

2844

}

2845

}, {

2846

'note': 'Age restricted channel',

2847

# from https://www.youtube.com/user/DeusExOfficial

2848

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

2849

'playlist_mincount': 64,

2850

'info_dict': {

2851

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

2852

'title': 'Uploads from Deus Ex',

2853

'uploader': 'Deus Ex',

2854

'uploader_id': 'DeusExOfficial',

2855

},

2856

}, {

2857

'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',

2858

'only_matching': True,

2859

}, {

2860

'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',

2861

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2866

return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)

2867

else super(YoutubeChannelIE, cls).suitable(url))

2868

2869

def _build_template_url(self, url, channel_id):

2870

return self._TEMPLATE_URL % channel_id

2871

2872

def _real_extract(self, url):

2873

channel_id = self._match_id(url)

2874

2875

url = self._build_template_url(url, channel_id)

2876

2877

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

2878

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

2879

# otherwise fallback on channel by page extraction

2880

channel_page = self._download_webpage(

2881

url + '?view=57', channel_id,

2882

'Downloading channel page', fatal=False)

2883

if channel_page is False:

2884

channel_playlist_id = False

2885

else:

2886

channel_playlist_id = self._html_search_meta(

2887

'channelId', channel_page, 'channel id', default=None)

2888

if not channel_playlist_id:

2889

channel_url = self._html_search_meta(

2890

('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),

2891

channel_page, 'channel url', default=None)

2892

if channel_url:

2893

channel_playlist_id = self._search_regex(

2894

r'vnd\.youtube://user/([0-9A-Za-z_-]+)',

2895

channel_url, 'channel id', default=None)

2896

if channel_playlist_id and channel_playlist_id.startswith('UC'):

2897

playlist_id = 'UU' + channel_playlist_id[2:]

2898

return self.url_result(

2899

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

2900

2901

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

2902

autogenerated = re.search(r'''(?x)

2903

class="[^"]*?(?:

2904

channel-header-autogenerated-label|

2905

yt-channel-title-autogenerated

2906

)[^"]*"''', channel_page) is not None

2907

2908

if autogenerated:

2909

# The videos are contained in a single page

2910

# the ajax pages can't be used, they are empty

2911

entries = [

2912

self.url_result(

2913

video_id, 'Youtube', video_id=video_id,

2914

video_title=video_title)

2915

for video_id, video_title in self.extract_videos_from_page(channel_page)]

2916

return self.playlist_result(entries, channel_id)

2917

2918

try:

2919

next(self._entries(channel_page, channel_id))

2920

except StopIteration:

2921

alert_message = self._html_search_regex(

2922

r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',

2923

channel_page, 'alert', default=None, group='alert')

2924

if alert_message:

2925

raise ExtractorError('Youtube said: %s' % alert_message, expected=True)

2926

2927

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

2928

2929

2930

class YoutubeUserIE(YoutubeChannelIE):

2931

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

2932

2933

_TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'

2934

IE_NAME = 'youtube:user'

2935

2936

_TESTS = [{

2937

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

2938

'playlist_mincount': 320,

2939

'info_dict': {

2940

'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',

2941

'title': 'Uploads from The Linux Foundation',

2942

'uploader': 'The Linux Foundation',

2943

'uploader_id': 'TheLinuxFoundation',

2944

}

2945

}, {

2946

# Only available via https://www.youtube.com/c/12minuteathlete/videos

2947

# but not https://www.youtube.com/user/12minuteathlete/videos

2948

'url': 'https://www.youtube.com/c/12minuteathlete/videos',

2949

'playlist_mincount': 249,

2950

'info_dict': {

2951

'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',

2952

'title': 'Uploads from 12 Minute Athlete',

2953

'uploader': '12 Minute Athlete',

2954

'uploader_id': 'the12minuteathlete',

2955

}

2956

}, {

2957

'url': 'ytuser:phihag',

2958

'only_matching': True,

2959

}, {

2960

'url': 'https://www.youtube.com/c/gametrailers',

2961

'only_matching': True,

2962

}, {

2963

'url': 'https://www.youtube.com/gametrailers',

2964

'only_matching': True,

2965

}, {

2966

# This channel is not available, geo restricted to JP

2967

'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',

2968

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2973

# Don't return True if the url can be extracted with other youtube

2974

# extractor, the regex would is too permissive and it would match.

2975

other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)

2976

if any(ie.suitable(url) for ie in other_yt_ies):

2977

return False

2978

else:

2979

return super(YoutubeUserIE, cls).suitable(url)

2980

2981

def _build_template_url(self, url, channel_id):

2982

mobj = re.match(self._VALID_URL, url)

2983

return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))

2984

2985

2986

class YoutubeLiveIE(YoutubeBaseInfoExtractor):

2987

IE_DESC = 'YouTube.com live streams'

2988

_VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'

2989

IE_NAME = 'youtube:live'

2990

2991

_TESTS = [{

2992

'url': 'https://www.youtube.com/user/TheYoungTurks/live',

'info_dict': {

'id': 'a48o2S1cPoo',

'ext': 'mp4',

'title': 'The Young Turks - Live Main Show',

2997

'uploader': 'The Young Turks',

2998

'uploader_id': 'TheYoungTurks',

2999

'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',

3000

'upload_date': '20150715',

3001

'license': 'Standard YouTube License',

3002

'description': 'md5:438179573adcdff3c97ebb1ee632b891',

3003

'categories': ['News & Politics'],

3004

'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],

3005

'like_count': int,

3006

'dislike_count': int,

3007

},

3008

'params': {

3009

'skip_download': True,

3010

},

3011

}, {

3012

'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',

3013

'only_matching': True,

3014

}, {

3015

'url': 'https://www.youtube.com/c/CommanderVideoHq/live',

3016

'only_matching': True,

3017

}, {

3018

'url': 'https://www.youtube.com/TheYoungTurks/live',

3019

'only_matching': True,

3020

}]

3021

3022

def _real_extract(self, url):

3023

mobj = re.match(self._VALID_URL, url)

3024

channel_id = mobj.group('id')

3025

base_url = mobj.group('base_url')

3026

webpage = self._download_webpage(url, channel_id, fatal=False)

3027

if webpage:

3028

page_type = self._og_search_property(

3029

'type', webpage, 'page type', default='')

3030

video_id = self._html_search_meta(

3031

'videoId', webpage, 'video id', default=None)

3032

if page_type.startswith('video') and video_id and re.match(

3033

r'^[0-9A-Za-z_-]{11}$', video_id):

3034

return self.url_result(video_id, YoutubeIE.ie_key())

3035

return self.url_result(base_url)

3036

3037

3038

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

3039

IE_DESC = 'YouTube.com user/channel playlists'

3040

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

3041

IE_NAME = 'youtube:playlists'

3042

3043

_TESTS = [{

3044

'url': 'https://www.youtube.com/user/ThirstForScience/playlists',

3045

'playlist_mincount': 4,

3046

'info_dict': {

3047

'id': 'ThirstForScience',

3048

'title': 'ThirstForScience',

3049

},

3050

}, {

3051

# with "Load more" button

3052

'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

3053

'playlist_mincount': 70,

3054

'info_dict': {

3055

'id': 'igorkle1',

3056

'title': 'Игорь Клейнер',

3057

},

3058

}, {

3059

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

3060

'playlist_mincount': 17,

3061

'info_dict': {

3062

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

3063

'title': 'Chem Player',

},

'skip': 'Blocked',

}]

class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):

3070

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'

3071

3072

3073

class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):

3074

IE_DESC = 'YouTube.com searches'

3075

# there doesn't appear to be a real limit, for example if you search for

3076

# 'python' you get more than 8.000.000 results

3077

_MAX_RESULTS = float('inf')

3078

IE_NAME = 'youtube:search'

3079

_SEARCH_KEY = 'ytsearch'

3080

_EXTRA_QUERY_ARGS = {}

3081

_TESTS = []

3082

3083

def _get_n_results(self, query, n):

3084

"""Get a specified number of results for a query"""

videos = []

limit = n

url_query = {

'search_query': query.encode('utf-8'),

3091

}

3092

url_query.update(self._EXTRA_QUERY_ARGS)

3093

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)

3094

3095

for pagenum in itertools.count(1):

3096

data = self._download_json(

3097

result_url, video_id='query "%s"' % query,

3098

note='Downloading page %s' % pagenum,

3099

errnote='Unable to download API page',

3100

query={'spf': 'navigate'})

3101

html_content = data[1]['body']['content']

3102

3103

if 'class="search-message' in html_content:

3104

raise ExtractorError(

3105

'[youtube] No video results', expected=True)

3106

3107

new_videos = list(self._process_page(html_content))

3108

videos += new_videos

3109

if not new_videos or len(videos) > limit:

3110

break

3111

next_link = self._html_search_regex(

3112

r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',

3113

html_content, 'next link', default=None)

3114

if next_link is None:

3115

break

3116

result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

3121

3122

3123

class YoutubeSearchDateIE(YoutubeSearchIE):

3124

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

3125

_SEARCH_KEY = 'ytsearchdate'

3126

IE_DESC = 'YouTube.com searches, newest videos first'

3127

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

3128

3129

3130

class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):

3131

IE_DESC = 'YouTube.com search URLs'

3132

IE_NAME = 'youtube:search_url'

3133

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'

3134

_TESTS = [{

3135

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

3136

'playlist_mincount': 5,

3137

'info_dict': {

3138

'title': 'youtube-dl test video',

3139

}

3140

}, {

3141

'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',

3142

'only_matching': True,

3143

}]

3144

3145

def _real_extract(self, url):

3146

mobj = re.match(self._VALID_URL, url)

3147

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

3148

webpage = self._download_webpage(url, query)

3149

return self.playlist_result(self._process_page(webpage), playlist_title=query)

3150

3151

3152

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

3153

IE_DESC = 'YouTube.com (multi-season) shows'

3154

_VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'

3155

IE_NAME = 'youtube:show'

3156

_TESTS = [{

3157

'url': 'https://www.youtube.com/show/airdisasters',

3158

'playlist_mincount': 5,

3159

'info_dict': {

3160

'id': 'airdisasters',

3161

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

3166

playlist_id = self._match_id(url)

3167

return super(YoutubeShowIE, self)._real_extract(

3168

'https://www.youtube.com/show/%s/playlists' % playlist_id)

3169

3170

3171

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

3172

"""

3173

Base class for feed extractors

3174

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

3175

"""

3176

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

3181

3182

def _real_initialize(self):

3183

self._login()

3184

3185

def _entries(self, page):

3186

# The extraction process is the same as for playlists, but the regex

3187

# for the video ids doesn't contain an index

3188

ids = []

3189

more_widget_html = content_html = page

3190

for page_num in itertools.count(1):

3191

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

3192

3193

# 'recommended' feed has infinite 'load more' and each new portion spins

3194

# the same videos in (sometimes) slightly different order, so we'll check

3195

# for unicity and break when portion has no new videos

3196

new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))

if not new_ids:

break

ids.extend(new_ids)

for entry in self._ids_to_results(new_ids):

3203

yield entry

3204

3205

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

3210

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

3211

'Downloading page #%s' % page_num,

3212

transform_source=uppercase_escape)

3213

content_html = more['content_html']

3214

more_widget_html = more['load_more_widget_html']

3215

3216

def _real_extract(self, url):

3217

page = self._download_webpage(

3218

'https://www.youtube.com/feed/%s' % self._FEED_NAME,

3219

self._PLAYLIST_TITLE)

3220

return self.playlist_result(

3221

self._entries(page), playlist_title=self._PLAYLIST_TITLE)

3222

3223

3224

class YoutubeWatchLaterIE(YoutubePlaylistIE):

3225

IE_NAME = 'youtube:watchlater'

3226

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

3227

_VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'

3228

3229

_TESTS = [{

3230

'url': 'https://www.youtube.com/playlist?list=WL',

3231

'only_matching': True,

3232

}, {

3233

'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',

3234

'only_matching': True,

3235

}]

3236

3237

def _real_extract(self, url):

3238

_, video = self._check_download_just_video(url, 'WL')

3239

if video:

3240

return video

3241

_, playlist = self._extract_playlist('WL')

return playlist

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

3246

IE_NAME = 'youtube:favorites'

3247

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

3248

_VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

3249

_LOGIN_REQUIRED = True

3250

3251

def _real_extract(self, url):

3252

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

3253

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

3254

return self.url_result(playlist_id, 'YoutubePlaylist')

3255

3256

3257

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

3258

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

3259

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'

3260

_FEED_NAME = 'recommended'

3261

_PLAYLIST_TITLE = 'Youtube Recommended videos'

3262

3263

3264

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

3265

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

3266

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

3267

_FEED_NAME = 'subscriptions'

3268

_PLAYLIST_TITLE = 'Youtube Subscriptions'

3269

3270

3271

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

3272

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

3273

_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'

3274

_FEED_NAME = 'history'

3275

_PLAYLIST_TITLE = 'Youtube History'

3276

3277

3278

class YoutubeTruncatedURLIE(InfoExtractor):

3279

IE_NAME = 'youtube:truncated_url'

3280

IE_DESC = False # Do not list

3281

_VALID_URL = r'''(?x)

3282

(?:https?://)?

3283

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

3284

(?:watch\?(?:

3285

feature=[a-z_]+|

3286

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',

3299

'only_matching': True,

3300

}, {

3301

'url': 'https://www.youtube.com/watch?',

3302

'only_matching': True,

3303

}, {

3304

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

3305

'only_matching': True,

3306

}, {

3307

'url': 'https://www.youtube.com/watch?feature=foo',

3308

'only_matching': True,

3309

}, {

3310

'url': 'https://www.youtube.com/watch?hl=en-GB',

3311

'only_matching': True,

3312

}, {

3313

'url': 'https://www.youtube.com/watch?t=2372',

3314

'only_matching': True,

3315

}]

3316

3317

def _real_extract(self, url):

3318

raise ExtractorError(

3319

'Did you forget to quote the URL? Remember that & is a meta '

3320

'character in most shells, so you want to put the URL in quotes, '

3321

'like youtube-dl '

3322

'"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

3323

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

3328

IE_NAME = 'youtube:truncated_id'

3329

IE_DESC = False # Do not list

3330

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

3331

3332

_TESTS = [{

3333

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

3334

'only_matching': True,

3335

}]

3336

3337

def _real_extract(self, url):

3338

video_id = self._match_id(url)

3339

raise ExtractorError(

3340

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

3341

expected=True)