jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import re
	10	import time
	11	import traceback
	12
	13	from .common import InfoExtractor, SearchInfoExtractor
	14	from ..jsinterp import JSInterpreter
	15	from ..swfinterp import SWFInterpreter
	16	from ..compat import (
	17	compat_chr,
	18	compat_parse_qs,
	19	compat_urllib_parse,
	20	compat_urllib_parse_unquote,
	21	compat_urllib_parse_unquote_plus,
	22	compat_urllib_parse_urlparse,
	23	compat_urlparse,
	24	compat_str,
	25	)
	26	from ..utils import (
	27	clean_html,
	28	encode_dict,
	29	ExtractorError,
	30	float_or_none,
	31	get_element_by_attribute,
	32	get_element_by_id,
	33	int_or_none,
	34	orderedSet,
	35	parse_duration,
	36	remove_start,
	37	sanitized_Request,
	38	smuggle_url,
	39	str_to_int,
	40	unescapeHTML,
	41	unified_strdate,
	42	unsmuggle_url,
	43	uppercase_escape,
	44	ISO3166Utils,
	45	)
	46
	47
	48	class YoutubeBaseInfoExtractor(InfoExtractor):
	49	"""Provide base functions for Youtube extractors"""
	50	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	51	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	52	_NETRC_MACHINE = 'youtube'
	53	# If True it will raise an error if no login info is provided
	54	_LOGIN_REQUIRED = False
	55
	56	def _set_language(self):
	57	self._set_cookie(
	58	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	59	# YouTube sets the expire time to about two months
	60	expire_time=time.time() + 2 * 30 * 24 * 3600)
	61
	62	def _ids_to_results(self, ids):
	63	return [
	64	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	65	for vid_id in ids]
	66
	67	def _login(self):
	68	"""
	69	Attempt to log in to YouTube.
	70	True is returned if successful or skipped.
	71	False is returned if login failed.
	72
	73	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	74	"""
	75	(username, password) = self._get_login_info()
	76	# No authentication to be performed
	77	if username is None:
	78	if self._LOGIN_REQUIRED:
	79	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	80	return True
	81
	82	login_page = self._download_webpage(
	83	self._LOGIN_URL, None,
	84	note='Downloading login page',
	85	errnote='unable to fetch login page', fatal=False)
	86	if login_page is False:
	87	return
	88
	89	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	90	login_page, 'Login GALX parameter')
	91
	92	# Log in
	93	login_form_strs = {
	94	'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	95	'Email': username,
	96	'GALX': galx,
	97	'Passwd': password,
	98
	99	'PersistentCookie': 'yes',
	100	'_utf8': '霱',
	101	'bgresponse': 'js_disabled',
	102	'checkConnection': '',
	103	'checkedDomains': 'youtube',
	104	'dnConn': '',
	105	'pstMsg': '0',
	106	'rmShown': '1',
	107	'secTok': '',
	108	'signIn': 'Sign in',
	109	'timeStmp': '',
	110	'service': 'youtube',
	111	'uilel': '3',
	112	'hl': 'en_US',
	113	}
	114
	115	login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
	116
	117	req = sanitized_Request(self._LOGIN_URL, login_data)
	118	login_results = self._download_webpage(
	119	req, None,
	120	note='Logging in', errnote='unable to log in', fatal=False)
	121	if login_results is False:
	122	return False
	123
	124	if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
	125	raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
	126
	127	# Two-Factor
	128	# TODO add SMS and phone call support - these require making a request and then prompting the user
	129
	130	if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
	131	tfa_code = self._get_tfa_info('2-step verification code')
	132
	133	if not tfa_code:
	134	self._downloader.report_warning(
	135	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	136	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	137	return False
	138
	139	tfa_code = remove_start(tfa_code, 'G-')
	140
	141	tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
	142
	143	tfa_form_strs.update({
	144	'Pin': tfa_code,
	145	'TrustDevice': 'on',
	146	})
	147
	148	tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
	149
	150	tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
	151	tfa_results = self._download_webpage(
	152	tfa_req, None,
	153	note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
	154
	155	if tfa_results is False:
	156	return False
	157
	158	if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
	159	self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
	160	return False
	161	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
	162	self._downloader.report_warning('unable to log in - did the page structure change?')
	163	return False
	164	if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
	165	self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
	166	return False
	167
	168	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	169	self._downloader.report_warning('unable to log in: bad username or password')
	170	return False
	171	return True
	172
	173	def _real_initialize(self):
	174	if self._downloader is None:
	175	return
	176	self._set_language()
	177	if not self._login():
	178	return
	179
	180
	181	class YoutubeEntryListBaseInfoExtractor(InfoExtractor):
	182	# Extract entries from page with "Load more" button
	183	def _entries(self, page, playlist_id):
	184	more_widget_html = content_html = page
	185	for page_num in itertools.count(1):
	186	for entry in self._process_page(content_html):
	187	yield entry
	188
	189	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	190	if not mobj:
	191	break
	192
	193	more = self._download_json(
	194	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	195	'Downloading page #%s' % page_num,
	196	transform_source=uppercase_escape)
	197	content_html = more['content_html']
	198	if not content_html.strip():
	199	# Some webpages show a "Load more" button but they don't
	200	# have more videos
	201	break
	202	more_widget_html = more['load_more_widget_html']
	203
	204
	205	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	206	def _process_page(self, content):
	207	for video_id, video_title in self.extract_videos_from_page(content):
	208	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	209
	210	def extract_videos_from_page(self, page):
	211	ids_in_page = []
	212	titles_in_page = []
	213	for mobj in re.finditer(self._VIDEO_RE, page):
	214	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	215	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	216	continue
	217	video_id = mobj.group('id')
	218	video_title = unescapeHTML(mobj.group('title'))
	219	if video_title:
	220	video_title = video_title.strip()
	221	try:
	222	idx = ids_in_page.index(video_id)
	223	if video_title and not titles_in_page[idx]:
	224	titles_in_page[idx] = video_title
	225	except ValueError:
	226	ids_in_page.append(video_id)
	227	titles_in_page.append(video_title)
	228	return zip(ids_in_page, titles_in_page)
	229
	230
	231	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	232	def _process_page(self, content):
	233	for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content):
	234	yield self.url_result(
	235	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	236
	237	def _real_extract(self, url):
	238	playlist_id = self._match_id(url)
	239	webpage = self._download_webpage(url, playlist_id)
	240	title = self._og_search_title(webpage, fatal=False)
	241	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	242
	243
	244	class YoutubeIE(YoutubeBaseInfoExtractor):
	245	IE_DESC = 'YouTube.com'
	246	_VALID_URL = r"""(?x)^
	247	(
	248	(?:https?://\|//) # http(s):// or protocol-independent URL
	249	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	250	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	251	(?:www\.)?pwnyoutube\.com/\|
	252	(?:www\.)?yourepeat\.com/\|
	253	tube\.majestyc\.net/\|
	254	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	255	(?:.*?\#/)? # handle anchor (#/) redirect urls
	256	(?: # the various things that can precede the ID:
	257	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	258	\|(?: # or the v= param in all its forms
	259	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	260	(?:\?\|\#!?) # the params delimiter ? or # or #!
	261	(?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
	262	v=
	263	)
	264	))
	265	\|(?:
	266	youtu\.be\| # just youtu.be/xxxx
	267	vid\.plus # or vid.plus/xxxx
	268	)/
	269	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	270	)
	271	)? # all until now is optional -> you can pass the naked ID
	272	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	273	(?!.*?&list=) # combined list/video URLs are handled by the playlist IE
	274	(?(1).+)? # if we found the ID, everything can follow
	275	$"""
	276	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	277	_formats = {
	278	'5': {'ext': 'flv', 'width': 400, 'height': 240},
	279	'6': {'ext': 'flv', 'width': 450, 'height': 270},
	280	'13': {'ext': '3gp'},
	281	'17': {'ext': '3gp', 'width': 176, 'height': 144},
	282	'18': {'ext': 'mp4', 'width': 640, 'height': 360},
	283	'22': {'ext': 'mp4', 'width': 1280, 'height': 720},
	284	'34': {'ext': 'flv', 'width': 640, 'height': 360},
	285	'35': {'ext': 'flv', 'width': 854, 'height': 480},
	286	'36': {'ext': '3gp', 'width': 320, 'height': 240},
	287	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
	288	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
	289	'43': {'ext': 'webm', 'width': 640, 'height': 360},
	290	'44': {'ext': 'webm', 'width': 854, 'height': 480},
	291	'45': {'ext': 'webm', 'width': 1280, 'height': 720},
	292	'46': {'ext': 'webm', 'width': 1920, 'height': 1080},
	293	'59': {'ext': 'mp4', 'width': 854, 'height': 480},
	294	'78': {'ext': 'mp4', 'width': 854, 'height': 480},
	295
	296
	297	# 3d videos
	298	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
	299	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
	300	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
	301	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
	302	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
	303	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
	304	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
	305
	306	# Apple HTTP Live Streaming
	307	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
	308	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
	309	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
	310	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
	311	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
	312	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
	313	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
	314
	315	# DASH mp4 video
	316	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	317	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	318	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	319	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	320	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	321	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
	322	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	323	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	324	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
	325	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
	326	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
	327
	328	# Dash mp4 audio
	329	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
	330	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
	331	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
	332
	333	# Dash webm
	334	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	335	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	336	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	337	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	338	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	339	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	340	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
	341	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	342	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	343	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	344	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	345	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	346	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	347	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	348	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	349	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
	350	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
	351	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
	352	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
	353	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
	354	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
	355
	356	# Dash webm audio
	357	'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
	358	'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	359
	360	# Dash webm audio with opus inside
	361	'249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
	362	'250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
	363	'251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
	364
	365	# RTMP (unnamed)
	366	'_rtmp': {'protocol': 'rtmp'},
	367	}
	368
	369	IE_NAME = 'youtube'
	370	_TESTS = [
	371	{
	372	'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
	373	'info_dict': {
	374	'id': 'BaW_jenozKc',
	375	'ext': 'mp4',
	376	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	377	'uploader': 'Philipp Hagemeister',
	378	'uploader_id': 'phihag',
	379	'upload_date': '20121002',
	380	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	381	'categories': ['Science & Technology'],
	382	'tags': ['youtube-dl'],
	383	'like_count': int,
	384	'dislike_count': int,
	385	'start_time': 1,
	386	'end_time': 9,
	387	}
	388	},
	389	{
	390	'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
	391	'note': 'Test generic use_cipher_signature video (#897)',
	392	'info_dict': {
	393	'id': 'UxxajLWwzqY',
	394	'ext': 'mp4',
	395	'upload_date': '20120506',
	396	'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
	397	'description': 'md5:782e8651347686cba06e58f71ab51773',
	398	'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
	399	'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
	400	'iconic ep', 'iconic', 'love', 'it'],
	401	'uploader': 'Icona Pop',
	402	'uploader_id': 'IconaPop',
	403	}
	404	},
	405	{
	406	'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
	407	'note': 'Test VEVO video with age protection (#956)',
	408	'info_dict': {
	409	'id': '07FYdnEawAQ',
	410	'ext': 'mp4',
	411	'upload_date': '20130703',
	412	'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
	413	'description': 'md5:64249768eec3bc4276236606ea996373',
	414	'uploader': 'justintimberlakeVEVO',
	415	'uploader_id': 'justintimberlakeVEVO',
	416	'age_limit': 18,
	417	}
	418	},
	419	{
	420	'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
	421	'note': 'Embed-only video (#1746)',
	422	'info_dict': {
	423	'id': 'yZIXLfi8CZQ',
	424	'ext': 'mp4',
	425	'upload_date': '20120608',
	426	'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
	427	'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
	428	'uploader': 'SET India',
	429	'uploader_id': 'setindia',
	430	'age_limit': 18,
	431	}
	432	},
	433	{
	434	'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
	435	'note': 'Use the first video ID in the URL',
	436	'info_dict': {
	437	'id': 'BaW_jenozKc',
	438	'ext': 'mp4',
	439	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	440	'uploader': 'Philipp Hagemeister',
	441	'uploader_id': 'phihag',
	442	'upload_date': '20121002',
	443	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	444	'categories': ['Science & Technology'],
	445	'tags': ['youtube-dl'],
	446	'like_count': int,
	447	'dislike_count': int,
	448	},
	449	'params': {
	450	'skip_download': True,
	451	},
	452	},
	453	{
	454	'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
	455	'note': '256k DASH audio (format 141) via DASH manifest',
	456	'info_dict': {
	457	'id': 'a9LDPn-MO4I',
	458	'ext': 'm4a',
	459	'upload_date': '20121002',
	460	'uploader_id': '8KVIDEO',
	461	'description': '',
	462	'uploader': '8KVIDEO',
	463	'title': 'UHDTV TEST 8K VIDEO.mp4'
	464	},
	465	'params': {
	466	'youtube_include_dash_manifest': True,
	467	'format': '141',
	468	},
	469	},
	470	# DASH manifest with encrypted signature
	471	{
	472	'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
	473	'info_dict': {
	474	'id': 'IB3lcPjvWLA',
	475	'ext': 'm4a',
	476	'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
	477	'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
	478	'uploader': 'AfrojackVEVO',
	479	'uploader_id': 'AfrojackVEVO',
	480	'upload_date': '20131011',
	481	},
	482	'params': {
	483	'youtube_include_dash_manifest': True,
	484	'format': '141',
	485	},
	486	},
	487	# JS player signature function name containing $
	488	{
	489	'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
	490	'info_dict': {
	491	'id': 'nfWlot6h_JM',
	492	'ext': 'm4a',
	493	'title': 'Taylor Swift - Shake It Off',
	494	'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
	495	'uploader': 'TaylorSwiftVEVO',
	496	'uploader_id': 'TaylorSwiftVEVO',
	497	'upload_date': '20140818',
	498	},
	499	'params': {
	500	'youtube_include_dash_manifest': True,

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

14

from ..jsinterp import JSInterpreter

15

from ..swfinterp import SWFInterpreter

16

from ..compat import (

compat_chr,

compat_parse_qs,

compat_urllib_parse,

compat_urllib_parse_unquote,

21

compat_urllib_parse_unquote_plus,

22

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

encode_dict,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

orderedSet,

parse_duration,

remove_start,

sanitized_Request,

smuggle_url,

str_to_int,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

ISO3166Utils,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

49

"""Provide base functions for Youtube extractors"""

50

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

51

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

52

_NETRC_MACHINE = 'youtube'

53

# If True it will raise an error if no login info is provided

54

_LOGIN_REQUIRED = False

55

56

def _set_language(self):

57

self._set_cookie(

58

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

59

# YouTube sets the expire time to about two months

60

expire_time=time.time() + 2 * 30 * 24 * 3600)

61

62

def _ids_to_results(self, ids):

63

return [

64

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

70

True is returned if successful or skipped.

71

False is returned if login failed.

72

73

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

74

"""

75

(username, password) = self._get_login_info()

76

# No authentication to be performed

77

if username is None:

78

if self._LOGIN_REQUIRED:

79

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

80

return True

81

82

login_page = self._download_webpage(

83

self._LOGIN_URL, None,

84

note='Downloading login page',

85

errnote='unable to fetch login page', fatal=False)

86

if login_page is False:

87

return

88

89

galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',

90

login_page, 'Login GALX parameter')

# Log in

login_form_strs = {

'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

'Email': username,

'GALX': galx,

'Passwd': password,

'PersistentCookie': 'yes',

100

'_utf8': '霱',

101

'bgresponse': 'js_disabled',

102

'checkConnection': '',

103

'checkedDomains': 'youtube',

'dnConn': '',

'pstMsg': '0',

'rmShown': '1',

'secTok': '',

'signIn': 'Sign in',

'timeStmp': '',

'service': 'youtube',

'uilel': '3',

'hl': 'en_US',

}

login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')

116

117

req = sanitized_Request(self._LOGIN_URL, login_data)

118

login_results = self._download_webpage(

119

req, None,

120

note='Logging in', errnote='unable to log in', fatal=False)

121

if login_results is False:

122

return False

123

124

if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:

125

raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)

126

127

# Two-Factor

128

# TODO add SMS and phone call support - these require making a request and then prompting the user

129

130

if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:

131

tfa_code = self._get_tfa_info('2-step verification code')

132

133

if not tfa_code:

134

self._downloader.report_warning(

135

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

136

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

137

return False

138

139

tfa_code = remove_start(tfa_code, 'G-')

140

141

tfa_form_strs = self._form_hidden_inputs('challenge', login_results)

142

143

tfa_form_strs.update({

'Pin': tfa_code,

'TrustDevice': 'on',

})

tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')

149

150

tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)

151

tfa_results = self._download_webpage(

152

tfa_req, None,

153

note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)

154

155

if tfa_results is False:

156

return False

157

158

if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:

159

self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')

160

return False

161

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:

162

self._downloader.report_warning('unable to log in - did the page structure change?')

163

return False

164

if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:

165

self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')

166

return False

167

168

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:

169

self._downloader.report_warning('unable to log in: bad username or password')

return False

return True

def _real_initialize(self):

174

if self._downloader is None:

175

return

176

self._set_language()

177

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(InfoExtractor):

182

# Extract entries from page with "Load more" button

183

def _entries(self, page, playlist_id):

184

more_widget_html = content_html = page

185

for page_num in itertools.count(1):

186

for entry in self._process_page(content_html):

187

yield entry

188

189

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

194

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

195

'Downloading page #%s' % page_num,

196

transform_source=uppercase_escape)

197

content_html = more['content_html']

198

if not content_html.strip():

199

# Some webpages show a "Load more" button but they don't

200

# have more videos

201

break

202

more_widget_html = more['load_more_widget_html']

203

204

205

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

206

def _process_page(self, content):

207

for video_id, video_title in self.extract_videos_from_page(content):

208

yield self.url_result(video_id, 'Youtube', video_id, video_title)

209

210

def extract_videos_from_page(self, page):

211

ids_in_page = []

212

titles_in_page = []

213

for mobj in re.finditer(self._VIDEO_RE, page):

214

# The link with index 0 is not the first video of the playlist (not sure if still actual)

215

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

216

continue

217

video_id = mobj.group('id')

218

video_title = unescapeHTML(mobj.group('title'))

219

if video_title:

220

video_title = video_title.strip()

221

try:

222

idx = ids_in_page.index(video_id)

223

if video_title and not titles_in_page[idx]:

224

titles_in_page[idx] = video_title

225

except ValueError:

226

ids_in_page.append(video_id)

227

titles_in_page.append(video_title)

228

return zip(ids_in_page, titles_in_page)

229

230

231

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

232

def _process_page(self, content):

233

for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content):

234

yield self.url_result(

235

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

236

237

def _real_extract(self, url):

238

playlist_id = self._match_id(url)

239

webpage = self._download_webpage(url, playlist_id)

240

title = self._og_search_title(webpage, fatal=False)

241

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

242

243

244

class YoutubeIE(YoutubeBaseInfoExtractor):

245

IE_DESC = 'YouTube.com'

246

_VALID_URL = r"""(?x)^

247

(

248

(?:https?://|//) # http(s):// or protocol-independent URL

249

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

250

(?:www\.)?deturl\.com/www\.youtube\.com/|

251

(?:www\.)?pwnyoutube\.com/|

252

(?:www\.)?yourepeat\.com/|

253

tube\.majestyc\.net/|

254

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

255

(?:.*?\#/)? # handle anchor (#/) redirect urls

256

(?: # the various things that can precede the ID:

257

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

258

|(?: # or the v= param in all its forms

259

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

260

(?:\?|\#!?) # the params delimiter ? or # or #!

261

(?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

267

vid\.plus # or vid.plus/xxxx

268

)/

269

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

270

)

271

)? # all until now is optional -> you can pass the naked ID

272

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

273

(?!.*?&list=) # combined list/video URLs are handled by the playlist IE

274

(?(1).+)? # if we found the ID, everything can follow

275

$"""

276

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

277

_formats = {

278

'5': {'ext': 'flv', 'width': 400, 'height': 240},

279

'6': {'ext': 'flv', 'width': 450, 'height': 270},

280

'13': {'ext': '3gp'},

281

'17': {'ext': '3gp', 'width': 176, 'height': 144},

282

'18': {'ext': 'mp4', 'width': 640, 'height': 360},

283

'22': {'ext': 'mp4', 'width': 1280, 'height': 720},

284

'34': {'ext': 'flv', 'width': 640, 'height': 360},

285

'35': {'ext': 'flv', 'width': 854, 'height': 480},

286

'36': {'ext': '3gp', 'width': 320, 'height': 240},

287

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080},

288

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072},

289

'43': {'ext': 'webm', 'width': 640, 'height': 360},

290

'44': {'ext': 'webm', 'width': 854, 'height': 480},

291

'45': {'ext': 'webm', 'width': 1280, 'height': 720},

292

'46': {'ext': 'webm', 'width': 1920, 'height': 1080},

293

'59': {'ext': 'mp4', 'width': 854, 'height': 480},

294

'78': {'ext': 'mp4', 'width': 854, 'height': 480},

# 3d videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},

299

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},

300

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},

301

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},

302

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},

303

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},

304

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},

305

306

# Apple HTTP Live Streaming

307

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},

308

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},

309

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},

310

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},

311

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},

312

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},

313

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},

314

315

# DASH mp4 video

316

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

317

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

318

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

319

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

320

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

321

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)

322

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

323

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

324

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},

325

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},

326

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},

327

328

# Dash mp4 audio

329

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},

330

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},

331

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},

332

333

# Dash webm

334

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

335

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

336

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

337

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

338

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

339

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

340

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},

341

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

342

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

343

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

344

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

345

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

346

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

347

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

348

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

349

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},

350

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},

351

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},

352

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},

353

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},

354

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},

355

356

# Dash webm audio

357

'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},

358

'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},

359

360

# Dash webm audio with opus inside

361

'249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},

362

'250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},

363

'251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},

364

365

# RTMP (unnamed)

366

'_rtmp': {'protocol': 'rtmp'},

}

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

377

'uploader': 'Philipp Hagemeister',

378

'uploader_id': 'phihag',

379

'upload_date': '20121002',

380

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

381

'categories': ['Science & Technology'],

382

'tags': ['youtube-dl'],

383

'like_count': int,

384

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',

391

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

396

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

397

'description': 'md5:782e8651347686cba06e58f71ab51773',

398

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

399

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

400

'iconic ep', 'iconic', 'love', 'it'],

401

'uploader': 'Icona Pop',

402

'uploader_id': 'IconaPop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

407

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

412

'title': 'Justin Timberlake - Tunnel Vision (Explicit)',

413

'description': 'md5:64249768eec3bc4276236606ea996373',

414

'uploader': 'justintimberlakeVEVO',

415

'uploader_id': 'justintimberlakeVEVO',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

421

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

426

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

427

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

428

'uploader': 'SET India',

429

'uploader_id': 'setindia',

'age_limit': 18,

}

},

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',

435

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

440

'uploader': 'Philipp Hagemeister',

441

'uploader_id': 'phihag',

442

'upload_date': '20121002',

443

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

444

'categories': ['Science & Technology'],

445

'tags': ['youtube-dl'],

446

'like_count': int,

447

'dislike_count': int,

448

},

449

'params': {

450

'skip_download': True,

},

},

{

'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',

455

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

460

'uploader_id': '8KVIDEO',

461

'description': '',

462

'uploader': '8KVIDEO',

463

'title': 'UHDTV TEST 8K VIDEO.mp4'

464

},

465

'params': {

466

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# DASH manifest with encrypted signature

471

{

472

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',

477

'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',

478

'uploader': 'AfrojackVEVO',

479

'uploader_id': 'AfrojackVEVO',

480

'upload_date': '20131011',

481

},

482

'params': {

483

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# JS player signature function name containing $

488

{

489

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

494

'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',

495

'uploader': 'TaylorSwiftVEVO',

496

'uploader_id': 'TaylorSwiftVEVO',

497

'upload_date': '20140818',

498

},

499

'params': {

500

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'upload_date': '20100909',

511

'uploader': 'The Amazing Atheist',

512

'uploader_id': 'TheAmazingAtheist',

513

'title': 'Burning Everyone\'s Koran',

514

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

515

}

516

},

517

# Normal age-gate video (No vevo, embed allowed)

518

{

519

'url': 'http://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

524

'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

525

'uploader': 'The Witcher',

526

'uploader_id': 'WitcherGame',

527

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

532

{

533

'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

538

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

539

'uploader': 'LloydVEVO',

540

'uploader_id': 'LloydVEVO',

541

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)

546

{

547

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'upload_date': '20100430',

552

'uploader_id': 'deadmau5',

553

'description': 'md5:12c56784b8032162bb936a5f76d55360',

554

'uploader': 'deadmau5',

555

'title': 'Deadmau5 - Some Chords (HD)',

556

},

557

'expected_warnings': [

558

'DASH manifest missing',

559

]

560

},

561

# Olympics (https://github.com/rg3/youtube-dl/issues/4431)

562

{

563

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'upload_date': '20150827',

568

'uploader_id': 'olympic',

569

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

570

'uploader': 'Olympics',

571

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

572

},

573

'params': {

574

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

584

'upload_date': '20110310',

585

'uploader_id': 'AllenMeow',

586

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

587

'uploader': '孫艾倫',

588

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

589

},

590

},

591

# url_encoded_fmt_stream_map is empty string

592

{

593

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

598

'description': '',

599

'upload_date': '20150404',

600

'uploader_id': 'spbelect',

601

'uploader': 'Наблюдатели Петербурга',

602

},

603

'params': {

604

'skip_download': 'requires avconv',

605

}

606

},

607

# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)

608

{

609

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'mp4',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

614

'description': 'md5:116377fd2963b81ec4ce64b542173306',

615

'upload_date': '20150625',

616

'uploader_id': 'dorappi2000',

617

'uploader': 'dorappi2000',

618

'formats': 'mincount:33',

619

},

620

},

621

# DASH manifest with segment_list

622

{

623

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

624

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

629

'uploader': 'Airtek',

630

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

631

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

632

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

633

},

634

'params': {

635

'youtube_include_dash_manifest': True,

636

'format': '135', # bestvideo

}

},

{

# Multifeed videos (multiple cameras), URL is for Main Camera

641

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

642

'info_dict': {

643

'id': 'jqWvoWXjCVs',

644

'title': 'teamPGP: Rocket League Noob Stream',

645

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

652

'description': 'md5:dc7872fb300e143831327f1bae3af010',

653

'upload_date': '20150721',

654

'uploader': 'Beer Games Beer',

655

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

662

'description': 'md5:dc7872fb300e143831327f1bae3af010',

663

'upload_date': '20150721',

664

'uploader': 'Beer Games Beer',

665

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

672

'description': 'md5:dc7872fb300e143831327f1bae3af010',

673

'upload_date': '20150721',

674

'uploader': 'Beer Games Beer',

675

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

682

'description': 'md5:dc7872fb300e143831327f1bae3af010',

683

'upload_date': '20150721',

684

'uploader': 'Beer Games Beer',

685

'uploader_id': 'beergamesbeer',

},

}],

'params': {

'skip_download': True,

},

},

{

'url': 'http://vid.plus/FlRa-iH7PGw',

694

'only_matching': True,

695

},

696

{

697

# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)

698

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

703

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

704

'upload_date': '20151119',

705

'uploader_id': 'IronSoulElf',

706

'uploader': 'IronSoulElf',

707

},

708

'params': {

709

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)

714

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

715

'only_matching': True,

716

},

717

{

718

# Video with yt:stretch=17:0

719

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

724

'description': 'md5:ee18a25c350637c8faff806845bddee9',

725

'upload_date': '20151107',

726

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

727

'uploader': 'CH GAMER DROID',

728

},

729

'params': {

730

'skip_download': True,

},

},

]

def __init__(self, *args, **kwargs):

736

super(YoutubeIE, self).__init__(*args, **kwargs)

737

self._player_cache = {}

738

739

def report_video_info_webpage_download(self, video_id):

740

"""Report attempt to download video info webpage."""

741

self.to_screen('%s: Downloading video info webpage' % video_id)

742

743

def report_information_extraction(self, video_id):

744

"""Report attempt to extract video information."""

745

self.to_screen('%s: Extracting video information' % video_id)

746

747

def report_unavailable_format(self, video_id, format):

748

"""Report extracted video URL."""

749

self.to_screen('%s: Format %s not available' % (video_id, format))

750

751

def report_rtmp_download(self):

752

"""Indicate the download will use the RTMP protocol."""

753

self.to_screen('RTMP download detected')

754

755

def _signature_cache_id(self, example_sig):

756

""" Return a string representation of a signature """

757

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

758

759

def _extract_signature_function(self, video_id, player_url, example_sig):

760

id_m = re.match(

761

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',

762

player_url)

763

if not id_m:

764

raise ExtractorError('Cannot identify player %r' % player_url)

765

player_type = id_m.group('ext')

766

player_id = id_m.group('id')

767

768

# Read from filesystem cache

769

func_id = '%s_%s_%s' % (

770

player_type, player_id, self._signature_cache_id(example_sig))

771

assert os.path.basename(func_id) == func_id

772

773

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

774

if cache_spec is not None:

775

return lambda s: ''.join(s[i] for i in cache_spec)

776

777

download_note = (

778

'Downloading player %s' % player_url

779

if self._downloader.params.get('verbose') else

780

'Downloading %s player %s' % (player_type, player_id)

781

)

782

if player_type == 'js':

783

code = self._download_webpage(

784

player_url, video_id,

785

note=download_note,

786

errnote='Download of %s failed' % player_url)

787

res = self._parse_sig_js(code)

788

elif player_type == 'swf':

789

urlh = self._request_webpage(

790

player_url, video_id,

791

note=download_note,

792

errnote='Download of %s failed' % player_url)

793

code = urlh.read()

794

res = self._parse_sig_swf(code)

795

else:

796

assert False, 'Invalid player type %r' % player_type

797

798

test_string = ''.join(map(compat_chr, range(len(example_sig))))

799

cache_res = res(test_string)

800

cache_spec = [ord(c) for c in cache_res]

801

802

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

803

return res

804

805

def _print_sig_code(self, func, example_sig):

806

def gen_sig_code(idxs):

807

def _genslice(start, end, step):

808

starts = '' if start == 0 else str(start)

809

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

810

steps = '' if step == 1 else (':%d' % step)

811

return 's[%s%s%s]' % (starts, ends, steps)

812

813

step = None

814

# Quelch pyflakes warnings - start will be set when step is set

815

start = '(Never used)'

816

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

821

step = None

822

continue

823

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

833

834

test_string = ''.join(map(compat_chr, range(len(example_sig))))

835

cache_res = func(test_string)

836

cache_spec = [ord(c) for c in cache_res]

837

expr_code = ' + '.join(gen_sig_code(cache_spec))

838

signature_id_tuple = '(%s)' % (

839

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

840

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

841

' return %s\n') % (signature_id_tuple, expr_code)

842

self.to_screen('Extracted signature function:\n' + code)

843

844

def _parse_sig_js(self, jscode):

845

funcname = self._search_regex(

846

r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,

847

'Initial JS player signature function name')

848

849

jsi = JSInterpreter(jscode)

850

initial_function = jsi.extract_function(funcname)

851

return lambda s: initial_function([s])

852

853

def _parse_sig_swf(self, file_contents):

854

swfi = SWFInterpreter(file_contents)

855

TARGET_CLASSNAME = 'SignatureDecipher'

856

searched_class = swfi.extract_class(TARGET_CLASSNAME)

857

initial_function = swfi.extract_function(searched_class, 'decipher')

858

return lambda s: initial_function([s])

859

860

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

861

"""Turn the encrypted s field into a working signature"""

862

863

if player_url is None:

864

raise ExtractorError('Cannot decrypt signature without player_url')

865

866

if player_url.startswith('//'):

867

player_url = 'https:' + player_url

868

try:

869

player_id = (player_url, self._signature_cache_id(s))

870

if player_id not in self._player_cache:

871

func = self._extract_signature_function(

872

video_id, player_url, s

873

)

874

self._player_cache[player_id] = func

875

func = self._player_cache[player_id]

876

if self._downloader.params.get('youtube_print_sig_code'):

877

self._print_sig_code(func, s)

878

return func(s)

879

except Exception as e:

880

tb = traceback.format_exc()

881

raise ExtractorError(

882

'Signature extraction failed: ' + tb, cause=e)

883

884

def _get_subtitles(self, video_id, webpage):

885

try:

886

subs_doc = self._download_xml(

887

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

888

video_id, note=False)

889

except ExtractorError as err:

890

self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

895

lang = track.attrib['lang_code']

896

if lang in sub_lang_list:

897

continue

898

sub_formats = []

899

for ext in ['sbv', 'vtt', 'srt']:

900

params = compat_urllib_parse.urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

905

})

906

sub_formats.append({

907

'url': 'https://www.youtube.com/api/timedtext?' + params,

908

'ext': ext,

909

})

910

sub_lang_list[lang] = sub_formats

911

if not sub_lang_list:

912

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

917

patterns = (

918

# User data may contain arbitrary character sequences that may affect

919

# JSON extraction with regex, e.g. when '};' is contained the second

920

# regex won't capture the whole JSON. Yet working around by trying more

921

# concrete regex first keeping in mind proper quoted string handling

922

# to be implemented in future that will replace this workaround (see

923

# https://github.com/rg3/youtube-dl/issues/7468,

924

# https://github.com/rg3/youtube-dl/pull/7599)

925

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

926

r';ytplayer\.config\s*=\s*({.+?});',

927

)

928

config = self._search_regex(

929

patterns, webpage, 'ytplayer.config', default=None)

930

if config:

931

return self._parse_json(

932

uppercase_escape(config), video_id, fatal=False)

933

934

def _get_automatic_captions(self, video_id, webpage):

935

"""We need the webpage for getting the captions url, pass it as an

936

argument to speed up the process."""

937

self.to_screen('%s: Looking for automatic captions' % video_id)

938

player_config = self._get_ytplayer_config(video_id, webpage)

939

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

940

if not player_config:

941

self._downloader.report_warning(err_msg)

942

return {}

943

try:

944

args = player_config['args']

945

caption_url = args['ttsurl']

946

timestamp = args['timestamp']

947

# We get the available subtitles

948

list_params = compat_urllib_parse.urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

954

caption_list = self._download_xml(list_url, video_id)

955

original_lang_node = caption_list.find('track')

956

if original_lang_node is None:

957

self._downloader.report_warning('Video doesn\'t have automatic captions')

958

return {}

959

original_lang = original_lang_node.attrib['lang_code']

960

caption_kind = original_lang_node.attrib.get('kind', '')

961

962

sub_lang_list = {}

963

for lang_node in caption_list.findall('target'):

964

sub_lang = lang_node.attrib['lang_code']

965

sub_formats = []

966

for ext in ['sbv', 'vtt', 'srt']:

967

params = compat_urllib_parse.urlencode({

968

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

973

})

974

sub_formats.append({

975

'url': caption_url + '&' + params,

976

'ext': ext,

977

})

978

sub_lang_list[sub_lang] = sub_formats

979

return sub_lang_list

980

# An extractor error can be raise by the download process if there are

981

# no automatic captions but there are subtitles

982

except (KeyError, ExtractorError):

983

self._downloader.report_warning(err_msg)

return {}

@classmethod

def extract_id(cls, url):

988

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

989

if mobj is None:

990

raise ExtractorError('Invalid URL: %s' % url)

991

video_id = mobj.group(2)

992

return video_id

993

994

def _extract_from_m3u8(self, manifest_url, video_id):

995

url_map = {}

996

997

def _get_urls(_manifest):

998

lines = _manifest.split('\n')

999

urls = filter(lambda l: l and not l.startswith('#'),

1000

lines)

1001

return urls

1002

manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')

1003

formats_urls = _get_urls(manifest)

1004

for format_url in formats_urls:

1005

itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')

1006

url_map[itag] = format_url

1007

return url_map

1008

1009

def _extract_annotations(self, video_id):

1010

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1011

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1012

1013

def _parse_dash_manifest(

1014

self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):

1015

def decrypt_sig(mobj):

1016

s = mobj.group(1)

1017

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

1018

return '/signature/%s' % dec_s

1019

dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)

1020

dash_doc = self._download_xml(

1021

dash_manifest_url, video_id,

1022

note='Downloading DASH manifest',

1023

errnote='Could not download DASH manifest',

1024

fatal=fatal)

1025

1026

if dash_doc is False:

return []

formats = []

for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):

1031

mime_type = a.attrib.get('mimeType')

1032

for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):

1033

url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')

1034

if url_el is None:

1035

continue

1036

if mime_type == 'text/vtt':

1037

# TODO implement WebVTT downloading

1038

pass

1039

elif mime_type.startswith('audio/') or mime_type.startswith('video/'):

1040

segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')

1041

format_id = r.attrib['id']

1042

video_url = url_el.text

1043

filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))

1044

f = {

1045

'format_id': format_id,

1046

'url': video_url,

1047

'width': int_or_none(r.attrib.get('width')),

1048

'height': int_or_none(r.attrib.get('height')),

1049

'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),

1050

'asr': int_or_none(r.attrib.get('audioSamplingRate')),

1051

'filesize': filesize,

1052

'fps': int_or_none(r.attrib.get('frameRate')),

1053

}

1054

if segment_list is not None:

1055

f.update({

1056

'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],

1057

'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],

1058

'protocol': 'http_dash_segments',

1059

})

1060

try:

1061

existing_format = next(

1062

fo for fo in formats

1063

if fo['format_id'] == format_id)

1064

except StopIteration:

1065

full_info = self._formats.get(format_id, {}).copy()

1066

full_info.update(f)

1067

codecs = r.attrib.get('codecs')

1068

if codecs:

1069

if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:

1070

full_info['vcodec'] = codecs

1071

elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:

1072

full_info['acodec'] = codecs

1073

formats.append(full_info)

1074

else:

1075

existing_format.update(f)

1076

else:

1077

self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)

1078

return formats

1079

1080

def _real_extract(self, url):

1081

url, smuggled_data = unsmuggle_url(url, {})

1082

1083

proto = (

1084

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1090

for component in [parsed_url.fragment, parsed_url.query]:

1091

query = compat_parse_qs(component)

1092

if start_time is None and 't' in query:

1093

start_time = parse_duration(query['t'][0])

1094

if start_time is None and 'start' in query:

1095

start_time = parse_duration(query['start'][0])

1096

if end_time is None and 'end' in query:

1097

end_time = parse_duration(query['end'][0])

1098

1099

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1100

mobj = re.search(self._NEXT_URL_RE, url)

1101

if mobj:

1102

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1103

video_id = self.extract_id(url)

1104

1105

# Get video webpage

1106

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1107

video_webpage = self._download_webpage(url, video_id)

1108

1109

# Attempt to extract SWF player URL

1110

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1111

if mobj is not None:

1112

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1119

dash_mpd = video_info.get('dashmpd')

1120

if dash_mpd and dash_mpd[0] not in dash_mpds:

1121

dash_mpds.append(dash_mpd[0])

# Get video info

embed_webpage = None

is_live = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1127

age_gate = True

1128

# We simulate the access to the video from www.youtube.com/v/{video_id}

1129

# this can be viewed without login into Youtube

1130

url = proto + '://www.youtube.com/embed/%s' % video_id

1131

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1132

data = compat_urllib_parse.urlencode({

1133

'video_id': video_id,

1134

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1135

'sts': self._search_regex(

1136

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1137

})

1138

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1139

video_info_webpage = self._download_webpage(

1140

video_info_url, video_id,

1141

note='Refetching age-gated info webpage',

1142

errnote='unable to download video info webpage')

1143

video_info = compat_parse_qs(video_info_webpage)

1144

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

# Try looking directly into the video webpage

1149

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1150

if ytplayer_config:

1151

args = ytplayer_config['args']

1152

if args.get('url_encoded_fmt_stream_map'):

1153

# Convert to the same format returned by compat_parse_qs

1154

video_info = dict((k, [v]) for k, v in args.items())

1155

add_dash_mpd(video_info)

1156

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1157

is_live = True

1158

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1159

# We also try looking in get_video_info since it may contain different dashmpd

1160

# URL that points to a DASH manifest with possibly different itag set (some itags

1161

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1162

# manifest pointed by get_video_info's dashmpd).

1163

# The general idea is to take a union of itags of both DASH manifests (for example

1164

# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)

1165

self.report_video_info_webpage_download(video_id)

1166

for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:

1167

video_info_url = (

1168

'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

1169

% (proto, video_id, el_type))

1170

video_info_webpage = self._download_webpage(

1171

video_info_url,

1172

video_id, note=False,

1173

errnote='unable to download video info webpage')

1174

get_video_info = compat_parse_qs(video_info_webpage)

1175

if get_video_info.get('use_cipher_signature') != ['True']:

1176

add_dash_mpd(get_video_info)

1177

if not video_info:

1178

video_info = get_video_info

1179

if 'token' in get_video_info:

1180

# Different get_video_info requests may report different results, e.g.

1181

# some may report video unavailability, but some may serve it without

1182

# any complaint (see https://github.com/rg3/youtube-dl/issues/7362,

1183

# the original webpage as well as el=info and el=embedded get_video_info

1184

# requests report video unavailability due to geo restriction while

1185

# el=detailpage succeeds and returns valid data). This is probably

1186

# due to YouTube measures against IP ranges of hosting providers.

1187

# Working around by preferring the first succeeded video_info containing

1188

# the token if no such video_info yet was found.

1189

if 'token' not in video_info:

1190

video_info = get_video_info

1191

break

1192

if 'token' not in video_info:

1193

if 'reason' in video_info:

1194

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1195

regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)

1196

if regions_allowed:

1197

raise ExtractorError('YouTube said: This video is available in %s only' % (

1198

', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),

1199

expected=True)

1200

raise ExtractorError(

1201

'YouTube said: %s' % video_info['reason'][0],

1202

expected=True, video_id=video_id)

1203

else:

1204

raise ExtractorError(

1205

'"token" parameter not in video info for unknown reason',

video_id=video_id)

# title

if 'title' in video_info:

1210

video_title = video_info['title'][0]

1211

else:

1212

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

video_description = get_element_by_id("eow-description", video_webpage)

1217

if video_description:

1218

video_description = re.sub(r'''(?x)

1219

<a\s+

1220

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1221

title="([^"]+)"\s+

1222

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1223

class="yt-uix-redirect-link"\s*>

1224

[^<]+

1225

</a>

1226

''', r'\1', video_description)

1227

video_description = clean_html(video_description)

1228

else:

1229

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1230

if fd_mobj:

1231

video_description = unescapeHTML(fd_mobj.group(1))

1232

else:

1233

video_description = ''

1234

1235

if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):

1236

if not self._downloader.params.get('noplaylist'):

1237

entries = []

1238

feed_ids = []

1239

multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])

1240

for feed in multifeed_metadata_list.split(','):

1241

feed_data = compat_parse_qs(feed)

1242

entries.append({

1243

'_type': 'url_transparent',

1244

'ie_key': 'Youtube',

1245

'url': smuggle_url(

1246

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1247

{'force_singlefeed': True}),

1248

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1249

})

1250

feed_ids.append(feed_data['id'][0])

1251

self.to_screen(

1252

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1253

% (', '.join(feed_ids), video_id))

1254

return self.playlist_result(entries, video_id, video_title, video_description)

1255

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1256

1257

if 'view_count' in video_info:

1258

view_count = int(video_info['view_count'][0])

else:

view_count = None

# Check for "rental" videos

1263

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1264

raise ExtractorError('"rental" videos not supported')

1265

1266

# Start extracting information

1267

self.report_information_extraction(video_id)

1268

1269

# uploader

1270

if 'author' not in video_info:

1271

raise ExtractorError('Unable to extract uploader name')

1272

video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])

1273

1274

# uploader_id

1275

video_uploader_id = None

1276

mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)

1277

if mobj is not None:

1278

video_uploader_id = mobj.group(1)

1279

else:

1280

self._downloader.report_warning('unable to extract uploader nickname')

1281

1282

# thumbnail image

1283

# We try first to get a high quality image:

1284

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

1285

video_webpage, re.DOTALL)

1286

if m_thumb is not None:

1287

video_thumbnail = m_thumb.group(1)

1288

elif 'thumbnail_url' not in video_info:

1289

self._downloader.report_warning('unable to extract video thumbnail')

1290

video_thumbnail = None

1291

else: # don't panic if we can't find it

1292

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

1293

1294

# upload date

1295

upload_date = self._html_search_meta(

1296

'datePublished', video_webpage, 'upload date', default=None)

1297

if not upload_date:

1298

upload_date = self._search_regex(

1299

[r'(?s)id="eow-date.*?>(.*?)</span>',

1300

r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],

1301

video_webpage, 'upload date', default=None)

1302

if upload_date:

1303

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

1304

upload_date = unified_strdate(upload_date)

1305

1306

m_cat_container = self._search_regex(

1307

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

1308

video_webpage, 'categories', default=None)

1309

if m_cat_container:

1310

category = self._html_search_regex(

1311

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

1312

default=None)

1313

video_categories = None if category is None else [category]

1314

else:

1315

video_categories = None

1316

1317

video_tags = [

1318

unescapeHTML(m.group('content'))

1319

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

1320

1321

def _extract_count(count_name):

1322

return str_to_int(self._search_regex(

1323

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

1324

% re.escape(count_name),

1325

video_webpage, count_name, default=None))

1326

1327

like_count = _extract_count('like')

1328

dislike_count = _extract_count('dislike')

1329

1330

# subtitles

1331

video_subtitles = self.extract_subtitles(video_id, video_webpage)

1332

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

1333

1334

if 'length_seconds' not in video_info:

1335

self._downloader.report_warning('unable to extract video duration')

1336

video_duration = None

1337

else:

1338

video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))

1339

1340

# annotations

1341

video_annotations = None

1342

if self._downloader.params.get('writeannotations', False):

1343

video_annotations = self._extract_annotations(video_id)

1344

1345

def _map_to_format_list(urlmap):

1346

formats = []

1347

for itag, video_real_url in urlmap.items():

1348

dct = {

1349

'format_id': itag,

1350

'url': video_real_url,

1351

'player_url': player_url,

1352

}

1353

if itag in self._formats:

1354

dct.update(self._formats[itag])

formats.append(dct)

return formats

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1359

self.report_rtmp_download()

1360

formats = [{

1361

'format_id': '_rtmp',

1362

'protocol': 'rtmp',

1363

'url': video_info['conn'][0],

1364

'player_url': player_url,

1365

}]

1366

elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:

1367

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1368

if 'rtmpe%3Dyes' in encoded_url_map:

1369

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

1370

formats = []

1371

for url_data_str in encoded_url_map.split(','):

1372

url_data = compat_parse_qs(url_data_str)

1373

if 'itag' not in url_data or 'url' not in url_data:

1374

continue

1375

format_id = url_data['itag'][0]

1376

url = url_data['url'][0]

1377

1378

if 'sig' in url_data:

1379

url += '&signature=' + url_data['sig'][0]

1380

elif 's' in url_data:

1381

encrypted_sig = url_data['s'][0]

1382

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1383

1384

jsplayer_url_json = self._search_regex(

1385

ASSETS_RE,

1386

embed_webpage if age_gate else video_webpage,

1387

'JS player URL (1)', default=None)

1388

if not jsplayer_url_json and not age_gate:

1389

# We need the embed website after all

1390

if embed_webpage is None:

1391

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1392

embed_webpage = self._download_webpage(

1393

embed_url, video_id, 'Downloading embed webpage')

1394

jsplayer_url_json = self._search_regex(

1395

ASSETS_RE, embed_webpage, 'JS player URL')

1396

1397

player_url = json.loads(jsplayer_url_json)

1398

if player_url is None:

1399

player_url_json = self._search_regex(

1400

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1401

video_webpage, 'age gate player URL')

1402

player_url = json.loads(player_url_json)

1403

1404

if self._downloader.params.get('verbose'):

1405

if player_url is None:

1406

player_version = 'unknown'

1407

player_desc = 'unknown'

1408

else:

1409

if player_url.endswith('swf'):

1410

player_version = self._search_regex(

1411

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1412

'flash player', fatal=False)

1413

player_desc = 'flash player %s' % player_version

1414

else:

1415

player_version = self._search_regex(

1416

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],

1417

player_url,

1418

'html5 player', fatal=False)

1419

player_desc = 'html5 player %s' % player_version

1420

1421

parts_sizes = self._signature_cache_id(encrypted_sig)

1422

self.to_screen('{%s} signature length %s, %s' %

1423

(format_id, parts_sizes, player_desc))

1424

1425

signature = self._decrypt_signature(

1426

encrypted_sig, video_id, player_url, age_gate)

1427

url += '&signature=' + signature

1428

if 'ratebypass' not in url:

1429

url += '&ratebypass=yes'

1430

1431

# Some itags are not included in DASH manifest thus corresponding formats will

1432

# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).

1433

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

1434

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

1435

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

1436

dct = {

1437

'format_id': format_id,

1438

'url': url,

1439

'player_url': player_url,

1440

'filesize': int_or_none(url_data.get('clen', [None])[0]),

1441

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

1442

'width': width,

1443

'height': height,

1444

'fps': int_or_none(url_data.get('fps', [None])[0]),

1445

'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],

1446

}

1447

type_ = url_data.get('type', [None])[0]

1448

if type_:

1449

type_split = type_.split(';')

1450

kind_ext = type_split[0].split('/')

1451

if len(kind_ext) == 2:

1452

kind, ext = kind_ext

1453

dct['ext'] = ext

1454

if kind in ('audio', 'video'):

1455

codecs = None

1456

for mobj in re.finditer(

1457

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

1458

if mobj.group('key') == 'codecs':

1459

codecs = mobj.group('val')

1460

break

1461

if codecs:

1462

codecs = codecs.split(',')

1463

if len(codecs) == 2:

1464

acodec, vcodec = codecs[0], codecs[1]

1465

else:

1466

acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])

dct.update({

'acodec': acodec,

'vcodec': vcodec,

})

if format_id in self._formats:

1472

dct.update(self._formats[format_id])

1473

formats.append(dct)

1474

elif video_info.get('hlsvp'):

1475

manifest_url = video_info['hlsvp'][0]

1476

url_map = self._extract_from_m3u8(manifest_url, video_id)

1477

formats = _map_to_format_list(url_map)

1478

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

1479

for a_format in formats:

1480

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

1481

else:

1482

raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

1483

1484

# Look for the DASH manifest

1485

if self._downloader.params.get('youtube_include_dash_manifest', True):

1486

dash_mpd_fatal = True

1487

for dash_manifest_url in dash_mpds:

1488

dash_formats = {}

1489

try:

1490

for df in self._parse_dash_manifest(

1491

video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):

1492

# Do not overwrite DASH format found in some previous DASH manifest

1493

if df['format_id'] not in dash_formats:

1494

dash_formats[df['format_id']] = df

1495

# Additional DASH manifests may end up in HTTP Error 403 therefore

1496

# allow them to fail without bug report message if we already have

1497

# some DASH manifest succeeded. This is temporary workaround to reduce

1498

# burst of bug reports until we figure out the reason and whether it

1499

# can be fixed at all.

1500

dash_mpd_fatal = False

1501

except (ExtractorError, KeyError) as e:

1502

self.report_warning(

1503

'Skipping DASH manifest: %r' % e, video_id)

1504

if dash_formats:

1505

# Remove the formats we found through non-DASH, they

1506

# contain less info and it can be wrong, because we use

1507

# fixed values (for example the resolution). See

1508

# https://github.com/rg3/youtube-dl/issues/5774 for an

1509

# example.

1510

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

1511

formats.extend(dash_formats.values())

1512

1513

# Check for malformed aspect ratio

1514

stretched_m = re.search(

1515

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

1516

video_webpage)

1517

if stretched_m:

1518

w = float(stretched_m.group('w'))

1519

h = float(stretched_m.group('h'))

1520

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

1521

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

1526

f['stretched_ratio'] = ratio

1527

1528

self._sort_formats(formats)

return {

'id': video_id,

'uploader': video_uploader,

1533

'uploader_id': video_uploader_id,

1534

'upload_date': upload_date,

1535

'title': video_title,

1536

'thumbnail': video_thumbnail,

1537

'description': video_description,

1538

'categories': video_categories,

1539

'tags': video_tags,

1540

'subtitles': video_subtitles,

1541

'automatic_captions': automatic_captions,

1542

'duration': video_duration,

1543

'age_limit': 18 if age_gate else 0,

1544

'annotations': video_annotations,

1545

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

1546

'view_count': view_count,

1547

'like_count': like_count,

1548

'dislike_count': dislike_count,

1549

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

1550

'formats': formats,

1551

'is_live': is_live,

1552

'start_time': start_time,

1553

'end_time': end_time,

}

class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):

1558

IE_DESC = 'YouTube.com playlists'

1559

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

youtube\.com/

(?:

\? (?:.*?&)*? (?:p|a|list)=

| p/

)

(

(?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}

1570

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})

1576

)"""

1577

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

1578

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

1579

IE_NAME = 'youtube:playlist'

1580

_TESTS = [{

1581

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

1582

'info_dict': {

1583

'title': 'ytdl test PL',

1584

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1589

'info_dict': {

1590

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1591

'title': 'YDL_Empty_List',

},

'playlist_count': 0,

}, {

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

1596

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1597

'info_dict': {

1598

'title': '29C3: Not my department',

1599

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1600

},

1601

'playlist_count': 95,

1602

}, {

1603

'note': 'issue #673',

1604

'url': 'PLBB231211A4F62143',

1605

'info_dict': {

1606

'title': '[OLD]Team Fortress 2 (Class-based LP)',

1607

'id': 'PLBB231211A4F62143',

1608

},

1609

'playlist_mincount': 26,

1610

}, {

1611

'note': 'Large playlist',

1612

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

1613

'info_dict': {

1614

'title': 'Uploads from Cauchemar',

1615

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

1616

},

1617

'playlist_mincount': 799,

1618

}, {

1619

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

1620

'info_dict': {

1621

'title': 'YDL_safe_search',

1622

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

},

'playlist_count': 2,

}, {

'note': 'embedded',

'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

1632

}

1633

}, {

1634

'note': 'Embedded SWF player',

1635

'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

1640

}

1641

}, {

1642

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

1643

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

1644

'info_dict': {

1645

'title': 'Uploads from Interstellar Movie',

1646

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

1647

},

1648

'playlist_mincout': 21,

1649

}]

1650

1651

def _real_initialize(self):

1652

self._login()

1653

1654

def _extract_mix(self, playlist_id):

1655

# The mixes are generated from a single video

1656

# the id of the playlist is just 'RD' + video_id

1657

url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)

1658

webpage = self._download_webpage(

1659

url, playlist_id, 'Downloading Youtube mix')

1660

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

1661

title_span = (

1662

search_title('playlist-title') or

1663

search_title('title long-title') or

1664

search_title('title'))

1665

title = clean_html(title_span)

1666

ids = orderedSet(re.findall(

1667

r'''(?xs)data-video-username=".*?".*?

1668

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

1669

webpage))

1670

url_results = self._ids_to_results(ids)

1671

1672

return self.playlist_result(url_results, playlist_id, title)

1673

1674

def _extract_playlist(self, playlist_id):

1675

url = self._TEMPLATE_URL % playlist_id

1676

page = self._download_webpage(url, playlist_id)

1677

1678

for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):

1679

match = match.strip()

1680

# Check if the playlist exists or is private

1681

if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):

1682

raise ExtractorError(

1683

'The playlist doesn\'t exist or is private, use --username or '

1684

'--netrc to access it.',

1685

expected=True)

1686

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

1687

raise ExtractorError(

1688

'Invalid parameters. Maybe URL is incorrect.',

1689

expected=True)

1690

elif re.match(r'[^<]*Choose your language[^<]*', match):

1691

continue

1692

else:

1693

self.report_warning('Youtube gives an alert message: ' + match)

1694

1695

playlist_title = self._html_search_regex(

1696

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

1697

page, 'title')

1698

1699

return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)

1700

1701

def _real_extract(self, url):

1702

# Extract playlist id

1703

mobj = re.match(self._VALID_URL, url)

1704

if mobj is None:

1705

raise ExtractorError('Invalid URL: %s' % url)

1706

playlist_id = mobj.group(1) or mobj.group(2)

1707

1708

# Check if it's a video-specific URL

1709

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

1710

if 'v' in query_dict:

1711

video_id = query_dict['v'][0]

1712

if self._downloader.params.get('noplaylist'):

1713

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1714

return self.url_result(video_id, 'Youtube', video_id=video_id)

1715

else:

1716

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

1717

1718

if playlist_id.startswith('RD') or playlist_id.startswith('UL'):

1719

# Mixes require a custom extraction process

1720

return self._extract_mix(playlist_id)

1721

1722

return self._extract_playlist(playlist_id)

1723

1724

1725

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

1726

IE_DESC = 'YouTube.com channels'

1727

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'

1728

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

1729

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

1730

IE_NAME = 'youtube:channel'

1731

_TESTS = [{

1732

'note': 'paginated channel',

1733

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

1734

'playlist_mincount': 91,

1735

'info_dict': {

1736

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

1737

'title': 'Uploads from lex will',

1738

}

1739

}, {

1740

'note': 'Age restricted channel',

1741

# from https://www.youtube.com/user/DeusExOfficial

1742

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

1743

'playlist_mincount': 64,

1744

'info_dict': {

1745

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

1746

'title': 'Uploads from Deus Ex',

},

}]

def _real_extract(self, url):

1751

channel_id = self._match_id(url)

1752

1753

url = self._TEMPLATE_URL % channel_id

1754

1755

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

1756

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

1757

# otherwise fallback on channel by page extraction

1758

channel_page = self._download_webpage(

1759

url + '?view=57', channel_id,

1760

'Downloading channel page', fatal=False)

1761

if channel_page is False:

1762

channel_playlist_id = False

1763

else:

1764

channel_playlist_id = self._html_search_meta(

1765

'channelId', channel_page, 'channel id', default=None)

1766

if not channel_playlist_id:

1767

channel_playlist_id = self._search_regex(

1768

r'data-(?:channel-external-|yt)id="([^"]+)"',

1769

channel_page, 'channel id', default=None)

1770

if channel_playlist_id and channel_playlist_id.startswith('UC'):

1771

playlist_id = 'UU' + channel_playlist_id[2:]

1772

return self.url_result(

1773

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

1774

1775

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

1776

autogenerated = re.search(r'''(?x)

1777

class="[^"]*?(?:

1778

channel-header-autogenerated-label|

1779

yt-channel-title-autogenerated

1780

)[^"]*"''', channel_page) is not None

1781

1782

if autogenerated:

1783

# The videos are contained in a single page

1784

# the ajax pages can't be used, they are empty

1785

entries = [

1786

self.url_result(

1787

video_id, 'Youtube', video_id=video_id,

1788

video_title=video_title)

1789

for video_id, video_title in self.extract_videos_from_page(channel_page)]

1790

return self.playlist_result(entries, channel_id)

1791

1792

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

1793

1794

1795

class YoutubeUserIE(YoutubeChannelIE):

1796

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

1797

_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'

1798

_TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'

1799

IE_NAME = 'youtube:user'

1800

1801

_TESTS = [{

1802

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

1803

'playlist_mincount': 320,

1804

'info_dict': {

1805

'title': 'TheLinuxFoundation',

1806

}

1807

}, {

1808

'url': 'ytuser:phihag',

1809

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

1814

# Don't return True if the url can be extracted with other youtube

1815

# extractor, the regex would is too permissive and it would match.

1816

other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)

1817

if any(ie.suitable(url) for ie in other_ies):

1818

return False

1819

else:

1820

return super(YoutubeUserIE, cls).suitable(url)

1821

1822

1823

class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

1824

IE_DESC = 'YouTube.com user playlists'

1825

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists'

1826

IE_NAME = 'youtube:user:playlists'

1827

1828

_TESTS = [{

1829

'url': 'http://www.youtube.com/user/ThirstForScience/playlists',

1830

'playlist_mincount': 4,

1831

'info_dict': {

1832

'id': 'ThirstForScience',

1833

'title': 'Thirst for Science',

1834

},

1835

}, {

1836

# with "Load more" button

1837

'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

1838

'playlist_mincount': 70,

1839

'info_dict': {

1840

'id': 'igorkle1',

1841

'title': 'Игорь Клейнер',

},

}]

class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):

1847

IE_DESC = 'YouTube.com searches'

1848

# there doesn't appear to be a real limit, for example if you search for

1849

# 'python' you get more than 8.000.000 results

1850

_MAX_RESULTS = float('inf')

1851

IE_NAME = 'youtube:search'

1852

_SEARCH_KEY = 'ytsearch'

1853

_EXTRA_QUERY_ARGS = {}

1854

_TESTS = []

1855

1856

def _get_n_results(self, query, n):

1857

"""Get a specified number of results for a query"""

videos = []

limit = n

for pagenum in itertools.count(1):

1863

url_query = {

1864

'search_query': query.encode('utf-8'),

'page': pagenum,

'spf': 'navigate',

}

url_query.update(self._EXTRA_QUERY_ARGS)

1869

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)

1870

data = self._download_json(

1871

result_url, video_id='query "%s"' % query,

1872

note='Downloading page %s' % pagenum,

1873

errnote='Unable to download API page')

1874

html_content = data[1]['body']['content']

1875

1876

if 'class="search-message' in html_content:

1877

raise ExtractorError(

1878

'[youtube] No video results', expected=True)

1879

1880

new_videos = self._ids_to_results(orderedSet(re.findall(

1881

r'href="/watch\?v=(.{11})', html_content)))

1882

videos += new_videos

1883

if not new_videos or len(videos) > limit:

break

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

1889

1890

1891

class YoutubeSearchDateIE(YoutubeSearchIE):

1892

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

1893

_SEARCH_KEY = 'ytsearchdate'

1894

IE_DESC = 'YouTube.com searches, newest videos first'

1895

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

1896

1897

1898

class YoutubeSearchURLIE(InfoExtractor):

1899

IE_DESC = 'YouTube.com search URLs'

1900

IE_NAME = 'youtube:search_url'

1901

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'

1902

_TESTS = [{

1903

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

1904

'playlist_mincount': 5,

1905

'info_dict': {

1906

'title': 'youtube-dl test video',

}

}]

def _real_extract(self, url):

1911

mobj = re.match(self._VALID_URL, url)

1912

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

1913

1914

webpage = self._download_webpage(url, query)

1915

result_code = self._search_regex(

1916

r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')

1917

1918

part_codes = re.findall(

1919

r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)

1920

entries = []

1921

for part_code in part_codes:

1922

part_title = self._html_search_regex(

1923

[r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)

1924

part_url_snippet = self._html_search_regex(

1925

r'(?s)href="([^"]+)"', part_code, 'item URL')

1926

part_url = compat_urlparse.urljoin(

1927

'https://www.youtube.com/', part_url_snippet)

entries.append({

'_type': 'url',

'url': part_url,

'title': part_title,

})

return {

'_type': 'playlist',

'entries': entries,

'title': query,

}

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

1942

IE_DESC = 'YouTube.com (multi-season) shows'

1943

_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'

1944

IE_NAME = 'youtube:show'

1945

_TESTS = [{

1946

'url': 'https://www.youtube.com/show/airdisasters',

1947

'playlist_mincount': 5,

1948

'info_dict': {

1949

'id': 'airdisasters',

1950

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

1955

playlist_id = self._match_id(url)

1956

return super(YoutubeShowIE, self)._real_extract(

1957

'https://www.youtube.com/show/%s/playlists' % playlist_id)

1958

1959

1960

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

1961

"""

1962

Base class for feed extractors

1963

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

1964

"""

1965

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

1970

1971

def _real_initialize(self):

1972

self._login()

1973

1974

def _real_extract(self, url):

1975

page = self._download_webpage(

1976

'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)

1977

1978

# The extraction process is the same as for playlists, but the regex

1979

# for the video ids doesn't contain an index

1980

ids = []

1981

more_widget_html = content_html = page

1982

for page_num in itertools.count(1):

1983

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

1984

1985

# 'recommended' feed has infinite 'load more' and each new portion spins

1986

# the same videos in (sometimes) slightly different order, so we'll check

1987

# for unicity and break when portion has no new videos

1988

new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))

if not new_ids:

break

ids.extend(new_ids)

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

1999

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

2000

'Downloading page #%s' % page_num,

2001

transform_source=uppercase_escape)

2002

content_html = more['content_html']

2003

more_widget_html = more['load_more_widget_html']

2004

2005

return self.playlist_result(

2006

self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)

2007

2008

2009

class YoutubeWatchLaterIE(YoutubePlaylistIE):

2010

IE_NAME = 'youtube:watchlater'

2011

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

2012

_VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'

2013

2014

_TESTS = [] # override PlaylistIE tests

2015

2016

def _real_extract(self, url):

2017

return self._extract_playlist('WL')

2018

2019

2020

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

2021

IE_NAME = 'youtube:favorites'

2022

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

2023

_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

2024

_LOGIN_REQUIRED = True

2025

2026

def _real_extract(self, url):

2027

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

2028

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

2029

return self.url_result(playlist_id, 'YoutubePlaylist')

2030

2031

2032

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

2033

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

2034

_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'

2035

_FEED_NAME = 'recommended'

2036

_PLAYLIST_TITLE = 'Youtube Recommended videos'

2037

2038

2039

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

2040

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

2041

_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

2042

_FEED_NAME = 'subscriptions'

2043

_PLAYLIST_TITLE = 'Youtube Subscriptions'

2044

2045

2046

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

2047

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

2048

_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'

2049

_FEED_NAME = 'history'

2050

_PLAYLIST_TITLE = 'Youtube History'

2051

2052

2053

class YoutubeTruncatedURLIE(InfoExtractor):

2054

IE_NAME = 'youtube:truncated_url'

2055

IE_DESC = False # Do not list

2056

_VALID_URL = r'''(?x)

2057

(?:https?://)?

2058

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

2059

(?:watch\?(?:

2060

feature=[a-z_]+|

2061

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',

2074

'only_matching': True,

2075

}, {

2076

'url': 'http://www.youtube.com/watch?',

2077

'only_matching': True,

2078

}, {

2079

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

2080

'only_matching': True,

2081

}, {

2082

'url': 'https://www.youtube.com/watch?feature=foo',

2083

'only_matching': True,

2084

}, {

2085

'url': 'https://www.youtube.com/watch?hl=en-GB',

2086

'only_matching': True,

2087

}, {

2088

'url': 'https://www.youtube.com/watch?t=2372',

2089

'only_matching': True,

2090

}]

2091

2092

def _real_extract(self, url):

2093

raise ExtractorError(

2094

'Did you forget to quote the URL? Remember that & is a meta '

2095

'character in most shells, so you want to put the URL in quotes, '

2096

'like youtube-dl '

2097

'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

2098

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

2103

IE_NAME = 'youtube:truncated_id'

2104

IE_DESC = False # Do not list

2105

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

2106

2107

_TESTS = [{

2108

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

2109

'only_matching': True,

2110

}]

2111

2112

def _real_extract(self, url):

2113

video_id = self._match_id(url)

2114

raise ExtractorError(

2115

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

2116

expected=True)