jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import random
	10	import re
	11	import time
	12	import traceback
	13
	14	from .common import InfoExtractor, SearchInfoExtractor
	15	from ..jsinterp import JSInterpreter
	16	from ..swfinterp import SWFInterpreter
	17	from ..compat import (
	18	compat_chr,
	19	compat_parse_qs,
	20	compat_urllib_parse,
	21	compat_urllib_parse_unquote,
	22	compat_urllib_parse_unquote_plus,
	23	compat_urllib_parse_urlparse,
	24	compat_urlparse,
	25	compat_str,
	26	)
	27	from ..utils import (
	28	clean_html,
	29	encode_dict,
	30	error_to_compat_str,
	31	ExtractorError,
	32	float_or_none,
	33	get_element_by_attribute,
	34	get_element_by_id,
	35	int_or_none,
	36	mimetype2ext,
	37	orderedSet,
	38	parse_duration,
	39	remove_quotes,
	40	remove_start,
	41	sanitized_Request,
	42	smuggle_url,
	43	str_to_int,
	44	unescapeHTML,
	45	unified_strdate,
	46	unsmuggle_url,
	47	uppercase_escape,
	48	ISO3166Utils,
	49	)
	50
	51
	52	class YoutubeBaseInfoExtractor(InfoExtractor):
	53	"""Provide base functions for Youtube extractors"""
	54	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	55	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	56	_NETRC_MACHINE = 'youtube'
	57	# If True it will raise an error if no login info is provided
	58	_LOGIN_REQUIRED = False
	59
	60	def _set_language(self):
	61	self._set_cookie(
	62	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	63	# YouTube sets the expire time to about two months
	64	expire_time=time.time() + 2 * 30 * 24 * 3600)
	65
	66	def _ids_to_results(self, ids):
	67	return [
	68	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	69	for vid_id in ids]
	70
	71	def _login(self):
	72	"""
	73	Attempt to log in to YouTube.
	74	True is returned if successful or skipped.
	75	False is returned if login failed.
	76
	77	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	78	"""
	79	(username, password) = self._get_login_info()
	80	# No authentication to be performed
	81	if username is None:
	82	if self._LOGIN_REQUIRED:
	83	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	84	return True
	85
	86	login_page = self._download_webpage(
	87	self._LOGIN_URL, None,
	88	note='Downloading login page',
	89	errnote='unable to fetch login page', fatal=False)
	90	if login_page is False:
	91	return
	92
	93	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	94	login_page, 'Login GALX parameter')
	95
	96	# Log in
	97	login_form_strs = {
	98	'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	99	'Email': username,
	100	'GALX': galx,
	101	'Passwd': password,
	102
	103	'PersistentCookie': 'yes',
	104	'_utf8': '霱',
	105	'bgresponse': 'js_disabled',
	106	'checkConnection': '',
	107	'checkedDomains': 'youtube',
	108	'dnConn': '',
	109	'pstMsg': '0',
	110	'rmShown': '1',
	111	'secTok': '',
	112	'signIn': 'Sign in',
	113	'timeStmp': '',
	114	'service': 'youtube',
	115	'uilel': '3',
	116	'hl': 'en_US',
	117	}
	118
	119	login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
	120
	121	req = sanitized_Request(self._LOGIN_URL, login_data)
	122	login_results = self._download_webpage(
	123	req, None,
	124	note='Logging in', errnote='unable to log in', fatal=False)
	125	if login_results is False:
	126	return False
	127
	128	if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
	129	raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
	130
	131	# Two-Factor
	132	# TODO add SMS and phone call support - these require making a request and then prompting the user
	133
	134	if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
	135	tfa_code = self._get_tfa_info('2-step verification code')
	136
	137	if not tfa_code:
	138	self._downloader.report_warning(
	139	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	140	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	141	return False
	142
	143	tfa_code = remove_start(tfa_code, 'G-')
	144
	145	tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
	146
	147	tfa_form_strs.update({
	148	'Pin': tfa_code,
	149	'TrustDevice': 'on',
	150	})
	151
	152	tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
	153
	154	tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
	155	tfa_results = self._download_webpage(
	156	tfa_req, None,
	157	note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
	158
	159	if tfa_results is False:
	160	return False
	161
	162	if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
	163	self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
	164	return False
	165	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
	166	self._downloader.report_warning('unable to log in - did the page structure change?')
	167	return False
	168	if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
	169	self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
	170	return False
	171
	172	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	173	self._downloader.report_warning('unable to log in: bad username or password')
	174	return False
	175	return True
	176
	177	def _real_initialize(self):
	178	if self._downloader is None:
	179	return
	180	self._set_language()
	181	if not self._login():
	182	return
	183
	184
	185	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	186	# Extract entries from page with "Load more" button
	187	def _entries(self, page, playlist_id):
	188	more_widget_html = content_html = page
	189	for page_num in itertools.count(1):
	190	for entry in self._process_page(content_html):
	191	yield entry
	192
	193	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	194	if not mobj:
	195	break
	196
	197	more = self._download_json(
	198	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	199	'Downloading page #%s' % page_num,
	200	transform_source=uppercase_escape)
	201	content_html = more['content_html']
	202	if not content_html.strip():
	203	# Some webpages show a "Load more" button but they don't
	204	# have more videos
	205	break
	206	more_widget_html = more['load_more_widget_html']
	207
	208
	209	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	210	def _process_page(self, content):
	211	for video_id, video_title in self.extract_videos_from_page(content):
	212	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	213
	214	def extract_videos_from_page(self, page):
	215	ids_in_page = []
	216	titles_in_page = []
	217	for mobj in re.finditer(self._VIDEO_RE, page):
	218	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	219	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	220	continue
	221	video_id = mobj.group('id')
	222	video_title = unescapeHTML(mobj.group('title'))
	223	if video_title:
	224	video_title = video_title.strip()
	225	try:
	226	idx = ids_in_page.index(video_id)
	227	if video_title and not titles_in_page[idx]:
	228	titles_in_page[idx] = video_title
	229	except ValueError:
	230	ids_in_page.append(video_id)
	231	titles_in_page.append(video_title)
	232	return zip(ids_in_page, titles_in_page)
	233
	234
	235	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	236	def _process_page(self, content):
	237	for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):
	238	yield self.url_result(
	239	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	240
	241	def _real_extract(self, url):
	242	playlist_id = self._match_id(url)
	243	webpage = self._download_webpage(url, playlist_id)
	244	title = self._og_search_title(webpage, fatal=False)
	245	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	246
	247
	248	class YoutubeIE(YoutubeBaseInfoExtractor):
	249	IE_DESC = 'YouTube.com'
	250	_VALID_URL = r"""(?x)^
	251	(
	252	(?:https?://\|//) # http(s):// or protocol-independent URL
	253	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	254	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	255	(?:www\.)?pwnyoutube\.com/\|
	256	(?:www\.)?yourepeat\.com/\|
	257	tube\.majestyc\.net/\|
	258	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	259	(?:.*?\#/)? # handle anchor (#/) redirect urls
	260	(?: # the various things that can precede the ID:
	261	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	262	\|(?: # or the v= param in all its forms
	263	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	264	(?:\?\|\#!?) # the params delimiter ? or # or #!
	265	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	266	v=
	267	)
	268	))
	269	\|(?:
	270	youtu\.be\| # just youtu.be/xxxx
	271	vid\.plus # or vid.plus/xxxx
	272	)/
	273	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	274	)
	275	)? # all until now is optional -> you can pass the naked ID
	276	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	277	(?!.*?&list=) # combined list/video URLs are handled by the playlist IE
	278	(?(1).+)? # if we found the ID, everything can follow
	279	$"""
	280	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	281	_formats = {
	282	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	283	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	284	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	285	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	286	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	287	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	288	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	289	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	290	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	291	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
	292	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	293	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	294	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	295	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	296	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	297	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	298	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	299	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	300
	301
	302	# 3D videos
	303	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	304	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	305	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	306	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	307	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	308	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	309	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	310
	311	# Apple HTTP Live Streaming
	312	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	313	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	314	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	315	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	316	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	317	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	318	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	319
	320	# DASH mp4 video
	321	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	322	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	323	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	324	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	325	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	326	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
	327	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	328	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	329	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	330	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	331	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	332
	333	# Dash mp4 audio
	334	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
	335	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
	336	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
	337
	338	# Dash webm
	339	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	340	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	341	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	342	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	343	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	344	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	345	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},
	346	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	347	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	348	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	349	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	350	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	351	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	352	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	353	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	354	# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
	355	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	356	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	357	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	358	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	359	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	360	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	361
	362	# Dash webm audio
	363	'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
	364	'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	365
	366	# Dash webm audio with opus inside
	367	'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
	368	'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
	369	'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
	370
	371	# RTMP (unnamed)
	372	'_rtmp': {'protocol': 'rtmp'},
	373	}
	374	_SUBTITLE_FORMATS = ('ttml', 'vtt')
	375
	376	IE_NAME = 'youtube'
	377	_TESTS = [
	378	{
	379	'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
	380	'info_dict': {
	381	'id': 'BaW_jenozKc',
	382	'ext': 'mp4',
	383	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	384	'uploader': 'Philipp Hagemeister',
	385	'uploader_id': 'phihag',
	386	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
	387	'upload_date': '20121002',
	388	'license': 'Standard YouTube License',
	389	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	390	'categories': ['Science & Technology'],
	391	'tags': ['youtube-dl'],
	392	'like_count': int,
	393	'dislike_count': int,
	394	'start_time': 1,
	395	'end_time': 9,
	396	}
	397	},
	398	{
	399	'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
	400	'note': 'Test generic use_cipher_signature video (#897)',
	401	'info_dict': {
	402	'id': 'UxxajLWwzqY',
	403	'ext': 'mp4',
	404	'upload_date': '20120506',
	405	'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
	406	'alt_title': 'I Love It (feat. Charli XCX)',
	407	'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
	408	'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
	409	'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
	410	'iconic ep', 'iconic', 'love', 'it'],
	411	'uploader': 'Icona Pop',
	412	'uploader_id': 'IconaPop',
	413	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop',
	414	'license': 'Standard YouTube License',
	415	'creator': 'Icona Pop',
	416	}
	417	},
	418	{
	419	'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
	420	'note': 'Test VEVO video with age protection (#956)',
	421	'info_dict': {
	422	'id': '07FYdnEawAQ',
	423	'ext': 'mp4',
	424	'upload_date': '20130703',
	425	'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
	426	'alt_title': 'Tunnel Vision',
	427	'description': 'md5:64249768eec3bc4276236606ea996373',
	428	'uploader': 'justintimberlakeVEVO',
	429	'uploader_id': 'justintimberlakeVEVO',
	430	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
	431	'license': 'Standard YouTube License',
	432	'creator': 'Justin Timberlake',
	433	'age_limit': 18,
	434	}
	435	},
	436	{
	437	'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
	438	'note': 'Embed-only video (#1746)',
	439	'info_dict': {
	440	'id': 'yZIXLfi8CZQ',
	441	'ext': 'mp4',
	442	'upload_date': '20120608',
	443	'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
	444	'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
	445	'uploader': 'SET India',
	446	'uploader_id': 'setindia',
	447	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia',
	448	'license': 'Standard YouTube License',
	449	'age_limit': 18,
	450	}
	451	},
	452	{
	453	'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
	454	'note': 'Use the first video ID in the URL',
	455	'info_dict': {
	456	'id': 'BaW_jenozKc',
	457	'ext': 'mp4',
	458	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	459	'uploader': 'Philipp Hagemeister',
	460	'uploader_id': 'phihag',
	461	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
	462	'upload_date': '20121002',
	463	'license': 'Standard YouTube License',
	464	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	465	'categories': ['Science & Technology'],
	466	'tags': ['youtube-dl'],
	467	'like_count': int,
	468	'dislike_count': int,
	469	},
	470	'params': {
	471	'skip_download': True,
	472	},
	473	},
	474	{
	475	'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
	476	'note': '256k DASH audio (format 141) via DASH manifest',
	477	'info_dict': {
	478	'id': 'a9LDPn-MO4I',
	479	'ext': 'm4a',
	480	'upload_date': '20121002',
	481	'uploader_id': '8KVIDEO',
	482	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
	483	'description': '',
	484	'uploader': '8KVIDEO',
	485	'license': 'Standard YouTube License',
	486	'title': 'UHDTV TEST 8K VIDEO.mp4'
	487	},
	488	'params': {
	489	'youtube_include_dash_manifest': True,
	490	'format': '141',
	491	},
	492	},
	493	# DASH manifest with encrypted signature
	494	{
	495	'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
	496	'info_dict': {
	497	'id': 'IB3lcPjvWLA',
	498	'ext': 'm4a',
	499	'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
	500	'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import random

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

15

from ..jsinterp import JSInterpreter

16

from ..swfinterp import SWFInterpreter

17

from ..compat import (

compat_chr,

compat_parse_qs,

compat_urllib_parse,

compat_urllib_parse_unquote,

22

compat_urllib_parse_unquote_plus,

23

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

encode_dict,

error_to_compat_str,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_duration,

remove_quotes,

remove_start,

sanitized_Request,

smuggle_url,

str_to_int,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

ISO3166Utils,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

53

"""Provide base functions for Youtube extractors"""

54

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

55

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

56

_NETRC_MACHINE = 'youtube'

57

# If True it will raise an error if no login info is provided

58

_LOGIN_REQUIRED = False

59

60

def _set_language(self):

61

self._set_cookie(

62

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

63

# YouTube sets the expire time to about two months

64

expire_time=time.time() + 2 * 30 * 24 * 3600)

65

66

def _ids_to_results(self, ids):

67

return [

68

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

74

True is returned if successful or skipped.

75

False is returned if login failed.

76

77

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

78

"""

79

(username, password) = self._get_login_info()

80

# No authentication to be performed

81

if username is None:

82

if self._LOGIN_REQUIRED:

83

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

84

return True

85

86

login_page = self._download_webpage(

87

self._LOGIN_URL, None,

88

note='Downloading login page',

89

errnote='unable to fetch login page', fatal=False)

90

if login_page is False:

91

return

92

93

galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',

94

login_page, 'Login GALX parameter')

# Log in

login_form_strs = {

'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

'Email': username,

'GALX': galx,

'Passwd': password,

'PersistentCookie': 'yes',

104

'_utf8': '霱',

105

'bgresponse': 'js_disabled',

106

'checkConnection': '',

107

'checkedDomains': 'youtube',

'dnConn': '',

'pstMsg': '0',

'rmShown': '1',

'secTok': '',

'signIn': 'Sign in',

'timeStmp': '',

'service': 'youtube',

'uilel': '3',

'hl': 'en_US',

}

login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')

120

121

req = sanitized_Request(self._LOGIN_URL, login_data)

122

login_results = self._download_webpage(

123

req, None,

124

note='Logging in', errnote='unable to log in', fatal=False)

125

if login_results is False:

126

return False

127

128

if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:

129

raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)

130

131

# Two-Factor

132

# TODO add SMS and phone call support - these require making a request and then prompting the user

133

134

if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:

135

tfa_code = self._get_tfa_info('2-step verification code')

136

137

if not tfa_code:

138

self._downloader.report_warning(

139

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

140

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

141

return False

142

143

tfa_code = remove_start(tfa_code, 'G-')

144

145

tfa_form_strs = self._form_hidden_inputs('challenge', login_results)

146

147

tfa_form_strs.update({

'Pin': tfa_code,

'TrustDevice': 'on',

})

tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')

153

154

tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)

155

tfa_results = self._download_webpage(

156

tfa_req, None,

157

note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)

158

159

if tfa_results is False:

160

return False

161

162

if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:

163

self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')

164

return False

165

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:

166

self._downloader.report_warning('unable to log in - did the page structure change?')

167

return False

168

if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:

169

self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')

170

return False

171

172

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:

173

self._downloader.report_warning('unable to log in: bad username or password')

return False

return True

def _real_initialize(self):

178

if self._downloader is None:

179

return

180

self._set_language()

181

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

186

# Extract entries from page with "Load more" button

187

def _entries(self, page, playlist_id):

188

more_widget_html = content_html = page

189

for page_num in itertools.count(1):

190

for entry in self._process_page(content_html):

191

yield entry

192

193

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

198

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

199

'Downloading page #%s' % page_num,

200

transform_source=uppercase_escape)

201

content_html = more['content_html']

202

if not content_html.strip():

203

# Some webpages show a "Load more" button but they don't

204

# have more videos

205

break

206

more_widget_html = more['load_more_widget_html']

207

208

209

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

210

def _process_page(self, content):

211

for video_id, video_title in self.extract_videos_from_page(content):

212

yield self.url_result(video_id, 'Youtube', video_id, video_title)

213

214

def extract_videos_from_page(self, page):

215

ids_in_page = []

216

titles_in_page = []

217

for mobj in re.finditer(self._VIDEO_RE, page):

218

# The link with index 0 is not the first video of the playlist (not sure if still actual)

219

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

220

continue

221

video_id = mobj.group('id')

222

video_title = unescapeHTML(mobj.group('title'))

223

if video_title:

224

video_title = video_title.strip()

225

try:

226

idx = ids_in_page.index(video_id)

227

if video_title and not titles_in_page[idx]:

228

titles_in_page[idx] = video_title

229

except ValueError:

230

ids_in_page.append(video_id)

231

titles_in_page.append(video_title)

232

return zip(ids_in_page, titles_in_page)

233

234

235

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

236

def _process_page(self, content):

237

for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):

238

yield self.url_result(

239

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

240

241

def _real_extract(self, url):

242

playlist_id = self._match_id(url)

243

webpage = self._download_webpage(url, playlist_id)

244

title = self._og_search_title(webpage, fatal=False)

245

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

246

247

248

class YoutubeIE(YoutubeBaseInfoExtractor):

249

IE_DESC = 'YouTube.com'

250

_VALID_URL = r"""(?x)^

251

(

252

(?:https?://|//) # http(s):// or protocol-independent URL

253

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

254

(?:www\.)?deturl\.com/www\.youtube\.com/|

255

(?:www\.)?pwnyoutube\.com/|

256

(?:www\.)?yourepeat\.com/|

257

tube\.majestyc\.net/|

258

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

259

(?:.*?\#/)? # handle anchor (#/) redirect urls

260

(?: # the various things that can precede the ID:

261

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

262

|(?: # or the v= param in all its forms

263

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

264

(?:\?|\#!?) # the params delimiter ? or # or #!

265

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

271

vid\.plus # or vid.plus/xxxx

272

)/

273

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

274

)

275

)? # all until now is optional -> you can pass the naked ID

276

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

277

(?!.*?&list=) # combined list/video URLs are handled by the playlist IE

278

(?(1).+)? # if we found the ID, everything can follow

279

$"""

280

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

281

_formats = {

282

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

283

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

284

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

285

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

286

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

287

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

288

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

289

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

290

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

291

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},

292

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

293

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

294

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

295

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

296

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

297

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

298

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

299

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

304

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

305

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

306

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

307

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

308

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

309

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

310

311

# Apple HTTP Live Streaming

312

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

313

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

314

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

315

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

316

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

317

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

318

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

319

320

# DASH mp4 video

321

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

322

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

323

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

324

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

325

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

326

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)

327

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

328

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

329

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

330

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

331

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

332

333

# Dash mp4 audio

334

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},

335

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},

336

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},

337

338

# Dash webm

339

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

340

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

341

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

342

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

343

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

344

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

345

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},

346

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

347

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

348

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

349

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

350

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

351

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

352

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

353

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

354

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

355

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

356

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

357

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

358

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

359

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

360

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

361

362

# Dash webm audio

363

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},

364

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},

365

366

# Dash webm audio with opus inside

367

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},

368

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},

369

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},

370

371

# RTMP (unnamed)

372

'_rtmp': {'protocol': 'rtmp'},

373

}

374

_SUBTITLE_FORMATS = ('ttml', 'vtt')

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

384

'uploader': 'Philipp Hagemeister',

385

'uploader_id': 'phihag',

386

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',

387

'upload_date': '20121002',

388

'license': 'Standard YouTube License',

389

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

390

'categories': ['Science & Technology'],

391

'tags': ['youtube-dl'],

392

'like_count': int,

393

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',

400

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

405

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

406

'alt_title': 'I Love It (feat. Charli XCX)',

407

'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',

408

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

409

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

410

'iconic ep', 'iconic', 'love', 'it'],

411

'uploader': 'Icona Pop',

412

'uploader_id': 'IconaPop',

413

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop',

414

'license': 'Standard YouTube License',

415

'creator': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

420

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

425

'title': 'Justin Timberlake - Tunnel Vision (Explicit)',

426

'alt_title': 'Tunnel Vision',

427

'description': 'md5:64249768eec3bc4276236606ea996373',

428

'uploader': 'justintimberlakeVEVO',

429

'uploader_id': 'justintimberlakeVEVO',

430

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',

431

'license': 'Standard YouTube License',

432

'creator': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

438

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

443

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

444

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

445

'uploader': 'SET India',

446

'uploader_id': 'setindia',

447

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia',

448

'license': 'Standard YouTube License',

'age_limit': 18,

}

},

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

454

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

459

'uploader': 'Philipp Hagemeister',

460

'uploader_id': 'phihag',

461

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',

462

'upload_date': '20121002',

463

'license': 'Standard YouTube License',

464

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

465

'categories': ['Science & Technology'],

466

'tags': ['youtube-dl'],

467

'like_count': int,

468

'dislike_count': int,

469

},

470

'params': {

471

'skip_download': True,

},

},

{

'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',

476

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

481

'uploader_id': '8KVIDEO',

482

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',

483

'description': '',

484

'uploader': '8KVIDEO',

485

'license': 'Standard YouTube License',

486

'title': 'UHDTV TEST 8K VIDEO.mp4'

487

},

488

'params': {

489

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# DASH manifest with encrypted signature

494

{

495

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',

500

'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',

501

'uploader': 'AfrojackVEVO',

502

'uploader_id': 'AfrojackVEVO',

503

'upload_date': '20131011',

504

'license': 'Standard YouTube License',

505

},

506

'params': {

507

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# JS player signature function name containing $

512

{

513

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

518

'alt_title': 'Shake It Off',

519

'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',

520

'uploader': 'TaylorSwiftVEVO',

521

'uploader_id': 'TaylorSwiftVEVO',

522

'upload_date': '20140818',

523

'license': 'Standard YouTube License',

524

'creator': 'Taylor Swift',

525

},

526

'params': {

527

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'upload_date': '20100909',

538

'uploader': 'The Amazing Atheist',

539

'uploader_id': 'TheAmazingAtheist',

540

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',

541

'license': 'Standard YouTube License',

542

'title': 'Burning Everyone\'s Koran',

543

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

544

}

545

},

546

# Normal age-gate video (No vevo, embed allowed)

547

{

548

'url': 'http://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

553

'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

554

'uploader': 'The Witcher',

555

'uploader_id': 'WitcherGame',

556

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame',

557

'upload_date': '20140605',

558

'license': 'Standard YouTube License',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

563

{

564

'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

569

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

570

'uploader': 'LloydVEVO',

571

'uploader_id': 'LloydVEVO',

572

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',

573

'upload_date': '20110629',

574

'license': 'Standard YouTube License',

'age_limit': 18,

},

},

# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)

579

{

580

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'upload_date': '20100430',

585

'uploader_id': 'deadmau5',

586

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5',

587

'creator': 'deadmau5',

588

'description': 'md5:12c56784b8032162bb936a5f76d55360',

589

'uploader': 'deadmau5',

590

'license': 'Standard YouTube License',

591

'title': 'Deadmau5 - Some Chords (HD)',

592

'alt_title': 'Some Chords',

593

},

594

'expected_warnings': [

595

'DASH manifest missing',

596

]

597

},

598

# Olympics (https://github.com/rg3/youtube-dl/issues/4431)

599

{

600

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'upload_date': '20150827',

605

'uploader_id': 'olympic',

606

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',

607

'license': 'Standard YouTube License',

608

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

609

'uploader': 'Olympics',

610

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

611

},

612

'params': {

613

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

623

'upload_date': '20110310',

624

'uploader_id': 'AllenMeow',

625

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow',

626

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

627

'uploader': '孫艾倫',

628

'license': 'Standard YouTube License',

629

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

630

},

631

},

632

# url_encoded_fmt_stream_map is empty string

633

{

634

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

639

'description': '',

640

'upload_date': '20150404',

641

'uploader_id': 'spbelect',

642

'uploader': 'Наблюдатели Петербурга',

643

},

644

'params': {

645

'skip_download': 'requires avconv',

646

},

647

'skip': 'This live event has ended.',

648

},

649

# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)

650

{

651

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'mp4',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

656

'description': 'md5:116377fd2963b81ec4ce64b542173306',

657

'upload_date': '20150625',

658

'uploader_id': 'dorappi2000',

659

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',

660

'uploader': 'dorappi2000',

661

'license': 'Standard YouTube License',

662

'formats': 'mincount:33',

663

},

664

},

665

# DASH manifest with segment_list

666

{

667

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

668

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

673

'uploader': 'Airtek',

674

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

675

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

676

'license': 'Standard YouTube License',

677

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

678

},

679

'params': {

680

'youtube_include_dash_manifest': True,

681

'format': '135', # bestvideo

}

},

{

# Multifeed videos (multiple cameras), URL is for Main Camera

686

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

687

'info_dict': {

688

'id': 'jqWvoWXjCVs',

689

'title': 'teamPGP: Rocket League Noob Stream',

690

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

697

'description': 'md5:dc7872fb300e143831327f1bae3af010',

698

'upload_date': '20150721',

699

'uploader': 'Beer Games Beer',

700

'uploader_id': 'beergamesbeer',

701

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

702

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

709

'description': 'md5:dc7872fb300e143831327f1bae3af010',

710

'upload_date': '20150721',

711

'uploader': 'Beer Games Beer',

712

'uploader_id': 'beergamesbeer',

713

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

714

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

721

'description': 'md5:dc7872fb300e143831327f1bae3af010',

722

'upload_date': '20150721',

723

'uploader': 'Beer Games Beer',

724

'uploader_id': 'beergamesbeer',

725

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

726

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

733

'description': 'md5:dc7872fb300e143831327f1bae3af010',

734

'upload_date': '20150721',

735

'uploader': 'Beer Games Beer',

736

'uploader_id': 'beergamesbeer',

737

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

738

'license': 'Standard YouTube License',

},

}],

'params': {

'skip_download': True,

},

},

{

# Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)

747

'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',

748

'info_dict': {

749

'id': 'gVfLd0zydlo',

750

'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',

},

'playlist_count': 2,

},

{

'url': 'http://vid.plus/FlRa-iH7PGw',

756

'only_matching': True,

757

},

758

{

759

# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)

760

# Also tests cut-off URL expansion in video description (see

761

# https://github.com/rg3/youtube-dl/issues/1892,

762

# https://github.com/rg3/youtube-dl/issues/8164)

763

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

768

'alt_title': 'Dark Walk',

769

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

770

'upload_date': '20151119',

771

'uploader_id': 'IronSoulElf',

772

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',

773

'uploader': 'IronSoulElf',

774

'license': 'Standard YouTube License',

775

'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',

776

},

777

'params': {

778

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)

783

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

784

'only_matching': True,

785

},

786

{

787

# Video with yt:stretch=17:0

788

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

793

'description': 'md5:ee18a25c350637c8faff806845bddee9',

794

'upload_date': '20151107',

795

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

796

'uploader': 'CH GAMER DROID',

797

},

798

'params': {

799

'skip_download': True,

},

},

{

# Video licensed under Creative Commons

804

'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',

'info_dict': {

'id': 'M4gD1WSo5mA',

'ext': 'mp4',

'title': 'md5:e41008789470fc2533a3252216f1c1d1',

809

'description': 'md5:a677553cf0840649b731a3024aeff4cc',

810

'upload_date': '20150127',

811

'uploader_id': 'BerkmanCenter',

812

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',

813

'uploader': 'BerkmanCenter',

814

'license': 'Creative Commons Attribution license (reuse allowed)',

815

},

816

'params': {

817

'skip_download': True,

},

},

{

# Channel-like uploader_url

822

'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',

'info_dict': {

'id': 'eQcmzGIKrzg',

'ext': 'mp4',

'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',

827

'description': 'md5:dda0d780d5a6e120758d1711d062a867',

828

'upload_date': '20151119',

829

'uploader': 'Bernie 2016',

830

'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',

831

'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',

832

'license': 'Creative Commons Attribution license (reuse allowed)',

833

},

834

'params': {

835

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

840

'only_matching': True,

}

]

def __init__(self, *args, **kwargs):

845

super(YoutubeIE, self).__init__(*args, **kwargs)

846

self._player_cache = {}

847

848

def report_video_info_webpage_download(self, video_id):

849

"""Report attempt to download video info webpage."""

850

self.to_screen('%s: Downloading video info webpage' % video_id)

851

852

def report_information_extraction(self, video_id):

853

"""Report attempt to extract video information."""

854

self.to_screen('%s: Extracting video information' % video_id)

855

856

def report_unavailable_format(self, video_id, format):

857

"""Report extracted video URL."""

858

self.to_screen('%s: Format %s not available' % (video_id, format))

859

860

def report_rtmp_download(self):

861

"""Indicate the download will use the RTMP protocol."""

862

self.to_screen('RTMP download detected')

863

864

def _signature_cache_id(self, example_sig):

865

""" Return a string representation of a signature """

866

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

867

868

def _extract_signature_function(self, video_id, player_url, example_sig):

869

id_m = re.match(

870

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',

871

player_url)

872

if not id_m:

873

raise ExtractorError('Cannot identify player %r' % player_url)

874

player_type = id_m.group('ext')

875

player_id = id_m.group('id')

876

877

# Read from filesystem cache

878

func_id = '%s_%s_%s' % (

879

player_type, player_id, self._signature_cache_id(example_sig))

880

assert os.path.basename(func_id) == func_id

881

882

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

883

if cache_spec is not None:

884

return lambda s: ''.join(s[i] for i in cache_spec)

885

886

download_note = (

887

'Downloading player %s' % player_url

888

if self._downloader.params.get('verbose') else

889

'Downloading %s player %s' % (player_type, player_id)

890

)

891

if player_type == 'js':

892

code = self._download_webpage(

893

player_url, video_id,

894

note=download_note,

895

errnote='Download of %s failed' % player_url)

896

res = self._parse_sig_js(code)

897

elif player_type == 'swf':

898

urlh = self._request_webpage(

899

player_url, video_id,

900

note=download_note,

901

errnote='Download of %s failed' % player_url)

902

code = urlh.read()

903

res = self._parse_sig_swf(code)

904

else:

905

assert False, 'Invalid player type %r' % player_type

906

907

test_string = ''.join(map(compat_chr, range(len(example_sig))))

908

cache_res = res(test_string)

909

cache_spec = [ord(c) for c in cache_res]

910

911

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

912

return res

913

914

def _print_sig_code(self, func, example_sig):

915

def gen_sig_code(idxs):

916

def _genslice(start, end, step):

917

starts = '' if start == 0 else str(start)

918

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

919

steps = '' if step == 1 else (':%d' % step)

920

return 's[%s%s%s]' % (starts, ends, steps)

921

922

step = None

923

# Quelch pyflakes warnings - start will be set when step is set

924

start = '(Never used)'

925

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

930

step = None

931

continue

932

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

942

943

test_string = ''.join(map(compat_chr, range(len(example_sig))))

944

cache_res = func(test_string)

945

cache_spec = [ord(c) for c in cache_res]

946

expr_code = ' + '.join(gen_sig_code(cache_spec))

947

signature_id_tuple = '(%s)' % (

948

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

949

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

950

' return %s\n') % (signature_id_tuple, expr_code)

951

self.to_screen('Extracted signature function:\n' + code)

952

953

def _parse_sig_js(self, jscode):

954

funcname = self._search_regex(

955

r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,

956

'Initial JS player signature function name')

957

958

jsi = JSInterpreter(jscode)

959

initial_function = jsi.extract_function(funcname)

960

return lambda s: initial_function([s])

961

962

def _parse_sig_swf(self, file_contents):

963

swfi = SWFInterpreter(file_contents)

964

TARGET_CLASSNAME = 'SignatureDecipher'

965

searched_class = swfi.extract_class(TARGET_CLASSNAME)

966

initial_function = swfi.extract_function(searched_class, 'decipher')

967

return lambda s: initial_function([s])

968

969

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

970

"""Turn the encrypted s field into a working signature"""

971

972

if player_url is None:

973

raise ExtractorError('Cannot decrypt signature without player_url')

974

975

if player_url.startswith('//'):

976

player_url = 'https:' + player_url

977

try:

978

player_id = (player_url, self._signature_cache_id(s))

979

if player_id not in self._player_cache:

980

func = self._extract_signature_function(

981

video_id, player_url, s

982

)

983

self._player_cache[player_id] = func

984

func = self._player_cache[player_id]

985

if self._downloader.params.get('youtube_print_sig_code'):

986

self._print_sig_code(func, s)

987

return func(s)

988

except Exception as e:

989

tb = traceback.format_exc()

990

raise ExtractorError(

991

'Signature extraction failed: ' + tb, cause=e)

992

993

def _get_subtitles(self, video_id, webpage):

994

try:

995

subs_doc = self._download_xml(

996

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

997

video_id, note=False)

998

except ExtractorError as err:

999

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

1004

lang = track.attrib['lang_code']

1005

if lang in sub_lang_list:

1006

continue

1007

sub_formats = []

1008

for ext in self._SUBTITLE_FORMATS:

1009

params = compat_urllib_parse.urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

1014

})

1015

sub_formats.append({

1016

'url': 'https://www.youtube.com/api/timedtext?' + params,

1017

'ext': ext,

1018

})

1019

sub_lang_list[lang] = sub_formats

1020

if not sub_lang_list:

1021

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

1026

patterns = (

1027

# User data may contain arbitrary character sequences that may affect

1028

# JSON extraction with regex, e.g. when '};' is contained the second

1029

# regex won't capture the whole JSON. Yet working around by trying more

1030

# concrete regex first keeping in mind proper quoted string handling

1031

# to be implemented in future that will replace this workaround (see

1032

# https://github.com/rg3/youtube-dl/issues/7468,

1033

# https://github.com/rg3/youtube-dl/pull/7599)

1034

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

1035

r';ytplayer\.config\s*=\s*({.+?});',

1036

)

1037

config = self._search_regex(

1038

patterns, webpage, 'ytplayer.config', default=None)

1039

if config:

1040

return self._parse_json(

1041

uppercase_escape(config), video_id, fatal=False)

1042

1043

def _get_automatic_captions(self, video_id, webpage):

1044

"""We need the webpage for getting the captions url, pass it as an

1045

argument to speed up the process."""

1046

self.to_screen('%s: Looking for automatic captions' % video_id)

1047

player_config = self._get_ytplayer_config(video_id, webpage)

1048

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

1049

if not player_config:

1050

self._downloader.report_warning(err_msg)

1051

return {}

1052

try:

1053

args = player_config['args']

1054

caption_url = args.get('ttsurl')

1055

if caption_url:

1056

timestamp = args['timestamp']

1057

# We get the available subtitles

1058

list_params = compat_urllib_parse.urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

1064

caption_list = self._download_xml(list_url, video_id)

1065

original_lang_node = caption_list.find('track')

1066

if original_lang_node is None:

1067

self._downloader.report_warning('Video doesn\'t have automatic captions')

1068

return {}

1069

original_lang = original_lang_node.attrib['lang_code']

1070

caption_kind = original_lang_node.attrib.get('kind', '')

1071

1072

sub_lang_list = {}

1073

for lang_node in caption_list.findall('target'):

1074

sub_lang = lang_node.attrib['lang_code']

1075

sub_formats = []

1076

for ext in self._SUBTITLE_FORMATS:

1077

params = compat_urllib_parse.urlencode({

1078

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1083

})

1084

sub_formats.append({

1085

'url': caption_url + '&' + params,

1086

'ext': ext,

1087

})

1088

sub_lang_list[sub_lang] = sub_formats

1089

return sub_lang_list

1090

1091

# Some videos don't provide ttsurl but rather caption_tracks and

1092

# caption_translation_languages (e.g. 20LmZk1hakA)

1093

caption_tracks = args['caption_tracks']

1094

caption_translation_languages = args['caption_translation_languages']

1095

caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]

1096

parsed_caption_url = compat_urlparse.urlparse(caption_url)

1097

caption_qs = compat_parse_qs(parsed_caption_url.query)

1098

1099

sub_lang_list = {}

1100

for lang in caption_translation_languages.split(','):

1101

lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))

1102

sub_lang = lang_qs.get('lc', [None])[0]

if not sub_lang:

continue

sub_formats = []

for ext in self._SUBTITLE_FORMATS:

caption_qs.update({

'tlang': [sub_lang],

'fmt': [ext],

})

sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(

1112

query=compat_urllib_parse.urlencode(caption_qs, True)))

sub_formats.append({

'url': sub_url,

'ext': ext,

})

sub_lang_list[sub_lang] = sub_formats

1118

return sub_lang_list

1119

# An extractor error can be raise by the download process if there are

1120

# no automatic captions but there are subtitles

1121

except (KeyError, ExtractorError):

1122

self._downloader.report_warning(err_msg)

1123

return {}

1124

1125

def _mark_watched(self, video_id, video_info):

1126

playback_url = video_info.get('videostats_playback_base_url', [None])[0]

1127

if not playback_url:

1128

return

1129

parsed_playback_url = compat_urlparse.urlparse(playback_url)

1130

qs = compat_urlparse.parse_qs(parsed_playback_url.query)

1131

1132

# cpn generation algorithm is reverse engineered from base.js.

1133

# In fact it works even with dummy cpn.

1134

CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'

1135

cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))

qs.update({

'ver': ['2'],

'cpn': [cpn],

})

playback_url = compat_urlparse.urlunparse(

1142

parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True)))

1143

1144

self._download_webpage(

1145

playback_url, video_id, 'Marking watched',

1146

'Unable to mark watched', fatal=False)

1147

1148

@classmethod

1149

def extract_id(cls, url):

1150

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1151

if mobj is None:

1152

raise ExtractorError('Invalid URL: %s' % url)

1153

video_id = mobj.group(2)

1154

return video_id

1155

1156

def _extract_from_m3u8(self, manifest_url, video_id):

1157

url_map = {}

1158

1159

def _get_urls(_manifest):

1160

lines = _manifest.split('\n')

1161

urls = filter(lambda l: l and not l.startswith('#'),

1162

lines)

1163

return urls

1164

manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')

1165

formats_urls = _get_urls(manifest)

1166

for format_url in formats_urls:

1167

itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')

1168

url_map[itag] = format_url

1169

return url_map

1170

1171

def _extract_annotations(self, video_id):

1172

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1173

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1174

1175

def _real_extract(self, url):

1176

url, smuggled_data = unsmuggle_url(url, {})

1177

1178

proto = (

1179

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1185

for component in [parsed_url.fragment, parsed_url.query]:

1186

query = compat_parse_qs(component)

1187

if start_time is None and 't' in query:

1188

start_time = parse_duration(query['t'][0])

1189

if start_time is None and 'start' in query:

1190

start_time = parse_duration(query['start'][0])

1191

if end_time is None and 'end' in query:

1192

end_time = parse_duration(query['end'][0])

1193

1194

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1195

mobj = re.search(self._NEXT_URL_RE, url)

1196

if mobj:

1197

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1198

video_id = self.extract_id(url)

1199

1200

# Get video webpage

1201

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1202

video_webpage = self._download_webpage(url, video_id)

1203

1204

# Attempt to extract SWF player URL

1205

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1206

if mobj is not None:

1207

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1214

dash_mpd = video_info.get('dashmpd')

1215

if dash_mpd and dash_mpd[0] not in dash_mpds:

1216

dash_mpds.append(dash_mpd[0])

# Get video info

embed_webpage = None

is_live = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1222

age_gate = True

1223

# We simulate the access to the video from www.youtube.com/v/{video_id}

1224

# this can be viewed without login into Youtube

1225

url = proto + '://www.youtube.com/embed/%s' % video_id

1226

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1227

data = compat_urllib_parse.urlencode({

1228

'video_id': video_id,

1229

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1230

'sts': self._search_regex(

1231

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1232

})

1233

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1234

video_info_webpage = self._download_webpage(

1235

video_info_url, video_id,

1236

note='Refetching age-gated info webpage',

1237

errnote='unable to download video info webpage')

1238

video_info = compat_parse_qs(video_info_webpage)

1239

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

# Try looking directly into the video webpage

1244

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1245

if ytplayer_config:

1246

args = ytplayer_config['args']

1247

if args.get('url_encoded_fmt_stream_map'):

1248

# Convert to the same format returned by compat_parse_qs

1249

video_info = dict((k, [v]) for k, v in args.items())

1250

add_dash_mpd(video_info)

1251

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1252

is_live = True

1253

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1254

# We also try looking in get_video_info since it may contain different dashmpd

1255

# URL that points to a DASH manifest with possibly different itag set (some itags

1256

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1257

# manifest pointed by get_video_info's dashmpd).

1258

# The general idea is to take a union of itags of both DASH manifests (for example

1259

# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)

1260

self.report_video_info_webpage_download(video_id)

1261

for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:

1262

video_info_url = (

1263

'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

1264

% (proto, video_id, el_type))

1265

video_info_webpage = self._download_webpage(

1266

video_info_url,

1267

video_id, note=False,

1268

errnote='unable to download video info webpage')

1269

get_video_info = compat_parse_qs(video_info_webpage)

1270

if get_video_info.get('use_cipher_signature') != ['True']:

1271

add_dash_mpd(get_video_info)

1272

if not video_info:

1273

video_info = get_video_info

1274

if 'token' in get_video_info:

1275

# Different get_video_info requests may report different results, e.g.

1276

# some may report video unavailability, but some may serve it without

1277

# any complaint (see https://github.com/rg3/youtube-dl/issues/7362,

1278

# the original webpage as well as el=info and el=embedded get_video_info

1279

# requests report video unavailability due to geo restriction while

1280

# el=detailpage succeeds and returns valid data). This is probably

1281

# due to YouTube measures against IP ranges of hosting providers.

1282

# Working around by preferring the first succeeded video_info containing

1283

# the token if no such video_info yet was found.

1284

if 'token' not in video_info:

1285

video_info = get_video_info

1286

break

1287

if 'token' not in video_info:

1288

if 'reason' in video_info:

1289

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1290

regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)

1291

if regions_allowed:

1292

raise ExtractorError('YouTube said: This video is available in %s only' % (

1293

', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),

1294

expected=True)

1295

raise ExtractorError(

1296

'YouTube said: %s' % video_info['reason'][0],

1297

expected=True, video_id=video_id)

1298

else:

1299

raise ExtractorError(

1300

'"token" parameter not in video info for unknown reason',

video_id=video_id)

# title

if 'title' in video_info:

1305

video_title = video_info['title'][0]

1306

else:

1307

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

video_description = get_element_by_id("eow-description", video_webpage)

1312

if video_description:

1313

video_description = re.sub(r'''(?x)

1314

<a\s+

1315

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1316

(?:title|href)="([^"]+)"\s+

1317

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1318

class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>

1319

[^<]+\.{3}\s*

1320

</a>

1321

''', r'\1', video_description)

1322

video_description = clean_html(video_description)

1323

else:

1324

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1325

if fd_mobj:

1326

video_description = unescapeHTML(fd_mobj.group(1))

1327

else:

1328

video_description = ''

1329

1330

if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):

1331

if not self._downloader.params.get('noplaylist'):

1332

entries = []

1333

feed_ids = []

1334

multifeed_metadata_list = video_info['multifeed_metadata_list'][0]

1335

for feed in multifeed_metadata_list.split(','):

1336

# Unquote should take place before split on comma (,) since textual

1337

# fields may contain comma as well (see

1338

# https://github.com/rg3/youtube-dl/issues/8536)

1339

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1340

entries.append({

1341

'_type': 'url_transparent',

1342

'ie_key': 'Youtube',

1343

'url': smuggle_url(

1344

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1345

{'force_singlefeed': True}),

1346

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1347

})

1348

feed_ids.append(feed_data['id'][0])

1349

self.to_screen(

1350

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1351

% (', '.join(feed_ids), video_id))

1352

return self.playlist_result(entries, video_id, video_title, video_description)

1353

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1354

1355

if 'view_count' in video_info:

1356

view_count = int(video_info['view_count'][0])

else:

view_count = None

# Check for "rental" videos

1361

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1362

raise ExtractorError('"rental" videos not supported')

1363

1364

# Start extracting information

1365

self.report_information_extraction(video_id)

1366

1367

# uploader

1368

if 'author' not in video_info:

1369

raise ExtractorError('Unable to extract uploader name')

1370

video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])

1371

1372

# uploader_id

1373

video_uploader_id = None

1374

video_uploader_url = None

1375

mobj = re.search(

1376

r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',

1377

video_webpage)

1378

if mobj is not None:

1379

video_uploader_id = mobj.group('uploader_id')

1380

video_uploader_url = mobj.group('uploader_url')

1381

else:

1382

self._downloader.report_warning('unable to extract uploader nickname')

1383

1384

# thumbnail image

1385

# We try first to get a high quality image:

1386

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

1387

video_webpage, re.DOTALL)

1388

if m_thumb is not None:

1389

video_thumbnail = m_thumb.group(1)

1390

elif 'thumbnail_url' not in video_info:

1391

self._downloader.report_warning('unable to extract video thumbnail')

1392

video_thumbnail = None

1393

else: # don't panic if we can't find it

1394

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

1395

1396

# upload date

1397

upload_date = self._html_search_meta(

1398

'datePublished', video_webpage, 'upload date', default=None)

1399

if not upload_date:

1400

upload_date = self._search_regex(

1401

[r'(?s)id="eow-date.*?>(.*?)</span>',

1402

r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],

1403

video_webpage, 'upload date', default=None)

1404

if upload_date:

1405

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

1406

upload_date = unified_strdate(upload_date)

1407

1408

video_license = self._html_search_regex(

1409

r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',

1410

video_webpage, 'license', default=None)

1411

1412

m_music = re.search(

1413

r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:$.+?$)?</li',

1414

video_webpage)

1415

if m_music:

1416

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

1417

video_creator = clean_html(m_music.group('creator'))

1418

else:

1419

video_alt_title = video_creator = None

1420

1421

m_cat_container = self._search_regex(

1422

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

1423

video_webpage, 'categories', default=None)

1424

if m_cat_container:

1425

category = self._html_search_regex(

1426

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

1427

default=None)

1428

video_categories = None if category is None else [category]

1429

else:

1430

video_categories = None

1431

1432

video_tags = [

1433

unescapeHTML(m.group('content'))

1434

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

1435

1436

def _extract_count(count_name):

1437

return str_to_int(self._search_regex(

1438

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

1439

% re.escape(count_name),

1440

video_webpage, count_name, default=None))

1441

1442

like_count = _extract_count('like')

1443

dislike_count = _extract_count('dislike')

1444

1445

# subtitles

1446

video_subtitles = self.extract_subtitles(video_id, video_webpage)

1447

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

1448

1449

if 'length_seconds' not in video_info:

1450

self._downloader.report_warning('unable to extract video duration')

1451

video_duration = None

1452

else:

1453

video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))

1454

1455

# annotations

1456

video_annotations = None

1457

if self._downloader.params.get('writeannotations', False):

1458

video_annotations = self._extract_annotations(video_id)

1459

1460

def _map_to_format_list(urlmap):

1461

formats = []

1462

for itag, video_real_url in urlmap.items():

1463

dct = {

1464

'format_id': itag,

1465

'url': video_real_url,

1466

'player_url': player_url,

1467

}

1468

if itag in self._formats:

1469

dct.update(self._formats[itag])

formats.append(dct)

return formats

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1474

self.report_rtmp_download()

1475

formats = [{

1476

'format_id': '_rtmp',

1477

'protocol': 'rtmp',

1478

'url': video_info['conn'][0],

1479

'player_url': player_url,

1480

}]

1481

elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:

1482

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1483

if 'rtmpe%3Dyes' in encoded_url_map:

1484

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

1485

formats_spec = {}

1486

fmt_list = video_info.get('fmt_list', [''])[0]

1487

if fmt_list:

1488

for fmt in fmt_list.split(','):

1489

spec = fmt.split('/')

1490

if len(spec) > 1:

1491

width_height = spec[1].split('x')

1492

if len(width_height) == 2:

1493

formats_spec[spec[0]] = {

1494

'resolution': spec[1],

1495

'width': int_or_none(width_height[0]),

1496

'height': int_or_none(width_height[1]),

1497

}

1498

formats = []

1499

for url_data_str in encoded_url_map.split(','):

1500

url_data = compat_parse_qs(url_data_str)

1501

if 'itag' not in url_data or 'url' not in url_data:

1502

continue

1503

format_id = url_data['itag'][0]

1504

url = url_data['url'][0]

1505

1506

if 'sig' in url_data:

1507

url += '&signature=' + url_data['sig'][0]

1508

elif 's' in url_data:

1509

encrypted_sig = url_data['s'][0]

1510

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1511

1512

jsplayer_url_json = self._search_regex(

1513

ASSETS_RE,

1514

embed_webpage if age_gate else video_webpage,

1515

'JS player URL (1)', default=None)

1516

if not jsplayer_url_json and not age_gate:

1517

# We need the embed website after all

1518

if embed_webpage is None:

1519

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1520

embed_webpage = self._download_webpage(

1521

embed_url, video_id, 'Downloading embed webpage')

1522

jsplayer_url_json = self._search_regex(

1523

ASSETS_RE, embed_webpage, 'JS player URL')

1524

1525

player_url = json.loads(jsplayer_url_json)

1526

if player_url is None:

1527

player_url_json = self._search_regex(

1528

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1529

video_webpage, 'age gate player URL')

1530

player_url = json.loads(player_url_json)

1531

1532

if self._downloader.params.get('verbose'):

1533

if player_url is None:

1534

player_version = 'unknown'

1535

player_desc = 'unknown'

1536

else:

1537

if player_url.endswith('swf'):

1538

player_version = self._search_regex(

1539

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1540

'flash player', fatal=False)

1541

player_desc = 'flash player %s' % player_version

1542

else:

1543

player_version = self._search_regex(

1544

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],

1545

player_url,

1546

'html5 player', fatal=False)

1547

player_desc = 'html5 player %s' % player_version

1548

1549

parts_sizes = self._signature_cache_id(encrypted_sig)

1550

self.to_screen('{%s} signature length %s, %s' %

1551

(format_id, parts_sizes, player_desc))

1552

1553

signature = self._decrypt_signature(

1554

encrypted_sig, video_id, player_url, age_gate)

1555

url += '&signature=' + signature

1556

if 'ratebypass' not in url:

1557

url += '&ratebypass=yes'

1558

1559

dct = {

1560

'format_id': format_id,

1561

'url': url,

1562

'player_url': player_url,

1563

}

1564

if format_id in self._formats:

1565

dct.update(self._formats[format_id])

1566

if format_id in formats_spec:

1567

dct.update(formats_spec[format_id])

1568

1569

# Some itags are not included in DASH manifest thus corresponding formats will

1570

# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).

1571

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

1572

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

1573

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

1574

1575

more_fields = {

1576

'filesize': int_or_none(url_data.get('clen', [None])[0]),

1577

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

1578

'width': width,

1579

'height': height,

1580

'fps': int_or_none(url_data.get('fps', [None])[0]),

1581

'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],

1582

}

1583

for key, value in more_fields.items():

1584

if value:

1585

dct[key] = value

1586

type_ = url_data.get('type', [None])[0]

1587

if type_:

1588

type_split = type_.split(';')

1589

kind_ext = type_split[0].split('/')

1590

if len(kind_ext) == 2:

1591

kind, _ = kind_ext

1592

dct['ext'] = mimetype2ext(type_split[0])

1593

if kind in ('audio', 'video'):

1594

codecs = None

1595

for mobj in re.finditer(

1596

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

1597

if mobj.group('key') == 'codecs':

1598

codecs = mobj.group('val')

1599

break

1600

if codecs:

1601

codecs = codecs.split(',')

1602

if len(codecs) == 2:

1603

acodec, vcodec = codecs[1], codecs[0]

1604

else:

1605

acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])

dct.update({

'acodec': acodec,

'vcodec': vcodec,

})

formats.append(dct)

elif video_info.get('hlsvp'):

1612

manifest_url = video_info['hlsvp'][0]

1613

url_map = self._extract_from_m3u8(manifest_url, video_id)

1614

formats = _map_to_format_list(url_map)

1615

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

1616

for a_format in formats:

1617

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

1618

else:

1619

unavailable_message = self._html_search_regex(

1620

r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',

1621

video_webpage, 'unavailable message', default=None)

1622

if unavailable_message:

1623

raise ExtractorError(unavailable_message, expected=True)

1624

raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

1625

1626

# Look for the DASH manifest

1627

if self._downloader.params.get('youtube_include_dash_manifest', True):

1628

dash_mpd_fatal = True

1629

for mpd_url in dash_mpds:

1630

dash_formats = {}

1631

try:

1632

def decrypt_sig(mobj):

1633

s = mobj.group(1)

1634

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

1635

return '/signature/%s' % dec_s

1636

1637

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

1638

1639

for df in self._extract_mpd_formats(

1640

mpd_url, video_id, fatal=dash_mpd_fatal,

1641

formats_dict=self._formats):

1642

# Do not overwrite DASH format found in some previous DASH manifest

1643

if df['format_id'] not in dash_formats:

1644

dash_formats[df['format_id']] = df

1645

# Additional DASH manifests may end up in HTTP Error 403 therefore

1646

# allow them to fail without bug report message if we already have

1647

# some DASH manifest succeeded. This is temporary workaround to reduce

1648

# burst of bug reports until we figure out the reason and whether it

1649

# can be fixed at all.

1650

dash_mpd_fatal = False

1651

except (ExtractorError, KeyError) as e:

1652

self.report_warning(

1653

'Skipping DASH manifest: %r' % e, video_id)

1654

if dash_formats:

1655

# Remove the formats we found through non-DASH, they

1656

# contain less info and it can be wrong, because we use

1657

# fixed values (for example the resolution). See

1658

# https://github.com/rg3/youtube-dl/issues/5774 for an

1659

# example.

1660

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

1661

formats.extend(dash_formats.values())

1662

1663

# Check for malformed aspect ratio

1664

stretched_m = re.search(

1665

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

1666

video_webpage)

1667

if stretched_m:

1668

w = float(stretched_m.group('w'))

1669

h = float(stretched_m.group('h'))

1670

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

1671

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

1676

f['stretched_ratio'] = ratio

1677

1678

self._sort_formats(formats)

1679

1680

self.mark_watched(video_id, video_info)

return {

'id': video_id,

'uploader': video_uploader,

1685

'uploader_id': video_uploader_id,

1686

'uploader_url': video_uploader_url,

1687

'upload_date': upload_date,

1688

'license': video_license,

1689

'creator': video_creator,

1690

'title': video_title,

1691

'alt_title': video_alt_title,

1692

'thumbnail': video_thumbnail,

1693

'description': video_description,

1694

'categories': video_categories,

1695

'tags': video_tags,

1696

'subtitles': video_subtitles,

1697

'automatic_captions': automatic_captions,

1698

'duration': video_duration,

1699

'age_limit': 18 if age_gate else 0,

1700

'annotations': video_annotations,

1701

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

1702

'view_count': view_count,

1703

'like_count': like_count,

1704

'dislike_count': dislike_count,

1705

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

1706

'formats': formats,

1707

'is_live': is_live,

1708

'start_time': start_time,

1709

'end_time': end_time,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

1714

IE_DESC = 'YouTube.com playlists'

1715

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

youtube\.com/

(?:

\? (?:.*?[&;])*? (?:p|a|list)=

| p/

)

(

(?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}

1726

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})

1732

)"""

1733

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

1734

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

1735

IE_NAME = 'youtube:playlist'

1736

_TESTS = [{

1737

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

1738

'info_dict': {

1739

'title': 'ytdl test PL',

1740

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1745

'info_dict': {

1746

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1747

'title': 'YDL_Empty_List',

},

'playlist_count': 0,

}, {

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

1752

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1753

'info_dict': {

1754

'title': '29C3: Not my department',

1755

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1756

},

1757

'playlist_count': 95,

1758

}, {

1759

'note': 'issue #673',

1760

'url': 'PLBB231211A4F62143',

1761

'info_dict': {

1762

'title': '[OLD]Team Fortress 2 (Class-based LP)',

1763

'id': 'PLBB231211A4F62143',

1764

},

1765

'playlist_mincount': 26,

1766

}, {

1767

'note': 'Large playlist',

1768

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

1769

'info_dict': {

1770

'title': 'Uploads from Cauchemar',

1771

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

1772

},

1773

'playlist_mincount': 799,

1774

}, {

1775

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

1776

'info_dict': {

1777

'title': 'YDL_safe_search',

1778

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

},

'playlist_count': 2,

}, {

'note': 'embedded',

'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

1788

}

1789

}, {

1790

'note': 'Embedded SWF player',

1791

'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

1796

}

1797

}, {

1798

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

1799

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

1800

'info_dict': {

1801

'title': 'Uploads from Interstellar Movie',

1802

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

1803

},

1804

'playlist_mincout': 21,

1805

}]

1806

1807

def _real_initialize(self):

1808

self._login()

1809

1810

def _extract_mix(self, playlist_id):

1811

# The mixes are generated from a single video

1812

# the id of the playlist is just 'RD' + video_id

1813

url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)

1814

webpage = self._download_webpage(

1815

url, playlist_id, 'Downloading Youtube mix')

1816

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

1817

title_span = (

1818

search_title('playlist-title') or

1819

search_title('title long-title') or

1820

search_title('title'))

1821

title = clean_html(title_span)

1822

ids = orderedSet(re.findall(

1823

r'''(?xs)data-video-username=".*?".*?

1824

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

1825

webpage))

1826

url_results = self._ids_to_results(ids)

1827

1828

return self.playlist_result(url_results, playlist_id, title)

1829

1830

def _extract_playlist(self, playlist_id):

1831

url = self._TEMPLATE_URL % playlist_id

1832

page = self._download_webpage(url, playlist_id)

1833

1834

for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):

1835

match = match.strip()

1836

# Check if the playlist exists or is private

1837

if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):

1838

raise ExtractorError(

1839

'The playlist doesn\'t exist or is private, use --username or '

1840

'--netrc to access it.',

1841

expected=True)

1842

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

1843

raise ExtractorError(

1844

'Invalid parameters. Maybe URL is incorrect.',

1845

expected=True)

1846

elif re.match(r'[^<]*Choose your language[^<]*', match):

1847

continue

1848

else:

1849

self.report_warning('Youtube gives an alert message: ' + match)

1850

1851

playlist_title = self._html_search_regex(

1852

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

1853

page, 'title')

1854

1855

return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)

1856

1857

def _check_download_just_video(self, url, playlist_id):

1858

# Check if it's a video-specific URL

1859

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

1860

if 'v' in query_dict:

1861

video_id = query_dict['v'][0]

1862

if self._downloader.params.get('noplaylist'):

1863

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1864

return self.url_result(video_id, 'Youtube', video_id=video_id)

1865

else:

1866

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

1867

1868

def _real_extract(self, url):

1869

# Extract playlist id

1870

mobj = re.match(self._VALID_URL, url)

1871

if mobj is None:

1872

raise ExtractorError('Invalid URL: %s' % url)

1873

playlist_id = mobj.group(1) or mobj.group(2)

1874

1875

video = self._check_download_just_video(url, playlist_id)

if video:

return video

if playlist_id.startswith('RD') or playlist_id.startswith('UL'):

1880

# Mixes require a custom extraction process

1881

return self._extract_mix(playlist_id)

1882

1883

return self._extract_playlist(playlist_id)

1884

1885

1886

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

1887

IE_DESC = 'YouTube.com channels'

1888

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'

1889

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

1890

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

1891

IE_NAME = 'youtube:channel'

1892

_TESTS = [{

1893

'note': 'paginated channel',

1894

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

1895

'playlist_mincount': 91,

1896

'info_dict': {

1897

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

1898

'title': 'Uploads from lex will',

1899

}

1900

}, {

1901

'note': 'Age restricted channel',

1902

# from https://www.youtube.com/user/DeusExOfficial

1903

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

1904

'playlist_mincount': 64,

1905

'info_dict': {

1906

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

1907

'title': 'Uploads from Deus Ex',

},

}]

@classmethod

def suitable(cls, url):

1913

return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)

1914

1915

def _real_extract(self, url):

1916

channel_id = self._match_id(url)

1917

1918

url = self._TEMPLATE_URL % channel_id

1919

1920

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

1921

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

1922

# otherwise fallback on channel by page extraction

1923

channel_page = self._download_webpage(

1924

url + '?view=57', channel_id,

1925

'Downloading channel page', fatal=False)

1926

if channel_page is False:

1927

channel_playlist_id = False

1928

else:

1929

channel_playlist_id = self._html_search_meta(

1930

'channelId', channel_page, 'channel id', default=None)

1931

if not channel_playlist_id:

1932

channel_playlist_id = self._search_regex(

1933

r'data-(?:channel-external-|yt)id="([^"]+)"',

1934

channel_page, 'channel id', default=None)

1935

if channel_playlist_id and channel_playlist_id.startswith('UC'):

1936

playlist_id = 'UU' + channel_playlist_id[2:]

1937

return self.url_result(

1938

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

1939

1940

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

1941

autogenerated = re.search(r'''(?x)

1942

class="[^"]*?(?:

1943

channel-header-autogenerated-label|

1944

yt-channel-title-autogenerated

1945

)[^"]*"''', channel_page) is not None

1946

1947

if autogenerated:

1948

# The videos are contained in a single page

1949

# the ajax pages can't be used, they are empty

1950

entries = [

1951

self.url_result(

1952

video_id, 'Youtube', video_id=video_id,

1953

video_title=video_title)

1954

for video_id, video_title in self.extract_videos_from_page(channel_page)]

1955

return self.playlist_result(entries, channel_id)

1956

1957

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

1958

1959

1960

class YoutubeUserIE(YoutubeChannelIE):

1961

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

1962

_VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'

1963

_TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'

1964

IE_NAME = 'youtube:user'

1965

1966

_TESTS = [{

1967

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

1968

'playlist_mincount': 320,

1969

'info_dict': {

1970

'title': 'TheLinuxFoundation',

1971

}

1972

}, {

1973

'url': 'ytuser:phihag',

1974

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

1979

# Don't return True if the url can be extracted with other youtube

1980

# extractor, the regex would is too permissive and it would match.

1981

other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)

1982

if any(ie.suitable(url) for ie in other_ies):

1983

return False

1984

else:

1985

return super(YoutubeUserIE, cls).suitable(url)

1986

1987

1988

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

1989

IE_DESC = 'YouTube.com user/channel playlists'

1990

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

1991

IE_NAME = 'youtube:playlists'

1992

1993

_TESTS = [{

1994

'url': 'http://www.youtube.com/user/ThirstForScience/playlists',

1995

'playlist_mincount': 4,

1996

'info_dict': {

1997

'id': 'ThirstForScience',

1998

'title': 'Thirst for Science',

1999

},

2000

}, {

2001

# with "Load more" button

2002

'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

2003

'playlist_mincount': 70,

2004

'info_dict': {

2005

'id': 'igorkle1',

2006

'title': 'Игорь Клейнер',

2007

},

2008

}, {

2009

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

2010

'playlist_mincount': 17,

2011

'info_dict': {

2012

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

2013

'title': 'Chem Player',

},

}]

class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):

2019

IE_DESC = 'YouTube.com searches'

2020

# there doesn't appear to be a real limit, for example if you search for

2021

# 'python' you get more than 8.000.000 results

2022

_MAX_RESULTS = float('inf')

2023

IE_NAME = 'youtube:search'

2024

_SEARCH_KEY = 'ytsearch'

2025

_EXTRA_QUERY_ARGS = {}

2026

_TESTS = []

2027

2028

def _get_n_results(self, query, n):

2029

"""Get a specified number of results for a query"""

videos = []

limit = n

for pagenum in itertools.count(1):

2035

url_query = {

2036

'search_query': query.encode('utf-8'),

'page': pagenum,

'spf': 'navigate',

}

url_query.update(self._EXTRA_QUERY_ARGS)

2041

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)

2042

data = self._download_json(

2043

result_url, video_id='query "%s"' % query,

2044

note='Downloading page %s' % pagenum,

2045

errnote='Unable to download API page')

2046

html_content = data[1]['body']['content']

2047

2048

if 'class="search-message' in html_content:

2049

raise ExtractorError(

2050

'[youtube] No video results', expected=True)

2051

2052

new_videos = self._ids_to_results(orderedSet(re.findall(

2053

r'href="/watch\?v=(.{11})', html_content)))

2054

videos += new_videos

2055

if not new_videos or len(videos) > limit:

break

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

2061

2062

2063

class YoutubeSearchDateIE(YoutubeSearchIE):

2064

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

2065

_SEARCH_KEY = 'ytsearchdate'

2066

IE_DESC = 'YouTube.com searches, newest videos first'

2067

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

2068

2069

2070

class YoutubeSearchURLIE(InfoExtractor):

2071

IE_DESC = 'YouTube.com search URLs'

2072

IE_NAME = 'youtube:search_url'

2073

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'

2074

_TESTS = [{

2075

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

2076

'playlist_mincount': 5,

2077

'info_dict': {

2078

'title': 'youtube-dl test video',

2079

}

2080

}, {

2081

'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',

2082

'only_matching': True,

2083

}]

2084

2085

def _real_extract(self, url):

2086

mobj = re.match(self._VALID_URL, url)

2087

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

2088

2089

webpage = self._download_webpage(url, query)

2090

result_code = self._search_regex(

2091

r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')

2092

2093

part_codes = re.findall(

2094

r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)

2095

entries = []

2096

for part_code in part_codes:

2097

part_title = self._html_search_regex(

2098

[r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)

2099

part_url_snippet = self._html_search_regex(

2100

r'(?s)href="([^"]+)"', part_code, 'item URL')

2101

part_url = compat_urlparse.urljoin(

2102

'https://www.youtube.com/', part_url_snippet)

entries.append({

'_type': 'url',

'url': part_url,

'title': part_title,

})

return {

'_type': 'playlist',

'entries': entries,

'title': query,

}

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

2117

IE_DESC = 'YouTube.com (multi-season) shows'

2118

_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'

2119

IE_NAME = 'youtube:show'

2120

_TESTS = [{

2121

'url': 'https://www.youtube.com/show/airdisasters',

2122

'playlist_mincount': 5,

2123

'info_dict': {

2124

'id': 'airdisasters',

2125

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

2130

playlist_id = self._match_id(url)

2131

return super(YoutubeShowIE, self)._real_extract(

2132

'https://www.youtube.com/show/%s/playlists' % playlist_id)

2133

2134

2135

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

2136

"""

2137

Base class for feed extractors

2138

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

2139

"""

2140

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

2145

2146

def _real_initialize(self):

2147

self._login()

2148

2149

def _real_extract(self, url):

2150

page = self._download_webpage(

2151

'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)

2152

2153

# The extraction process is the same as for playlists, but the regex

2154

# for the video ids doesn't contain an index

2155

ids = []

2156

more_widget_html = content_html = page

2157

for page_num in itertools.count(1):

2158

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

2159

2160

# 'recommended' feed has infinite 'load more' and each new portion spins

2161

# the same videos in (sometimes) slightly different order, so we'll check

2162

# for unicity and break when portion has no new videos

2163

new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))

if not new_ids:

break

ids.extend(new_ids)

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

2174

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

2175

'Downloading page #%s' % page_num,

2176

transform_source=uppercase_escape)

2177

content_html = more['content_html']

2178

more_widget_html = more['load_more_widget_html']

2179

2180

return self.playlist_result(

2181

self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)

2182

2183

2184

class YoutubeWatchLaterIE(YoutubePlaylistIE):

2185

IE_NAME = 'youtube:watchlater'

2186

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

2187

_VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'

2188

2189

_TESTS = [{

2190

'url': 'https://www.youtube.com/playlist?list=WL',

2191

'only_matching': True,

2192

}, {

2193

'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',

2194

'only_matching': True,

2195

}]

2196

2197

def _real_extract(self, url):

2198

video = self._check_download_just_video(url, 'WL')

2199

if video:

2200

return video

2201

return self._extract_playlist('WL')

2202

2203

2204

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

2205

IE_NAME = 'youtube:favorites'

2206

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

2207

_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

2208

_LOGIN_REQUIRED = True

2209

2210

def _real_extract(self, url):

2211

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

2212

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

2213

return self.url_result(playlist_id, 'YoutubePlaylist')

2214

2215

2216

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

2217

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

2218

_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'

2219

_FEED_NAME = 'recommended'

2220

_PLAYLIST_TITLE = 'Youtube Recommended videos'

2221

2222

2223

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

2224

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

2225

_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

2226

_FEED_NAME = 'subscriptions'

2227

_PLAYLIST_TITLE = 'Youtube Subscriptions'

2228

2229

2230

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

2231

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

2232

_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'

2233

_FEED_NAME = 'history'

2234

_PLAYLIST_TITLE = 'Youtube History'

2235

2236

2237

class YoutubeTruncatedURLIE(InfoExtractor):

2238

IE_NAME = 'youtube:truncated_url'

2239

IE_DESC = False # Do not list

2240

_VALID_URL = r'''(?x)

2241

(?:https?://)?

2242

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

2243

(?:watch\?(?:

2244

feature=[a-z_]+|

2245

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',

2258

'only_matching': True,

2259

}, {

2260

'url': 'http://www.youtube.com/watch?',

2261

'only_matching': True,

2262

}, {

2263

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

2264

'only_matching': True,

2265

}, {

2266

'url': 'https://www.youtube.com/watch?feature=foo',

2267

'only_matching': True,

2268

}, {

2269

'url': 'https://www.youtube.com/watch?hl=en-GB',

2270

'only_matching': True,

2271

}, {

2272

'url': 'https://www.youtube.com/watch?t=2372',

2273

'only_matching': True,

2274

}]

2275

2276

def _real_extract(self, url):

2277

raise ExtractorError(

2278

'Did you forget to quote the URL? Remember that & is a meta '

2279

'character in most shells, so you want to put the URL in quotes, '

2280

'like youtube-dl '

2281

'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

2282

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

2287

IE_NAME = 'youtube:truncated_id'

2288

IE_DESC = False # Do not list

2289

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

2290

2291

_TESTS = [{

2292

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

2293

'only_matching': True,

2294

}]

2295

2296

def _real_extract(self, url):

2297

video_id = self._match_id(url)

2298

raise ExtractorError(

2299

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

2300

expected=True)