jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import random
	10	import re
	11	import time
	12	import traceback
	13
	14	from .common import InfoExtractor, SearchInfoExtractor
	15	from ..jsinterp import JSInterpreter
	16	from ..swfinterp import SWFInterpreter
	17	from ..compat import (
	18	compat_chr,
	19	compat_parse_qs,
	20	compat_urllib_parse_unquote,
	21	compat_urllib_parse_unquote_plus,
	22	compat_urllib_parse_urlencode,
	23	compat_urllib_parse_urlparse,
	24	compat_urlparse,
	25	compat_str,
	26	)
	27	from ..utils import (
	28	clean_html,
	29	error_to_compat_str,
	30	ExtractorError,
	31	float_or_none,
	32	get_element_by_attribute,
	33	get_element_by_id,
	34	int_or_none,
	35	mimetype2ext,
	36	orderedSet,
	37	parse_duration,
	38	remove_quotes,
	39	remove_start,
	40	sanitized_Request,
	41	smuggle_url,
	42	str_to_int,
	43	unescapeHTML,
	44	unified_strdate,
	45	unsmuggle_url,
	46	uppercase_escape,
	47	urlencode_postdata,
	48	ISO3166Utils,
	49	)
	50
	51
	52	class YoutubeBaseInfoExtractor(InfoExtractor):
	53	"""Provide base functions for Youtube extractors"""
	54	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	55	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	56	_NETRC_MACHINE = 'youtube'
	57	# If True it will raise an error if no login info is provided
	58	_LOGIN_REQUIRED = False
	59
	60	def _set_language(self):
	61	self._set_cookie(
	62	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	63	# YouTube sets the expire time to about two months
	64	expire_time=time.time() + 2 * 30 * 24 * 3600)
	65
	66	def _ids_to_results(self, ids):
	67	return [
	68	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	69	for vid_id in ids]
	70
	71	def _login(self):
	72	"""
	73	Attempt to log in to YouTube.
	74	True is returned if successful or skipped.
	75	False is returned if login failed.
	76
	77	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	78	"""
	79	(username, password) = self._get_login_info()
	80	# No authentication to be performed
	81	if username is None:
	82	if self._LOGIN_REQUIRED:
	83	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	84	return True
	85
	86	login_page = self._download_webpage(
	87	self._LOGIN_URL, None,
	88	note='Downloading login page',
	89	errnote='unable to fetch login page', fatal=False)
	90	if login_page is False:
	91	return
	92
	93	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	94	login_page, 'Login GALX parameter')
	95
	96	# Log in
	97	login_form_strs = {
	98	'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	99	'Email': username,
	100	'GALX': galx,
	101	'Passwd': password,
	102
	103	'PersistentCookie': 'yes',
	104	'_utf8': '霱',
	105	'bgresponse': 'js_disabled',
	106	'checkConnection': '',
	107	'checkedDomains': 'youtube',
	108	'dnConn': '',
	109	'pstMsg': '0',
	110	'rmShown': '1',
	111	'secTok': '',
	112	'signIn': 'Sign in',
	113	'timeStmp': '',
	114	'service': 'youtube',
	115	'uilel': '3',
	116	'hl': 'en_US',
	117	}
	118
	119	login_data = urlencode_postdata(login_form_strs)
	120
	121	req = sanitized_Request(self._LOGIN_URL, login_data)
	122	login_results = self._download_webpage(
	123	req, None,
	124	note='Logging in', errnote='unable to log in', fatal=False)
	125	if login_results is False:
	126	return False
	127
	128	error_msg = self._html_search_regex(
	129	r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<',
	130	login_results, 'error message', default=None)
	131	if error_msg:
	132	raise ExtractorError('Unable to login: %s' % error_msg, expected=True)
	133
	134	if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
	135	raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
	136
	137	# Two-Factor
	138	# TODO add SMS and phone call support - these require making a request and then prompting the user
	139
	140	if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
	141	tfa_code = self._get_tfa_info('2-step verification code')
	142
	143	if not tfa_code:
	144	self._downloader.report_warning(
	145	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	146	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	147	return False
	148
	149	tfa_code = remove_start(tfa_code, 'G-')
	150
	151	tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
	152
	153	tfa_form_strs.update({
	154	'Pin': tfa_code,
	155	'TrustDevice': 'on',
	156	})
	157
	158	tfa_data = urlencode_postdata(tfa_form_strs)
	159
	160	tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
	161	tfa_results = self._download_webpage(
	162	tfa_req, None,
	163	note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
	164
	165	if tfa_results is False:
	166	return False
	167
	168	if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
	169	self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
	170	return False
	171	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
	172	self._downloader.report_warning('unable to log in - did the page structure change?')
	173	return False
	174	if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
	175	self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
	176	return False
	177
	178	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	179	self._downloader.report_warning('unable to log in: bad username or password')
	180	return False
	181	return True
	182
	183	def _real_initialize(self):
	184	if self._downloader is None:
	185	return
	186	self._set_language()
	187	if not self._login():
	188	return
	189
	190
	191	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	192	# Extract entries from page with "Load more" button
	193	def _entries(self, page, playlist_id):
	194	more_widget_html = content_html = page
	195	for page_num in itertools.count(1):
	196	for entry in self._process_page(content_html):
	197	yield entry
	198
	199	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	200	if not mobj:
	201	break
	202
	203	more = self._download_json(
	204	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	205	'Downloading page #%s' % page_num,
	206	transform_source=uppercase_escape)
	207	content_html = more['content_html']
	208	if not content_html.strip():
	209	# Some webpages show a "Load more" button but they don't
	210	# have more videos
	211	break
	212	more_widget_html = more['load_more_widget_html']
	213
	214
	215	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	216	def _process_page(self, content):
	217	for video_id, video_title in self.extract_videos_from_page(content):
	218	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	219
	220	def extract_videos_from_page(self, page):
	221	ids_in_page = []
	222	titles_in_page = []
	223	for mobj in re.finditer(self._VIDEO_RE, page):
	224	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	225	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	226	continue
	227	video_id = mobj.group('id')
	228	video_title = unescapeHTML(mobj.group('title'))
	229	if video_title:
	230	video_title = video_title.strip()
	231	try:
	232	idx = ids_in_page.index(video_id)
	233	if video_title and not titles_in_page[idx]:
	234	titles_in_page[idx] = video_title
	235	except ValueError:
	236	ids_in_page.append(video_id)
	237	titles_in_page.append(video_title)
	238	return zip(ids_in_page, titles_in_page)
	239
	240
	241	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	242	def _process_page(self, content):
	243	for playlist_id in orderedSet(re.findall(
	244	r'<h3[^>]+class="[^"]yt-lockup-title[^"]"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
	245	content)):
	246	yield self.url_result(
	247	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	248
	249	def _real_extract(self, url):
	250	playlist_id = self._match_id(url)
	251	webpage = self._download_webpage(url, playlist_id)
	252	title = self._og_search_title(webpage, fatal=False)
	253	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	254
	255
	256	class YoutubeIE(YoutubeBaseInfoExtractor):
	257	IE_DESC = 'YouTube.com'
	258	_VALID_URL = r"""(?x)^
	259	(
	260	(?:https?://\|//) # http(s):// or protocol-independent URL
	261	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	262	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	263	(?:www\.)?pwnyoutube\.com/\|
	264	(?:www\.)?yourepeat\.com/\|
	265	tube\.majestyc\.net/\|
	266	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	267	(?:.*?\#/)? # handle anchor (#/) redirect urls
	268	(?: # the various things that can precede the ID:
	269	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	270	\|(?: # or the v= param in all its forms
	271	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	272	(?:\?\|\#!?) # the params delimiter ? or # or #!
	273	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	274	v=
	275	)
	276	))
	277	\|(?:
	278	youtu\.be\| # just youtu.be/xxxx
	279	vid\.plus\| # or vid.plus/xxxx
	280	zwearz\.com/watch\| # or zwearz.com/watch/xxxx
	281	)/
	282	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	283	)
	284	)? # all until now is optional -> you can pass the naked ID
	285	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	286	(?!.*?&list=) # combined list/video URLs are handled by the playlist IE
	287	(?(1).+)? # if we found the ID, everything can follow
	288	$"""
	289	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	290	_formats = {
	291	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	292	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
	293	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
	294	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
	295	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
	296	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	297	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	298	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	299	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	300	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
	301	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	302	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
	303	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	304	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
	305	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	306	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	307	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	308	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
	309
	310
	311	# 3D videos
	312	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	313	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
	314	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	315	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
	316	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	317	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	318	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	319
	320	# Apple HTTP Live Streaming
	321	'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	322	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	323	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	324	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
	325	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	326	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
	327	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	328	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	329
	330	# DASH mp4 video
	331	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	332	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	333	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	334	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	335	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	336	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
	337	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	338	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	339	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	340	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	341	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	342
	343	# Dash mp4 audio
	344	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
	345	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
	346	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
	347	'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
	348	'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
	349
	350	# Dash webm
	351	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	352	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	353	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	354	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	355	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	356	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	357	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},
	358	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	359	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	360	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	361	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	362	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	363	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	364	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	365	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	366	# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
	367	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	368	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	369	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	370	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	371	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	372	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	373
	374	# Dash webm audio
	375	'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
	376	'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	377
	378	# Dash webm audio with opus inside
	379	'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
	380	'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
	381	'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
	382
	383	# RTMP (unnamed)
	384	'_rtmp': {'protocol': 'rtmp'},
	385	}
	386	_SUBTITLE_FORMATS = ('ttml', 'vtt')
	387
	388	IE_NAME = 'youtube'
	389	_TESTS = [
	390	{
	391	'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
	392	'info_dict': {
	393	'id': 'BaW_jenozKc',
	394	'ext': 'mp4',
	395	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	396	'uploader': 'Philipp Hagemeister',
	397	'uploader_id': 'phihag',
	398	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
	399	'upload_date': '20121002',
	400	'license': 'Standard YouTube License',
	401	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	402	'categories': ['Science & Technology'],
	403	'tags': ['youtube-dl'],
	404	'like_count': int,
	405	'dislike_count': int,
	406	'start_time': 1,
	407	'end_time': 9,
	408	}
	409	},
	410	{
	411	'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
	412	'note': 'Test generic use_cipher_signature video (#897)',
	413	'info_dict': {
	414	'id': 'UxxajLWwzqY',
	415	'ext': 'mp4',
	416	'upload_date': '20120506',
	417	'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
	418	'alt_title': 'I Love It (feat. Charli XCX)',
	419	'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
	420	'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
	421	'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
	422	'iconic ep', 'iconic', 'love', 'it'],
	423	'uploader': 'Icona Pop',
	424	'uploader_id': 'IconaPop',
	425	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop',
	426	'license': 'Standard YouTube License',
	427	'creator': 'Icona Pop',
	428	}
	429	},
	430	{
	431	'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
	432	'note': 'Test VEVO video with age protection (#956)',
	433	'info_dict': {
	434	'id': '07FYdnEawAQ',
	435	'ext': 'mp4',
	436	'upload_date': '20130703',
	437	'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
	438	'alt_title': 'Tunnel Vision',
	439	'description': 'md5:64249768eec3bc4276236606ea996373',
	440	'uploader': 'justintimberlakeVEVO',
	441	'uploader_id': 'justintimberlakeVEVO',
	442	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
	443	'license': 'Standard YouTube License',
	444	'creator': 'Justin Timberlake',
	445	'age_limit': 18,
	446	}
	447	},
	448	{
	449	'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
	450	'note': 'Embed-only video (#1746)',
	451	'info_dict': {
	452	'id': 'yZIXLfi8CZQ',
	453	'ext': 'mp4',
	454	'upload_date': '20120608',
	455	'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
	456	'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
	457	'uploader': 'SET India',
	458	'uploader_id': 'setindia',
	459	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia',
	460	'license': 'Standard YouTube License',
	461	'age_limit': 18,
	462	}
	463	},
	464	{
	465	'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
	466	'note': 'Use the first video ID in the URL',
	467	'info_dict': {
	468	'id': 'BaW_jenozKc',
	469	'ext': 'mp4',
	470	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	471	'uploader': 'Philipp Hagemeister',
	472	'uploader_id': 'phihag',
	473	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
	474	'upload_date': '20121002',
	475	'license': 'Standard YouTube License',
	476	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	477	'categories': ['Science & Technology'],
	478	'tags': ['youtube-dl'],
	479	'like_count': int,
	480	'dislike_count': int,
	481	},
	482	'params': {
	483	'skip_download': True,
	484	},
	485	},
	486	{
	487	'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
	488	'note': '256k DASH audio (format 141) via DASH manifest',
	489	'info_dict': {
	490	'id': 'a9LDPn-MO4I',
	491	'ext': 'm4a',
	492	'upload_date': '20121002',
	493	'uploader_id': '8KVIDEO',
	494	'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
	495	'description': '',
	496	'uploader': '8KVIDEO',
	497	'license': 'Standard YouTube License',
	498	'title': 'UHDTV TEST 8K VIDEO.mp4'
	499	},
	500	'params': {

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import random

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

15

from ..jsinterp import JSInterpreter

16

from ..swfinterp import SWFInterpreter

17

from ..compat import (

18

compat_chr,

19

compat_parse_qs,

20

compat_urllib_parse_unquote,

21

compat_urllib_parse_unquote_plus,

22

compat_urllib_parse_urlencode,

23

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

error_to_compat_str,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_duration,

remove_quotes,

remove_start,

sanitized_Request,

smuggle_url,

str_to_int,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

urlencode_postdata,

ISO3166Utils,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

53

"""Provide base functions for Youtube extractors"""

54

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

55

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

56

_NETRC_MACHINE = 'youtube'

57

# If True it will raise an error if no login info is provided

58

_LOGIN_REQUIRED = False

59

60

def _set_language(self):

61

self._set_cookie(

62

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

63

# YouTube sets the expire time to about two months

64

expire_time=time.time() + 2 * 30 * 24 * 3600)

65

66

def _ids_to_results(self, ids):

67

return [

68

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

74

True is returned if successful or skipped.

75

False is returned if login failed.

76

77

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

78

"""

79

(username, password) = self._get_login_info()

80

# No authentication to be performed

81

if username is None:

82

if self._LOGIN_REQUIRED:

83

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

84

return True

85

86

login_page = self._download_webpage(

87

self._LOGIN_URL, None,

88

note='Downloading login page',

89

errnote='unable to fetch login page', fatal=False)

90

if login_page is False:

91

return

92

93

galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',

94

login_page, 'Login GALX parameter')

# Log in

login_form_strs = {

'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

'Email': username,

'GALX': galx,

'Passwd': password,

'PersistentCookie': 'yes',

104

'_utf8': '霱',

105

'bgresponse': 'js_disabled',

106

'checkConnection': '',

107

'checkedDomains': 'youtube',

'dnConn': '',

'pstMsg': '0',

'rmShown': '1',

'secTok': '',

'signIn': 'Sign in',

'timeStmp': '',

'service': 'youtube',

'uilel': '3',

'hl': 'en_US',

}

login_data = urlencode_postdata(login_form_strs)

120

121

req = sanitized_Request(self._LOGIN_URL, login_data)

122

login_results = self._download_webpage(

123

req, None,

124

note='Logging in', errnote='unable to log in', fatal=False)

125

if login_results is False:

126

return False

127

128

error_msg = self._html_search_regex(

129

r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<',

130

login_results, 'error message', default=None)

131

if error_msg:

132

raise ExtractorError('Unable to login: %s' % error_msg, expected=True)

133

134

if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:

135

raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)

136

137

# Two-Factor

138

# TODO add SMS and phone call support - these require making a request and then prompting the user

139

140

if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:

141

tfa_code = self._get_tfa_info('2-step verification code')

142

143

if not tfa_code:

144

self._downloader.report_warning(

145

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

146

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

147

return False

148

149

tfa_code = remove_start(tfa_code, 'G-')

150

151

tfa_form_strs = self._form_hidden_inputs('challenge', login_results)

152

153

tfa_form_strs.update({

'Pin': tfa_code,

'TrustDevice': 'on',

})

tfa_data = urlencode_postdata(tfa_form_strs)

159

160

tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)

161

tfa_results = self._download_webpage(

162

tfa_req, None,

163

note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)

164

165

if tfa_results is False:

166

return False

167

168

if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:

169

self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')

170

return False

171

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:

172

self._downloader.report_warning('unable to log in - did the page structure change?')

173

return False

174

if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:

175

self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')

176

return False

177

178

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:

179

self._downloader.report_warning('unable to log in: bad username or password')

return False

return True

def _real_initialize(self):

184

if self._downloader is None:

185

return

186

self._set_language()

187

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

192

# Extract entries from page with "Load more" button

193

def _entries(self, page, playlist_id):

194

more_widget_html = content_html = page

195

for page_num in itertools.count(1):

196

for entry in self._process_page(content_html):

197

yield entry

198

199

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

204

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

205

'Downloading page #%s' % page_num,

206

transform_source=uppercase_escape)

207

content_html = more['content_html']

208

if not content_html.strip():

209

# Some webpages show a "Load more" button but they don't

210

# have more videos

211

break

212

more_widget_html = more['load_more_widget_html']

213

214

215

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

216

def _process_page(self, content):

217

for video_id, video_title in self.extract_videos_from_page(content):

218

yield self.url_result(video_id, 'Youtube', video_id, video_title)

219

220

def extract_videos_from_page(self, page):

221

ids_in_page = []

222

titles_in_page = []

223

for mobj in re.finditer(self._VIDEO_RE, page):

224

# The link with index 0 is not the first video of the playlist (not sure if still actual)

225

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

226

continue

227

video_id = mobj.group('id')

228

video_title = unescapeHTML(mobj.group('title'))

229

if video_title:

230

video_title = video_title.strip()

231

try:

232

idx = ids_in_page.index(video_id)

233

if video_title and not titles_in_page[idx]:

234

titles_in_page[idx] = video_title

235

except ValueError:

236

ids_in_page.append(video_id)

237

titles_in_page.append(video_title)

238

return zip(ids_in_page, titles_in_page)

239

240

241

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

242

def _process_page(self, content):

243

for playlist_id in orderedSet(re.findall(

244

r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',

245

content)):

246

yield self.url_result(

247

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

248

249

def _real_extract(self, url):

250

playlist_id = self._match_id(url)

251

webpage = self._download_webpage(url, playlist_id)

252

title = self._og_search_title(webpage, fatal=False)

253

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

254

255

256

class YoutubeIE(YoutubeBaseInfoExtractor):

257

IE_DESC = 'YouTube.com'

258

_VALID_URL = r"""(?x)^

259

(

260

(?:https?://|//) # http(s):// or protocol-independent URL

261

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

262

(?:www\.)?deturl\.com/www\.youtube\.com/|

263

(?:www\.)?pwnyoutube\.com/|

264

(?:www\.)?yourepeat\.com/|

265

tube\.majestyc\.net/|

266

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

267

(?:.*?\#/)? # handle anchor (#/) redirect urls

268

(?: # the various things that can precede the ID:

269

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

270

|(?: # or the v= param in all its forms

271

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

272

(?:\?|\#!?) # the params delimiter ? or # or #!

273

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

279

vid\.plus| # or vid.plus/xxxx

280

zwearz\.com/watch| # or zwearz.com/watch/xxxx

281

)/

282

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

283

)

284

)? # all until now is optional -> you can pass the naked ID

285

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

286

(?!.*?&list=) # combined list/video URLs are handled by the playlist IE

287

(?(1).+)? # if we found the ID, everything can follow

288

$"""

289

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

290

_formats = {

291

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

292

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},

293

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},

294

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},

295

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},

296

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

297

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

298

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

299

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

300

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},

301

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

302

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},

303

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

304

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},

305

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

306

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

307

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

308

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

313

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},

314

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

315

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},

316

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

317

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

318

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

319

320

# Apple HTTP Live Streaming

321

'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

322

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

323

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

324

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

325

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

326

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},

327

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

328

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

329

330

# DASH mp4 video

331

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

332

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

333

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

334

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

335

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

336

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)

337

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

338

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

339

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

340

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

341

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

342

343

# Dash mp4 audio

344

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},

345

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},

346

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},

347

'256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},

348

'258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},

349

350

# Dash webm

351

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

352

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

353

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

354

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

355

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

356

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

357

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},

358

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

359

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

360

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

361

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

362

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

363

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

364

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

365

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

366

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

367

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

368

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

369

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

370

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

371

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

372

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

373

374

# Dash webm audio

375

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},

376

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},

377

378

# Dash webm audio with opus inside

379

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},

380

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},

381

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},

382

383

# RTMP (unnamed)

384

'_rtmp': {'protocol': 'rtmp'},

385

}

386

_SUBTITLE_FORMATS = ('ttml', 'vtt')

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

396

'uploader': 'Philipp Hagemeister',

397

'uploader_id': 'phihag',

398

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',

399

'upload_date': '20121002',

400

'license': 'Standard YouTube License',

401

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

402

'categories': ['Science & Technology'],

403

'tags': ['youtube-dl'],

404

'like_count': int,

405

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',

412

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

417

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

418

'alt_title': 'I Love It (feat. Charli XCX)',

419

'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',

420

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

421

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

422

'iconic ep', 'iconic', 'love', 'it'],

423

'uploader': 'Icona Pop',

424

'uploader_id': 'IconaPop',

425

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop',

426

'license': 'Standard YouTube License',

427

'creator': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

432

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

437

'title': 'Justin Timberlake - Tunnel Vision (Explicit)',

438

'alt_title': 'Tunnel Vision',

439

'description': 'md5:64249768eec3bc4276236606ea996373',

440

'uploader': 'justintimberlakeVEVO',

441

'uploader_id': 'justintimberlakeVEVO',

442

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',

443

'license': 'Standard YouTube License',

444

'creator': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

450

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

455

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

456

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

457

'uploader': 'SET India',

458

'uploader_id': 'setindia',

459

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia',

460

'license': 'Standard YouTube License',

'age_limit': 18,

}

},

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

466

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

471

'uploader': 'Philipp Hagemeister',

472

'uploader_id': 'phihag',

473

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',

474

'upload_date': '20121002',

475

'license': 'Standard YouTube License',

476

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

477

'categories': ['Science & Technology'],

478

'tags': ['youtube-dl'],

479

'like_count': int,

480

'dislike_count': int,

481

},

482

'params': {

483

'skip_download': True,

},

},

{

'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',

488

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

493

'uploader_id': '8KVIDEO',

494

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',

495

'description': '',

496

'uploader': '8KVIDEO',

497

'license': 'Standard YouTube License',

498

'title': 'UHDTV TEST 8K VIDEO.mp4'

499

},

500

'params': {

501

'youtube_include_dash_manifest': True,

502

'format': '141',

503

},

504

'skip': 'format 141 not served anymore',

505

},

506

# DASH manifest with encrypted signature

507

{

508

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',

513

'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',

514

'uploader': 'AfrojackVEVO',

515

'uploader_id': 'AfrojackVEVO',

516

'upload_date': '20131011',

517

'license': 'Standard YouTube License',

518

},

519

'params': {

520

'youtube_include_dash_manifest': True,

521

'format': '141/bestaudio[ext=m4a]',

522

},

523

},

524

# JS player signature function name containing $

525

{

526

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

531

'alt_title': 'Shake It Off',

532

'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',

533

'uploader': 'TaylorSwiftVEVO',

534

'uploader_id': 'TaylorSwiftVEVO',

535

'upload_date': '20140818',

536

'license': 'Standard YouTube License',

537

'creator': 'Taylor Swift',

538

},

539

'params': {

540

'youtube_include_dash_manifest': True,

541

'format': '141/bestaudio[ext=m4a]',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'upload_date': '20100909',

551

'uploader': 'The Amazing Atheist',

552

'uploader_id': 'TheAmazingAtheist',

553

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',

554

'license': 'Standard YouTube License',

555

'title': 'Burning Everyone\'s Koran',

556

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

557

}

558

},

559

# Normal age-gate video (No vevo, embed allowed)

560

{

561

'url': 'http://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

566

'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

567

'uploader': 'The Witcher',

568

'uploader_id': 'WitcherGame',

569

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame',

570

'upload_date': '20140605',

571

'license': 'Standard YouTube License',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

576

{

577

'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

582

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

583

'uploader': 'LloydVEVO',

584

'uploader_id': 'LloydVEVO',

585

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',

586

'upload_date': '20110629',

587

'license': 'Standard YouTube License',

'age_limit': 18,

},

},

# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)

592

{

593

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'upload_date': '20100430',

598

'uploader_id': 'deadmau5',

599

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5',

600

'creator': 'deadmau5',

601

'description': 'md5:12c56784b8032162bb936a5f76d55360',

602

'uploader': 'deadmau5',

603

'license': 'Standard YouTube License',

604

'title': 'Deadmau5 - Some Chords (HD)',

605

'alt_title': 'Some Chords',

606

},

607

'expected_warnings': [

608

'DASH manifest missing',

609

]

610

},

611

# Olympics (https://github.com/rg3/youtube-dl/issues/4431)

612

{

613

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'upload_date': '20150827',

618

'uploader_id': 'olympic',

619

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',

620

'license': 'Standard YouTube License',

621

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

622

'uploader': 'Olympics',

623

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

624

},

625

'params': {

626

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

636

'upload_date': '20110310',

637

'uploader_id': 'AllenMeow',

638

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow',

639

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

640

'uploader': '孫艾倫',

641

'license': 'Standard YouTube License',

642

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

643

},

644

},

645

# url_encoded_fmt_stream_map is empty string

646

{

647

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

652

'description': '',

653

'upload_date': '20150404',

654

'uploader_id': 'spbelect',

655

'uploader': 'Наблюдатели Петербурга',

656

},

657

'params': {

658

'skip_download': 'requires avconv',

659

},

660

'skip': 'This live event has ended.',

661

},

662

# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)

663

{

664

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'mp4',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

669

'description': 'md5:116377fd2963b81ec4ce64b542173306',

670

'upload_date': '20150625',

671

'uploader_id': 'dorappi2000',

672

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',

673

'uploader': 'dorappi2000',

674

'license': 'Standard YouTube License',

675

'formats': 'mincount:33',

676

},

677

},

678

# DASH manifest with segment_list

679

{

680

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

681

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

686

'uploader': 'Airtek',

687

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

688

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

689

'license': 'Standard YouTube License',

690

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

691

},

692

'params': {

693

'youtube_include_dash_manifest': True,

694

'format': '135', # bestvideo

}

},

{

# Multifeed videos (multiple cameras), URL is for Main Camera

699

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

700

'info_dict': {

701

'id': 'jqWvoWXjCVs',

702

'title': 'teamPGP: Rocket League Noob Stream',

703

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

710

'description': 'md5:dc7872fb300e143831327f1bae3af010',

711

'upload_date': '20150721',

712

'uploader': 'Beer Games Beer',

713

'uploader_id': 'beergamesbeer',

714

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

715

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

722

'description': 'md5:dc7872fb300e143831327f1bae3af010',

723

'upload_date': '20150721',

724

'uploader': 'Beer Games Beer',

725

'uploader_id': 'beergamesbeer',

726

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

727

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

734

'description': 'md5:dc7872fb300e143831327f1bae3af010',

735

'upload_date': '20150721',

736

'uploader': 'Beer Games Beer',

737

'uploader_id': 'beergamesbeer',

738

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

739

'license': 'Standard YouTube License',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

746

'description': 'md5:dc7872fb300e143831327f1bae3af010',

747

'upload_date': '20150721',

748

'uploader': 'Beer Games Beer',

749

'uploader_id': 'beergamesbeer',

750

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',

751

'license': 'Standard YouTube License',

},

}],

'params': {

'skip_download': True,

},

},

{

# Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)

760

'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',

761

'info_dict': {

762

'id': 'gVfLd0zydlo',

763

'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',

},

'playlist_count': 2,

},

{

'url': 'http://vid.plus/FlRa-iH7PGw',

769

'only_matching': True,

770

},

771

{

772

'url': 'http://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',

773

'only_matching': True,

774

},

775

{

776

# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)

777

# Also tests cut-off URL expansion in video description (see

778

# https://github.com/rg3/youtube-dl/issues/1892,

779

# https://github.com/rg3/youtube-dl/issues/8164)

780

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

785

'alt_title': 'Dark Walk',

786

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

787

'upload_date': '20151119',

788

'uploader_id': 'IronSoulElf',

789

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',

790

'uploader': 'IronSoulElf',

791

'license': 'Standard YouTube License',

792

'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',

793

},

794

'params': {

795

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)

800

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

801

'only_matching': True,

802

},

803

{

804

# Video with yt:stretch=17:0

805

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

810

'description': 'md5:ee18a25c350637c8faff806845bddee9',

811

'upload_date': '20151107',

812

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

813

'uploader': 'CH GAMER DROID',

814

},

815

'params': {

816

'skip_download': True,

},

},

{

# Video licensed under Creative Commons

821

'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',

'info_dict': {

'id': 'M4gD1WSo5mA',

'ext': 'mp4',

'title': 'md5:e41008789470fc2533a3252216f1c1d1',

826

'description': 'md5:a677553cf0840649b731a3024aeff4cc',

827

'upload_date': '20150127',

828

'uploader_id': 'BerkmanCenter',

829

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',

830

'uploader': 'BerkmanCenter',

831

'license': 'Creative Commons Attribution license (reuse allowed)',

832

},

833

'params': {

834

'skip_download': True,

},

},

{

# Channel-like uploader_url

839

'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',

'info_dict': {

'id': 'eQcmzGIKrzg',

'ext': 'mp4',

'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',

844

'description': 'md5:dda0d780d5a6e120758d1711d062a867',

845

'upload_date': '20151119',

846

'uploader': 'Bernie 2016',

847

'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',

848

'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',

849

'license': 'Creative Commons Attribution license (reuse allowed)',

850

},

851

'params': {

852

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

857

'only_matching': True,

}

]

def __init__(self, *args, **kwargs):

862

super(YoutubeIE, self).__init__(*args, **kwargs)

863

self._player_cache = {}

864

865

def report_video_info_webpage_download(self, video_id):

866

"""Report attempt to download video info webpage."""

867

self.to_screen('%s: Downloading video info webpage' % video_id)

868

869

def report_information_extraction(self, video_id):

870

"""Report attempt to extract video information."""

871

self.to_screen('%s: Extracting video information' % video_id)

872

873

def report_unavailable_format(self, video_id, format):

874

"""Report extracted video URL."""

875

self.to_screen('%s: Format %s not available' % (video_id, format))

876

877

def report_rtmp_download(self):

878

"""Indicate the download will use the RTMP protocol."""

879

self.to_screen('RTMP download detected')

880

881

def _signature_cache_id(self, example_sig):

882

""" Return a string representation of a signature """

883

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

884

885

def _extract_signature_function(self, video_id, player_url, example_sig):

886

id_m = re.match(

887

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',

888

player_url)

889

if not id_m:

890

raise ExtractorError('Cannot identify player %r' % player_url)

891

player_type = id_m.group('ext')

892

player_id = id_m.group('id')

893

894

# Read from filesystem cache

895

func_id = '%s_%s_%s' % (

896

player_type, player_id, self._signature_cache_id(example_sig))

897

assert os.path.basename(func_id) == func_id

898

899

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

900

if cache_spec is not None:

901

return lambda s: ''.join(s[i] for i in cache_spec)

902

903

download_note = (

904

'Downloading player %s' % player_url

905

if self._downloader.params.get('verbose') else

906

'Downloading %s player %s' % (player_type, player_id)

907

)

908

if player_type == 'js':

909

code = self._download_webpage(

910

player_url, video_id,

911

note=download_note,

912

errnote='Download of %s failed' % player_url)

913

res = self._parse_sig_js(code)

914

elif player_type == 'swf':

915

urlh = self._request_webpage(

916

player_url, video_id,

917

note=download_note,

918

errnote='Download of %s failed' % player_url)

919

code = urlh.read()

920

res = self._parse_sig_swf(code)

921

else:

922

assert False, 'Invalid player type %r' % player_type

923

924

test_string = ''.join(map(compat_chr, range(len(example_sig))))

925

cache_res = res(test_string)

926

cache_spec = [ord(c) for c in cache_res]

927

928

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

929

return res

930

931

def _print_sig_code(self, func, example_sig):

932

def gen_sig_code(idxs):

933

def _genslice(start, end, step):

934

starts = '' if start == 0 else str(start)

935

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

936

steps = '' if step == 1 else (':%d' % step)

937

return 's[%s%s%s]' % (starts, ends, steps)

938

939

step = None

940

# Quelch pyflakes warnings - start will be set when step is set

941

start = '(Never used)'

942

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

947

step = None

948

continue

949

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

959

960

test_string = ''.join(map(compat_chr, range(len(example_sig))))

961

cache_res = func(test_string)

962

cache_spec = [ord(c) for c in cache_res]

963

expr_code = ' + '.join(gen_sig_code(cache_spec))

964

signature_id_tuple = '(%s)' % (

965

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

966

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

967

' return %s\n') % (signature_id_tuple, expr_code)

968

self.to_screen('Extracted signature function:\n' + code)

969

970

def _parse_sig_js(self, jscode):

971

funcname = self._search_regex(

972

r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,

973

'Initial JS player signature function name')

974

975

jsi = JSInterpreter(jscode)

976

initial_function = jsi.extract_function(funcname)

977

return lambda s: initial_function([s])

978

979

def _parse_sig_swf(self, file_contents):

980

swfi = SWFInterpreter(file_contents)

981

TARGET_CLASSNAME = 'SignatureDecipher'

982

searched_class = swfi.extract_class(TARGET_CLASSNAME)

983

initial_function = swfi.extract_function(searched_class, 'decipher')

984

return lambda s: initial_function([s])

985

986

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

987

"""Turn the encrypted s field into a working signature"""

988

989

if player_url is None:

990

raise ExtractorError('Cannot decrypt signature without player_url')

991

992

if player_url.startswith('//'):

993

player_url = 'https:' + player_url

994

try:

995

player_id = (player_url, self._signature_cache_id(s))

996

if player_id not in self._player_cache:

997

func = self._extract_signature_function(

998

video_id, player_url, s

999

)

1000

self._player_cache[player_id] = func

1001

func = self._player_cache[player_id]

1002

if self._downloader.params.get('youtube_print_sig_code'):

1003

self._print_sig_code(func, s)

1004

return func(s)

1005

except Exception as e:

1006

tb = traceback.format_exc()

1007

raise ExtractorError(

1008

'Signature extraction failed: ' + tb, cause=e)

1009

1010

def _get_subtitles(self, video_id, webpage):

1011

try:

1012

subs_doc = self._download_xml(

1013

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

1014

video_id, note=False)

1015

except ExtractorError as err:

1016

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

1021

lang = track.attrib['lang_code']

1022

if lang in sub_lang_list:

1023

continue

1024

sub_formats = []

1025

for ext in self._SUBTITLE_FORMATS:

1026

params = compat_urllib_parse_urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

1031

})

1032

sub_formats.append({

1033

'url': 'https://www.youtube.com/api/timedtext?' + params,

1034

'ext': ext,

1035

})

1036

sub_lang_list[lang] = sub_formats

1037

if not sub_lang_list:

1038

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

1043

patterns = (

1044

# User data may contain arbitrary character sequences that may affect

1045

# JSON extraction with regex, e.g. when '};' is contained the second

1046

# regex won't capture the whole JSON. Yet working around by trying more

1047

# concrete regex first keeping in mind proper quoted string handling

1048

# to be implemented in future that will replace this workaround (see

1049

# https://github.com/rg3/youtube-dl/issues/7468,

1050

# https://github.com/rg3/youtube-dl/pull/7599)

1051

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

1052

r';ytplayer\.config\s*=\s*({.+?});',

1053

)

1054

config = self._search_regex(

1055

patterns, webpage, 'ytplayer.config', default=None)

1056

if config:

1057

return self._parse_json(

1058

uppercase_escape(config), video_id, fatal=False)

1059

1060

def _get_automatic_captions(self, video_id, webpage):

1061

"""We need the webpage for getting the captions url, pass it as an

1062

argument to speed up the process."""

1063

self.to_screen('%s: Looking for automatic captions' % video_id)

1064

player_config = self._get_ytplayer_config(video_id, webpage)

1065

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

1066

if not player_config:

1067

self._downloader.report_warning(err_msg)

1068

return {}

1069

try:

1070

args = player_config['args']

1071

caption_url = args.get('ttsurl')

1072

if caption_url:

1073

timestamp = args['timestamp']

1074

# We get the available subtitles

1075

list_params = compat_urllib_parse_urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

1081

caption_list = self._download_xml(list_url, video_id)

1082

original_lang_node = caption_list.find('track')

1083

if original_lang_node is None:

1084

self._downloader.report_warning('Video doesn\'t have automatic captions')

1085

return {}

1086

original_lang = original_lang_node.attrib['lang_code']

1087

caption_kind = original_lang_node.attrib.get('kind', '')

1088

1089

sub_lang_list = {}

1090

for lang_node in caption_list.findall('target'):

1091

sub_lang = lang_node.attrib['lang_code']

1092

sub_formats = []

1093

for ext in self._SUBTITLE_FORMATS:

1094

params = compat_urllib_parse_urlencode({

1095

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1100

})

1101

sub_formats.append({

1102

'url': caption_url + '&' + params,

1103

'ext': ext,

1104

})

1105

sub_lang_list[sub_lang] = sub_formats

1106

return sub_lang_list

1107

1108

# Some videos don't provide ttsurl but rather caption_tracks and

1109

# caption_translation_languages (e.g. 20LmZk1hakA)

1110

caption_tracks = args['caption_tracks']

1111

caption_translation_languages = args['caption_translation_languages']

1112

caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]

1113

parsed_caption_url = compat_urllib_parse_urlparse(caption_url)

1114

caption_qs = compat_parse_qs(parsed_caption_url.query)

1115

1116

sub_lang_list = {}

1117

for lang in caption_translation_languages.split(','):

1118

lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))

1119

sub_lang = lang_qs.get('lc', [None])[0]

if not sub_lang:

continue

sub_formats = []

for ext in self._SUBTITLE_FORMATS:

caption_qs.update({

'tlang': [sub_lang],

'fmt': [ext],

})

sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(

1129

query=compat_urllib_parse_urlencode(caption_qs, True)))

sub_formats.append({

'url': sub_url,

'ext': ext,

})

sub_lang_list[sub_lang] = sub_formats

1135

return sub_lang_list

1136

# An extractor error can be raise by the download process if there are

1137

# no automatic captions but there are subtitles

1138

except (KeyError, ExtractorError):

1139

self._downloader.report_warning(err_msg)

1140

return {}

1141

1142

def _mark_watched(self, video_id, video_info):

1143

playback_url = video_info.get('videostats_playback_base_url', [None])[0]

1144

if not playback_url:

1145

return

1146

parsed_playback_url = compat_urlparse.urlparse(playback_url)

1147

qs = compat_urlparse.parse_qs(parsed_playback_url.query)

1148

1149

# cpn generation algorithm is reverse engineered from base.js.

1150

# In fact it works even with dummy cpn.

1151

CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'

1152

cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))

qs.update({

'ver': ['2'],

'cpn': [cpn],

})

playback_url = compat_urlparse.urlunparse(

1159

parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))

1160

1161

self._download_webpage(

1162

playback_url, video_id, 'Marking watched',

1163

'Unable to mark watched', fatal=False)

1164

1165

@classmethod

1166

def extract_id(cls, url):

1167

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1168

if mobj is None:

1169

raise ExtractorError('Invalid URL: %s' % url)

1170

video_id = mobj.group(2)

1171

return video_id

1172

1173

def _extract_from_m3u8(self, manifest_url, video_id):

1174

url_map = {}

1175

1176

def _get_urls(_manifest):

1177

lines = _manifest.split('\n')

1178

urls = filter(lambda l: l and not l.startswith('#'),

1179

lines)

1180

return urls

1181

manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')

1182

formats_urls = _get_urls(manifest)

1183

for format_url in formats_urls:

1184

itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')

1185

url_map[itag] = format_url

1186

return url_map

1187

1188

def _extract_annotations(self, video_id):

1189

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1190

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1191

1192

def _real_extract(self, url):

1193

url, smuggled_data = unsmuggle_url(url, {})

1194

1195

proto = (

1196

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1202

for component in [parsed_url.fragment, parsed_url.query]:

1203

query = compat_parse_qs(component)

1204

if start_time is None and 't' in query:

1205

start_time = parse_duration(query['t'][0])

1206

if start_time is None and 'start' in query:

1207

start_time = parse_duration(query['start'][0])

1208

if end_time is None and 'end' in query:

1209

end_time = parse_duration(query['end'][0])

1210

1211

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1212

mobj = re.search(self._NEXT_URL_RE, url)

1213

if mobj:

1214

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1215

video_id = self.extract_id(url)

1216

1217

# Get video webpage

1218

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1219

video_webpage = self._download_webpage(url, video_id)

1220

1221

# Attempt to extract SWF player URL

1222

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1223

if mobj is not None:

1224

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1231

dash_mpd = video_info.get('dashmpd')

1232

if dash_mpd and dash_mpd[0] not in dash_mpds:

1233

dash_mpds.append(dash_mpd[0])

# Get video info

embed_webpage = None

is_live = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1239

age_gate = True

1240

# We simulate the access to the video from www.youtube.com/v/{video_id}

1241

# this can be viewed without login into Youtube

1242

url = proto + '://www.youtube.com/embed/%s' % video_id

1243

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1244

data = compat_urllib_parse_urlencode({

1245

'video_id': video_id,

1246

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1247

'sts': self._search_regex(

1248

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1249

})

1250

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1251

video_info_webpage = self._download_webpage(

1252

video_info_url, video_id,

1253

note='Refetching age-gated info webpage',

1254

errnote='unable to download video info webpage')

1255

video_info = compat_parse_qs(video_info_webpage)

1256

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

# Try looking directly into the video webpage

1261

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1262

if ytplayer_config:

1263

args = ytplayer_config['args']

1264

if args.get('url_encoded_fmt_stream_map'):

1265

# Convert to the same format returned by compat_parse_qs

1266

video_info = dict((k, [v]) for k, v in args.items())

1267

add_dash_mpd(video_info)

1268

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1269

is_live = True

1270

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1271

# We also try looking in get_video_info since it may contain different dashmpd

1272

# URL that points to a DASH manifest with possibly different itag set (some itags

1273

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1274

# manifest pointed by get_video_info's dashmpd).

1275

# The general idea is to take a union of itags of both DASH manifests (for example

1276

# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)

1277

self.report_video_info_webpage_download(video_id)

1278

for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:

1279

video_info_url = (

1280

'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

1281

% (proto, video_id, el_type))

1282

video_info_webpage = self._download_webpage(

1283

video_info_url,

1284

video_id, note=False,

1285

errnote='unable to download video info webpage')

1286

get_video_info = compat_parse_qs(video_info_webpage)

1287

if get_video_info.get('use_cipher_signature') != ['True']:

1288

add_dash_mpd(get_video_info)

1289

if not video_info:

1290

video_info = get_video_info

1291

if 'token' in get_video_info:

1292

# Different get_video_info requests may report different results, e.g.

1293

# some may report video unavailability, but some may serve it without

1294

# any complaint (see https://github.com/rg3/youtube-dl/issues/7362,

1295

# the original webpage as well as el=info and el=embedded get_video_info

1296

# requests report video unavailability due to geo restriction while

1297

# el=detailpage succeeds and returns valid data). This is probably

1298

# due to YouTube measures against IP ranges of hosting providers.

1299

# Working around by preferring the first succeeded video_info containing

1300

# the token if no such video_info yet was found.

1301

if 'token' not in video_info:

1302

video_info = get_video_info

1303

break

1304

if 'token' not in video_info:

1305

if 'reason' in video_info:

1306

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1307

regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)

1308

if regions_allowed:

1309

raise ExtractorError('YouTube said: This video is available in %s only' % (

1310

', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),

1311

expected=True)

1312

raise ExtractorError(

1313

'YouTube said: %s' % video_info['reason'][0],

1314

expected=True, video_id=video_id)

1315

else:

1316

raise ExtractorError(

1317

'"token" parameter not in video info for unknown reason',

video_id=video_id)

# title

if 'title' in video_info:

1322

video_title = video_info['title'][0]

1323

else:

1324

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

video_description = get_element_by_id("eow-description", video_webpage)

1329

if video_description:

1330

video_description = re.sub(r'''(?x)

1331

<a\s+

1332

(?:[a-zA-Z-]+="[^"]*"\s+)*?

1333

(?:title|href)="([^"]+)"\s+

1334

(?:[a-zA-Z-]+="[^"]*"\s+)*?

class="[^"]*"[^>]*>

[^<]+\.{3}\s*

</a>

''', r'\1', video_description)

1339

video_description = clean_html(video_description)

1340

else:

1341

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1342

if fd_mobj:

1343

video_description = unescapeHTML(fd_mobj.group(1))

1344

else:

1345

video_description = ''

1346

1347

if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):

1348

if not self._downloader.params.get('noplaylist'):

1349

entries = []

1350

feed_ids = []

1351

multifeed_metadata_list = video_info['multifeed_metadata_list'][0]

1352

for feed in multifeed_metadata_list.split(','):

1353

# Unquote should take place before split on comma (,) since textual

1354

# fields may contain comma as well (see

1355

# https://github.com/rg3/youtube-dl/issues/8536)

1356

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1357

entries.append({

1358

'_type': 'url_transparent',

1359

'ie_key': 'Youtube',

1360

'url': smuggle_url(

1361

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1362

{'force_singlefeed': True}),

1363

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1364

})

1365

feed_ids.append(feed_data['id'][0])

1366

self.to_screen(

1367

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1368

% (', '.join(feed_ids), video_id))

1369

return self.playlist_result(entries, video_id, video_title, video_description)

1370

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1371

1372

if 'view_count' in video_info:

1373

view_count = int(video_info['view_count'][0])

else:

view_count = None

# Check for "rental" videos

1378

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1379

raise ExtractorError('"rental" videos not supported')

1380

1381

# Start extracting information

1382

self.report_information_extraction(video_id)

1383

1384

# uploader

1385

if 'author' not in video_info:

1386

raise ExtractorError('Unable to extract uploader name')

1387

video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])

1388

1389

# uploader_id

1390

video_uploader_id = None

1391

video_uploader_url = None

1392

mobj = re.search(

1393

r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',

1394

video_webpage)

1395

if mobj is not None:

1396

video_uploader_id = mobj.group('uploader_id')

1397

video_uploader_url = mobj.group('uploader_url')

1398

else:

1399

self._downloader.report_warning('unable to extract uploader nickname')

1400

1401

# thumbnail image

1402

# We try first to get a high quality image:

1403

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

1404

video_webpage, re.DOTALL)

1405

if m_thumb is not None:

1406

video_thumbnail = m_thumb.group(1)

1407

elif 'thumbnail_url' not in video_info:

1408

self._downloader.report_warning('unable to extract video thumbnail')

1409

video_thumbnail = None

1410

else: # don't panic if we can't find it

1411

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

1412

1413

# upload date

1414

upload_date = self._html_search_meta(

1415

'datePublished', video_webpage, 'upload date', default=None)

1416

if not upload_date:

1417

upload_date = self._search_regex(

1418

[r'(?s)id="eow-date.*?>(.*?)</span>',

1419

r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],

1420

video_webpage, 'upload date', default=None)

1421

if upload_date:

1422

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

1423

upload_date = unified_strdate(upload_date)

1424

1425

video_license = self._html_search_regex(

1426

r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',

1427

video_webpage, 'license', default=None)

1428

1429

m_music = re.search(

1430

r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:$.+?$)?</li',

1431

video_webpage)

1432

if m_music:

1433

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

1434

video_creator = clean_html(m_music.group('creator'))

1435

else:

1436

video_alt_title = video_creator = None

1437

1438

m_cat_container = self._search_regex(

1439

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

1440

video_webpage, 'categories', default=None)

1441

if m_cat_container:

1442

category = self._html_search_regex(

1443

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

1444

default=None)

1445

video_categories = None if category is None else [category]

1446

else:

1447

video_categories = None

1448

1449

video_tags = [

1450

unescapeHTML(m.group('content'))

1451

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

1452

1453

def _extract_count(count_name):

1454

return str_to_int(self._search_regex(

1455

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

1456

% re.escape(count_name),

1457

video_webpage, count_name, default=None))

1458

1459

like_count = _extract_count('like')

1460

dislike_count = _extract_count('dislike')

1461

1462

# subtitles

1463

video_subtitles = self.extract_subtitles(video_id, video_webpage)

1464

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

1465

1466

if 'length_seconds' not in video_info:

1467

self._downloader.report_warning('unable to extract video duration')

1468

video_duration = None

1469

else:

1470

video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))

1471

1472

# annotations

1473

video_annotations = None

1474

if self._downloader.params.get('writeannotations', False):

1475

video_annotations = self._extract_annotations(video_id)

1476

1477

def _map_to_format_list(urlmap):

1478

formats = []

1479

for itag, video_real_url in urlmap.items():

1480

dct = {

1481

'format_id': itag,

1482

'url': video_real_url,

1483

'player_url': player_url,

1484

}

1485

if itag in self._formats:

1486

dct.update(self._formats[itag])

formats.append(dct)

return formats

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1491

self.report_rtmp_download()

1492

formats = [{

1493

'format_id': '_rtmp',

1494

'protocol': 'rtmp',

1495

'url': video_info['conn'][0],

1496

'player_url': player_url,

1497

}]

1498

elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:

1499

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1500

if 'rtmpe%3Dyes' in encoded_url_map:

1501

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

1502

formats_spec = {}

1503

fmt_list = video_info.get('fmt_list', [''])[0]

1504

if fmt_list:

1505

for fmt in fmt_list.split(','):

1506

spec = fmt.split('/')

1507

if len(spec) > 1:

1508

width_height = spec[1].split('x')

1509

if len(width_height) == 2:

1510

formats_spec[spec[0]] = {

1511

'resolution': spec[1],

1512

'width': int_or_none(width_height[0]),

1513

'height': int_or_none(width_height[1]),

1514

}

1515

formats = []

1516

for url_data_str in encoded_url_map.split(','):

1517

url_data = compat_parse_qs(url_data_str)

1518

if 'itag' not in url_data or 'url' not in url_data:

1519

continue

1520

format_id = url_data['itag'][0]

1521

url = url_data['url'][0]

1522

1523

if 'sig' in url_data:

1524

url += '&signature=' + url_data['sig'][0]

1525

elif 's' in url_data:

1526

encrypted_sig = url_data['s'][0]

1527

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1528

1529

jsplayer_url_json = self._search_regex(

1530

ASSETS_RE,

1531

embed_webpage if age_gate else video_webpage,

1532

'JS player URL (1)', default=None)

1533

if not jsplayer_url_json and not age_gate:

1534

# We need the embed website after all

1535

if embed_webpage is None:

1536

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1537

embed_webpage = self._download_webpage(

1538

embed_url, video_id, 'Downloading embed webpage')

1539

jsplayer_url_json = self._search_regex(

1540

ASSETS_RE, embed_webpage, 'JS player URL')

1541

1542

player_url = json.loads(jsplayer_url_json)

1543

if player_url is None:

1544

player_url_json = self._search_regex(

1545

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1546

video_webpage, 'age gate player URL')

1547

player_url = json.loads(player_url_json)

1548

1549

if self._downloader.params.get('verbose'):

1550

if player_url is None:

1551

player_version = 'unknown'

1552

player_desc = 'unknown'

1553

else:

1554

if player_url.endswith('swf'):

1555

player_version = self._search_regex(

1556

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1557

'flash player', fatal=False)

1558

player_desc = 'flash player %s' % player_version

1559

else:

1560

player_version = self._search_regex(

1561

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],

1562

player_url,

1563

'html5 player', fatal=False)

1564

player_desc = 'html5 player %s' % player_version

1565

1566

parts_sizes = self._signature_cache_id(encrypted_sig)

1567

self.to_screen('{%s} signature length %s, %s' %

1568

(format_id, parts_sizes, player_desc))

1569

1570

signature = self._decrypt_signature(

1571

encrypted_sig, video_id, player_url, age_gate)

1572

url += '&signature=' + signature

1573

if 'ratebypass' not in url:

1574

url += '&ratebypass=yes'

1575

1576

dct = {

1577

'format_id': format_id,

1578

'url': url,

1579

'player_url': player_url,

1580

}

1581

if format_id in self._formats:

1582

dct.update(self._formats[format_id])

1583

if format_id in formats_spec:

1584

dct.update(formats_spec[format_id])

1585

1586

# Some itags are not included in DASH manifest thus corresponding formats will

1587

# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).

1588

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

1589

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

1590

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

1591

1592

more_fields = {

1593

'filesize': int_or_none(url_data.get('clen', [None])[0]),

1594

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

1595

'width': width,

1596

'height': height,

1597

'fps': int_or_none(url_data.get('fps', [None])[0]),

1598

'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],

1599

}

1600

for key, value in more_fields.items():

1601

if value:

1602

dct[key] = value

1603

type_ = url_data.get('type', [None])[0]

1604

if type_:

1605

type_split = type_.split(';')

1606

kind_ext = type_split[0].split('/')

1607

if len(kind_ext) == 2:

1608

kind, _ = kind_ext

1609

dct['ext'] = mimetype2ext(type_split[0])

1610

if kind in ('audio', 'video'):

1611

codecs = None

1612

for mobj in re.finditer(

1613

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

1614

if mobj.group('key') == 'codecs':

1615

codecs = mobj.group('val')

1616

break

1617

if codecs:

1618

codecs = codecs.split(',')

1619

if len(codecs) == 2:

1620

acodec, vcodec = codecs[1], codecs[0]

1621

else:

1622

acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])

dct.update({

'acodec': acodec,

'vcodec': vcodec,

})

formats.append(dct)

elif video_info.get('hlsvp'):

1629

manifest_url = video_info['hlsvp'][0]

1630

url_map = self._extract_from_m3u8(manifest_url, video_id)

1631

formats = _map_to_format_list(url_map)

1632

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

1633

for a_format in formats:

1634

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

1635

else:

1636

unavailable_message = self._html_search_regex(

1637

r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',

1638

video_webpage, 'unavailable message', default=None)

1639

if unavailable_message:

1640

raise ExtractorError(unavailable_message, expected=True)

1641

raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

1642

1643

# Look for the DASH manifest

1644

if self._downloader.params.get('youtube_include_dash_manifest', True):

1645

dash_mpd_fatal = True

1646

for mpd_url in dash_mpds:

1647

dash_formats = {}

1648

try:

1649

def decrypt_sig(mobj):

1650

s = mobj.group(1)

1651

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

1652

return '/signature/%s' % dec_s

1653

1654

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

1655

1656

for df in self._extract_mpd_formats(

1657

mpd_url, video_id, fatal=dash_mpd_fatal,

1658

formats_dict=self._formats):

1659

# Do not overwrite DASH format found in some previous DASH manifest

1660

if df['format_id'] not in dash_formats:

1661

dash_formats[df['format_id']] = df

1662

# Additional DASH manifests may end up in HTTP Error 403 therefore

1663

# allow them to fail without bug report message if we already have

1664

# some DASH manifest succeeded. This is temporary workaround to reduce

1665

# burst of bug reports until we figure out the reason and whether it

1666

# can be fixed at all.

1667

dash_mpd_fatal = False

1668

except (ExtractorError, KeyError) as e:

1669

self.report_warning(

1670

'Skipping DASH manifest: %r' % e, video_id)

1671

if dash_formats:

1672

# Remove the formats we found through non-DASH, they

1673

# contain less info and it can be wrong, because we use

1674

# fixed values (for example the resolution). See

1675

# https://github.com/rg3/youtube-dl/issues/5774 for an

1676

# example.

1677

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

1678

formats.extend(dash_formats.values())

1679

1680

# Check for malformed aspect ratio

1681

stretched_m = re.search(

1682

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

1683

video_webpage)

1684

if stretched_m:

1685

w = float(stretched_m.group('w'))

1686

h = float(stretched_m.group('h'))

1687

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

1688

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

1693

f['stretched_ratio'] = ratio

1694

1695

self._sort_formats(formats)

1696

1697

self.mark_watched(video_id, video_info)

return {

'id': video_id,

'uploader': video_uploader,

1702

'uploader_id': video_uploader_id,

1703

'uploader_url': video_uploader_url,

1704

'upload_date': upload_date,

1705

'license': video_license,

1706

'creator': video_creator,

1707

'title': video_title,

1708

'alt_title': video_alt_title,

1709

'thumbnail': video_thumbnail,

1710

'description': video_description,

1711

'categories': video_categories,

1712

'tags': video_tags,

1713

'subtitles': video_subtitles,

1714

'automatic_captions': automatic_captions,

1715

'duration': video_duration,

1716

'age_limit': 18 if age_gate else 0,

1717

'annotations': video_annotations,

1718

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

1719

'view_count': view_count,

1720

'like_count': like_count,

1721

'dislike_count': dislike_count,

1722

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

1723

'formats': formats,

1724

'is_live': is_live,

1725

'start_time': start_time,

1726

'end_time': end_time,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

1731

IE_DESC = 'YouTube.com playlists'

1732

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

youtube\.com/

(?:

\? (?:.*?[&;])*? (?:p|a|list)=

| p/

)

(

(?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}

1743

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})

1749

)"""

1750

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

1751

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

1752

IE_NAME = 'youtube:playlist'

1753

_TESTS = [{

1754

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

1755

'info_dict': {

1756

'title': 'ytdl test PL',

1757

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1762

'info_dict': {

1763

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1764

'title': 'YDL_Empty_List',

},

'playlist_count': 0,

}, {

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

1769

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1770

'info_dict': {

1771

'title': '29C3: Not my department',

1772

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1773

},

1774

'playlist_count': 95,

1775

}, {

1776

'note': 'issue #673',

1777

'url': 'PLBB231211A4F62143',

1778

'info_dict': {

1779

'title': '[OLD]Team Fortress 2 (Class-based LP)',

1780

'id': 'PLBB231211A4F62143',

1781

},

1782

'playlist_mincount': 26,

1783

}, {

1784

'note': 'Large playlist',

1785

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

1786

'info_dict': {

1787

'title': 'Uploads from Cauchemar',

1788

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

1789

},

1790

'playlist_mincount': 799,

1791

}, {

1792

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

1793

'info_dict': {

1794

'title': 'YDL_safe_search',

1795

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

},

'playlist_count': 2,

}, {

'note': 'embedded',

'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

1805

}

1806

}, {

1807

'note': 'Embedded SWF player',

1808

'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

1813

}

1814

}, {

1815

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

1816

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

1817

'info_dict': {

1818

'title': 'Uploads from Interstellar Movie',

1819

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

1820

},

1821

'playlist_mincout': 21,

1822

}]

1823

1824

def _real_initialize(self):

1825

self._login()

1826

1827

def _extract_mix(self, playlist_id):

1828

# The mixes are generated from a single video

1829

# the id of the playlist is just 'RD' + video_id

1830

ids = []

1831

last_id = playlist_id[-11:]

1832

for n in itertools.count(1):

1833

url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)

1834

webpage = self._download_webpage(

1835

url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))

1836

new_ids = orderedSet(re.findall(

1837

r'''(?xs)data-video-username=".*?".*?

1838

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

1839

webpage))

1840

# Fetch new pages until all the videos are repeated, it seems that

1841

# there are always 51 unique videos.

1842

new_ids = [_id for _id in new_ids if _id not in ids]

if not new_ids:

break

ids.extend(new_ids)

last_id = ids[-1]

url_results = self._ids_to_results(ids)

1849

1850

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

1851

title_span = (

1852

search_title('playlist-title') or

1853

search_title('title long-title') or

1854

search_title('title'))

1855

title = clean_html(title_span)

1856

1857

return self.playlist_result(url_results, playlist_id, title)

1858

1859

def _extract_playlist(self, playlist_id):

1860

url = self._TEMPLATE_URL % playlist_id

1861

page = self._download_webpage(url, playlist_id)

1862

1863

for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):

1864

match = match.strip()

1865

# Check if the playlist exists or is private

1866

if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):

1867

raise ExtractorError(

1868

'The playlist doesn\'t exist or is private, use --username or '

1869

'--netrc to access it.',

1870

expected=True)

1871

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

1872

raise ExtractorError(

1873

'Invalid parameters. Maybe URL is incorrect.',

1874

expected=True)

1875

elif re.match(r'[^<]*Choose your language[^<]*', match):

1876

continue

1877

else:

1878

self.report_warning('Youtube gives an alert message: ' + match)

1879

1880

playlist_title = self._html_search_regex(

1881

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

1882

page, 'title')

1883

1884

return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)

1885

1886

def _check_download_just_video(self, url, playlist_id):

1887

# Check if it's a video-specific URL

1888

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

1889

if 'v' in query_dict:

1890

video_id = query_dict['v'][0]

1891

if self._downloader.params.get('noplaylist'):

1892

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1893

return self.url_result(video_id, 'Youtube', video_id=video_id)

1894

else:

1895

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

1896

1897

def _real_extract(self, url):

1898

# Extract playlist id

1899

mobj = re.match(self._VALID_URL, url)

1900

if mobj is None:

1901

raise ExtractorError('Invalid URL: %s' % url)

1902

playlist_id = mobj.group(1) or mobj.group(2)

1903

1904

video = self._check_download_just_video(url, playlist_id)

if video:

return video

if playlist_id.startswith(('RD', 'UL', 'PU')):

1909

# Mixes require a custom extraction process

1910

return self._extract_mix(playlist_id)

1911

1912

return self._extract_playlist(playlist_id)

1913

1914

1915

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

1916

IE_DESC = 'YouTube.com channels'

1917

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'

1918

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

1919

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

1920

IE_NAME = 'youtube:channel'

1921

_TESTS = [{

1922

'note': 'paginated channel',

1923

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

1924

'playlist_mincount': 91,

1925

'info_dict': {

1926

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

1927

'title': 'Uploads from lex will',

1928

}

1929

}, {

1930

'note': 'Age restricted channel',

1931

# from https://www.youtube.com/user/DeusExOfficial

1932

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

1933

'playlist_mincount': 64,

1934

'info_dict': {

1935

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

1936

'title': 'Uploads from Deus Ex',

},

}]

@classmethod

def suitable(cls, url):

1942

return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)

1943

else super(YoutubeChannelIE, cls).suitable(url))

1944

1945

def _real_extract(self, url):

1946

channel_id = self._match_id(url)

1947

1948

url = self._TEMPLATE_URL % channel_id

1949

1950

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

1951

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

1952

# otherwise fallback on channel by page extraction

1953

channel_page = self._download_webpage(

1954

url + '?view=57', channel_id,

1955

'Downloading channel page', fatal=False)

1956

if channel_page is False:

1957

channel_playlist_id = False

1958

else:

1959

channel_playlist_id = self._html_search_meta(

1960

'channelId', channel_page, 'channel id', default=None)

1961

if not channel_playlist_id:

1962

channel_playlist_id = self._search_regex(

1963

r'data-(?:channel-external-|yt)id="([^"]+)"',

1964

channel_page, 'channel id', default=None)

1965

if channel_playlist_id and channel_playlist_id.startswith('UC'):

1966

playlist_id = 'UU' + channel_playlist_id[2:]

1967

return self.url_result(

1968

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

1969

1970

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

1971

autogenerated = re.search(r'''(?x)

1972

class="[^"]*?(?:

1973

channel-header-autogenerated-label|

1974

yt-channel-title-autogenerated

1975

)[^"]*"''', channel_page) is not None

1976

1977

if autogenerated:

1978

# The videos are contained in a single page

1979

# the ajax pages can't be used, they are empty

1980

entries = [

1981

self.url_result(

1982

video_id, 'Youtube', video_id=video_id,

1983

video_title=video_title)

1984

for video_id, video_title in self.extract_videos_from_page(channel_page)]

1985

return self.playlist_result(entries, channel_id)

1986

1987

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

1988

1989

1990

class YoutubeUserIE(YoutubeChannelIE):

1991

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

1992

1993

_TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'

1994

IE_NAME = 'youtube:user'

1995

1996

_TESTS = [{

1997

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

1998

'playlist_mincount': 320,

1999

'info_dict': {

2000

'title': 'TheLinuxFoundation',

2001

}

2002

}, {

2003

'url': 'ytuser:phihag',

2004

'only_matching': True,

2005

}, {

2006

'url': 'https://www.youtube.com/c/gametrailers',

2007

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

2012

# Don't return True if the url can be extracted with other youtube

2013

# extractor, the regex would is too permissive and it would match.

2014

other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)

2015

if any(ie.suitable(url) for ie in other_yt_ies):

2016

return False

2017

else:

2018

return super(YoutubeUserIE, cls).suitable(url)

2019

2020

2021

class YoutubeLiveIE(YoutubeBaseInfoExtractor):

2022

IE_DESC = 'YouTube.com live streams'

2023

_VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+))/live'

2024

IE_NAME = 'youtube:live'

2025

2026

_TESTS = [{

2027

'url': 'http://www.youtube.com/user/TheYoungTurks/live',

'info_dict': {

'id': 'a48o2S1cPoo',

'ext': 'mp4',

'title': 'The Young Turks - Live Main Show',

2032

'uploader': 'The Young Turks',

2033

'uploader_id': 'TheYoungTurks',

2034

'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',

2035

'upload_date': '20150715',

2036

'license': 'Standard YouTube License',

2037

'description': 'md5:438179573adcdff3c97ebb1ee632b891',

2038

'categories': ['News & Politics'],

2039

'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],

2040

'like_count': int,

2041

'dislike_count': int,

2042

},

2043

'params': {

2044

'skip_download': True,

2045

},

2046

}, {

2047

'url': 'http://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',

2048

'only_matching': True,

2049

}]

2050

2051

def _real_extract(self, url):

2052

mobj = re.match(self._VALID_URL, url)

2053

channel_id = mobj.group('id')

2054

base_url = mobj.group('base_url')

2055

webpage = self._download_webpage(url, channel_id, fatal=False)

2056

if webpage:

2057

page_type = self._og_search_property(

2058

'type', webpage, 'page type', default=None)

2059

video_id = self._html_search_meta(

2060

'videoId', webpage, 'video id', default=None)

2061

if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id):

2062

return self.url_result(video_id, YoutubeIE.ie_key())

2063

return self.url_result(base_url)

2064

2065

2066

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

2067

IE_DESC = 'YouTube.com user/channel playlists'

2068

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

2069

IE_NAME = 'youtube:playlists'

2070

2071

_TESTS = [{

2072

'url': 'http://www.youtube.com/user/ThirstForScience/playlists',

2073

'playlist_mincount': 4,

2074

'info_dict': {

2075

'id': 'ThirstForScience',

2076

'title': 'Thirst for Science',

2077

},

2078

}, {

2079

# with "Load more" button

2080

'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

2081

'playlist_mincount': 70,

2082

'info_dict': {

2083

'id': 'igorkle1',

2084

'title': 'Игорь Клейнер',

2085

},

2086

}, {

2087

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

2088

'playlist_mincount': 17,

2089

'info_dict': {

2090

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

2091

'title': 'Chem Player',

},

}]

class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):

2097

IE_DESC = 'YouTube.com searches'

2098

# there doesn't appear to be a real limit, for example if you search for

2099

# 'python' you get more than 8.000.000 results

2100

_MAX_RESULTS = float('inf')

2101

IE_NAME = 'youtube:search'

2102

_SEARCH_KEY = 'ytsearch'

2103

_EXTRA_QUERY_ARGS = {}

2104

_TESTS = []

2105

2106

def _get_n_results(self, query, n):

2107

"""Get a specified number of results for a query"""

videos = []

limit = n

for pagenum in itertools.count(1):

2113

url_query = {

2114

'search_query': query.encode('utf-8'),

'page': pagenum,

'spf': 'navigate',

}

url_query.update(self._EXTRA_QUERY_ARGS)

2119

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)

2120

data = self._download_json(

2121

result_url, video_id='query "%s"' % query,

2122

note='Downloading page %s' % pagenum,

2123

errnote='Unable to download API page')

2124

html_content = data[1]['body']['content']

2125

2126

if 'class="search-message' in html_content:

2127

raise ExtractorError(

2128

'[youtube] No video results', expected=True)

2129

2130

new_videos = self._ids_to_results(orderedSet(re.findall(

2131

r'href="/watch\?v=(.{11})', html_content)))

2132

videos += new_videos

2133

if not new_videos or len(videos) > limit:

break

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

2139

2140

2141

class YoutubeSearchDateIE(YoutubeSearchIE):

2142

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

2143

_SEARCH_KEY = 'ytsearchdate'

2144

IE_DESC = 'YouTube.com searches, newest videos first'

2145

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

2146

2147

2148

class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):

2149

IE_DESC = 'YouTube.com search URLs'

2150

IE_NAME = 'youtube:search_url'

2151

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'

2152

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'

2153

_TESTS = [{

2154

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

2155

'playlist_mincount': 5,

2156

'info_dict': {

2157

'title': 'youtube-dl test video',

2158

}

2159

}, {

2160

'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',

2161

'only_matching': True,

2162

}]

2163

2164

def _real_extract(self, url):

2165

mobj = re.match(self._VALID_URL, url)

2166

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

2167

webpage = self._download_webpage(url, query)

2168

return self.playlist_result(self._process_page(webpage), playlist_title=query)

2169

2170

2171

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

2172

IE_DESC = 'YouTube.com (multi-season) shows'

2173

_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'

2174

IE_NAME = 'youtube:show'

2175

_TESTS = [{

2176

'url': 'https://www.youtube.com/show/airdisasters',

2177

'playlist_mincount': 5,

2178

'info_dict': {

2179

'id': 'airdisasters',

2180

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

2185

playlist_id = self._match_id(url)

2186

return super(YoutubeShowIE, self)._real_extract(

2187

'https://www.youtube.com/show/%s/playlists' % playlist_id)

2188

2189

2190

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

2191

"""

2192

Base class for feed extractors

2193

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

2194

"""

2195

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

2200

2201

def _real_initialize(self):

2202

self._login()

2203

2204

def _real_extract(self, url):

2205

page = self._download_webpage(

2206

'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)

2207

2208

# The extraction process is the same as for playlists, but the regex

2209

# for the video ids doesn't contain an index

2210

ids = []

2211

more_widget_html = content_html = page

2212

for page_num in itertools.count(1):

2213

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

2214

2215

# 'recommended' feed has infinite 'load more' and each new portion spins

2216

# the same videos in (sometimes) slightly different order, so we'll check

2217

# for unicity and break when portion has no new videos

2218

new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))

if not new_ids:

break

ids.extend(new_ids)

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

2229

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

2230

'Downloading page #%s' % page_num,

2231

transform_source=uppercase_escape)

2232

content_html = more['content_html']

2233

more_widget_html = more['load_more_widget_html']

2234

2235

return self.playlist_result(

2236

self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)

2237

2238

2239

class YoutubeWatchLaterIE(YoutubePlaylistIE):

2240

IE_NAME = 'youtube:watchlater'

2241

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

2242

_VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'

2243

2244

_TESTS = [{

2245

'url': 'https://www.youtube.com/playlist?list=WL',

2246

'only_matching': True,

2247

}, {

2248

'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',

2249

'only_matching': True,

2250

}]

2251

2252

def _real_extract(self, url):

2253

video = self._check_download_just_video(url, 'WL')

2254

if video:

2255

return video

2256

return self._extract_playlist('WL')

2257

2258

2259

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

2260

IE_NAME = 'youtube:favorites'

2261

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

2262

_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

2263

_LOGIN_REQUIRED = True

2264

2265

def _real_extract(self, url):

2266

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

2267

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

2268

return self.url_result(playlist_id, 'YoutubePlaylist')

2269

2270

2271

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

2272

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

2273

_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'

2274

_FEED_NAME = 'recommended'

2275

_PLAYLIST_TITLE = 'Youtube Recommended videos'

2276

2277

2278

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

2279

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

2280

_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

2281

_FEED_NAME = 'subscriptions'

2282

_PLAYLIST_TITLE = 'Youtube Subscriptions'

2283

2284

2285

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

2286

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

2287

_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'

2288

_FEED_NAME = 'history'

2289

_PLAYLIST_TITLE = 'Youtube History'

2290

2291

2292

class YoutubeTruncatedURLIE(InfoExtractor):

2293

IE_NAME = 'youtube:truncated_url'

2294

IE_DESC = False # Do not list

2295

_VALID_URL = r'''(?x)

2296

(?:https?://)?

2297

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

2298

(?:watch\?(?:

2299

feature=[a-z_]+|

2300

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',

2313

'only_matching': True,

2314

}, {

2315

'url': 'http://www.youtube.com/watch?',

2316

'only_matching': True,

2317

}, {

2318

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

2319

'only_matching': True,

2320

}, {

2321

'url': 'https://www.youtube.com/watch?feature=foo',

2322

'only_matching': True,

2323

}, {

2324

'url': 'https://www.youtube.com/watch?hl=en-GB',

2325

'only_matching': True,

2326

}, {

2327

'url': 'https://www.youtube.com/watch?t=2372',

2328

'only_matching': True,

2329

}]

2330

2331

def _real_extract(self, url):

2332

raise ExtractorError(

2333

'Did you forget to quote the URL? Remember that & is a meta '

2334

'character in most shells, so you want to put the URL in quotes, '

2335

'like youtube-dl '

2336

'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

2337

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

2342

IE_NAME = 'youtube:truncated_id'

2343

IE_DESC = False # Do not list

2344

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

2345

2346

_TESTS = [{

2347

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

2348

'only_matching': True,

2349

}]

2350

2351

def _real_extract(self, url):

2352

video_id = self._match_id(url)

2353

raise ExtractorError(

2354

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

2355

expected=True)