jfr.im git - yt-dlp.git/blame_incremental - youtube

Commit	Line	Data
	1	# coding: utf-8
	2
	3	from __future__ import unicode_literals
	4
	5
	6	import itertools
	7	import json
	8	import os.path
	9	import random
	10	import re
	11	import time
	12	import traceback
	13
	14	from .common import InfoExtractor, SearchInfoExtractor
	15	from ..jsinterp import JSInterpreter
	16	from ..swfinterp import SWFInterpreter
	17	from ..compat import (
	18	compat_chr,
	19	compat_parse_qs,
	20	compat_urllib_parse,
	21	compat_urllib_parse_unquote,
	22	compat_urllib_parse_unquote_plus,
	23	compat_urllib_parse_urlparse,
	24	compat_urlparse,
	25	compat_str,
	26	)
	27	from ..utils import (
	28	clean_html,
	29	encode_dict,
	30	error_to_compat_str,
	31	ExtractorError,
	32	float_or_none,
	33	get_element_by_attribute,
	34	get_element_by_id,
	35	int_or_none,
	36	mimetype2ext,
	37	orderedSet,
	38	parse_duration,
	39	remove_quotes,
	40	remove_start,
	41	sanitized_Request,
	42	smuggle_url,
	43	str_to_int,
	44	unescapeHTML,
	45	unified_strdate,
	46	unsmuggle_url,
	47	uppercase_escape,
	48	ISO3166Utils,
	49	)
	50
	51
	52	class YoutubeBaseInfoExtractor(InfoExtractor):
	53	"""Provide base functions for Youtube extractors"""
	54	_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
	55	_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
	56	_NETRC_MACHINE = 'youtube'
	57	# If True it will raise an error if no login info is provided
	58	_LOGIN_REQUIRED = False
	59
	60	def _set_language(self):
	61	self._set_cookie(
	62	'.youtube.com', 'PREF', 'f1=50000000&hl=en',
	63	# YouTube sets the expire time to about two months
	64	expire_time=time.time() + 2 * 30 * 24 * 3600)
	65
	66	def _ids_to_results(self, ids):
	67	return [
	68	self.url_result(vid_id, 'Youtube', video_id=vid_id)
	69	for vid_id in ids]
	70
	71	def _login(self):
	72	"""
	73	Attempt to log in to YouTube.
	74	True is returned if successful or skipped.
	75	False is returned if login failed.
	76
	77	If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
	78	"""
	79	(username, password) = self._get_login_info()
	80	# No authentication to be performed
	81	if username is None:
	82	if self._LOGIN_REQUIRED:
	83	raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
	84	return True
	85
	86	login_page = self._download_webpage(
	87	self._LOGIN_URL, None,
	88	note='Downloading login page',
	89	errnote='unable to fetch login page', fatal=False)
	90	if login_page is False:
	91	return
	92
	93	galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
	94	login_page, 'Login GALX parameter')
	95
	96	# Log in
	97	login_form_strs = {
	98	'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
	99	'Email': username,
	100	'GALX': galx,
	101	'Passwd': password,
	102
	103	'PersistentCookie': 'yes',
	104	'_utf8': '霱',
	105	'bgresponse': 'js_disabled',
	106	'checkConnection': '',
	107	'checkedDomains': 'youtube',
	108	'dnConn': '',
	109	'pstMsg': '0',
	110	'rmShown': '1',
	111	'secTok': '',
	112	'signIn': 'Sign in',
	113	'timeStmp': '',
	114	'service': 'youtube',
	115	'uilel': '3',
	116	'hl': 'en_US',
	117	}
	118
	119	login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
	120
	121	req = sanitized_Request(self._LOGIN_URL, login_data)
	122	login_results = self._download_webpage(
	123	req, None,
	124	note='Logging in', errnote='unable to log in', fatal=False)
	125	if login_results is False:
	126	return False
	127
	128	if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
	129	raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
	130
	131	# Two-Factor
	132	# TODO add SMS and phone call support - these require making a request and then prompting the user
	133
	134	if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
	135	tfa_code = self._get_tfa_info('2-step verification code')
	136
	137	if not tfa_code:
	138	self._downloader.report_warning(
	139	'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
	140	'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
	141	return False
	142
	143	tfa_code = remove_start(tfa_code, 'G-')
	144
	145	tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
	146
	147	tfa_form_strs.update({
	148	'Pin': tfa_code,
	149	'TrustDevice': 'on',
	150	})
	151
	152	tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
	153
	154	tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
	155	tfa_results = self._download_webpage(
	156	tfa_req, None,
	157	note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
	158
	159	if tfa_results is False:
	160	return False
	161
	162	if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
	163	self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
	164	return False
	165	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
	166	self._downloader.report_warning('unable to log in - did the page structure change?')
	167	return False
	168	if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
	169	self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
	170	return False
	171
	172	if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
	173	self._downloader.report_warning('unable to log in: bad username or password')
	174	return False
	175	return True
	176
	177	def _real_initialize(self):
	178	if self._downloader is None:
	179	return
	180	self._set_language()
	181	if not self._login():
	182	return
	183
	184
	185	class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
	186	# Extract entries from page with "Load more" button
	187	def _entries(self, page, playlist_id):
	188	more_widget_html = content_html = page
	189	for page_num in itertools.count(1):
	190	for entry in self._process_page(content_html):
	191	yield entry
	192
	193	mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
	194	if not mobj:
	195	break
	196
	197	more = self._download_json(
	198	'https://youtube.com/%s' % mobj.group('more'), playlist_id,
	199	'Downloading page #%s' % page_num,
	200	transform_source=uppercase_escape)
	201	content_html = more['content_html']
	202	if not content_html.strip():
	203	# Some webpages show a "Load more" button but they don't
	204	# have more videos
	205	break
	206	more_widget_html = more['load_more_widget_html']
	207
	208
	209	class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	210	def _process_page(self, content):
	211	for video_id, video_title in self.extract_videos_from_page(content):
	212	yield self.url_result(video_id, 'Youtube', video_id, video_title)
	213
	214	def extract_videos_from_page(self, page):
	215	ids_in_page = []
	216	titles_in_page = []
	217	for mobj in re.finditer(self._VIDEO_RE, page):
	218	# The link with index 0 is not the first video of the playlist (not sure if still actual)
	219	if 'index' in mobj.groupdict() and mobj.group('id') == '0':
	220	continue
	221	video_id = mobj.group('id')
	222	video_title = unescapeHTML(mobj.group('title'))
	223	if video_title:
	224	video_title = video_title.strip()
	225	try:
	226	idx = ids_in_page.index(video_id)
	227	if video_title and not titles_in_page[idx]:
	228	titles_in_page[idx] = video_title
	229	except ValueError:
	230	ids_in_page.append(video_id)
	231	titles_in_page.append(video_title)
	232	return zip(ids_in_page, titles_in_page)
	233
	234
	235	class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
	236	def _process_page(self, content):
	237	for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):
	238	yield self.url_result(
	239	'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
	240
	241	def _real_extract(self, url):
	242	playlist_id = self._match_id(url)
	243	webpage = self._download_webpage(url, playlist_id)
	244	title = self._og_search_title(webpage, fatal=False)
	245	return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
	246
	247
	248	class YoutubeIE(YoutubeBaseInfoExtractor):
	249	IE_DESC = 'YouTube.com'
	250	_VALID_URL = r"""(?x)^
	251	(
	252	(?:https?://\|//) # http(s):// or protocol-independent URL
	253	(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/\|
	254	(?:www\.)?deturl\.com/www\.youtube\.com/\|
	255	(?:www\.)?pwnyoutube\.com/\|
	256	(?:www\.)?yourepeat\.com/\|
	257	tube\.majestyc\.net/\|
	258	youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
	259	(?:.*?\#/)? # handle anchor (#/) redirect urls
	260	(?: # the various things that can precede the ID:
	261	(?:(?:v\|embed\|e)/(?!videoseries)) # v/ or embed/ or e/
	262	\|(?: # or the v= param in all its forms
	263	(?:(?:watch\|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup\|.php) or nothing (like /?v=xxxx)
	264	(?:\?\|\#!?) # the params delimiter ? or # or #!
	265	(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
	266	v=
	267	)
	268	))
	269	\|(?:
	270	youtu\.be\| # just youtu.be/xxxx
	271	vid\.plus # or vid.plus/xxxx
	272	)/
	273	\|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
	274	)
	275	)? # all until now is optional -> you can pass the naked ID
	276	([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
	277	(?!.*?&list=) # combined list/video URLs are handled by the playlist IE
	278	(?(1).+)? # if we found the ID, everything can follow
	279	$"""
	280	_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
	281	# tbr was extracted from com/google/youtube/model/VideoFormat.as in watch_as3.swf and converted from Bytes/S to KBits/S
	282	_formats = {
	283	'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263', 'tbr': 320},
	284	'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263', 'tbr': 896},
	285	'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v', 'tbr': 60},
	286	'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v', 'tbr': 80},
	287	'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264', 'tbr': 736},
	288	'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'tbr': 3192},
	289	'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'tbr': 928},
	290	'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'tbr': 1280},
	291	# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
	292	'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v', 'tbr': 256},
	293	'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'tbr': 6192},
	294	'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'tbr': 10128},
	295	'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'tbr': 928},
	296	'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'tbr': 1280},
	297	'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'tbr': 3192},
	298	'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
	299	'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'tbr': 1280},
	300	'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'tbr': 1280},
	301
	302
	303	# 3D videos
	304	'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20, 'tbr': 800},
	305	'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20, 'tbr': 1152},
	306	'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20, 'tbr': 3000},
	307	'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20, 'tbr': 6000},
	308	'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
	309	'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	310	'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
	311
	312	# Apple HTTP Live Streaming
	313	'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10, 'tbr': 186.625},
	314	'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10, 'tbr': 951.5625},
	315	'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10, 'tbr': 1312.5},
	316	'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10, 'tbr': 3207.421875},
	317	'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10, 'tbr': 6349.21875},
	318	'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
	319	'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
	320
	321	# DASH mp4 video
	322	'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 261.71875},
	323	'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 951.5625},
	324	'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 1312.5},
	325	'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 3207.421875},
	326	'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 6349.21875},
	327	'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 10128.0}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
	328	'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 91.796875},
	329	'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	330	'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	331	'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
	332	'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
	333
	334	# Dash mp4 audio
	335	'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash', 'tbr': 32},
	336	'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash', 'tbr': 128},
	337	'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash', 'tbr': 320},
	338
	339	# Dash webm
	340	'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	341	'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	342	'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	343	'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	344	'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	345	'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
	346	'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},
	347	'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	348	'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	349	'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	350	'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	351	'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	352	'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	353	'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	354	'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	355	# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
	356	'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	357	'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	358	'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	359	'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	360	'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
	361	'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
	362
	363	# Dash webm audio
	364	'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
	365	'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
	366
	367	# Dash webm audio with opus inside
	368	'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
	369	'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
	370	'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
	371
	372	# RTMP (unnamed)
	373	'_rtmp': {'protocol': 'rtmp'},
	374	}
	375	_SUBTITLE_FORMATS = ('ttml', 'vtt')
	376
	377	IE_NAME = 'youtube'
	378	_TESTS = [
	379	{
	380	'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
	381	'info_dict': {
	382	'id': 'BaW_jenozKc',
	383	'ext': 'mp4',
	384	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	385	'uploader': 'Philipp Hagemeister',
	386	'uploader_id': 'phihag',
	387	'upload_date': '20121002',
	388	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	389	'categories': ['Science & Technology'],
	390	'tags': ['youtube-dl'],
	391	'like_count': int,
	392	'dislike_count': int,
	393	'start_time': 1,
	394	'end_time': 9,
	395	}
	396	},
	397	{
	398	'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
	399	'note': 'Test generic use_cipher_signature video (#897)',
	400	'info_dict': {
	401	'id': 'UxxajLWwzqY',
	402	'ext': 'mp4',
	403	'upload_date': '20120506',
	404	'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
	405	'alt_title': 'I Love It (feat. Charli XCX)',
	406	'description': 'md5:782e8651347686cba06e58f71ab51773',
	407	'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
	408	'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
	409	'iconic ep', 'iconic', 'love', 'it'],
	410	'uploader': 'Icona Pop',
	411	'uploader_id': 'IconaPop',
	412	'creator': 'Icona Pop',
	413	}
	414	},
	415	{
	416	'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
	417	'note': 'Test VEVO video with age protection (#956)',
	418	'info_dict': {
	419	'id': '07FYdnEawAQ',
	420	'ext': 'mp4',
	421	'upload_date': '20130703',
	422	'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
	423	'alt_title': 'Tunnel Vision',
	424	'description': 'md5:64249768eec3bc4276236606ea996373',
	425	'uploader': 'justintimberlakeVEVO',
	426	'uploader_id': 'justintimberlakeVEVO',
	427	'creator': 'Justin Timberlake',
	428	'age_limit': 18,
	429	}
	430	},
	431	{
	432	'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
	433	'note': 'Embed-only video (#1746)',
	434	'info_dict': {
	435	'id': 'yZIXLfi8CZQ',
	436	'ext': 'mp4',
	437	'upload_date': '20120608',
	438	'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
	439	'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
	440	'uploader': 'SET India',
	441	'uploader_id': 'setindia',
	442	'age_limit': 18,
	443	}
	444	},
	445	{
	446	'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
	447	'note': 'Use the first video ID in the URL',
	448	'info_dict': {
	449	'id': 'BaW_jenozKc',
	450	'ext': 'mp4',
	451	'title': 'youtube-dl test video "\'/\\ä↭𝕐',
	452	'uploader': 'Philipp Hagemeister',
	453	'uploader_id': 'phihag',
	454	'upload_date': '20121002',
	455	'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
	456	'categories': ['Science & Technology'],
	457	'tags': ['youtube-dl'],
	458	'like_count': int,
	459	'dislike_count': int,
	460	},
	461	'params': {
	462	'skip_download': True,
	463	},
	464	},
	465	{
	466	'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
	467	'note': '256k DASH audio (format 141) via DASH manifest',
	468	'info_dict': {
	469	'id': 'a9LDPn-MO4I',
	470	'ext': 'm4a',
	471	'upload_date': '20121002',
	472	'uploader_id': '8KVIDEO',
	473	'description': '',
	474	'uploader': '8KVIDEO',
	475	'title': 'UHDTV TEST 8K VIDEO.mp4'
	476	},
	477	'params': {
	478	'youtube_include_dash_manifest': True,
	479	'format': '141',
	480	},
	481	},
	482	# DASH manifest with encrypted signature
	483	{
	484	'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
	485	'info_dict': {
	486	'id': 'IB3lcPjvWLA',
	487	'ext': 'm4a',
	488	'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
	489	'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
	490	'uploader': 'AfrojackVEVO',
	491	'uploader_id': 'AfrojackVEVO',
	492	'upload_date': '20131011',
	493	},
	494	'params': {
	495	'youtube_include_dash_manifest': True,
	496	'format': '141',
	497	},
	498	},
	499	# JS player signature function name containing $
	500	{

1

# coding: utf-8

2

3

from __future__ import unicode_literals

import itertools

import json

import os.path

import random

import re

import time

import traceback

from .common import InfoExtractor, SearchInfoExtractor

15

from ..jsinterp import JSInterpreter

16

from ..swfinterp import SWFInterpreter

17

from ..compat import (

compat_chr,

compat_parse_qs,

compat_urllib_parse,

compat_urllib_parse_unquote,

22

compat_urllib_parse_unquote_plus,

23

compat_urllib_parse_urlparse,

compat_urlparse,

compat_str,

)

from ..utils import (

clean_html,

encode_dict,

error_to_compat_str,

ExtractorError,

float_or_none,

get_element_by_attribute,

get_element_by_id,

int_or_none,

mimetype2ext,

orderedSet,

parse_duration,

remove_quotes,

remove_start,

sanitized_Request,

smuggle_url,

str_to_int,

unescapeHTML,

unified_strdate,

unsmuggle_url,

uppercase_escape,

ISO3166Utils,

)

class YoutubeBaseInfoExtractor(InfoExtractor):

53

"""Provide base functions for Youtube extractors"""

54

_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'

55

_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'

56

_NETRC_MACHINE = 'youtube'

57

# If True it will raise an error if no login info is provided

58

_LOGIN_REQUIRED = False

59

60

def _set_language(self):

61

self._set_cookie(

62

'.youtube.com', 'PREF', 'f1=50000000&hl=en',

63

# YouTube sets the expire time to about two months

64

expire_time=time.time() + 2 * 30 * 24 * 3600)

65

66

def _ids_to_results(self, ids):

67

return [

68

self.url_result(vid_id, 'Youtube', video_id=vid_id)

for vid_id in ids]

def _login(self):

"""

Attempt to log in to YouTube.

74

True is returned if successful or skipped.

75

False is returned if login failed.

76

77

If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.

78

"""

79

(username, password) = self._get_login_info()

80

# No authentication to be performed

81

if username is None:

82

if self._LOGIN_REQUIRED:

83

raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)

84

return True

85

86

login_page = self._download_webpage(

87

self._LOGIN_URL, None,

88

note='Downloading login page',

89

errnote='unable to fetch login page', fatal=False)

90

if login_page is False:

91

return

92

93

galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',

94

login_page, 'Login GALX parameter')

# Log in

login_form_strs = {

'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',

'Email': username,

'GALX': galx,

'Passwd': password,

'PersistentCookie': 'yes',

104

'_utf8': '霱',

105

'bgresponse': 'js_disabled',

106

'checkConnection': '',

107

'checkedDomains': 'youtube',

'dnConn': '',

'pstMsg': '0',

'rmShown': '1',

'secTok': '',

'signIn': 'Sign in',

'timeStmp': '',

'service': 'youtube',

'uilel': '3',

'hl': 'en_US',

}

login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')

120

121

req = sanitized_Request(self._LOGIN_URL, login_data)

122

login_results = self._download_webpage(

123

req, None,

124

note='Logging in', errnote='unable to log in', fatal=False)

125

if login_results is False:

126

return False

127

128

if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:

129

raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)

130

131

# Two-Factor

132

# TODO add SMS and phone call support - these require making a request and then prompting the user

133

134

if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:

135

tfa_code = self._get_tfa_info('2-step verification code')

136

137

if not tfa_code:

138

self._downloader.report_warning(

139

'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'

140

'(Note that only TOTP (Google Authenticator App) codes work at this time.)')

141

return False

142

143

tfa_code = remove_start(tfa_code, 'G-')

144

145

tfa_form_strs = self._form_hidden_inputs('challenge', login_results)

146

147

tfa_form_strs.update({

'Pin': tfa_code,

'TrustDevice': 'on',

})

tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')

153

154

tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)

155

tfa_results = self._download_webpage(

156

tfa_req, None,

157

note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)

158

159

if tfa_results is False:

160

return False

161

162

if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:

163

self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')

164

return False

165

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:

166

self._downloader.report_warning('unable to log in - did the page structure change?')

167

return False

168

if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:

169

self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')

170

return False

171

172

if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:

173

self._downloader.report_warning('unable to log in: bad username or password')

return False

return True

def _real_initialize(self):

178

if self._downloader is None:

179

return

180

self._set_language()

181

if not self._login():

return

class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):

186

# Extract entries from page with "Load more" button

187

def _entries(self, page, playlist_id):

188

more_widget_html = content_html = page

189

for page_num in itertools.count(1):

190

for entry in self._process_page(content_html):

191

yield entry

192

193

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

198

'https://youtube.com/%s' % mobj.group('more'), playlist_id,

199

'Downloading page #%s' % page_num,

200

transform_source=uppercase_escape)

201

content_html = more['content_html']

202

if not content_html.strip():

203

# Some webpages show a "Load more" button but they don't

204

# have more videos

205

break

206

more_widget_html = more['load_more_widget_html']

207

208

209

class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

210

def _process_page(self, content):

211

for video_id, video_title in self.extract_videos_from_page(content):

212

yield self.url_result(video_id, 'Youtube', video_id, video_title)

213

214

def extract_videos_from_page(self, page):

215

ids_in_page = []

216

titles_in_page = []

217

for mobj in re.finditer(self._VIDEO_RE, page):

218

# The link with index 0 is not the first video of the playlist (not sure if still actual)

219

if 'index' in mobj.groupdict() and mobj.group('id') == '0':

220

continue

221

video_id = mobj.group('id')

222

video_title = unescapeHTML(mobj.group('title'))

223

if video_title:

224

video_title = video_title.strip()

225

try:

226

idx = ids_in_page.index(video_id)

227

if video_title and not titles_in_page[idx]:

228

titles_in_page[idx] = video_title

229

except ValueError:

230

ids_in_page.append(video_id)

231

titles_in_page.append(video_title)

232

return zip(ids_in_page, titles_in_page)

233

234

235

class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):

236

def _process_page(self, content):

237

for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):

238

yield self.url_result(

239

'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')

240

241

def _real_extract(self, url):

242

playlist_id = self._match_id(url)

243

webpage = self._download_webpage(url, playlist_id)

244

title = self._og_search_title(webpage, fatal=False)

245

return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)

246

247

248

class YoutubeIE(YoutubeBaseInfoExtractor):

249

IE_DESC = 'YouTube.com'

250

_VALID_URL = r"""(?x)^

251

(

252

(?:https?://|//) # http(s):// or protocol-independent URL

253

(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|

254

(?:www\.)?deturl\.com/www\.youtube\.com/|

255

(?:www\.)?pwnyoutube\.com/|

256

(?:www\.)?yourepeat\.com/|

257

tube\.majestyc\.net/|

258

youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains

259

(?:.*?\#/)? # handle anchor (#/) redirect urls

260

(?: # the various things that can precede the ID:

261

(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/

262

|(?: # or the v= param in all its forms

263

(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)

264

(?:\?|\#!?) # the params delimiter ? or # or #!

265

(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)

v=

)

))

|(?:

youtu\.be| # just youtu.be/xxxx

271

vid\.plus # or vid.plus/xxxx

272

)/

273

|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=

274

)

275

)? # all until now is optional -> you can pass the naked ID

276

([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID

277

(?!.*?&list=) # combined list/video URLs are handled by the playlist IE

278

(?(1).+)? # if we found the ID, everything can follow

279

$"""

280

_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'

281

# tbr was extracted from com/google/youtube/model/VideoFormat.as in watch_as3.swf and converted from Bytes/S to KBits/S

282

_formats = {

283

'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263', 'tbr': 320},

284

'6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263', 'tbr': 896},

285

'13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v', 'tbr': 60},

286

'17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v', 'tbr': 80},

287

'18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264', 'tbr': 736},

288

'22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'tbr': 3192},

289

'34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'tbr': 928},

290

'35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'tbr': 1280},

291

# itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well

292

'36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v', 'tbr': 256},

293

'37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'tbr': 6192},

294

'38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'tbr': 10128},

295

'43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'tbr': 928},

296

'44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'tbr': 1280},

297

'45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'tbr': 3192},

298

'46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},

299

'59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'tbr': 1280},

300

'78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'tbr': 1280},

# 3D videos

'82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20, 'tbr': 800},

305

'83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20, 'tbr': 1152},

306

'84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20, 'tbr': 3000},

307

'85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20, 'tbr': 6000},

308

'100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},

309

'101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

310

'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},

311

312

# Apple HTTP Live Streaming

313

'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10, 'tbr': 186.625},

314

'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10, 'tbr': 951.5625},

315

'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10, 'tbr': 1312.5},

316

'95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10, 'tbr': 3207.421875},

317

'96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10, 'tbr': 6349.21875},

318

'132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},

319

'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},

320

321

# DASH mp4 video

322

'133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 261.71875},

323

'134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 951.5625},

324

'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 1312.5},

325

'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 3207.421875},

326

'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 6349.21875},

327

'138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 10128.0}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)

328

'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40, 'tbr': 91.796875},

329

'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

330

'298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

331

'299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},

332

'266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},

333

334

# Dash mp4 audio

335

'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash', 'tbr': 32},

336

'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash', 'tbr': 128},

337

'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash', 'tbr': 320},

338

339

# Dash webm

340

'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

341

'168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

342

'169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

343

'170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

344

'218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

345

'219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},

346

'278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},

347

'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

348

'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

349

'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

350

'245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

351

'246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

352

'247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

353

'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

354

'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

355

# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)

356

'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

357

'302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

358

'303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

359

'308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

360

'313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},

361

'315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},

362

363

# Dash webm audio

364

'171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},

365

'172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},

366

367

# Dash webm audio with opus inside

368

'249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},

369

'250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},

370

'251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},

371

372

# RTMP (unnamed)

373

'_rtmp': {'protocol': 'rtmp'},

374

}

375

_SUBTITLE_FORMATS = ('ttml', 'vtt')

IE_NAME = 'youtube'

_TESTS = [

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

385

'uploader': 'Philipp Hagemeister',

386

'uploader_id': 'phihag',

387

'upload_date': '20121002',

388

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

389

'categories': ['Science & Technology'],

390

'tags': ['youtube-dl'],

391

'like_count': int,

392

'dislike_count': int,

'start_time': 1,

'end_time': 9,

}

},

{

'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',

399

'note': 'Test generic use_cipher_signature video (#897)',

'info_dict': {

'id': 'UxxajLWwzqY',

'ext': 'mp4',

'upload_date': '20120506',

404

'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',

405

'alt_title': 'I Love It (feat. Charli XCX)',

406

'description': 'md5:782e8651347686cba06e58f71ab51773',

407

'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',

408

'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',

409

'iconic ep', 'iconic', 'love', 'it'],

410

'uploader': 'Icona Pop',

411

'uploader_id': 'IconaPop',

412

'creator': 'Icona Pop',

}

},

{

'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',

417

'note': 'Test VEVO video with age protection (#956)',

'info_dict': {

'id': '07FYdnEawAQ',

'ext': 'mp4',

'upload_date': '20130703',

422

'title': 'Justin Timberlake - Tunnel Vision (Explicit)',

423

'alt_title': 'Tunnel Vision',

424

'description': 'md5:64249768eec3bc4276236606ea996373',

425

'uploader': 'justintimberlakeVEVO',

426

'uploader_id': 'justintimberlakeVEVO',

427

'creator': 'Justin Timberlake',

'age_limit': 18,

}

},

{

'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',

433

'note': 'Embed-only video (#1746)',

'info_dict': {

'id': 'yZIXLfi8CZQ',

'ext': 'mp4',

'upload_date': '20120608',

438

'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',

439

'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',

440

'uploader': 'SET India',

441

'uploader_id': 'setindia',

'age_limit': 18,

}

},

{

'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',

447

'note': 'Use the first video ID in the URL',

'info_dict': {

'id': 'BaW_jenozKc',

'ext': 'mp4',

'title': 'youtube-dl test video "\'/\\ä↭𝕐',

452

'uploader': 'Philipp Hagemeister',

453

'uploader_id': 'phihag',

454

'upload_date': '20121002',

455

'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',

456

'categories': ['Science & Technology'],

457

'tags': ['youtube-dl'],

458

'like_count': int,

459

'dislike_count': int,

460

},

461

'params': {

462

'skip_download': True,

},

},

{

'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',

467

'note': '256k DASH audio (format 141) via DASH manifest',

'info_dict': {

'id': 'a9LDPn-MO4I',

'ext': 'm4a',

'upload_date': '20121002',

472

'uploader_id': '8KVIDEO',

473

'description': '',

474

'uploader': '8KVIDEO',

475

'title': 'UHDTV TEST 8K VIDEO.mp4'

476

},

477

'params': {

478

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# DASH manifest with encrypted signature

483

{

484

'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',

'info_dict': {

'id': 'IB3lcPjvWLA',

'ext': 'm4a',

'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',

489

'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',

490

'uploader': 'AfrojackVEVO',

491

'uploader_id': 'AfrojackVEVO',

492

'upload_date': '20131011',

493

},

494

'params': {

495

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# JS player signature function name containing $

500

{

501

'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',

'info_dict': {

'id': 'nfWlot6h_JM',

'ext': 'm4a',

'title': 'Taylor Swift - Shake It Off',

506

'alt_title': 'Shake It Off',

507

'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',

508

'uploader': 'TaylorSwiftVEVO',

509

'uploader_id': 'TaylorSwiftVEVO',

510

'upload_date': '20140818',

511

'creator': 'Taylor Swift',

512

},

513

'params': {

514

'youtube_include_dash_manifest': True,

'format': '141',

},

},

# Controversy video

{

'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',

'info_dict': {

'id': 'T4XJQO3qol8',

'ext': 'mp4',

'upload_date': '20100909',

525

'uploader': 'The Amazing Atheist',

526

'uploader_id': 'TheAmazingAtheist',

527

'title': 'Burning Everyone\'s Koran',

528

'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',

529

}

530

},

531

# Normal age-gate video (No vevo, embed allowed)

532

{

533

'url': 'http://youtube.com/watch?v=HtVdAasjOgU',

'info_dict': {

'id': 'HtVdAasjOgU',

'ext': 'mp4',

'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',

538

'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',

539

'uploader': 'The Witcher',

540

'uploader_id': 'WitcherGame',

541

'upload_date': '20140605',

'age_limit': 18,

},

},

# Age-gate video with encrypted signature

546

{

547

'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',

'info_dict': {

'id': '6kLq3WMV1nU',

'ext': 'mp4',

'title': 'Dedication To My Ex (Miss That) (Lyric Video)',

552

'description': 'md5:33765bb339e1b47e7e72b5490139bb41',

553

'uploader': 'LloydVEVO',

554

'uploader_id': 'LloydVEVO',

555

'upload_date': '20110629',

'age_limit': 18,

},

},

# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)

560

{

561

'url': '__2ABJjxzNo',

'info_dict': {

'id': '__2ABJjxzNo',

'ext': 'mp4',

'upload_date': '20100430',

566

'uploader_id': 'deadmau5',

567

'creator': 'deadmau5',

568

'description': 'md5:12c56784b8032162bb936a5f76d55360',

569

'uploader': 'deadmau5',

570

'title': 'Deadmau5 - Some Chords (HD)',

571

'alt_title': 'Some Chords',

572

},

573

'expected_warnings': [

574

'DASH manifest missing',

575

]

576

},

577

# Olympics (https://github.com/rg3/youtube-dl/issues/4431)

578

{

579

'url': 'lqQg6PlCWgI',

'info_dict': {

'id': 'lqQg6PlCWgI',

'ext': 'mp4',

'upload_date': '20150827',

584

'uploader_id': 'olympic',

585

'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',

586

'uploader': 'Olympics',

587

'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',

588

},

589

'params': {

590

'skip_download': 'requires avconv',

}

},

# Non-square pixels

{

'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',

'info_dict': {

'id': '_b-2C3KPAM0',

'ext': 'mp4',

'stretched_ratio': 16 / 9.,

600

'upload_date': '20110310',

601

'uploader_id': 'AllenMeow',

602

'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',

603

'uploader': '孫艾倫',

604

'title': '[A-made] 變態妍字幕版太妍我就是這樣的人',

605

},

606

},

607

# url_encoded_fmt_stream_map is empty string

608

{

609

'url': 'qEJwOuvDf7I',

'info_dict': {

'id': 'qEJwOuvDf7I',

'ext': 'webm',

'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',

614

'description': '',

615

'upload_date': '20150404',

616

'uploader_id': 'spbelect',

617

'uploader': 'Наблюдатели Петербурга',

618

},

619

'params': {

620

'skip_download': 'requires avconv',

621

},

622

'skip': 'This live event has ended.',

623

},

624

# Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)

625

{

626

'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',

'info_dict': {

'id': 'FIl7x6_3R5Y',

'ext': 'mp4',

'title': 'md5:7b81415841e02ecd4313668cde88737a',

631

'description': 'md5:116377fd2963b81ec4ce64b542173306',

632

'upload_date': '20150625',

633

'uploader_id': 'dorappi2000',

634

'uploader': 'dorappi2000',

635

'formats': 'mincount:33',

636

},

637

},

638

# DASH manifest with segment_list

639

{

640

'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',

641

'md5': '8ce563a1d667b599d21064e982ab9e31',

'info_dict': {

'id': 'CsmdDsKjzN8',

'ext': 'mp4',

'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510

646

'uploader': 'Airtek',

647

'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',

648

'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',

649

'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',

650

},

651

'params': {

652

'youtube_include_dash_manifest': True,

653

'format': '135', # bestvideo

}

},

{

# Multifeed videos (multiple cameras), URL is for Main Camera

658

'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',

659

'info_dict': {

660

'id': 'jqWvoWXjCVs',

661

'title': 'teamPGP: Rocket League Noob Stream',

662

'description': 'md5:dc7872fb300e143831327f1bae3af010',

},

'playlist': [{

'info_dict': {

'id': 'jqWvoWXjCVs',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',

669

'description': 'md5:dc7872fb300e143831327f1bae3af010',

670

'upload_date': '20150721',

671

'uploader': 'Beer Games Beer',

672

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': '6h8e8xoXJzg',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',

679

'description': 'md5:dc7872fb300e143831327f1bae3af010',

680

'upload_date': '20150721',

681

'uploader': 'Beer Games Beer',

682

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': 'PUOgX5z9xZw',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (grizzle)',

689

'description': 'md5:dc7872fb300e143831327f1bae3af010',

690

'upload_date': '20150721',

691

'uploader': 'Beer Games Beer',

692

'uploader_id': 'beergamesbeer',

},

}, {

'info_dict': {

'id': 'teuwxikvS5k',

'ext': 'mp4',

'title': 'teamPGP: Rocket League Noob Stream (zim)',

699

'description': 'md5:dc7872fb300e143831327f1bae3af010',

700

'upload_date': '20150721',

701

'uploader': 'Beer Games Beer',

702

'uploader_id': 'beergamesbeer',

},

}],

'params': {

'skip_download': True,

},

},

{

# Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)

711

'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',

712

'info_dict': {

713

'id': 'gVfLd0zydlo',

714

'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',

},

'playlist_count': 2,

},

{

'url': 'http://vid.plus/FlRa-iH7PGw',

720

'only_matching': True,

721

},

722

{

723

# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)

724

# Also tests cut-off URL expansion in video description (see

725

# https://github.com/rg3/youtube-dl/issues/1892,

726

# https://github.com/rg3/youtube-dl/issues/8164)

727

'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',

'info_dict': {

'id': 'lsguqyKfVQg',

'ext': 'mp4',

'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',

732

'alt_title': 'Dark Walk',

733

'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',

734

'upload_date': '20151119',

735

'uploader_id': 'IronSoulElf',

736

'uploader': 'IronSoulElf',

737

'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',

738

},

739

'params': {

740

'skip_download': True,

},

},

{

# Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)

745

'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',

746

'only_matching': True,

747

},

748

{

749

# Video with yt:stretch=17:0

750

'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',

'info_dict': {

'id': 'Q39EVAstoRM',

'ext': 'mp4',

'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',

755

'description': 'md5:ee18a25c350637c8faff806845bddee9',

756

'upload_date': '20151107',

757

'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',

758

'uploader': 'CH GAMER DROID',

759

},

760

'params': {

761

'skip_download': True,

},

},

{

'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',

766

'only_matching': True,

}

]

def __init__(self, *args, **kwargs):

771

super(YoutubeIE, self).__init__(*args, **kwargs)

772

self._player_cache = {}

773

774

def report_video_info_webpage_download(self, video_id):

775

"""Report attempt to download video info webpage."""

776

self.to_screen('%s: Downloading video info webpage' % video_id)

777

778

def report_information_extraction(self, video_id):

779

"""Report attempt to extract video information."""

780

self.to_screen('%s: Extracting video information' % video_id)

781

782

def report_unavailable_format(self, video_id, format):

783

"""Report extracted video URL."""

784

self.to_screen('%s: Format %s not available' % (video_id, format))

785

786

def report_rtmp_download(self):

787

"""Indicate the download will use the RTMP protocol."""

788

self.to_screen('RTMP download detected')

789

790

def _signature_cache_id(self, example_sig):

791

""" Return a string representation of a signature """

792

return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))

793

794

def _extract_signature_function(self, video_id, player_url, example_sig):

795

id_m = re.match(

796

r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',

797

player_url)

798

if not id_m:

799

raise ExtractorError('Cannot identify player %r' % player_url)

800

player_type = id_m.group('ext')

801

player_id = id_m.group('id')

802

803

# Read from filesystem cache

804

func_id = '%s_%s_%s' % (

805

player_type, player_id, self._signature_cache_id(example_sig))

806

assert os.path.basename(func_id) == func_id

807

808

cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)

809

if cache_spec is not None:

810

return lambda s: ''.join(s[i] for i in cache_spec)

811

812

download_note = (

813

'Downloading player %s' % player_url

814

if self._downloader.params.get('verbose') else

815

'Downloading %s player %s' % (player_type, player_id)

816

)

817

if player_type == 'js':

818

code = self._download_webpage(

819

player_url, video_id,

820

note=download_note,

821

errnote='Download of %s failed' % player_url)

822

res = self._parse_sig_js(code)

823

elif player_type == 'swf':

824

urlh = self._request_webpage(

825

player_url, video_id,

826

note=download_note,

827

errnote='Download of %s failed' % player_url)

828

code = urlh.read()

829

res = self._parse_sig_swf(code)

830

else:

831

assert False, 'Invalid player type %r' % player_type

832

833

test_string = ''.join(map(compat_chr, range(len(example_sig))))

834

cache_res = res(test_string)

835

cache_spec = [ord(c) for c in cache_res]

836

837

self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)

838

return res

839

840

def _print_sig_code(self, func, example_sig):

841

def gen_sig_code(idxs):

842

def _genslice(start, end, step):

843

starts = '' if start == 0 else str(start)

844

ends = (':%d' % (end + step)) if end + step >= 0 else ':'

845

steps = '' if step == 1 else (':%d' % step)

846

return 's[%s%s%s]' % (starts, ends, steps)

847

848

step = None

849

# Quelch pyflakes warnings - start will be set when step is set

850

start = '(Never used)'

851

for i, prev in zip(idxs[1:], idxs[:-1]):

if step is not None:

if i - prev == step:

continue

yield _genslice(start, prev, step)

856

step = None

857

continue

858

if i - prev in [-1, 1]:

step = i - prev

start = prev

continue

else:

yield 's[%d]' % prev

if step is None:

yield 's[%d]' % i

else:

yield _genslice(start, i, step)

868

869

test_string = ''.join(map(compat_chr, range(len(example_sig))))

870

cache_res = func(test_string)

871

cache_spec = [ord(c) for c in cache_res]

872

expr_code = ' + '.join(gen_sig_code(cache_spec))

873

signature_id_tuple = '(%s)' % (

874

', '.join(compat_str(len(p)) for p in example_sig.split('.')))

875

code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'

876

' return %s\n') % (signature_id_tuple, expr_code)

877

self.to_screen('Extracted signature function:\n' + code)

878

879

def _parse_sig_js(self, jscode):

880

funcname = self._search_regex(

881

r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,

882

'Initial JS player signature function name')

883

884

jsi = JSInterpreter(jscode)

885

initial_function = jsi.extract_function(funcname)

886

return lambda s: initial_function([s])

887

888

def _parse_sig_swf(self, file_contents):

889

swfi = SWFInterpreter(file_contents)

890

TARGET_CLASSNAME = 'SignatureDecipher'

891

searched_class = swfi.extract_class(TARGET_CLASSNAME)

892

initial_function = swfi.extract_function(searched_class, 'decipher')

893

return lambda s: initial_function([s])

894

895

def _decrypt_signature(self, s, video_id, player_url, age_gate=False):

896

"""Turn the encrypted s field into a working signature"""

897

898

if player_url is None:

899

raise ExtractorError('Cannot decrypt signature without player_url')

900

901

if player_url.startswith('//'):

902

player_url = 'https:' + player_url

903

try:

904

player_id = (player_url, self._signature_cache_id(s))

905

if player_id not in self._player_cache:

906

func = self._extract_signature_function(

907

video_id, player_url, s

908

)

909

self._player_cache[player_id] = func

910

func = self._player_cache[player_id]

911

if self._downloader.params.get('youtube_print_sig_code'):

912

self._print_sig_code(func, s)

913

return func(s)

914

except Exception as e:

915

tb = traceback.format_exc()

916

raise ExtractorError(

917

'Signature extraction failed: ' + tb, cause=e)

918

919

def _get_subtitles(self, video_id, webpage):

920

try:

921

subs_doc = self._download_xml(

922

'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,

923

video_id, note=False)

924

except ExtractorError as err:

925

self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))

return {}

sub_lang_list = {}

for track in subs_doc.findall('track'):

930

lang = track.attrib['lang_code']

931

if lang in sub_lang_list:

932

continue

933

sub_formats = []

934

for ext in self._SUBTITLE_FORMATS:

935

params = compat_urllib_parse.urlencode({

'lang': lang,

'v': video_id,

'fmt': ext,

'name': track.attrib['name'].encode('utf-8'),

940

})

941

sub_formats.append({

942

'url': 'https://www.youtube.com/api/timedtext?' + params,

943

'ext': ext,

944

})

945

sub_lang_list[lang] = sub_formats

946

if not sub_lang_list:

947

self._downloader.report_warning('video doesn\'t have subtitles')

return {}

return sub_lang_list

def _get_ytplayer_config(self, video_id, webpage):

952

patterns = (

953

# User data may contain arbitrary character sequences that may affect

954

# JSON extraction with regex, e.g. when '};' is contained the second

955

# regex won't capture the whole JSON. Yet working around by trying more

956

# concrete regex first keeping in mind proper quoted string handling

957

# to be implemented in future that will replace this workaround (see

958

# https://github.com/rg3/youtube-dl/issues/7468,

959

# https://github.com/rg3/youtube-dl/pull/7599)

960

r';ytplayer\.config\s*=\s*({.+?});ytplayer',

961

r';ytplayer\.config\s*=\s*({.+?});',

962

)

963

config = self._search_regex(

964

patterns, webpage, 'ytplayer.config', default=None)

965

if config:

966

return self._parse_json(

967

uppercase_escape(config), video_id, fatal=False)

968

969

def _get_automatic_captions(self, video_id, webpage):

970

"""We need the webpage for getting the captions url, pass it as an

971

argument to speed up the process."""

972

self.to_screen('%s: Looking for automatic captions' % video_id)

973

player_config = self._get_ytplayer_config(video_id, webpage)

974

err_msg = 'Couldn\'t find automatic captions for %s' % video_id

975

if not player_config:

976

self._downloader.report_warning(err_msg)

977

return {}

978

try:

979

args = player_config['args']

980

caption_url = args.get('ttsurl')

981

if caption_url:

982

timestamp = args['timestamp']

983

# We get the available subtitles

984

list_params = compat_urllib_parse.urlencode({

'type': 'list',

'tlangs': 1,

'asrs': 1,

})

list_url = caption_url + '&' + list_params

990

caption_list = self._download_xml(list_url, video_id)

991

original_lang_node = caption_list.find('track')

992

if original_lang_node is None:

993

self._downloader.report_warning('Video doesn\'t have automatic captions')

994

return {}

995

original_lang = original_lang_node.attrib['lang_code']

996

caption_kind = original_lang_node.attrib.get('kind', '')

997

998

sub_lang_list = {}

999

for lang_node in caption_list.findall('target'):

1000

sub_lang = lang_node.attrib['lang_code']

1001

sub_formats = []

1002

for ext in self._SUBTITLE_FORMATS:

1003

params = compat_urllib_parse.urlencode({

1004

'lang': original_lang,

'tlang': sub_lang,

'fmt': ext,

'ts': timestamp,

'kind': caption_kind,

1009

})

1010

sub_formats.append({

1011

'url': caption_url + '&' + params,

1012

'ext': ext,

1013

})

1014

sub_lang_list[sub_lang] = sub_formats

1015

return sub_lang_list

1016

1017

# Some videos don't provide ttsurl but rather caption_tracks and

1018

# caption_translation_languages (e.g. 20LmZk1hakA)

1019

caption_tracks = args['caption_tracks']

1020

caption_translation_languages = args['caption_translation_languages']

1021

caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]

1022

parsed_caption_url = compat_urlparse.urlparse(caption_url)

1023

caption_qs = compat_parse_qs(parsed_caption_url.query)

1024

1025

sub_lang_list = {}

1026

for lang in caption_translation_languages.split(','):

1027

lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))

1028

sub_lang = lang_qs.get('lc', [None])[0]

if not sub_lang:

continue

sub_formats = []

for ext in self._SUBTITLE_FORMATS:

caption_qs.update({

'tlang': [sub_lang],

'fmt': [ext],

})

sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(

1038

query=compat_urllib_parse.urlencode(caption_qs, True)))

sub_formats.append({

'url': sub_url,

'ext': ext,

})

sub_lang_list[sub_lang] = sub_formats

1044

return sub_lang_list

1045

# An extractor error can be raise by the download process if there are

1046

# no automatic captions but there are subtitles

1047

except (KeyError, ExtractorError):

1048

self._downloader.report_warning(err_msg)

1049

return {}

1050

1051

def _mark_watched(self, video_id, video_info):

1052

playback_url = video_info.get('videostats_playback_base_url', [None])[0]

1053

if not playback_url:

1054

return

1055

parsed_playback_url = compat_urlparse.urlparse(playback_url)

1056

qs = compat_urlparse.parse_qs(parsed_playback_url.query)

1057

1058

# cpn generation algorithm is reverse engineered from base.js.

1059

# In fact it works even with dummy cpn.

1060

CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'

1061

cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))

qs.update({

'ver': ['2'],

'cpn': [cpn],

})

playback_url = compat_urlparse.urlunparse(

1068

parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True)))

1069

1070

self._download_webpage(

1071

playback_url, video_id, 'Marking watched',

1072

'Unable to mark watched', fatal=False)

1073

1074

@classmethod

1075

def extract_id(cls, url):

1076

mobj = re.match(cls._VALID_URL, url, re.VERBOSE)

1077

if mobj is None:

1078

raise ExtractorError('Invalid URL: %s' % url)

1079

video_id = mobj.group(2)

1080

return video_id

1081

1082

def _extract_from_m3u8(self, manifest_url, video_id):

1083

url_map = {}

1084

1085

def _get_urls(_manifest):

1086

lines = _manifest.split('\n')

1087

urls = filter(lambda l: l and not l.startswith('#'),

1088

lines)

1089

return urls

1090

manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')

1091

formats_urls = _get_urls(manifest)

1092

for format_url in formats_urls:

1093

itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')

1094

url_map[itag] = format_url

1095

return url_map

1096

1097

def _extract_annotations(self, video_id):

1098

url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id

1099

return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')

1100

1101

def _real_extract(self, url):

1102

url, smuggled_data = unsmuggle_url(url, {})

1103

1104

proto = (

1105

'http' if self._downloader.params.get('prefer_insecure', False)

else 'https')

start_time = None

end_time = None

parsed_url = compat_urllib_parse_urlparse(url)

1111

for component in [parsed_url.fragment, parsed_url.query]:

1112

query = compat_parse_qs(component)

1113

if start_time is None and 't' in query:

1114

start_time = parse_duration(query['t'][0])

1115

if start_time is None and 'start' in query:

1116

start_time = parse_duration(query['start'][0])

1117

if end_time is None and 'end' in query:

1118

end_time = parse_duration(query['end'][0])

1119

1120

# Extract original video URL from URL with redirection, like age verification, using next_url parameter

1121

mobj = re.search(self._NEXT_URL_RE, url)

1122

if mobj:

1123

url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')

1124

video_id = self.extract_id(url)

1125

1126

# Get video webpage

1127

url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id

1128

video_webpage = self._download_webpage(url, video_id)

1129

1130

# Attempt to extract SWF player URL

1131

mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)

1132

if mobj is not None:

1133

player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))

else:

player_url = None

dash_mpds = []

def add_dash_mpd(video_info):

1140

dash_mpd = video_info.get('dashmpd')

1141

if dash_mpd and dash_mpd[0] not in dash_mpds:

1142

dash_mpds.append(dash_mpd[0])

# Get video info

embed_webpage = None

is_live = None

if re.search(r'player-age-gate-content">', video_webpage) is not None:

1148

age_gate = True

1149

# We simulate the access to the video from www.youtube.com/v/{video_id}

1150

# this can be viewed without login into Youtube

1151

url = proto + '://www.youtube.com/embed/%s' % video_id

1152

embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')

1153

data = compat_urllib_parse.urlencode({

1154

'video_id': video_id,

1155

'eurl': 'https://youtube.googleapis.com/v/' + video_id,

1156

'sts': self._search_regex(

1157

r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),

1158

})

1159

video_info_url = proto + '://www.youtube.com/get_video_info?' + data

1160

video_info_webpage = self._download_webpage(

1161

video_info_url, video_id,

1162

note='Refetching age-gated info webpage',

1163

errnote='unable to download video info webpage')

1164

video_info = compat_parse_qs(video_info_webpage)

1165

add_dash_mpd(video_info)

else:

age_gate = False

video_info = None

# Try looking directly into the video webpage

1170

ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)

1171

if ytplayer_config:

1172

args = ytplayer_config['args']

1173

if args.get('url_encoded_fmt_stream_map'):

1174

# Convert to the same format returned by compat_parse_qs

1175

video_info = dict((k, [v]) for k, v in args.items())

1176

add_dash_mpd(video_info)

1177

if args.get('livestream') == '1' or args.get('live_playback') == 1:

1178

is_live = True

1179

if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):

1180

# We also try looking in get_video_info since it may contain different dashmpd

1181

# URL that points to a DASH manifest with possibly different itag set (some itags

1182

# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH

1183

# manifest pointed by get_video_info's dashmpd).

1184

# The general idea is to take a union of itags of both DASH manifests (for example

1185

# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)

1186

self.report_video_info_webpage_download(video_id)

1187

for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:

1188

video_info_url = (

1189

'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'

1190

% (proto, video_id, el_type))

1191

video_info_webpage = self._download_webpage(

1192

video_info_url,

1193

video_id, note=False,

1194

errnote='unable to download video info webpage')

1195

get_video_info = compat_parse_qs(video_info_webpage)

1196

if get_video_info.get('use_cipher_signature') != ['True']:

1197

add_dash_mpd(get_video_info)

1198

if not video_info:

1199

video_info = get_video_info

1200

if 'token' in get_video_info:

1201

# Different get_video_info requests may report different results, e.g.

1202

# some may report video unavailability, but some may serve it without

1203

# any complaint (see https://github.com/rg3/youtube-dl/issues/7362,

1204

# the original webpage as well as el=info and el=embedded get_video_info

1205

# requests report video unavailability due to geo restriction while

1206

# el=detailpage succeeds and returns valid data). This is probably

1207

# due to YouTube measures against IP ranges of hosting providers.

1208

# Working around by preferring the first succeeded video_info containing

1209

# the token if no such video_info yet was found.

1210

if 'token' not in video_info:

1211

video_info = get_video_info

1212

break

1213

if 'token' not in video_info:

1214

if 'reason' in video_info:

1215

if 'The uploader has not made this video available in your country.' in video_info['reason']:

1216

regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)

1217

if regions_allowed:

1218

raise ExtractorError('YouTube said: This video is available in %s only' % (

1219

', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),

1220

expected=True)

1221

raise ExtractorError(

1222

'YouTube said: %s' % video_info['reason'][0],

1223

expected=True, video_id=video_id)

1224

else:

1225

raise ExtractorError(

1226

'"token" parameter not in video info for unknown reason',

video_id=video_id)

# title

if 'title' in video_info:

1231

video_title = video_info['title'][0]

1232

else:

1233

self._downloader.report_warning('Unable to extract video title')

video_title = '_'

# description

video_description = get_element_by_id("eow-description", video_webpage)

1238

if video_description:

1239

video_description = re.sub(r'''(?x)

1240

<a\s+

1241

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1242

(?:title|href)="([^"]+)"\s+

1243

(?:[a-zA-Z-]+="[^"]+"\s+)*?

1244

class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>

1245

[^<]+\.{3}\s*

1246

</a>

1247

''', r'\1', video_description)

1248

video_description = clean_html(video_description)

1249

else:

1250

fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)

1251

if fd_mobj:

1252

video_description = unescapeHTML(fd_mobj.group(1))

1253

else:

1254

video_description = ''

1255

1256

if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):

1257

if not self._downloader.params.get('noplaylist'):

1258

entries = []

1259

feed_ids = []

1260

multifeed_metadata_list = video_info['multifeed_metadata_list'][0]

1261

for feed in multifeed_metadata_list.split(','):

1262

# Unquote should take place before split on comma (,) since textual

1263

# fields may contain comma as well (see

1264

# https://github.com/rg3/youtube-dl/issues/8536)

1265

feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))

1266

entries.append({

1267

'_type': 'url_transparent',

1268

'ie_key': 'Youtube',

1269

'url': smuggle_url(

1270

'%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),

1271

{'force_singlefeed': True}),

1272

'title': '%s (%s)' % (video_title, feed_data['title'][0]),

1273

})

1274

feed_ids.append(feed_data['id'][0])

1275

self.to_screen(

1276

'Downloading multifeed video (%s) - add --no-playlist to just download video %s'

1277

% (', '.join(feed_ids), video_id))

1278

return self.playlist_result(entries, video_id, video_title, video_description)

1279

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1280

1281

if 'view_count' in video_info:

1282

view_count = int(video_info['view_count'][0])

else:

view_count = None

# Check for "rental" videos

1287

if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:

1288

raise ExtractorError('"rental" videos not supported')

1289

1290

# Start extracting information

1291

self.report_information_extraction(video_id)

1292

1293

# uploader

1294

if 'author' not in video_info:

1295

raise ExtractorError('Unable to extract uploader name')

1296

video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])

1297

1298

# uploader_id

1299

video_uploader_id = None

1300

mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)

1301

if mobj is not None:

1302

video_uploader_id = mobj.group(1)

1303

else:

1304

self._downloader.report_warning('unable to extract uploader nickname')

1305

1306

# thumbnail image

1307

# We try first to get a high quality image:

1308

m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',

1309

video_webpage, re.DOTALL)

1310

if m_thumb is not None:

1311

video_thumbnail = m_thumb.group(1)

1312

elif 'thumbnail_url' not in video_info:

1313

self._downloader.report_warning('unable to extract video thumbnail')

1314

video_thumbnail = None

1315

else: # don't panic if we can't find it

1316

video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])

1317

1318

# upload date

1319

upload_date = self._html_search_meta(

1320

'datePublished', video_webpage, 'upload date', default=None)

1321

if not upload_date:

1322

upload_date = self._search_regex(

1323

[r'(?s)id="eow-date.*?>(.*?)</span>',

1324

r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],

1325

video_webpage, 'upload date', default=None)

1326

if upload_date:

1327

upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())

1328

upload_date = unified_strdate(upload_date)

1329

1330

m_music = re.search(

1331

r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:$.+?$)?</li',

1332

video_webpage)

1333

if m_music:

1334

video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))

1335

video_creator = clean_html(m_music.group('creator'))

1336

else:

1337

video_alt_title = video_creator = None

1338

1339

m_cat_container = self._search_regex(

1340

r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',

1341

video_webpage, 'categories', default=None)

1342

if m_cat_container:

1343

category = self._html_search_regex(

1344

r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',

1345

default=None)

1346

video_categories = None if category is None else [category]

1347

else:

1348

video_categories = None

1349

1350

video_tags = [

1351

unescapeHTML(m.group('content'))

1352

for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]

1353

1354

def _extract_count(count_name):

1355

return str_to_int(self._search_regex(

1356

r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'

1357

% re.escape(count_name),

1358

video_webpage, count_name, default=None))

1359

1360

like_count = _extract_count('like')

1361

dislike_count = _extract_count('dislike')

1362

1363

# subtitles

1364

video_subtitles = self.extract_subtitles(video_id, video_webpage)

1365

automatic_captions = self.extract_automatic_captions(video_id, video_webpage)

1366

1367

if 'length_seconds' not in video_info:

1368

self._downloader.report_warning('unable to extract video duration')

1369

video_duration = None

1370

else:

1371

video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))

1372

1373

# annotations

1374

video_annotations = None

1375

if self._downloader.params.get('writeannotations', False):

1376

video_annotations = self._extract_annotations(video_id)

1377

1378

def _map_to_format_list(urlmap):

1379

formats = []

1380

for itag, video_real_url in urlmap.items():

1381

dct = {

1382

'format_id': itag,

1383

'url': video_real_url,

1384

'player_url': player_url,

1385

}

1386

if itag in self._formats:

1387

dct.update(self._formats[itag])

formats.append(dct)

return formats

if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):

1392

self.report_rtmp_download()

1393

formats = [{

1394

'format_id': '_rtmp',

1395

'protocol': 'rtmp',

1396

'url': video_info['conn'][0],

1397

'player_url': player_url,

1398

}]

1399

elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:

1400

encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]

1401

if 'rtmpe%3Dyes' in encoded_url_map:

1402

raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)

1403

fmt_list = video_info.get('fmt_list', [''])[0]

1404

if fmt_list:

1405

for fmt in fmt_list.split(','):

1406

spec = fmt.split('/')

1407

width, height = spec[1].split('x')

1408

self._formats[spec[0]].update({

1409

'resolution': spec[1],

1410

'width': int_or_none(width),

1411

'height': int_or_none(height),

1412

})

1413

formats = []

1414

for url_data_str in encoded_url_map.split(','):

1415

url_data = compat_parse_qs(url_data_str)

1416

if 'itag' not in url_data or 'url' not in url_data:

1417

continue

1418

format_id = url_data['itag'][0]

1419

url = url_data['url'][0]

1420

1421

if 'sig' in url_data:

1422

url += '&signature=' + url_data['sig'][0]

1423

elif 's' in url_data:

1424

encrypted_sig = url_data['s'][0]

1425

ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'

1426

1427

jsplayer_url_json = self._search_regex(

1428

ASSETS_RE,

1429

embed_webpage if age_gate else video_webpage,

1430

'JS player URL (1)', default=None)

1431

if not jsplayer_url_json and not age_gate:

1432

# We need the embed website after all

1433

if embed_webpage is None:

1434

embed_url = proto + '://www.youtube.com/embed/%s' % video_id

1435

embed_webpage = self._download_webpage(

1436

embed_url, video_id, 'Downloading embed webpage')

1437

jsplayer_url_json = self._search_regex(

1438

ASSETS_RE, embed_webpage, 'JS player URL')

1439

1440

player_url = json.loads(jsplayer_url_json)

1441

if player_url is None:

1442

player_url_json = self._search_regex(

1443

r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',

1444

video_webpage, 'age gate player URL')

1445

player_url = json.loads(player_url_json)

1446

1447

if self._downloader.params.get('verbose'):

1448

if player_url is None:

1449

player_version = 'unknown'

1450

player_desc = 'unknown'

1451

else:

1452

if player_url.endswith('swf'):

1453

player_version = self._search_regex(

1454

r'-(.+?)(?:/watch_as3)?\.swf$', player_url,

1455

'flash player', fatal=False)

1456

player_desc = 'flash player %s' % player_version

1457

else:

1458

player_version = self._search_regex(

1459

[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],

1460

player_url,

1461

'html5 player', fatal=False)

1462

player_desc = 'html5 player %s' % player_version

1463

1464

parts_sizes = self._signature_cache_id(encrypted_sig)

1465

self.to_screen('{%s} signature length %s, %s' %

1466

(format_id, parts_sizes, player_desc))

1467

1468

signature = self._decrypt_signature(

1469

encrypted_sig, video_id, player_url, age_gate)

1470

url += '&signature=' + signature

1471

if 'ratebypass' not in url:

1472

url += '&ratebypass=yes'

1473

1474

dct = {

1475

'format_id': format_id,

1476

'url': url,

1477

'player_url': player_url,

1478

}

1479

if format_id in self._formats:

1480

dct.update(self._formats[format_id])

1481

1482

# Some itags are not included in DASH manifest thus corresponding formats will

1483

# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).

1484

# Trying to extract metadata from url_encoded_fmt_stream_map entry.

1485

mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])

1486

width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)

1487

1488

more_fields = {

1489

'filesize': int_or_none(url_data.get('clen', [None])[0]),

1490

'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),

1491

'width': width,

1492

'height': height,

1493

'fps': int_or_none(url_data.get('fps', [None])[0]),

1494

'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],

1495

}

1496

for key, value in more_fields.items():

1497

if value:

1498

dct[key] = value

1499

type_ = url_data.get('type', [None])[0]

1500

if type_:

1501

type_split = type_.split(';')

1502

kind_ext = type_split[0].split('/')

1503

if len(kind_ext) == 2:

1504

kind, _ = kind_ext

1505

dct['ext'] = mimetype2ext(type_split[0])

1506

if kind in ('audio', 'video'):

1507

codecs = None

1508

for mobj in re.finditer(

1509

r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):

1510

if mobj.group('key') == 'codecs':

1511

codecs = mobj.group('val')

1512

break

1513

if codecs:

1514

codecs = codecs.split(',')

1515

if len(codecs) == 2:

1516

acodec, vcodec = codecs[1], codecs[0]

1517

else:

1518

acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])

dct.update({

'acodec': acodec,

'vcodec': vcodec,

})

formats.append(dct)

elif video_info.get('hlsvp'):

1525

manifest_url = video_info['hlsvp'][0]

1526

url_map = self._extract_from_m3u8(manifest_url, video_id)

1527

formats = _map_to_format_list(url_map)

1528

# Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming

1529

for a_format in formats:

1530

a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'

1531

else:

1532

unavailable_message = self._html_search_regex(

1533

r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',

1534

video_webpage, 'unavailable message', default=None)

1535

if unavailable_message:

1536

raise ExtractorError(unavailable_message, expected=True)

1537

raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')

1538

1539

# Look for the DASH manifest

1540

if self._downloader.params.get('youtube_include_dash_manifest', True):

1541

dash_mpd_fatal = True

1542

for mpd_url in dash_mpds:

1543

dash_formats = {}

1544

try:

1545

def decrypt_sig(mobj):

1546

s = mobj.group(1)

1547

dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)

1548

return '/signature/%s' % dec_s

1549

1550

mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)

1551

1552

for df in self._extract_mpd_formats(

1553

mpd_url, video_id, fatal=dash_mpd_fatal,

1554

formats_dict=self._formats):

1555

# Do not overwrite DASH format found in some previous DASH manifest

1556

if df['format_id'] not in dash_formats:

1557

dash_formats[df['format_id']] = df

1558

# Additional DASH manifests may end up in HTTP Error 403 therefore

1559

# allow them to fail without bug report message if we already have

1560

# some DASH manifest succeeded. This is temporary workaround to reduce

1561

# burst of bug reports until we figure out the reason and whether it

1562

# can be fixed at all.

1563

dash_mpd_fatal = False

1564

except (ExtractorError, KeyError) as e:

1565

self.report_warning(

1566

'Skipping DASH manifest: %r' % e, video_id)

1567

if dash_formats:

1568

# Remove the formats we found through non-DASH, they

1569

# contain less info and it can be wrong, because we use

1570

# fixed values (for example the resolution). See

1571

# https://github.com/rg3/youtube-dl/issues/5774 for an

1572

# example.

1573

formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]

1574

formats.extend(dash_formats.values())

1575

1576

# Check for malformed aspect ratio

1577

stretched_m = re.search(

1578

r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',

1579

video_webpage)

1580

if stretched_m:

1581

w = float(stretched_m.group('w'))

1582

h = float(stretched_m.group('h'))

1583

# yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).

1584

# We will only process correct ratios.

if w > 0 and h > 0:

ratio = w / h

for f in formats:

if f.get('vcodec') != 'none':

1589

f['stretched_ratio'] = ratio

1590

1591

self._sort_formats(formats)

1592

1593

self.mark_watched(video_id, video_info)

return {

'id': video_id,

'uploader': video_uploader,

1598

'uploader_id': video_uploader_id,

1599

'upload_date': upload_date,

1600

'creator': video_creator,

1601

'title': video_title,

1602

'alt_title': video_alt_title,

1603

'thumbnail': video_thumbnail,

1604

'description': video_description,

1605

'categories': video_categories,

1606

'tags': video_tags,

1607

'subtitles': video_subtitles,

1608

'automatic_captions': automatic_captions,

1609

'duration': video_duration,

1610

'age_limit': 18 if age_gate else 0,

1611

'annotations': video_annotations,

1612

'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,

1613

'view_count': view_count,

1614

'like_count': like_count,

1615

'dislike_count': dislike_count,

1616

'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),

1617

'formats': formats,

1618

'is_live': is_live,

1619

'start_time': start_time,

1620

'end_time': end_time,

}

class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):

1625

IE_DESC = 'YouTube.com playlists'

1626

_VALID_URL = r"""(?x)(?:

(?:https?://)?

(?:\w+\.)?

youtube\.com/

(?:

\? (?:.*?[&;])*? (?:p|a|list)=

| p/

)

(

(?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}

1637

# Top tracks, they can also include dots

|(?:MC)[\w\.]*

)

.*

|

((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})

1643

)"""

1644

_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'

1645

_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'

1646

IE_NAME = 'youtube:playlist'

1647

_TESTS = [{

1648

'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

1649

'info_dict': {

1650

'title': 'ytdl test PL',

1651

'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',

},

'playlist_count': 3,

}, {

'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1656

'info_dict': {

1657

'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',

1658

'title': 'YDL_Empty_List',

},

'playlist_count': 0,

}, {

'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',

1663

'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1664

'info_dict': {

1665

'title': '29C3: Not my department',

1666

'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',

1667

},

1668

'playlist_count': 95,

1669

}, {

1670

'note': 'issue #673',

1671

'url': 'PLBB231211A4F62143',

1672

'info_dict': {

1673

'title': '[OLD]Team Fortress 2 (Class-based LP)',

1674

'id': 'PLBB231211A4F62143',

1675

},

1676

'playlist_mincount': 26,

1677

}, {

1678

'note': 'Large playlist',

1679

'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',

1680

'info_dict': {

1681

'title': 'Uploads from Cauchemar',

1682

'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',

1683

},

1684

'playlist_mincount': 799,

1685

}, {

1686

'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

1687

'info_dict': {

1688

'title': 'YDL_safe_search',

1689

'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',

},

'playlist_count': 2,

}, {

'note': 'embedded',

'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

'playlist_count': 4,

'info_dict': {

'title': 'JODA15',

'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',

1699

}

1700

}, {

1701

'note': 'Embedded SWF player',

1702

'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',

'playlist_count': 4,

'info_dict': {

'title': 'JODA7',

'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',

1707

}

1708

}, {

1709

'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',

1710

'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',

1711

'info_dict': {

1712

'title': 'Uploads from Interstellar Movie',

1713

'id': 'UUXw-G3eDE9trcvY2sBMM_aA',

1714

},

1715

'playlist_mincout': 21,

1716

}]

1717

1718

def _real_initialize(self):

1719

self._login()

1720

1721

def _extract_mix(self, playlist_id):

1722

# The mixes are generated from a single video

1723

# the id of the playlist is just 'RD' + video_id

1724

url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)

1725

webpage = self._download_webpage(

1726

url, playlist_id, 'Downloading Youtube mix')

1727

search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)

1728

title_span = (

1729

search_title('playlist-title') or

1730

search_title('title long-title') or

1731

search_title('title'))

1732

title = clean_html(title_span)

1733

ids = orderedSet(re.findall(

1734

r'''(?xs)data-video-username=".*?".*?

1735

href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),

1736

webpage))

1737

url_results = self._ids_to_results(ids)

1738

1739

return self.playlist_result(url_results, playlist_id, title)

1740

1741

def _extract_playlist(self, playlist_id):

1742

url = self._TEMPLATE_URL % playlist_id

1743

page = self._download_webpage(url, playlist_id)

1744

1745

for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):

1746

match = match.strip()

1747

# Check if the playlist exists or is private

1748

if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):

1749

raise ExtractorError(

1750

'The playlist doesn\'t exist or is private, use --username or '

1751

'--netrc to access it.',

1752

expected=True)

1753

elif re.match(r'[^<]*Invalid parameters[^<]*', match):

1754

raise ExtractorError(

1755

'Invalid parameters. Maybe URL is incorrect.',

1756

expected=True)

1757

elif re.match(r'[^<]*Choose your language[^<]*', match):

1758

continue

1759

else:

1760

self.report_warning('Youtube gives an alert message: ' + match)

1761

1762

playlist_title = self._html_search_regex(

1763

r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',

1764

page, 'title')

1765

1766

return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)

1767

1768

def _check_download_just_video(self, url, playlist_id):

1769

# Check if it's a video-specific URL

1770

query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)

1771

if 'v' in query_dict:

1772

video_id = query_dict['v'][0]

1773

if self._downloader.params.get('noplaylist'):

1774

self.to_screen('Downloading just video %s because of --no-playlist' % video_id)

1775

return self.url_result(video_id, 'Youtube', video_id=video_id)

1776

else:

1777

self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))

1778

1779

def _real_extract(self, url):

1780

# Extract playlist id

1781

mobj = re.match(self._VALID_URL, url)

1782

if mobj is None:

1783

raise ExtractorError('Invalid URL: %s' % url)

1784

playlist_id = mobj.group(1) or mobj.group(2)

1785

1786

video = self._check_download_just_video(url, playlist_id)

if video:

return video

if playlist_id.startswith('RD') or playlist_id.startswith('UL'):

1791

# Mixes require a custom extraction process

1792

return self._extract_mix(playlist_id)

1793

1794

return self._extract_playlist(playlist_id)

1795

1796

1797

class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):

1798

IE_DESC = 'YouTube.com channels'

1799

_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'

1800

_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'

1801

_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'

1802

IE_NAME = 'youtube:channel'

1803

_TESTS = [{

1804

'note': 'paginated channel',

1805

'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',

1806

'playlist_mincount': 91,

1807

'info_dict': {

1808

'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',

1809

'title': 'Uploads from lex will',

1810

}

1811

}, {

1812

'note': 'Age restricted channel',

1813

# from https://www.youtube.com/user/DeusExOfficial

1814

'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',

1815

'playlist_mincount': 64,

1816

'info_dict': {

1817

'id': 'UUs0ifCMCm1icqRbqhUINa0w',

1818

'title': 'Uploads from Deus Ex',

},

}]

@classmethod

def suitable(cls, url):

1824

return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)

1825

1826

def _real_extract(self, url):

1827

channel_id = self._match_id(url)

1828

1829

url = self._TEMPLATE_URL % channel_id

1830

1831

# Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)

1832

# Workaround by extracting as a playlist if managed to obtain channel playlist URL

1833

# otherwise fallback on channel by page extraction

1834

channel_page = self._download_webpage(

1835

url + '?view=57', channel_id,

1836

'Downloading channel page', fatal=False)

1837

if channel_page is False:

1838

channel_playlist_id = False

1839

else:

1840

channel_playlist_id = self._html_search_meta(

1841

'channelId', channel_page, 'channel id', default=None)

1842

if not channel_playlist_id:

1843

channel_playlist_id = self._search_regex(

1844

r'data-(?:channel-external-|yt)id="([^"]+)"',

1845

channel_page, 'channel id', default=None)

1846

if channel_playlist_id and channel_playlist_id.startswith('UC'):

1847

playlist_id = 'UU' + channel_playlist_id[2:]

1848

return self.url_result(

1849

compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')

1850

1851

channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')

1852

autogenerated = re.search(r'''(?x)

1853

class="[^"]*?(?:

1854

channel-header-autogenerated-label|

1855

yt-channel-title-autogenerated

1856

)[^"]*"''', channel_page) is not None

1857

1858

if autogenerated:

1859

# The videos are contained in a single page

1860

# the ajax pages can't be used, they are empty

1861

entries = [

1862

self.url_result(

1863

video_id, 'Youtube', video_id=video_id,

1864

video_title=video_title)

1865

for video_id, video_title in self.extract_videos_from_page(channel_page)]

1866

return self.playlist_result(entries, channel_id)

1867

1868

return self.playlist_result(self._entries(channel_page, channel_id), channel_id)

1869

1870

1871

class YoutubeUserIE(YoutubeChannelIE):

1872

IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'

1873

_VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'

1874

_TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'

1875

IE_NAME = 'youtube:user'

1876

1877

_TESTS = [{

1878

'url': 'https://www.youtube.com/user/TheLinuxFoundation',

1879

'playlist_mincount': 320,

1880

'info_dict': {

1881

'title': 'TheLinuxFoundation',

1882

}

1883

}, {

1884

'url': 'ytuser:phihag',

1885

'only_matching': True,

}]

@classmethod

def suitable(cls, url):

1890

# Don't return True if the url can be extracted with other youtube

1891

# extractor, the regex would is too permissive and it would match.

1892

other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)

1893

if any(ie.suitable(url) for ie in other_ies):

1894

return False

1895

else:

1896

return super(YoutubeUserIE, cls).suitable(url)

1897

1898

1899

class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):

1900

IE_DESC = 'YouTube.com user/channel playlists'

1901

_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'

1902

IE_NAME = 'youtube:playlists'

1903

1904

_TESTS = [{

1905

'url': 'http://www.youtube.com/user/ThirstForScience/playlists',

1906

'playlist_mincount': 4,

1907

'info_dict': {

1908

'id': 'ThirstForScience',

1909

'title': 'Thirst for Science',

1910

},

1911

}, {

1912

# with "Load more" button

1913

'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',

1914

'playlist_mincount': 70,

1915

'info_dict': {

1916

'id': 'igorkle1',

1917

'title': 'Игорь Клейнер',

1918

},

1919

}, {

1920

'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',

1921

'playlist_mincount': 17,

1922

'info_dict': {

1923

'id': 'UCiU1dHvZObB2iP6xkJ__Icw',

1924

'title': 'Chem Player',

},

}]

class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):

1930

IE_DESC = 'YouTube.com searches'

1931

# there doesn't appear to be a real limit, for example if you search for

1932

# 'python' you get more than 8.000.000 results

1933

_MAX_RESULTS = float('inf')

1934

IE_NAME = 'youtube:search'

1935

_SEARCH_KEY = 'ytsearch'

1936

_EXTRA_QUERY_ARGS = {}

1937

_TESTS = []

1938

1939

def _get_n_results(self, query, n):

1940

"""Get a specified number of results for a query"""

videos = []

limit = n

for pagenum in itertools.count(1):

1946

url_query = {

1947

'search_query': query.encode('utf-8'),

'page': pagenum,

'spf': 'navigate',

}

url_query.update(self._EXTRA_QUERY_ARGS)

1952

result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)

1953

data = self._download_json(

1954

result_url, video_id='query "%s"' % query,

1955

note='Downloading page %s' % pagenum,

1956

errnote='Unable to download API page')

1957

html_content = data[1]['body']['content']

1958

1959

if 'class="search-message' in html_content:

1960

raise ExtractorError(

1961

'[youtube] No video results', expected=True)

1962

1963

new_videos = self._ids_to_results(orderedSet(re.findall(

1964

r'href="/watch\?v=(.{11})', html_content)))

1965

videos += new_videos

1966

if not new_videos or len(videos) > limit:

break

if len(videos) > n:

videos = videos[:n]

return self.playlist_result(videos, query)

1972

1973

1974

class YoutubeSearchDateIE(YoutubeSearchIE):

1975

IE_NAME = YoutubeSearchIE.IE_NAME + ':date'

1976

_SEARCH_KEY = 'ytsearchdate'

1977

IE_DESC = 'YouTube.com searches, newest videos first'

1978

_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}

1979

1980

1981

class YoutubeSearchURLIE(InfoExtractor):

1982

IE_DESC = 'YouTube.com search URLs'

1983

IE_NAME = 'youtube:search_url'

1984

_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'

1985

_TESTS = [{

1986

'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',

1987

'playlist_mincount': 5,

1988

'info_dict': {

1989

'title': 'youtube-dl test video',

1990

}

1991

}, {

1992

'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',

1993

'only_matching': True,

1994

}]

1995

1996

def _real_extract(self, url):

1997

mobj = re.match(self._VALID_URL, url)

1998

query = compat_urllib_parse_unquote_plus(mobj.group('query'))

1999

2000

webpage = self._download_webpage(url, query)

2001

result_code = self._search_regex(

2002

r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')

2003

2004

part_codes = re.findall(

2005

r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)

2006

entries = []

2007

for part_code in part_codes:

2008

part_title = self._html_search_regex(

2009

[r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)

2010

part_url_snippet = self._html_search_regex(

2011

r'(?s)href="([^"]+)"', part_code, 'item URL')

2012

part_url = compat_urlparse.urljoin(

2013

'https://www.youtube.com/', part_url_snippet)

entries.append({

'_type': 'url',

'url': part_url,

'title': part_title,

})

return {

'_type': 'playlist',

'entries': entries,

'title': query,

}

class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):

2028

IE_DESC = 'YouTube.com (multi-season) shows'

2029

_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'

2030

IE_NAME = 'youtube:show'

2031

_TESTS = [{

2032

'url': 'https://www.youtube.com/show/airdisasters',

2033

'playlist_mincount': 5,

2034

'info_dict': {

2035

'id': 'airdisasters',

2036

'title': 'Air Disasters',

}

}]

def _real_extract(self, url):

2041

playlist_id = self._match_id(url)

2042

return super(YoutubeShowIE, self)._real_extract(

2043

'https://www.youtube.com/show/%s/playlists' % playlist_id)

2044

2045

2046

class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):

2047

"""

2048

Base class for feed extractors

2049

Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.

2050

"""

2051

_LOGIN_REQUIRED = True

@property

def IE_NAME(self):

return 'youtube:%s' % self._FEED_NAME

2056

2057

def _real_initialize(self):

2058

self._login()

2059

2060

def _real_extract(self, url):

2061

page = self._download_webpage(

2062

'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)

2063

2064

# The extraction process is the same as for playlists, but the regex

2065

# for the video ids doesn't contain an index

2066

ids = []

2067

more_widget_html = content_html = page

2068

for page_num in itertools.count(1):

2069

matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)

2070

2071

# 'recommended' feed has infinite 'load more' and each new portion spins

2072

# the same videos in (sometimes) slightly different order, so we'll check

2073

# for unicity and break when portion has no new videos

2074

new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))

if not new_ids:

break

ids.extend(new_ids)

mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)

if not mobj:

break

more = self._download_json(

2085

'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,

2086

'Downloading page #%s' % page_num,

2087

transform_source=uppercase_escape)

2088

content_html = more['content_html']

2089

more_widget_html = more['load_more_widget_html']

2090

2091

return self.playlist_result(

2092

self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)

2093

2094

2095

class YoutubeWatchLaterIE(YoutubePlaylistIE):

2096

IE_NAME = 'youtube:watchlater'

2097

IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'

2098

_VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'

2099

2100

_TESTS = [{

2101

'url': 'https://www.youtube.com/playlist?list=WL',

2102

'only_matching': True,

2103

}, {

2104

'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',

2105

'only_matching': True,

2106

}]

2107

2108

def _real_extract(self, url):

2109

video = self._check_download_just_video(url, 'WL')

2110

if video:

2111

return video

2112

return self._extract_playlist('WL')

2113

2114

2115

class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):

2116

IE_NAME = 'youtube:favorites'

2117

IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'

2118

_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'

2119

_LOGIN_REQUIRED = True

2120

2121

def _real_extract(self, url):

2122

webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')

2123

playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')

2124

return self.url_result(playlist_id, 'YoutubePlaylist')

2125

2126

2127

class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):

2128

IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'

2129

_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'

2130

_FEED_NAME = 'recommended'

2131

_PLAYLIST_TITLE = 'Youtube Recommended videos'

2132

2133

2134

class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):

2135

IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'

2136

_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'

2137

_FEED_NAME = 'subscriptions'

2138

_PLAYLIST_TITLE = 'Youtube Subscriptions'

2139

2140

2141

class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):

2142

IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'

2143

_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'

2144

_FEED_NAME = 'history'

2145

_PLAYLIST_TITLE = 'Youtube History'

2146

2147

2148

class YoutubeTruncatedURLIE(InfoExtractor):

2149

IE_NAME = 'youtube:truncated_url'

2150

IE_DESC = False # Do not list

2151

_VALID_URL = r'''(?x)

2152

(?:https?://)?

2153

(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/

2154

(?:watch\?(?:

2155

feature=[a-z_]+|

2156

annotation_id=annotation_[^&]+|

x-yt-cl=[0-9]+|

hl=[^&]*|

t=[0-9]+

)?

|

attribution_link\?a=[^&]+

)

$

'''

_TESTS = [{

'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',

2169

'only_matching': True,

2170

}, {

2171

'url': 'http://www.youtube.com/watch?',

2172

'only_matching': True,

2173

}, {

2174

'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',

2175

'only_matching': True,

2176

}, {

2177

'url': 'https://www.youtube.com/watch?feature=foo',

2178

'only_matching': True,

2179

}, {

2180

'url': 'https://www.youtube.com/watch?hl=en-GB',

2181

'only_matching': True,

2182

}, {

2183

'url': 'https://www.youtube.com/watch?t=2372',

2184

'only_matching': True,

2185

}]

2186

2187

def _real_extract(self, url):

2188

raise ExtractorError(

2189

'Did you forget to quote the URL? Remember that & is a meta '

2190

'character in most shells, so you want to put the URL in quotes, '

2191

'like youtube-dl '

2192

'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '

2193

' or simply youtube-dl BaW_jenozKc .',

expected=True)

class YoutubeTruncatedIDIE(InfoExtractor):

2198

IE_NAME = 'youtube:truncated_id'

2199

IE_DESC = False # Do not list

2200

_VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'

2201

2202

_TESTS = [{

2203

'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',

2204

'only_matching': True,

2205

}]

2206

2207

def _real_extract(self, url):

2208

video_id = self._match_id(url)

2209

raise ExtractorError(

2210

'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),

2211

expected=True)