jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	import base64
	2	import hashlib
	3	import json
	4	import netrc
	5	import os
	6	import re
	7	import socket
	8	import sys
	9	import time
	10	import xml.etree.ElementTree
	11
	12	from ..utils import (
	13	compat_http_client,
	14	compat_urllib_error,
	15	compat_urllib_parse_urlparse,
	16	compat_str,
	17
	18	clean_html,
	19	compiled_regex_type,
	20	ExtractorError,
	21	RegexNotFoundError,
	22	sanitize_filename,
	23	unescapeHTML,
	24	)
	25	_NO_DEFAULT = object()
	26
	27
	28	class InfoExtractor(object):
	29	"""Information Extractor class.
	30
	31	Information extractors are the classes that, given a URL, extract
	32	information about the video (or videos) the URL refers to. This
	33	information includes the real video URL, the video title, author and
	34	others. The information is stored in a dictionary which is then
	35	passed to the FileDownloader. The FileDownloader processes this
	36	information possibly downloading the video to the file system, among
	37	other possible outcomes.
	38
	39	The dictionaries must include the following fields:
	40
	41	id: Video identifier.
	42	title: Video title, unescaped.
	43
	44	Additionally, it must contain either a formats entry or a url one:
	45
	46	formats: A list of dictionaries for each format available, ordered
	47	from worst to best quality.
	48
	49	Potential fields:
	50	* url Mandatory. The URL of the video file
	51	* ext Will be calculated from url if missing
	52	* format A human-readable description of the format
	53	("mp4 container with h264/opus").
	54	Calculated from the format_id, width, height.
	55	and format_note fields if missing.
	56	* format_id A short description of the format
	57	("mp4_h264_opus" or "19").
	58	Technically optional, but strongly recommended.
	59	* format_note Additional info about the format
	60	("3D" or "DASH video")
	61	* width Width of the video, if known
	62	* height Height of the video, if known
	63	* resolution Textual description of width and height
	64	* tbr Average bitrate of audio and video in KBit/s
	65	* abr Average audio bitrate in KBit/s
	66	* acodec Name of the audio codec in use
	67	* asr Audio sampling rate in Hertz
	68	* vbr Average video bitrate in KBit/s
	69	* vcodec Name of the video codec in use
	70	* container Name of the container format
	71	* filesize The number of bytes, if known in advance
	72	* player_url SWF Player URL (used for rtmpdump).
	73	* protocol The protocol that will be used for the actual
	74	download, lower-case.
	75	"http", "https", "rtsp", "rtmp", "m3u8" or so.
	76	* preference Order number of this format. If this field is
	77	present and not None, the formats get sorted
	78	by this field, regardless of all other values.
	79	-1 for default (order by other properties),
	80	-2 or smaller for less than default.
	81	* quality Order number of the video quality of this
	82	format, irrespective of the file format.
	83	-1 for default (order by other properties),
	84	-2 or smaller for less than default.
	85	url: Final video URL.
	86	ext: Video filename extension.
	87	format: The video format, defaults to ext (used for --get-format)
	88	player_url: SWF Player URL (used for rtmpdump).
	89
	90	The following fields are optional:
	91
	92	display_id An alternative identifier for the video, not necessarily
	93	unique, but available before title. Typically, id is
	94	something like "4234987", title "Dancing naked mole rats",
	95	and display_id "dancing-naked-mole-rats"
	96	thumbnails: A list of dictionaries, with the following entries:
	97	* "url"
	98	* "width" (optional, int)
	99	* "height" (optional, int)
	100	* "resolution" (optional, string "{width}x{height"},
	101	deprecated)
	102	thumbnail: Full URL to a video thumbnail image.
	103	description: One-line video description.
	104	uploader: Full name of the video uploader.
	105	timestamp: UNIX timestamp of the moment the video became available.
	106	upload_date: Video upload date (YYYYMMDD).
	107	If not explicitly set, calculated from timestamp.
	108	uploader_id: Nickname or id of the video uploader.
	109	location: Physical location of the video.
	110	subtitles: The subtitle file contents as a dictionary in the format
	111	{language: subtitles}.
	112	duration: Length of the video in seconds, as an integer.
	113	view_count: How many users have watched the video on the platform.
	114	like_count: Number of positive ratings of the video
	115	dislike_count: Number of negative ratings of the video
	116	comment_count: Number of comments on the video
	117	age_limit: Age restriction for the video, as an integer (years)
	118	webpage_url: The url to the video webpage, if given to youtube-dl it
	119	should allow to get the same result again. (It will be set
	120	by YoutubeDL if it's missing)
	121	categories: A list of categories that the video falls in, for example
	122	["Sports", "Berlin"]
	123
	124	Unless mentioned otherwise, the fields should be Unicode strings.
	125
	126	Subclasses of this one should re-define the _real_initialize() and
	127	_real_extract() methods and define a _VALID_URL regexp.
	128	Probably, they should also be added to the list of extractors.
	129
	130	Finally, the _WORKING attribute should be set to False for broken IEs
	131	in order to warn the users and skip the tests.
	132	"""
	133
	134	_ready = False
	135	_downloader = None
	136	_WORKING = True
	137
	138	def __init__(self, downloader=None):
	139	"""Constructor. Receives an optional downloader."""
	140	self._ready = False
	141	self.set_downloader(downloader)
	142
	143	@classmethod
	144	def suitable(cls, url):
	145	"""Receives a URL and returns True if suitable for this IE."""
	146
	147	# This does not use has/getattr intentionally - we want to know whether
	148	# we have cached the regexp for this class, whereas getattr would also
	149	# match the superclass
	150	if '_VALID_URL_RE' not in cls.__dict__:
	151	cls._VALID_URL_RE = re.compile(cls._VALID_URL)
	152	return cls._VALID_URL_RE.match(url) is not None
	153
	154	@classmethod
	155	def working(cls):
	156	"""Getter method for _WORKING."""
	157	return cls._WORKING
	158
	159	def initialize(self):
	160	"""Initializes an instance (authentication, etc)."""
	161	if not self._ready:
	162	self._real_initialize()
	163	self._ready = True
	164
	165	def extract(self, url):
	166	"""Extracts URL information and returns it in list of dicts."""
	167	self.initialize()
	168	return self._real_extract(url)
	169
	170	def set_downloader(self, downloader):
	171	"""Sets the downloader for this IE."""
	172	self._downloader = downloader
	173
	174	def _real_initialize(self):
	175	"""Real initialization process. Redefine in subclasses."""
	176	pass
	177
	178	def _real_extract(self, url):
	179	"""Real extraction process. Redefine in subclasses."""
	180	pass
	181
	182	@classmethod
	183	def ie_key(cls):
	184	"""A string for getting the InfoExtractor with get_info_extractor"""
	185	return cls.__name__[:-2]
	186
	187	@property
	188	def IE_NAME(self):
	189	return type(self).__name__[:-2]
	190
	191	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	192	""" Returns the response handle """
	193	if note is None:
	194	self.report_download_webpage(video_id)
	195	elif note is not False:
	196	if video_id is None:
	197	self.to_screen(u'%s' % (note,))
	198	else:
	199	self.to_screen(u'%s: %s' % (video_id, note))
	200	try:
	201	return self._downloader.urlopen(url_or_request)
	202	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	203	if errnote is False:
	204	return False
	205	if errnote is None:
	206	errnote = u'Unable to download webpage'
	207	errmsg = u'%s: %s' % (errnote, compat_str(err))
	208	if fatal:
	209	raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
	210	else:
	211	self._downloader.report_warning(errmsg)
	212	return False
	213
	214	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	215	""" Returns a tuple (page content as string, URL handle) """
	216
	217	# Strip hashes from the URL (#1038)
	218	if isinstance(url_or_request, (compat_str, str)):
	219	url_or_request = url_or_request.partition('#')[0]
	220
	221	urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
	222	if urlh is False:
	223	assert not fatal
	224	return False
	225	content_type = urlh.headers.get('Content-Type', '')
	226	webpage_bytes = urlh.read()
	227	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	228	if m:
	229	encoding = m.group(1)
	230	else:
	231	m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
	232	webpage_bytes[:1024])
	233	if m:
	234	encoding = m.group(1).decode('ascii')
	235	elif webpage_bytes.startswith(b'\xff\xfe'):
	236	encoding = 'utf-16'
	237	else:
	238	encoding = 'utf-8'
	239	if self._downloader.params.get('dump_intermediate_pages', False):
	240	try:
	241	url = url_or_request.get_full_url()
	242	except AttributeError:
	243	url = url_or_request
	244	self.to_screen(u'Dumping request to ' + url)
	245	dump = base64.b64encode(webpage_bytes).decode('ascii')
	246	self._downloader.to_screen(dump)
	247	if self._downloader.params.get('write_pages', False):
	248	try:
	249	url = url_or_request.get_full_url()
	250	except AttributeError:
	251	url = url_or_request
	252	basen = '%s_%s' % (video_id, url)
	253	if len(basen) > 240:
	254	h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
	255	basen = basen[:240 - len(h)] + h
	256	raw_filename = basen + '.dump'
	257	filename = sanitize_filename(raw_filename, restricted=True)
	258	self.to_screen(u'Saving request to ' + filename)
	259	with open(filename, 'wb') as outf:
	260	outf.write(webpage_bytes)
	261
	262	try:
	263	content = webpage_bytes.decode(encoding, 'replace')
	264	except LookupError:
	265	content = webpage_bytes.decode('utf-8', 'replace')
	266
	267	if (u'<title>Access to this site is blocked</title>' in content and
	268	u'Websense' in content[:512]):
	269	msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
	270	blocked_iframe = self._html_search_regex(
	271	r'<iframe src="([^"]+)"', content,
	272	u'Websense information URL', default=None)
	273	if blocked_iframe:
	274	msg += u' Visit %s for more details' % blocked_iframe
	275	raise ExtractorError(msg, expected=True)
	276
	277	return (content, urlh)
	278
	279	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	280	""" Returns the data of the page as a string """
	281	res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
	282	if res is False:
	283	return res
	284	else:
	285	content, _ = res
	286	return content
	287
	288	def _download_xml(self, url_or_request, video_id,
	289	note=u'Downloading XML', errnote=u'Unable to download XML',
	290	transform_source=None, fatal=True):
	291	"""Return the xml as an xml.etree.ElementTree.Element"""
	292	xml_string = self._download_webpage(
	293	url_or_request, video_id, note, errnote, fatal=fatal)
	294	if xml_string is False:
	295	return xml_string
	296	if transform_source:
	297	xml_string = transform_source(xml_string)
	298	return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
	299
	300	def _download_json(self, url_or_request, video_id,
	301	note=u'Downloading JSON metadata',
	302	errnote=u'Unable to download JSON metadata',
	303	transform_source=None):
	304	json_string = self._download_webpage(url_or_request, video_id, note, errnote)
	305	if transform_source:
	306	json_string = transform_source(json_string)
	307	try:
	308	return json.loads(json_string)
	309	except ValueError as ve:
	310	raise ExtractorError('Failed to download JSON', cause=ve)
	311
	312	def report_warning(self, msg, video_id=None):
	313	idstr = u'' if video_id is None else u'%s: ' % video_id
	314	self._downloader.report_warning(
	315	u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
	316
	317	def to_screen(self, msg):
	318	"""Print msg to screen, prefixing it with '[ie_name]'"""
	319	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	320
	321	def report_extraction(self, id_or_name):
	322	"""Report information extraction."""
	323	self.to_screen(u'%s: Extracting information' % id_or_name)
	324
	325	def report_download_webpage(self, video_id):
	326	"""Report webpage download."""
	327	self.to_screen(u'%s: Downloading webpage' % video_id)
	328
	329	def report_age_confirmation(self):
	330	"""Report attempt to confirm age."""
	331	self.to_screen(u'Confirming age')
	332
	333	def report_login(self):
	334	"""Report attempt to log in."""
	335	self.to_screen(u'Logging in')
	336
	337	#Methods for following #608
	338	@staticmethod
	339	def url_result(url, ie=None, video_id=None):
	340	"""Returns a url that points to a page that should be processed"""
	341	#TODO: ie should be the class used for getting the info
	342	video_info = {'_type': 'url',
	343	'url': url,
	344	'ie_key': ie}
	345	if video_id is not None:
	346	video_info['id'] = video_id
	347	return video_info
	348	@staticmethod
	349	def playlist_result(entries, playlist_id=None, playlist_title=None):
	350	"""Returns a playlist"""
	351	video_info = {'_type': 'playlist',
	352	'entries': entries}
	353	if playlist_id:
	354	video_info['id'] = playlist_id
	355	if playlist_title:
	356	video_info['title'] = playlist_title
	357	return video_info
	358
	359	def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
	360	"""
	361	Perform a regex search on the given string, using a single or a list of
	362	patterns returning the first matching group.
	363	In case of failure return a default value or raise a WARNING or a
	364	RegexNotFoundError, depending on fatal, specifying the field name.
	365	"""
	366	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	367	mobj = re.search(pattern, string, flags)
	368	else:
	369	for p in pattern:
	370	mobj = re.search(p, string, flags)
	371	if mobj: break
	372
	373	if os.name != 'nt' and sys.stderr.isatty():
	374	_name = u'\033[0;34m%s\033[0m' % name
	375	else:
	376	_name = name
	377
	378	if mobj:
	379	# return the first matching group
	380	return next(g for g in mobj.groups() if g is not None)
	381	elif default is not _NO_DEFAULT:
	382	return default
	383	elif fatal:
	384	raise RegexNotFoundError(u'Unable to extract %s' % _name)
	385	else:
	386	self._downloader.report_warning(u'unable to extract %s; '
	387	u'please report this issue on http://yt-dl.org/bug' % _name)
	388	return None
	389
	390	def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
	391	"""
	392	Like _search_regex, but strips HTML tags and unescapes entities.
	393	"""
	394	res = self._search_regex(pattern, string, name, default, fatal, flags)
	395	if res:
	396	return clean_html(res).strip()
	397	else:
	398	return res
	399
	400	def _get_login_info(self):
	401	"""
	402	Get the the login info as (username, password)
	403	It will look in the netrc file using the _NETRC_MACHINE value
	404	If there's no info available, return (None, None)
	405	"""
	406	if self._downloader is None:
	407	return (None, None)
	408
	409	username = None
	410	password = None
	411	downloader_params = self._downloader.params
	412
	413	# Attempt to use provided username and password or .netrc data
	414	if downloader_params.get('username', None) is not None:
	415	username = downloader_params['username']
	416	password = downloader_params['password']
	417	elif downloader_params.get('usenetrc', False):
	418	try:
	419	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	420	if info is not None:
	421	username = info[0]
	422	password = info[2]
	423	else:
	424	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	425	except (IOError, netrc.NetrcParseError) as err:
	426	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	427
	428	return (username, password)
	429
	430	# Helper functions for extracting OpenGraph info
	431	@staticmethod
	432	def _og_regexes(prop):
	433	content_re = r'content=(?:"([^>]+?)"\|\'([^>]+?)\')'
	434	property_re = r'(?:name\|property)=[\'"]og:%s[\'"]' % re.escape(prop)
	435	template = r'<meta[^>]+?%s[^>]+?%s'
	436	return [
	437	template % (property_re, content_re),
	438	template % (content_re, property_re),
	439	]
	440
	441	def _og_search_property(self, prop, html, name=None, **kargs):
	442	if name is None:
	443	name = 'OpenGraph %s' % prop
	444	escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
	445	if escaped is None:
	446	return None
	447	return unescapeHTML(escaped)
	448
	449	def _og_search_thumbnail(self, html, **kargs):
	450	return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
	451
	452	def _og_search_description(self, html, **kargs):
	453	return self._og_search_property('description', html, fatal=False, **kargs)
	454
	455	def _og_search_title(self, html, **kargs):
	456	return self._og_search_property('title', html, **kargs)
	457
	458	def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
	459	regexes = self._og_regexes('video')
	460	if secure: regexes = self._og_regexes('video:secure_url') + regexes
	461	return self._html_search_regex(regexes, html, name, **kargs)
	462
	463	def _og_search_url(self, html, **kargs):
	464	return self._og_search_property('url', html, **kargs)
	465
	466	def _html_search_meta(self, name, html, display_name=None, fatal=False):
	467	if display_name is None:
	468	display_name = name
	469	return self._html_search_regex(
	470	r'''(?ix)<meta
	471	(?=[^>]+(?:itemprop\|name\|property)=["\']%s["\'])
	472	[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
	473	html, display_name, fatal=fatal)
	474
	475	def _dc_search_uploader(self, html):
	476	return self._html_search_meta('dc.creator', html, 'uploader')
	477
	478	def _rta_search(self, html):
	479	# See http://www.rtalabel.org/index.php?content=howtofaq#single
	480	if re.search(r'(?ix)<meta\s+name="rating"\s+'
	481	r' content="RTA-5042-1996-1400-1577-RTA"',
	482	html):
	483	return 18
	484	return 0
	485
	486	def _media_rating_search(self, html):
	487	# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
	488	rating = self._html_search_meta('rating', html)
	489
	490	if not rating:
	491	return None
	492
	493	RATING_TABLE = {
	494	'safe for kids': 0,
	495	'general': 8,
	496	'14 years': 14,
	497	'mature': 17,
	498	'restricted': 19,
	499	}
	500	return RATING_TABLE.get(rating.lower(), None)
	501
	502	def _twitter_search_player(self, html):
	503	return self._html_search_meta('twitter:player', html,
	504	'twitter card player')
	505
	506	def _sort_formats(self, formats):
	507	if not formats:
	508	raise ExtractorError(u'No video formats found')
	509
	510	def _formats_key(f):
	511	# TODO remove the following workaround
	512	from ..utils import determine_ext
	513	if not f.get('ext') and 'url' in f:
	514	f['ext'] = determine_ext(f['url'])
	515
	516	preference = f.get('preference')
	517	if preference is None:
	518	proto = f.get('protocol')
	519	if proto is None:
	520	proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
	521
	522	preference = 0 if proto in ['http', 'https'] else -0.1
	523	if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
	524	preference -= 0.5
	525
	526	if f.get('vcodec') == 'none': # audio only
	527	if self._downloader.params.get('prefer_free_formats'):
	528	ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
	529	else:
	530	ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
	531	ext_preference = 0
	532	try:
	533	audio_ext_preference = ORDER.index(f['ext'])
	534	except ValueError:
	535	audio_ext_preference = -1
	536	else:
	537	if self._downloader.params.get('prefer_free_formats'):
	538	ORDER = [u'flv', u'mp4', u'webm']
	539	else:
	540	ORDER = [u'webm', u'flv', u'mp4']
	541	try:
	542	ext_preference = ORDER.index(f['ext'])
	543	except ValueError:
	544	ext_preference = -1
	545	audio_ext_preference = 0
	546
	547	return (
	548	preference,
	549	f.get('quality') if f.get('quality') is not None else -1,
	550	f.get('height') if f.get('height') is not None else -1,
	551	f.get('width') if f.get('width') is not None else -1,
	552	ext_preference,
	553	f.get('tbr') if f.get('tbr') is not None else -1,
	554	f.get('vbr') if f.get('vbr') is not None else -1,
	555	f.get('abr') if f.get('abr') is not None else -1,
	556	audio_ext_preference,
	557	f.get('filesize') if f.get('filesize') is not None else -1,
	558	f.get('format_id'),
	559	)
	560	formats.sort(key=_formats_key)
	561
	562	def http_scheme(self):
	563	""" Either "https:" or "https:", depending on the user's preferences """
	564	return (
	565	'http:'
	566	if self._downloader.params.get('prefer_insecure', False)
	567	else 'https:')
	568
	569	def _proto_relative_url(self, url, scheme=None):
	570	if url is None:
	571	return url
	572	if url.startswith('//'):
	573	if scheme is None:
	574	scheme = self.http_scheme()
	575	return scheme + url
	576	else:
	577	return url
	578
	579	def _sleep(self, timeout, video_id, msg_template=None):
	580	if msg_template is None:
	581	msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
	582	msg = msg_template % {'video_id': video_id, 'timeout': timeout}
	583	self.to_screen(msg)
	584	time.sleep(timeout)
	585
	586
	587	class SearchInfoExtractor(InfoExtractor):
	588	"""
	589	Base class for paged search queries extractors.
	590	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	591	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	592	"""
	593
	594	@classmethod
	595	def _make_valid_url(cls):
	596	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	597
	598	@classmethod
	599	def suitable(cls, url):
	600	return re.match(cls._make_valid_url(), url) is not None
	601
	602	def _real_extract(self, query):
	603	mobj = re.match(self._make_valid_url(), query)
	604	if mobj is None:
	605	raise ExtractorError(u'Invalid search query "%s"' % query)
	606
	607	prefix = mobj.group('prefix')
	608	query = mobj.group('query')
	609	if prefix == '':
	610	return self._get_n_results(query, 1)
	611	elif prefix == 'all':
	612	return self._get_n_results(query, self._MAX_RESULTS)
	613	else:
	614	n = int(prefix)
	615	if n <= 0:
	616	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	617	elif n > self._MAX_RESULTS:
	618	self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	619	n = self._MAX_RESULTS
	620	return self._get_n_results(query, n)
	621
	622	def _get_n_results(self, query, n):
	623	"""Get a specified number of results for a query"""
	624	raise NotImplementedError("This method must be implemented by subclasses")
	625
	626	@property
	627	def SEARCH_KEY(self):
	628	return self._SEARCH_KEY