jfr.im git - yt-dlp.git/blame_incremental - youtube

... / ...

Commit	Line	Data
	1	import base64
	2	import json
	3	import os
	4	import re
	5	import socket
	6	import sys
	7	import netrc
	8	import xml.etree.ElementTree
	9
	10	from ..utils import (
	11	compat_http_client,
	12	compat_urllib_error,
	13	compat_urllib_parse_urlparse,
	14	compat_str,
	15
	16	clean_html,
	17	compiled_regex_type,
	18	ExtractorError,
	19	RegexNotFoundError,
	20	sanitize_filename,
	21	unescapeHTML,
	22	)
	23	_NO_DEFAULT = object()
	24
	25
	26	class InfoExtractor(object):
	27	"""Information Extractor class.
	28
	29	Information extractors are the classes that, given a URL, extract
	30	information about the video (or videos) the URL refers to. This
	31	information includes the real video URL, the video title, author and
	32	others. The information is stored in a dictionary which is then
	33	passed to the FileDownloader. The FileDownloader processes this
	34	information possibly downloading the video to the file system, among
	35	other possible outcomes.
	36
	37	The dictionaries must include the following fields:
	38
	39	id: Video identifier.
	40	title: Video title, unescaped.
	41
	42	Additionally, it must contain either a formats entry or a url one:
	43
	44	formats: A list of dictionaries for each format available, ordered
	45	from worst to best quality.
	46
	47	Potential fields:
	48	* url Mandatory. The URL of the video file
	49	* ext Will be calculated from url if missing
	50	* format A human-readable description of the format
	51	("mp4 container with h264/opus").
	52	Calculated from the format_id, width, height.
	53	and format_note fields if missing.
	54	* format_id A short description of the format
	55	("mp4_h264_opus" or "19").
	56	Technically optional, but strongly recommended.
	57	* format_note Additional info about the format
	58	("3D" or "DASH video")
	59	* width Width of the video, if known
	60	* height Height of the video, if known
	61	* resolution Textual description of width and height
	62	* tbr Average bitrate of audio and video in KBit/s
	63	* abr Average audio bitrate in KBit/s
	64	* acodec Name of the audio codec in use
	65	* vbr Average video bitrate in KBit/s
	66	* vcodec Name of the video codec in use
	67	* filesize The number of bytes, if known in advance
	68	* player_url SWF Player URL (used for rtmpdump).
	69	* protocol The protocol that will be used for the actual
	70	download, lower-case.
	71	"http", "https", "rtsp", "rtmp" or so.
	72	* preference Order number of this format. If this field is
	73	present and not None, the formats get sorted
	74	by this field.
	75	-1 for default (order by other properties),
	76	-2 or smaller for less than default.
	77	* quality Order number of the video quality of this
	78	format, irrespective of the file format.
	79	-1 for default (order by other properties),
	80	-2 or smaller for less than default.
	81	url: Final video URL.
	82	ext: Video filename extension.
	83	format: The video format, defaults to ext (used for --get-format)
	84	player_url: SWF Player URL (used for rtmpdump).
	85
	86	The following fields are optional:
	87
	88	thumbnails: A list of dictionaries (with the entries "resolution" and
	89	"url") for the varying thumbnails
	90	thumbnail: Full URL to a video thumbnail image.
	91	description: One-line video description.
	92	uploader: Full name of the video uploader.
	93	upload_date: Video upload date (YYYYMMDD).
	94	uploader_id: Nickname or id of the video uploader.
	95	location: Physical location of the video.
	96	subtitles: The subtitle file contents as a dictionary in the format
	97	{language: subtitles}.
	98	duration: Length of the video in seconds, as an integer.
	99	view_count: How many users have watched the video on the platform.
	100	like_count: Number of positive ratings of the video
	101	dislike_count: Number of negative ratings of the video
	102	comment_count: Number of comments on the video
	103	age_limit: Age restriction for the video, as an integer (years)
	104	webpage_url: The url to the video webpage, if given to youtube-dl it
	105	should allow to get the same result again. (It will be set
	106	by YoutubeDL if it's missing)
	107
	108	Unless mentioned otherwise, the fields should be Unicode strings.
	109
	110	Subclasses of this one should re-define the _real_initialize() and
	111	_real_extract() methods and define a _VALID_URL regexp.
	112	Probably, they should also be added to the list of extractors.
	113
	114	_real_extract() must return a list of information dictionaries as
	115	described above.
	116
	117	Finally, the _WORKING attribute should be set to False for broken IEs
	118	in order to warn the users and skip the tests.
	119	"""
	120
	121	_ready = False
	122	_downloader = None
	123	_WORKING = True
	124
	125	def __init__(self, downloader=None):
	126	"""Constructor. Receives an optional downloader."""
	127	self._ready = False
	128	self.set_downloader(downloader)
	129
	130	@classmethod
	131	def suitable(cls, url):
	132	"""Receives a URL and returns True if suitable for this IE."""
	133
	134	# This does not use has/getattr intentionally - we want to know whether
	135	# we have cached the regexp for this class, whereas getattr would also
	136	# match the superclass
	137	if '_VALID_URL_RE' not in cls.__dict__:
	138	cls._VALID_URL_RE = re.compile(cls._VALID_URL)
	139	return cls._VALID_URL_RE.match(url) is not None
	140
	141	@classmethod
	142	def working(cls):
	143	"""Getter method for _WORKING."""
	144	return cls._WORKING
	145
	146	def initialize(self):
	147	"""Initializes an instance (authentication, etc)."""
	148	if not self._ready:
	149	self._real_initialize()
	150	self._ready = True
	151
	152	def extract(self, url):
	153	"""Extracts URL information and returns it in list of dicts."""
	154	self.initialize()
	155	return self._real_extract(url)
	156
	157	def set_downloader(self, downloader):
	158	"""Sets the downloader for this IE."""
	159	self._downloader = downloader
	160
	161	def _real_initialize(self):
	162	"""Real initialization process. Redefine in subclasses."""
	163	pass
	164
	165	def _real_extract(self, url):
	166	"""Real extraction process. Redefine in subclasses."""
	167	pass
	168
	169	@classmethod
	170	def ie_key(cls):
	171	"""A string for getting the InfoExtractor with get_info_extractor"""
	172	return cls.__name__[:-2]
	173
	174	@property
	175	def IE_NAME(self):
	176	return type(self).__name__[:-2]
	177
	178	def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	179	""" Returns the response handle """
	180	if note is None:
	181	self.report_download_webpage(video_id)
	182	elif note is not False:
	183	if video_id is None:
	184	self.to_screen(u'%s' % (note,))
	185	else:
	186	self.to_screen(u'%s: %s' % (video_id, note))
	187	try:
	188	return self._downloader.urlopen(url_or_request)
	189	except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
	190	if errnote is False:
	191	return False
	192	if errnote is None:
	193	errnote = u'Unable to download webpage'
	194	errmsg = u'%s: %s' % (errnote, compat_str(err))
	195	if fatal:
	196	raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
	197	else:
	198	self._downloader.report_warning(errmsg)
	199	return False
	200
	201	def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	202	""" Returns a tuple (page content as string, URL handle) """
	203
	204	# Strip hashes from the URL (#1038)
	205	if isinstance(url_or_request, (compat_str, str)):
	206	url_or_request = url_or_request.partition('#')[0]
	207
	208	urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
	209	if urlh is False:
	210	assert not fatal
	211	return False
	212	content_type = urlh.headers.get('Content-Type', '')
	213	webpage_bytes = urlh.read()
	214	m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s;\scharset=(.+)', content_type)
	215	if m:
	216	encoding = m.group(1)
	217	else:
	218	m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
	219	webpage_bytes[:1024])
	220	if m:
	221	encoding = m.group(1).decode('ascii')
	222	else:
	223	encoding = 'utf-8'
	224	if self._downloader.params.get('dump_intermediate_pages', False):
	225	try:
	226	url = url_or_request.get_full_url()
	227	except AttributeError:
	228	url = url_or_request
	229	self.to_screen(u'Dumping request to ' + url)
	230	dump = base64.b64encode(webpage_bytes).decode('ascii')
	231	self._downloader.to_screen(dump)
	232	if self._downloader.params.get('write_pages', False):
	233	try:
	234	url = url_or_request.get_full_url()
	235	except AttributeError:
	236	url = url_or_request
	237	raw_filename = ('%s_%s.dump' % (video_id, url))
	238	filename = sanitize_filename(raw_filename, restricted=True)
	239	self.to_screen(u'Saving request to ' + filename)
	240	with open(filename, 'wb') as outf:
	241	outf.write(webpage_bytes)
	242
	243	content = webpage_bytes.decode(encoding, 'replace')
	244	return (content, urlh)
	245
	246	def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
	247	""" Returns the data of the page as a string """
	248	res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
	249	if res is False:
	250	return res
	251	else:
	252	content, _ = res
	253	return content
	254
	255	def _download_xml(self, url_or_request, video_id,
	256	note=u'Downloading XML', errnote=u'Unable to download XML',
	257	transform_source=None):
	258	"""Return the xml as an xml.etree.ElementTree.Element"""
	259	xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
	260	if transform_source:
	261	xml_string = transform_source(xml_string)
	262	return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
	263
	264	def _download_json(self, url_or_request, video_id,
	265	note=u'Downloading JSON metadata',
	266	errnote=u'Unable to download JSON metadata'):
	267	json_string = self._download_webpage(url_or_request, video_id, note, errnote)
	268	try:
	269	return json.loads(json_string)
	270	except ValueError as ve:
	271	raise ExtractorError('Failed to download JSON', cause=ve)
	272
	273	def report_warning(self, msg, video_id=None):
	274	idstr = u'' if video_id is None else u'%s: ' % video_id
	275	self._downloader.report_warning(
	276	u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
	277
	278	def to_screen(self, msg):
	279	"""Print msg to screen, prefixing it with '[ie_name]'"""
	280	self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
	281
	282	def report_extraction(self, id_or_name):
	283	"""Report information extraction."""
	284	self.to_screen(u'%s: Extracting information' % id_or_name)
	285
	286	def report_download_webpage(self, video_id):
	287	"""Report webpage download."""
	288	self.to_screen(u'%s: Downloading webpage' % video_id)
	289
	290	def report_age_confirmation(self):
	291	"""Report attempt to confirm age."""
	292	self.to_screen(u'Confirming age')
	293
	294	def report_login(self):
	295	"""Report attempt to log in."""
	296	self.to_screen(u'Logging in')
	297
	298	#Methods for following #608
	299	@staticmethod
	300	def url_result(url, ie=None, video_id=None):
	301	"""Returns a url that points to a page that should be processed"""
	302	#TODO: ie should be the class used for getting the info
	303	video_info = {'_type': 'url',
	304	'url': url,
	305	'ie_key': ie}
	306	if video_id is not None:
	307	video_info['id'] = video_id
	308	return video_info
	309	@staticmethod
	310	def playlist_result(entries, playlist_id=None, playlist_title=None):
	311	"""Returns a playlist"""
	312	video_info = {'_type': 'playlist',
	313	'entries': entries}
	314	if playlist_id:
	315	video_info['id'] = playlist_id
	316	if playlist_title:
	317	video_info['title'] = playlist_title
	318	return video_info
	319
	320	def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
	321	"""
	322	Perform a regex search on the given string, using a single or a list of
	323	patterns returning the first matching group.
	324	In case of failure return a default value or raise a WARNING or a
	325	RegexNotFoundError, depending on fatal, specifying the field name.
	326	"""
	327	if isinstance(pattern, (str, compat_str, compiled_regex_type)):
	328	mobj = re.search(pattern, string, flags)
	329	else:
	330	for p in pattern:
	331	mobj = re.search(p, string, flags)
	332	if mobj: break
	333
	334	if os.name != 'nt' and sys.stderr.isatty():
	335	_name = u'\033[0;34m%s\033[0m' % name
	336	else:
	337	_name = name
	338
	339	if mobj:
	340	# return the first matching group
	341	return next(g for g in mobj.groups() if g is not None)
	342	elif default is not _NO_DEFAULT:
	343	return default
	344	elif fatal:
	345	raise RegexNotFoundError(u'Unable to extract %s' % _name)
	346	else:
	347	self._downloader.report_warning(u'unable to extract %s; '
	348	u'please report this issue on http://yt-dl.org/bug' % _name)
	349	return None
	350
	351	def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
	352	"""
	353	Like _search_regex, but strips HTML tags and unescapes entities.
	354	"""
	355	res = self._search_regex(pattern, string, name, default, fatal, flags)
	356	if res:
	357	return clean_html(res).strip()
	358	else:
	359	return res
	360
	361	def _get_login_info(self):
	362	"""
	363	Get the the login info as (username, password)
	364	It will look in the netrc file using the _NETRC_MACHINE value
	365	If there's no info available, return (None, None)
	366	"""
	367	if self._downloader is None:
	368	return (None, None)
	369
	370	username = None
	371	password = None
	372	downloader_params = self._downloader.params
	373
	374	# Attempt to use provided username and password or .netrc data
	375	if downloader_params.get('username', None) is not None:
	376	username = downloader_params['username']
	377	password = downloader_params['password']
	378	elif downloader_params.get('usenetrc', False):
	379	try:
	380	info = netrc.netrc().authenticators(self._NETRC_MACHINE)
	381	if info is not None:
	382	username = info[0]
	383	password = info[2]
	384	else:
	385	raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
	386	except (IOError, netrc.NetrcParseError) as err:
	387	self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
	388
	389	return (username, password)
	390
	391	# Helper functions for extracting OpenGraph info
	392	@staticmethod
	393	def _og_regexes(prop):
	394	content_re = r'content=(?:"([^>]+?)"\|\'(.+?)\')'
	395	property_re = r'(?:name\|property)=[\'"]og:%s[\'"]' % re.escape(prop)
	396	template = r'<meta[^>]+?%s[^>]+?%s'
	397	return [
	398	template % (property_re, content_re),
	399	template % (content_re, property_re),
	400	]
	401
	402	def _og_search_property(self, prop, html, name=None, **kargs):
	403	if name is None:
	404	name = 'OpenGraph %s' % prop
	405	escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
	406	if escaped is None:
	407	return None
	408	return unescapeHTML(escaped)
	409
	410	def _og_search_thumbnail(self, html, **kargs):
	411	return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
	412
	413	def _og_search_description(self, html, **kargs):
	414	return self._og_search_property('description', html, fatal=False, **kargs)
	415
	416	def _og_search_title(self, html, **kargs):
	417	return self._og_search_property('title', html, **kargs)
	418
	419	def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
	420	regexes = self._og_regexes('video')
	421	if secure: regexes = self._og_regexes('video:secure_url') + regexes
	422	return self._html_search_regex(regexes, html, name, **kargs)
	423
	424	def _html_search_meta(self, name, html, display_name=None):
	425	if display_name is None:
	426	display_name = name
	427	return self._html_search_regex(
	428	r'''(?ix)<meta
	429	(?=[^>]+(?:itemprop\|name\|property)=["\']%s["\'])
	430	[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
	431	html, display_name, fatal=False)
	432
	433	def _dc_search_uploader(self, html):
	434	return self._html_search_meta('dc.creator', html, 'uploader')
	435
	436	def _rta_search(self, html):
	437	# See http://www.rtalabel.org/index.php?content=howtofaq#single
	438	if re.search(r'(?ix)<meta\s+name="rating"\s+'
	439	r' content="RTA-5042-1996-1400-1577-RTA"',
	440	html):
	441	return 18
	442	return 0
	443
	444	def _media_rating_search(self, html):
	445	# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
	446	rating = self._html_search_meta('rating', html)
	447
	448	if not rating:
	449	return None
	450
	451	RATING_TABLE = {
	452	'safe for kids': 0,
	453	'general': 8,
	454	'14 years': 14,
	455	'mature': 17,
	456	'restricted': 19,
	457	}
	458	return RATING_TABLE.get(rating.lower(), None)
	459
	460	def _sort_formats(self, formats):
	461	def _formats_key(f):
	462	# TODO remove the following workaround
	463	from ..utils import determine_ext
	464	if not f.get('ext') and 'url' in f:
	465	f['ext'] = determine_ext(f['url'])
	466
	467	preference = f.get('preference')
	468	if preference is None:
	469	proto = f.get('protocol')
	470	if proto is None:
	471	proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
	472
	473	preference = 0 if proto in ['http', 'https'] else -0.1
	474	if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
	475	preference -= 0.5
	476
	477	if f.get('vcodec') == 'none': # audio only
	478	if self._downloader.params.get('prefer_free_formats'):
	479	ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
	480	else:
	481	ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
	482	ext_preference = 0
	483	try:
	484	audio_ext_preference = ORDER.index(f['ext'])
	485	except ValueError:
	486	audio_ext_preference = -1
	487	else:
	488	if self._downloader.params.get('prefer_free_formats'):
	489	ORDER = [u'flv', u'mp4', u'webm']
	490	else:
	491	ORDER = [u'webm', u'flv', u'mp4']
	492	try:
	493	ext_preference = ORDER.index(f['ext'])
	494	except ValueError:
	495	ext_preference = -1
	496	audio_ext_preference = 0
	497
	498	return (
	499	preference,
	500	f.get('quality') if f.get('quality') is not None else -1,
	501	f.get('height') if f.get('height') is not None else -1,
	502	f.get('width') if f.get('width') is not None else -1,
	503	ext_preference,
	504	f.get('tbr') if f.get('tbr') is not None else -1,
	505	f.get('vbr') if f.get('vbr') is not None else -1,
	506	f.get('abr') if f.get('abr') is not None else -1,
	507	audio_ext_preference,
	508	f.get('filesize') if f.get('filesize') is not None else -1,
	509	f.get('format_id'),
	510	)
	511	formats.sort(key=_formats_key)
	512
	513
	514	class SearchInfoExtractor(InfoExtractor):
	515	"""
	516	Base class for paged search queries extractors.
	517	They accept urls in the format _SEARCH_KEY(\|all\|[0-9]):{query}
	518	Instances should define _SEARCH_KEY and _MAX_RESULTS.
	519	"""
	520
	521	@classmethod
	522	def _make_valid_url(cls):
	523	return r'%s(?P<prefix>\|[1-9][0-9]*\|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
	524
	525	@classmethod
	526	def suitable(cls, url):
	527	return re.match(cls._make_valid_url(), url) is not None
	528
	529	def _real_extract(self, query):
	530	mobj = re.match(self._make_valid_url(), query)
	531	if mobj is None:
	532	raise ExtractorError(u'Invalid search query "%s"' % query)
	533
	534	prefix = mobj.group('prefix')
	535	query = mobj.group('query')
	536	if prefix == '':
	537	return self._get_n_results(query, 1)
	538	elif prefix == 'all':
	539	return self._get_n_results(query, self._MAX_RESULTS)
	540	else:
	541	n = int(prefix)
	542	if n <= 0:
	543	raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
	544	elif n > self._MAX_RESULTS:
	545	self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
	546	n = self._MAX_RESULTS
	547	return self._get_n_results(query, n)
	548
	549	def _get_n_results(self, query, n):
	550	"""Get a specified number of results for a query"""
	551	raise NotImplementedError("This method must be implemented by subclasses")
	552
	553	@property
	554	def SEARCH_KEY(self):
	555	return self._SEARCH_KEY