jfr.im git - erebus.git/blame_incremental

... / ...

Commit	Line	Data
	1	# Erebus IRC bot - Author: Conny Sjoblom
	2	# vim: fileencoding=utf-8
	3	# URL Checker
	4	# This file is released into the public domain; see http://unlicense.org/
	5
	6	# module info
	7	modinfo = {
	8	'author': 'Erebus Team',
	9	'license': 'public domain',
	10	'compatible': [0],
	11	'depends': [],
	12	'softdeps': [],
	13	}
	14
	15	# http://embed.ly/tools/generator
	16
	17	# preamble
	18	import modlib
	19	lib = modlib.modlib(__name__)
	20	modstart = lib.modstart
	21	modstop = lib.modstop
	22
	23	# module code
	24	import sys
	25	if sys.version_info.major < 3:
	26	stringbase = basestring
	27	import urllib2
	28	import urlparse
	29	import HTMLParser
	30	html = HTMLParser.HTMLParser()
	31	from BeautifulSoup import BeautifulSoup
	32	else:
	33	stringbase = str
	34	import urllib.request as urllib2
	35	import urllib.parse as urlparse
	36	import html
	37	from bs4 import BeautifulSoup
	38	import http.client
	39
	40	import re, json, datetime
	41
	42	try:
	43	import aia
	44	aia_session = aia.AIASession()
	45	# aia is broken on capath systems, needs cafile to work
	46	aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
	47	aia_session._trusted = {
	48	aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
	49	for ca_der in aia_session._context.get_ca_certs(True)
	50	}
	51	print("aia loaded")
	52	except ImportError as e:
	53	print(repr(e))
	54	aia = None
	55
	56	hostmask_regex = re.compile(r'^(.)!(.)@(.*)$')
	57
	58	def parser_hostmask(hostmask):
	59	if isinstance(hostmask, dict):
	60	return hostmask
	61
	62	nick = None
	63	user = None
	64	host = None
	65
	66	if hostmask is not None:
	67	match = hostmask_regex.match(hostmask)
	68
	69	if not match:
	70	nick = hostmask
	71	else:
	72	nick = match.group(1)
	73	user = match.group(2)
	74	host = match.group(3)
	75
	76	return {
	77	'nick': nick,
	78	'user': user,
	79	'host': host
	80	}
	81
	82	class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	83	def http_error_301(self, req, fp, code, msg, headers):
	84	result = urllib2.HTTPRedirectHandler.http_error_301(
	85	self, req, fp, code, msg, headers)
	86	result.status = code
	87	return result
	88
	89	def http_error_302(self, req, fp, code, msg, headers):
	90	result = urllib2.HTTPRedirectHandler.http_error_302(
	91	self, req, fp, code, msg, headers)
	92	result.status = code
	93	return result
	94
	95	def _get_blocked_chans():
	96	return lib.parent.cfg.get('urls', 'blocked', '').split(',')
	97
	98	def process_line(line):
	99	responses = []
	100	num_found = 0
	101	limit = lib.parent.cfg.getint('urls', 'limit', 2)
	102	for action, group in regexes:
	103	for regex in group:
	104	for match in regex.findall(line):
	105	if match:
	106	num_found += 1
	107	if num_found > limit:
	108	return responses
	109	if isinstance(match, stringbase):
	110	resp = action(match)
	111	else:
	112	resp = action(*match)
	113	if resp is not None and resp != "":
	114	responses.append(resp)
	115	return responses
	116
	117	@lib.hooknum("PRIVMSG")
	118	def privmsg_hook(bot, textline):
	119	user = parser_hostmask(textline[1:textline.find(' ')])
	120	chan = textline.split()[2]
	121
	122	if chan in _get_blocked_chans(): return
	123
	124	try:
	125	line = textline.split(None, 3)[3][1:]
	126	except IndexError:
	127	line = ''
	128
	129	responses = process_line(line)
	130	send_response(bot, chan, responses)
	131
	132	def send_response(bot, chan, responses):
	133	if len(responses) > 0:
	134	if lib.parent.cfg.getboolean('urls', 'multiline'):
	135	for r in responses:
	136	bot.msg(chan, r, True)
	137	else:
	138	bot.msg(chan, ' \| '.join(responses), True)
	139
	140	def unescape(line):
	141	return re.sub('\s+', ' ', html.unescape(line))
	142
	143	def gotspotify(type, track):
	144	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	145	xml = urllib2.urlopen(url).read()
	146	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
	147	lookup_type = soup.contents[2].name
	148
	149	if lookup_type == 'track':
	150	name = soup.find('name').string
	151	album_name = soup.find('album').find('name').string
	152	artist_name = soup.find('artist').find('name').string
	153	popularity = soup.find('popularity')
	154	if popularity:
	155	popularity = float(popularity.string)*100
	156	length = float(soup.find('length').string)
	157	minutes = int(length)/60
	158	seconds = int(length)%60
	159
	160	return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
	161
	162	elif lookup_type == 'album':
	163	album_name = soup.find('album').find('name').string
	164	artist_name = soup.find('artist').find('name').string
	165	released = soup.find('released').string
	166	return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
	167
	168	else:
	169	return 'Unsupported type.'
	170
	171	def _yt_duration(s):
	172	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	173	pcs = [x for x in mo.groups() if x]
	174	return ''.join(pcs).lower()
	175	def _yt_date(s, f):
	176	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	177	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
	178	def _yt_round(n):
	179	n = float(n)
	180	if n >= 10**12:
	181	return '%.1ft' % (n/10**12)
	182	elif n >= 10**9:
	183	return '%.1fb' % (n/10**9)
	184	elif n >= 10**6:
	185	return '%.1fm' % (n/10**6)
	186	elif n >= 10**3:
	187	return '%.1fk' % (n/10**3)
	188	else:
	189	return int(n)
	190
	191	def gotyoutube(url):
	192	url_data = urlparse.urlparse(url)
	193	query = urlparse.parse_qs(url_data.query)
	194	video = query["v"][0]
	195	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
	196	try:
	197	respdata = urllib2.urlopen(api_url).read()
	198	v = json.loads(respdata)
	199	v = v['items'][0]
	200
	201	return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
	202	'title': v['snippet']['title'],
	203	'author': v['snippet']['channelTitle'],
	204	'duration': _yt_duration(v['contentDetails']['duration']),
	205	'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
	206	'views': _yt_round(v['statistics']['viewCount']),
	207	'likes': _yt_round(v['statistics']['likeCount']),
	208	'dislikes': _yt_round(v['statistics']['dislikeCount']),
	209	})
	210	except urllib2.HTTPError as e:
	211	if e.getcode() == 403:
	212	return 'API limit exceeded'
	213	else:
	214	return str(e)
	215	except IndexError:
	216	return 'no results'
	217	except Exception as e:
	218	return str(e)
	219
	220	def gottwitch(uri):
	221	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	222	opener = urllib2.build_opener()
	223	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	224	respdata = opener.open(url).read()
	225	twitch = json.loads(respdata)['data']
	226	try:
	227	# TODO: add current game.
	228	return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	229	except:
	230	return 'Channel offline.'
	231
	232	def _humanize_bytes(b):
	233	b = int(b)
	234	i = 0
	235	table = " kMGTPEZYRQ"
	236	while b > 1024:
	237	i += 1
	238	b /= 1024.0
	239	if i == 0:
	240	return "%dB" % (b)
	241	else:
	242	return "%.2f%siB" % (b, table[i])
	243
	244	def _do_request(url, try_aia=False):
	245	"""
	246	Return value is a tuple consisting of:
	247	- the HTTPResponse object, or a string on error. Empty string -> no response.
	248	- and a flag indicating whether AIA was used
	249	"""
	250	request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Linux"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1'})
	251	if try_aia:
	252	opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
	253	else:
	254	opener = urllib2.build_opener(SmartRedirectHandler())
	255
	256	# Send request and handle errors
	257	try:
	258	response = opener.open(request, timeout=2)
	259	except http.client.InvalidURL as e: # why does a method under urllib.request raise an exception under http.client???
	260	return '', False
	261	except urllib2.HTTPError as e:
	262	return 'Request error: %s %s' % (e.code, e.reason), False
	263	except urllib2.URLError as e:
	264	if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
	265	if aia: # Retry with AIA enabled, if module is present
	266	return _do_request(url, True)
	267	else:
	268	lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
	269	return 'Request error: site may have broken TLS configuration (%s)' % (e.reason), False
	270	else:
	271	return 'Request error: %s' % (e.reason), False
	272	except TimeoutError as e:
	273	return 'Request error: request timed out', False
	274	except Exception as e:
	275	return 'Unknown error: %s %r' % (type(e).__name__, e.args), False
	276
	277	return response, try_aia
	278
	279
	280	def goturl(url):
	281	output = []
	282	for _, group in other_regexes:
	283	for regex in group:
	284	if regex.match(url):
	285	return None
	286
	287	response, used_aia = _do_request(url)
	288	if isinstance(response, stringbase):
	289	return response
	290
	291	# Try to add type and length headers to reply
	292	c_type = response.getheader('Content-Type', '').split(';', 1)[0]
	293	c_len = response.getheader('Content-Length')
	294	if c_type != '':
	295	output.append("[%s] " % (c_type))
	296	else:
	297	output.append("[no type] ")
	298	if c_type != "text/html": # else length will be provided by HTML code below
	299	if c_len is not None:
	300	output.append("[%s] " % (_humanize_bytes(c_len)))
	301	else:
	302	output.append("[no length] ")
	303
	304	if used_aia:
	305	output.append("[AIA] ")
	306
	307	# Try to add title if HTML
	308	if c_type == 'text/html':
	309	try:
	310	responsebody = response.read(1024*1024)
	311	except Exception as e:
	312	output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
	313	else:
	314	if c_len is not None and len(responsebody) != int(c_len):
	315	output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
	316	else:
	317	output.append("[%s] " % (_humanize_bytes(len(responsebody))))
	318	try:
	319	soup = BeautifulSoup(responsebody)
	320	if soup.title:
	321	output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
	322	else:
	323	output.append('No title')
	324	except Exception as e:
	325	output.append('Title error: %s %r ' % (type(e).__name__, e.args))
	326
	327	return ''.join(output)
	328
	329	url_regex = (
	330	re.compile(r'https?://(?:[^/\s.]+\.)+[^/\s.]+(?:/\S+)?'),
	331	)
	332	other_regexes = (
	333	(lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter\|x)\.com/""", re.I),)), # skip twitter
	334	(lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
	335	(lambda x: '', (re.compile(r"""https?://jfr\.im/git/""", re.I),)), # skip my gitweb
	336	)
	337	regexes = other_regexes + (
	338	(goturl, url_regex),
	339	)