jfr.im git - erebus.git/blame_incremental

... / ...

Commit	Line	Data
	1	# Erebus IRC bot - Author: Conny Sjoblom
	2	# vim: fileencoding=utf-8
	3	# URL Checker
	4	# This file is released into the public domain; see http://unlicense.org/
	5
	6	# module info
	7	modinfo = {
	8	'author': 'Erebus Team',
	9	'license': 'public domain',
	10	'compatible': [0],
	11	'depends': [],
	12	'softdeps': [],
	13	}
	14
	15	# http://embed.ly/tools/generator
	16
	17	# preamble
	18	import modlib
	19	lib = modlib.modlib(__name__)
	20	modstart = lib.modstart
	21	modstop = lib.modstop
	22
	23	# module code
	24	import sys
	25	if sys.version_info.major < 3:
	26	stringbase = basestring
	27	import urllib2
	28	import urlparse
	29	import HTMLParser
	30	html = HTMLParser.HTMLParser()
	31	from BeautifulSoup import BeautifulSoup
	32	else:
	33	stringbase = str
	34	import urllib.request as urllib2
	35	import urllib.parse as urlparse
	36	import html
	37	from bs4 import BeautifulSoup
	38	import http.client
	39
	40	import re, json, datetime
	41
	42	try:
	43	import aia
	44	aia_session = aia.AIASession()
	45	# aia is broken on capath systems, needs cafile to work
	46	aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
	47	aia_session._trusted = {
	48	aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
	49	for ca_der in aia_session._context.get_ca_certs(True)
	50	}
	51	print("aia loaded")
	52	except ImportError as e:
	53	print(repr(e))
	54	aia = None
	55
	56	hostmask_regex = re.compile(r'^(.)!(.)@(.*)$')
	57
	58	def parser_hostmask(hostmask):
	59	if isinstance(hostmask, dict):
	60	return hostmask
	61
	62	nick = None
	63	user = None
	64	host = None
	65
	66	if hostmask is not None:
	67	match = hostmask_regex.match(hostmask)
	68
	69	if not match:
	70	nick = hostmask
	71	else:
	72	nick = match.group(1)
	73	user = match.group(2)
	74	host = match.group(3)
	75
	76	return {
	77	'nick': nick,
	78	'user': user,
	79	'host': host
	80	}
	81
	82	class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	83	def http_error_301(self, req, fp, code, msg, headers):
	84	result = urllib2.HTTPRedirectHandler.http_error_301(
	85	self, req, fp, code, msg, headers)
	86	result.status = code
	87	return result
	88
	89	def http_error_302(self, req, fp, code, msg, headers):
	90	result = urllib2.HTTPRedirectHandler.http_error_302(
	91	self, req, fp, code, msg, headers)
	92	result.status = code
	93	return result
	94
	95	def _get_blocked_chans():
	96	return lib.parent.cfg.get('urls', 'blocked', '').split(',')
	97
	98	def process_line(line):
	99	responses = []
	100	num_found = 0
	101	limit = lib.parent.cfg.getint('urls', 'limit', 2)
	102	for action, group in regexes:
	103	for regex in group:
	104	for match in regex.findall(line):
	105	if match:
	106	num_found += 1
	107	if num_found > limit:
	108	return responses
	109	if isinstance(match, stringbase):
	110	resp = action(match)
	111	else:
	112	resp = action(*match)
	113	if resp is not None and resp != "":
	114	responses.append(resp)
	115	return responses
	116
	117	@lib.hooknum("PRIVMSG")
	118	def privmsg_hook(bot, textline):
	119	user = parser_hostmask(textline[1:textline.find(' ')])
	120	chan = textline.split()[2]
	121
	122	if chan in _get_blocked_chans(): return
	123
	124	try:
	125	line = textline.split(None, 3)[3][1:]
	126	except IndexError:
	127	line = ''
	128
	129	responses = process_line(line)
	130	send_response(bot, chan, responses)
	131
	132	def send_response(bot, chan, responses):
	133	if len(responses) > 0:
	134	if lib.parent.cfg.getboolean('urls', 'multiline'):
	135	for r in responses:
	136	bot.msg(chan, r, True)
	137	else:
	138	bot.msg(chan, ' \| '.join(responses), True)
	139
	140	def unescape(line):
	141	return re.sub('\s+', ' ', html.unescape(line))
	142
	143	def gotspotify(type, track):
	144	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	145	xml = urllib2.urlopen(url).read()
	146	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
	147	lookup_type = soup.contents[2].name
	148
	149	if lookup_type == 'track':
	150	name = soup.find('name').string
	151	album_name = soup.find('album').find('name').string
	152	artist_name = soup.find('artist').find('name').string
	153	popularity = soup.find('popularity')
	154	if popularity:
	155	popularity = float(popularity.string)*100
	156	length = float(soup.find('length').string)
	157	minutes = int(length)/60
	158	seconds = int(length)%60
	159
	160	return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
	161
	162	elif lookup_type == 'album':
	163	album_name = soup.find('album').find('name').string
	164	artist_name = soup.find('artist').find('name').string
	165	released = soup.find('released').string
	166	return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
	167
	168	else:
	169	return 'Unsupported type.'
	170
	171	def _yt_duration(s):
	172	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	173	pcs = [x for x in mo.groups() if x]
	174	return ''.join(pcs).lower()
	175	def _yt_date(s, f):
	176	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	177	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
	178	def _yt_round(n):
	179	n = float(n)
	180	if n >= 10**12:
	181	return '%.1ft' % (n/10**12)
	182	elif n >= 10**9:
	183	return '%.1fb' % (n/10**9)
	184	elif n >= 10**6:
	185	return '%.1fm' % (n/10**6)
	186	elif n >= 10**3:
	187	return '%.1fk' % (n/10**3)
	188	else:
	189	return int(n)
	190
	191	def gotyoutube(url):
	192	url_data = urlparse.urlparse(url)
	193	query = urlparse.parse_qs(url_data.query)
	194	video = query["v"][0]
	195	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
	196	try:
	197	respdata = urllib2.urlopen(api_url).read()
	198	v = json.loads(respdata)
	199	v = v['items'][0]
	200
	201	return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
	202	'title': v['snippet']['title'],
	203	'author': v['snippet']['channelTitle'],
	204	'duration': _yt_duration(v['contentDetails']['duration']),
	205	'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
	206	'views': _yt_round(v['statistics']['viewCount']),
	207	'likes': _yt_round(v['statistics']['likeCount']),
	208	'dislikes': _yt_round(v['statistics']['dislikeCount']),
	209	})
	210	except urllib2.HTTPError as e:
	211	if e.getcode() == 403:
	212	return 'API limit exceeded'
	213	else:
	214	return str(e)
	215	except IndexError:
	216	return 'no results'
	217	except Exception as e:
	218	return str(e)
	219
	220	def gottwitch(uri):
	221	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	222	opener = urllib2.build_opener()
	223	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	224	respdata = opener.open(url).read()
	225	twitch = json.loads(respdata)['data']
	226	try:
	227	# TODO: add current game.
	228	return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	229	except:
	230	return 'Channel offline.'
	231
	232	def _humanize_bytes(b):
	233	b = int(b)
	234	i = 0
	235	table = " kMGTPEZYRQ"
	236	while b > 1024:
	237	i += 1
	238	b /= 1024.0
	239	if i == 0:
	240	return "%dB" % (b)
	241	else:
	242	return "%.2f%siB" % (b, table[i])
	243
	244	def _do_request(url, try_aia=False):
	245	"""
	246	Return value is a tuple consisting of:
	247	- the HTTPResponse object, or a string on error. Empty string -> no response.
	248	- and a flag indicating whether AIA was used
	249	"""
	250	try:
	251	request = urllib2.Request(url, headers={
	252	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
	253	'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
	254	'Sec-Ch-Ua-Mobile': '?0',
	255	'Sec-Ch-Ua-Platform': '"Linux"',
	256	'Sec-Fetch-Dest': 'document',
	257	'Sec-Fetch-Mode': 'navigate',
	258	'Sec-Fetch-Site': 'same-origin',
	259	'Sec-Fetch-User': '?1',
	260	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7',
	261	'Accept-Language': 'en-US,en;q=0.9',
	262	'Cache-Control': 'no-cache',
	263	'Pragma': 'no-cache',
	264	'Upgrade-Insecure-Requests': '1'
	265	})
	266	except ValueError:
	267	return '', False
	268	if try_aia:
	269	opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
	270	else:
	271	opener = urllib2.build_opener(SmartRedirectHandler())
	272
	273	# Send request and handle errors
	274	try:
	275	response = opener.open(request, timeout=2)
	276	except http.client.InvalidURL as e: # why does a method under urllib.request raise an exception under http.client???
	277	return '', False
	278	except urllib2.HTTPError as e:
	279	return 'Request error: %s %s' % (e.code, e.reason), False
	280	except urllib2.URLError as e:
	281	if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
	282	if aia: # Retry with AIA enabled, if module is present
	283	return _do_request(url, True)
	284	else:
	285	lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
	286	return 'Request error: site may have broken TLS configuration (%s)' % (e.reason), False
	287	else:
	288	return 'Request error: %s' % (e.reason), False
	289	except TimeoutError as e:
	290	return 'Request error: request timed out', False
	291	except Exception as e:
	292	return 'Unknown error: %s %r' % (type(e).__name__, e.args), False
	293
	294	return response, try_aia
	295
	296
	297	def goturl(url):
	298	output = []
	299	for _, group in other_regexes:
	300	for regex in group:
	301	if regex.match(url):
	302	return None
	303
	304	response, used_aia = _do_request(url)
	305	if isinstance(response, stringbase):
	306	return response
	307
	308	# Try to add type and length headers to reply
	309	c_type_fields = response.getheader('Content-Type', '').split(';')
	310	c_type = c_type_fields.pop(0)
	311	c_charset = None
	312	for f in c_type_fields:
	313	f = f.strip()
	314	if len(f) > 8 and f[0:8] == 'charset=':
	315	c_charset = f[8:]
	316	c_len = response.getheader('Content-Length')
	317	if c_type != '':
	318	output.append("[%s] " % (c_type))
	319	else:
	320	output.append("[no type] ")
	321	if c_type != "text/html": # else length will be provided by HTML code below
	322	if c_len is not None:
	323	output.append("[%s] " % (_humanize_bytes(c_len)))
	324	else:
	325	output.append("[no length] ")
	326
	327	if used_aia:
	328	output.append("[AIA] ")
	329
	330	# Try to add title if HTML
	331	if c_type == 'text/html':
	332	try:
	333	responsebody = response.read(1024*1024)
	334	except Exception as e:
	335	output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
	336	else:
	337	if c_len is not None and len(responsebody) != int(c_len):
	338	output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
	339	else:
	340	output.append("[%s] " % (_humanize_bytes(len(responsebody))))
	341	try:
	342	soup = BeautifulSoup(responsebody, from_encoding=c_charset)
	343	if soup.title:
	344	output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
	345	else:
	346	output.append('No title')
	347	except Exception as e:
	348	output.append('Title error: %s %r ' % (type(e).__name__, e.args))
	349
	350	return ''.join(output)
	351
	352	url_regex = (
	353	re.compile(r'https?://(?:[^/\s.]+\.)+[a-z0-9-]+(?:/[^\s\]>)}]+)?', re.I),
	354	)
	355	other_regexes = (
	356	(lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter\|x)\.com/""", re.I),)), # skip twitter
	357	(lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
	358	(lambda x: '', (re.compile(r"""https?://jfr\.im/git/""", re.I),)), # skip my gitweb
	359	(lambda x: '', (re.compile(r"""https?://(?:www\.)?wunderground\.com/""", re.I),)), # skip wunderground, they time us out
	360	)
	361	regexes = other_regexes + (
	362	(goturl, url_regex),
	363	)