jfr.im git - erebus.git/blame_incremental

... / ...

Commit	Line	Data
	1	# Erebus IRC bot - Author: Conny Sjoblom
	2	# vim: fileencoding=utf-8
	3	# URL Checker
	4	# This file is released into the public domain; see http://unlicense.org/
	5
	6	# module info
	7	modinfo = {
	8	'author': 'Erebus Team',
	9	'license': 'public domain',
	10	'compatible': [0],
	11	'depends': [],
	12	'softdeps': [],
	13	}
	14
	15	# http://embed.ly/tools/generator
	16
	17	# preamble
	18	import modlib
	19	lib = modlib.modlib(__name__)
	20	modstart = lib.modstart
	21	modstop = lib.modstop
	22
	23	# module code
	24	import sys
	25	if sys.version_info.major < 3:
	26	stringbase = basestring
	27	import urllib2
	28	import urlparse
	29	import HTMLParser
	30	html = HTMLParser.HTMLParser()
	31	from BeautifulSoup import BeautifulSoup
	32	else:
	33	stringbase = str
	34	import urllib.request as urllib2
	35	import urllib.parse as urlparse
	36	import html
	37	from bs4 import BeautifulSoup
	38	import http.client
	39
	40	import re, json, datetime
	41
	42	try:
	43	import aia
	44	aia_session = aia.AIASession()
	45	# aia is broken on capath systems, needs cafile to work
	46	aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
	47	aia_session._trusted = {
	48	aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
	49	for ca_der in aia_session._context.get_ca_certs(True)
	50	}
	51	print("aia loaded")
	52	except ImportError as e:
	53	print(repr(e))
	54	aia = None
	55
	56	hostmask_regex = re.compile(r'^(.)!(.)@(.*)$')
	57
	58	def parser_hostmask(hostmask):
	59	if isinstance(hostmask, dict):
	60	return hostmask
	61
	62	nick = None
	63	user = None
	64	host = None
	65
	66	if hostmask is not None:
	67	match = hostmask_regex.match(hostmask)
	68
	69	if not match:
	70	nick = hostmask
	71	else:
	72	nick = match.group(1)
	73	user = match.group(2)
	74	host = match.group(3)
	75
	76	return {
	77	'nick': nick,
	78	'user': user,
	79	'host': host
	80	}
	81
	82	class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	83	def http_error_301(self, req, fp, code, msg, headers):
	84	result = urllib2.HTTPRedirectHandler.http_error_301(
	85	self, req, fp, code, msg, headers)
	86	result.status = code
	87	return result
	88
	89	def http_error_302(self, req, fp, code, msg, headers):
	90	result = urllib2.HTTPRedirectHandler.http_error_302(
	91	self, req, fp, code, msg, headers)
	92	result.status = code
	93	return result
	94
	95	def _get_blocked_chans():
	96	return lib.parent.cfg.get('urls', 'blocked', '').split(',')
	97
	98	def process_line(line):
	99	responses = []
	100	num_found = 0
	101	limit = lib.parent.cfg.getint('urls', 'limit', 2)
	102	for action, group in regexes:
	103	for regex in group:
	104	for match in regex.findall(line):
	105	if match:
	106	num_found += 1
	107	if num_found > limit:
	108	return responses
	109	if isinstance(match, stringbase):
	110	resp = action(match)
	111	else:
	112	resp = action(*match)
	113	if resp is not None and resp != "":
	114	responses.append(resp)
	115	return responses
	116
	117	@lib.hooknum("PRIVMSG")
	118	def privmsg_hook(bot, textline):
	119	user = parser_hostmask(textline[1:textline.find(' ')])
	120	chan = textline.split()[2]
	121
	122	if chan in _get_blocked_chans(): return
	123
	124	try:
	125	line = textline.split(None, 3)[3][1:]
	126	except IndexError:
	127	line = ''
	128
	129	responses = process_line(line)
	130	send_response(bot, chan, responses)
	131
	132	def send_response(bot, chan, responses):
	133	if len(responses) > 0:
	134	if lib.parent.cfg.getboolean('urls', 'multiline'):
	135	for r in responses:
	136	bot.msg(chan, r, True)
	137	else:
	138	bot.msg(chan, ' \| '.join(responses), True)
	139
	140	def unescape(line):
	141	return re.sub('\s+', ' ', html.unescape(line))
	142
	143	def gotspotify(type, track):
	144	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	145	xml = urllib2.urlopen(url).read()
	146	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
	147	lookup_type = soup.contents[2].name
	148
	149	if lookup_type == 'track':
	150	name = soup.find('name').string
	151	album_name = soup.find('album').find('name').string
	152	artist_name = soup.find('artist').find('name').string
	153	popularity = soup.find('popularity')
	154	if popularity:
	155	popularity = float(popularity.string)*100
	156	length = float(soup.find('length').string)
	157	minutes = int(length)/60
	158	seconds = int(length)%60
	159
	160	return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
	161
	162	elif lookup_type == 'album':
	163	album_name = soup.find('album').find('name').string
	164	artist_name = soup.find('artist').find('name').string
	165	released = soup.find('released').string
	166	return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
	167
	168	else:
	169	return 'Unsupported type.'
	170
	171	def _yt_duration(s):
	172	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	173	pcs = [x for x in mo.groups() if x]
	174	return ''.join(pcs).lower()
	175	def _yt_date(s, f):
	176	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	177	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
	178	def _yt_round(n):
	179	n = float(n)
	180	if n >= 10**12:
	181	return '%.1ft' % (n/10**12)
	182	elif n >= 10**9:
	183	return '%.1fb' % (n/10**9)
	184	elif n >= 10**6:
	185	return '%.1fm' % (n/10**6)
	186	elif n >= 10**3:
	187	return '%.1fk' % (n/10**3)
	188	else:
	189	return int(n)
	190
	191	def gotyoutube(url):
	192	url_data = urlparse.urlparse(url)
	193	query = urlparse.parse_qs(url_data.query)
	194	video = query["v"][0]
	195	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
	196	try:
	197	respdata = urllib2.urlopen(api_url).read()
	198	v = json.loads(respdata)
	199	v = v['items'][0]
	200
	201	return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
	202	'title': v['snippet']['title'],
	203	'author': v['snippet']['channelTitle'],
	204	'duration': _yt_duration(v['contentDetails']['duration']),
	205	'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
	206	'views': _yt_round(v['statistics']['viewCount']),
	207	'likes': _yt_round(v['statistics']['likeCount']),
	208	'dislikes': _yt_round(v['statistics']['dislikeCount']),
	209	})
	210	except urllib2.HTTPError as e:
	211	if e.getcode() == 403:
	212	return 'API limit exceeded'
	213	else:
	214	return str(e)
	215	except IndexError:
	216	return 'no results'
	217	except Exception as e:
	218	return str(e)
	219
	220	def gottwitch(uri):
	221	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	222	opener = urllib2.build_opener()
	223	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	224	respdata = opener.open(url).read()
	225	twitch = json.loads(respdata)['data']
	226	try:
	227	# TODO: add current game.
	228	return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	229	except:
	230	return 'Channel offline.'
	231
	232	def _humanize_bytes(b):
	233	b = int(b)
	234	i = 0
	235	table = " kMGTPEZYRQ"
	236	while b > 1024:
	237	i += 1
	238	b /= 1024.0
	239	if i == 0:
	240	return "%dB" % (b)
	241	else:
	242	return "%.2f%siB" % (b, table[i])
	243
	244	def _do_request(url, try_aia=False):
	245	"""
	246	Return value is a tuple consisting of:
	247	- the HTTPResponse object, or a string on error. Empty string -> no response.
	248	- and a flag indicating whether AIA was used
	249	"""
	250	try:
	251	request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Linux"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1'})
	252	except ValueError:
	253	return '', False
	254	if try_aia:
	255	opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
	256	else:
	257	opener = urllib2.build_opener(SmartRedirectHandler())
	258
	259	# Send request and handle errors
	260	try:
	261	response = opener.open(request, timeout=2)
	262	except http.client.InvalidURL as e: # why does a method under urllib.request raise an exception under http.client???
	263	return '', False
	264	except urllib2.HTTPError as e:
	265	return 'Request error: %s %s' % (e.code, e.reason), False
	266	except urllib2.URLError as e:
	267	if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
	268	if aia: # Retry with AIA enabled, if module is present
	269	return _do_request(url, True)
	270	else:
	271	lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
	272	return 'Request error: site may have broken TLS configuration (%s)' % (e.reason), False
	273	else:
	274	return 'Request error: %s' % (e.reason), False
	275	except TimeoutError as e:
	276	return 'Request error: request timed out', False
	277	except Exception as e:
	278	return 'Unknown error: %s %r' % (type(e).__name__, e.args), False
	279
	280	return response, try_aia
	281
	282
	283	def goturl(url):
	284	output = []
	285	for _, group in other_regexes:
	286	for regex in group:
	287	if regex.match(url):
	288	return None
	289
	290	response, used_aia = _do_request(url)
	291	if isinstance(response, stringbase):
	292	return response
	293
	294	# Try to add type and length headers to reply
	295	c_type_fields = response.getheader('Content-Type', '').split(';')
	296	c_type = c_type_fields.pop(0)
	297	c_charset = None
	298	for f in c_type_fields:
	299	f = f.strip()
	300	if len(f) > 8 and f[0:8] == 'charset=':
	301	c_charset = f[8:]
	302	c_len = response.getheader('Content-Length')
	303	if c_type != '':
	304	output.append("[%s] " % (c_type))
	305	else:
	306	output.append("[no type] ")
	307	if c_type != "text/html": # else length will be provided by HTML code below
	308	if c_len is not None:
	309	output.append("[%s] " % (_humanize_bytes(c_len)))
	310	else:
	311	output.append("[no length] ")
	312
	313	if used_aia:
	314	output.append("[AIA] ")
	315
	316	# Try to add title if HTML
	317	if c_type == 'text/html':
	318	try:
	319	responsebody = response.read(1024*1024)
	320	except Exception as e:
	321	output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
	322	else:
	323	if c_len is not None and len(responsebody) != int(c_len):
	324	output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
	325	else:
	326	output.append("[%s] " % (_humanize_bytes(len(responsebody))))
	327	try:
	328	soup = BeautifulSoup(responsebody, from_encoding=c_charset)
	329	if soup.title:
	330	output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
	331	else:
	332	output.append('No title')
	333	except Exception as e:
	334	output.append('Title error: %s %r ' % (type(e).__name__, e.args))
	335
	336	return ''.join(output)
	337
	338	url_regex = (
	339	re.compile(r'https?://(?:[^/\s.]+\.)+[^/\s.]+(?:/\S+)?'),
	340	)
	341	other_regexes = (
	342	(lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter\|x)\.com/""", re.I),)), # skip twitter
	343	(lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
	344	(lambda x: '', (re.compile(r"""https?://jfr\.im/git/""", re.I),)), # skip my gitweb
	345	(lambda x: '', (re.compile(r"""https?://(?:www\.)?wunderground\.com/""", re.I),)), # skip wunderground, they time us out
	346	)
	347	regexes = other_regexes + (
	348	(goturl, url_regex),
	349	)