[erebus.git] / modules / urls.py

# Erebus IRC bot - Author: Conny Sjoblom
# vim: fileencoding=utf-8
# URL Checker
# This file is released into the public domain; see http://unlicense.org/

# module info
modinfo = {
	'author': 'Erebus Team',
	'license': 'public domain',
	'compatible': [0],
	'depends': [],
	'softdeps': [],
}

# http://embed.ly/tools/generator

# preamble
import modlib
lib = modlib.modlib(__name__)
modstart = lib.modstart
modstop = lib.modstop

# module code
import sys
if sys.version_info.major < 3:
	stringbase = basestring
	import urllib2
	import urlparse
	import HTMLParser
	html = HTMLParser.HTMLParser()
	from BeautifulSoup import BeautifulSoup
else:
	stringbase = str
	import urllib.request as urllib2
	import urllib.parse as urlparse
	import html
	from bs4 import BeautifulSoup

import re, json, datetime

hostmask_regex = re.compile(r'^(.*)!(.*)@(.*)$')

def parser_hostmask(hostmask):
	if isinstance(hostmask, dict):
		return hostmask

	nick = None
	user = None
	host = None

	if hostmask is not None:
		match = hostmask_regex.match(hostmask)

		if not match:
			nick = hostmask
		else:
			nick = match.group(1)
			user = match.group(2)
			host = match.group(3)

	return {
		'nick': nick,
		'user': user,
		'host': host
	}

class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	def http_error_301(self, req, fp, code, msg, headers):
		result = urllib2.HTTPRedirectHandler.http_error_301(
				self, req, fp, code, msg, headers)
		result.status = code
		return result

	def http_error_302(self, req, fp, code, msg, headers):
		result = urllib2.HTTPRedirectHandler.http_error_302(
				self, req, fp, code, msg, headers)
		result.status = code
		return result

def _get_blocked_chans():
	return lib.parent.cfg.get('urls', 'blocked', '').split(',')

def process_line(line):
	responses = []
	num_found = 0
	limit = lib.parent.cfg.getint('urls', 'limit', 2)
	for action, group in regexes:
		for regex in group:
			for match in regex.findall(line):
				if match:
					num_found += 1
					if num_found > limit:
						return responses
					if isinstance(match, stringbase):
						resp = action(match)
					else:
						resp = action(*match)
					if resp is not None and resp != "":
						responses.append(resp)
	return responses

@lib.hooknum("PRIVMSG")
def privmsg_hook(bot, textline):
	user = parser_hostmask(textline[1:textline.find(' ')])
	chan = textline.split()[2]

	if chan in _get_blocked_chans(): return

	try:
		line = textline.split(None, 3)[3][1:]
	except IndexError:
		line = ''

	responses = process_line(line)
	if len(responses) > 0:
		if lib.parent.cfg.getboolean('urls', 'multiline'):
			for r in responses:
				bot.msg(chan, r, True)
		else:
			bot.msg(chan, ' | '.join(responses), True)

def unescape(line):
	return re.sub('\s+', ' ', html.unescape(line))

def gotspotify(type, track):
	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	xml = urllib2.urlopen(url).read()
	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
	lookup_type = soup.contents[2].name

	if lookup_type == 'track':
		name = soup.find('name').string
		album_name = soup.find('album').find('name').string
		artist_name = soup.find('artist').find('name').string
		popularity = soup.find('popularity')
		if popularity:
			popularity = float(popularity.string)*100
		length = float(soup.find('length').string)
		minutes = int(length)/60
		seconds = int(length)%60

		return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))

	elif lookup_type == 'album':
		album_name = soup.find('album').find('name').string
		artist_name = soup.find('artist').find('name').string
		released = soup.find('released').string
		return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))

	else:
		return 'Unsupported type.'

def _yt_duration(s):
	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	pcs = [x for x in mo.groups() if x]
	return ''.join(pcs).lower()
def _yt_date(s, f):
	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
def _yt_round(n):
	n = float(n)
	if n >= 10**12:
		return '%.1ft' % (n/10**12)
	elif n >= 10**9:
		return '%.1fb' % (n/10**9)
	elif n >= 10**6:
		return '%.1fm' % (n/10**6)
	elif n >= 10**3:
		return '%.1fk' % (n/10**3)
	else:
		return int(n)

def gotyoutube(url):
	url_data = urlparse.urlparse(url)
	query = urlparse.parse_qs(url_data.query)
	video = query["v"][0]
	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
	try:
		respdata = urllib2.urlopen(api_url).read()
		v = json.loads(respdata)
		v = v['items'][0]

		return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
			'title': v['snippet']['title'],
			'author': v['snippet']['channelTitle'],
			'duration': _yt_duration(v['contentDetails']['duration']),
			'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
			'views': _yt_round(v['statistics']['viewCount']),
			'likes': _yt_round(v['statistics']['likeCount']),
			'dislikes': _yt_round(v['statistics']['dislikeCount']),
		})
	except urllib2.HTTPError as e:
		if e.getcode() == 403:
			return 'API limit exceeded'
		else:
			return str(e)
	except IndexError:
		return 'no results'
	except Exception as e:
		return str(e)

def gottwitch(uri):
	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	opener = urllib2.build_opener()
	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	respdata = opener.open(url).read()
	twitch = json.loads(respdata)['data']
	try:
		# TODO: add current game.
		return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	except:
		return 'Channel offline.'

def _humanize_bytes(b):
	b = int(b)
	i = 0
	table = " kMGTPEZYRQ"
	while b > 1024:
		i += 1
		b /= 1024.0
	if i == 0:
		return "%dB" % (b)
	else:
		return "%.2f%siB" % (b, table[i])

def goturl(url):
	output = []
	for _, group in other_regexes:
		for regex in group:
			if regex.match(url):
				return None
	request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'})
	opener = urllib2.build_opener(SmartRedirectHandler())

	# Send request and handle errors
	try:
		response = opener.open(request, timeout=2)
	except urllib2.HTTPError as e:
		return 'Request error: %s %s' % (e.code, e.reason)
	except urllib2.URLError as e:
		return 'Request error: %s' % (e.reason)
	except TimeoutError as e:
		return 'Request error: request timed out'
	except Exception as e:
		return 'Unknown error: %s %r' % (type(e).__name__, e.args)

	# Try to add type and length headers to reply
	c_type = response.getheader('Content-Type', '').split(';', 1)[0]
	c_len = response.getheader('Content-Length')
	if c_type != '':
		output.append("[%s] " % (c_type))
	else:
		output.append("[no type] ")
	if c_type != "text/html": # else length will be provided by HTML code below
		if c_len is not None:
			output.append("[%s] " % (_humanize_bytes(c_len)))
		else:
			output.append("[no length] ")

	# Try to add title if HTML
	if c_type == 'text/html':
		try:
			responsebody = response.read(1024*1024)
			print(type(responsebody))
		except Exception as e:
			output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
		else:
			if c_len is not None and len(responsebody) != int(c_len):
				output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
			else:
				output.append("[%s] " % (_humanize_bytes(len(responsebody))))
			try:
				soup = BeautifulSoup(responsebody)
				if soup.title:
					output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
				else:
					output.append('No title')
			except Exception as e:
				output.append('Title error: %s %r ' % (type(e).__name__, e.args))

	return ''.join(output)

url_regex = (
	re.compile(r'https?://(?:[^/\s.]+\.)+[^/\s.]+(?:/\S+)?'),
)
other_regexes = (
)
regexes = other_regexes + (
	(goturl, url_regex),
)
Commit	Line	Data
58cd0191	1	# Erebus IRC bot - Author: Conny Sjoblom
4477123d	2	# vim: fileencoding=utf-8
a83e1f9c	3	# URL Checker
	4	# This file is released into the public domain; see http://unlicense.org/
	5
	6	# module info
	7	modinfo = {
	8	'author': 'Erebus Team',
	9	'license': 'public domain',
fa93b933	10	'compatible': [0],
a62d0d18	11	'depends': [],
a62d0d18	12	'softdeps': [],
a83e1f9c	13	}
a83e1f9c	14
99366200 CS	15	# http://embed.ly/tools/generator
99366200 CS	16
a83e1f9c	17	# preamble
	18	import modlib
	19	lib = modlib.modlib(__name__)
	20	modstart = lib.modstart
	21	modstop = lib.modstop
	22
	23	# module code
a28e2ae9	24	import sys
a28e2ae9	25	if sys.version_info.major < 3:
55bfe803	26	stringbase = basestring
a28e2ae9	27	import urllib2
	28	import urlparse
	29	import HTMLParser
d266ce49	30	html = HTMLParser.HTMLParser()
a28e2ae9	31	from BeautifulSoup import BeautifulSoup
a28e2ae9	32	else:
55bfe803	33	stringbase = str
a28e2ae9	34	import urllib.request as urllib2
a28e2ae9	35	import urllib.parse as urlparse
d266ce49	36	import html
a28e2ae9	37	from bs4 import BeautifulSoup
a28e2ae9	38
467acacf	39	import re, json, datetime
a83e1f9c	40
390fbad4	41	hostmask_regex = re.compile(r'^(.)!(.)@(.*)$')
a83e1f9c	42
	43	def parser_hostmask(hostmask):
	44	if isinstance(hostmask, dict):
	45	return hostmask
	46
	47	nick = None
	48	user = None
	49	host = None
	50
	51	if hostmask is not None:
	52	match = hostmask_regex.match(hostmask)
	53
	54	if not match:
	55	nick = hostmask
	56	else:
	57	nick = match.group(1)
	58	user = match.group(2)
	59	host = match.group(3)
	60
	61	return {
	62	'nick': nick,
	63	'user': user,
	64	'host': host
	65	}
	66
394a7b69 CS	67	class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	68	def http_error_301(self, req, fp, code, msg, headers):
	69	result = urllib2.HTTPRedirectHandler.http_error_301(
	70	self, req, fp, code, msg, headers)
	71	result.status = code
	72	return result
	73
	74	def http_error_302(self, req, fp, code, msg, headers):
	75	result = urllib2.HTTPRedirectHandler.http_error_302(
	76	self, req, fp, code, msg, headers)
	77	result.status = code
	78	return result
	79
87f0733f JR	80	def _get_blocked_chans():
	81	return lib.parent.cfg.get('urls', 'blocked', '').split(',')
	82
467acacf	83	def process_line(line):
	84	responses = []
	85	num_found = 0
	86	limit = lib.parent.cfg.getint('urls', 'limit', 2)
ecbed328	87	for action, group in regexes:
467acacf	88	for regex in group:
	89	for match in regex.findall(line):
	90	if match:
	91	num_found += 1
	92	if num_found > limit:
	93	return responses
55bfe803 JR	94	if isinstance(match, stringbase):
	95	resp = action(match)
	96	else:
	97	resp = action(*match)
ecbed328 JR	98	if resp is not None and resp != "":
ecbed328 JR	99	responses.append(resp)
467acacf	100	return responses
467acacf	101
a83e1f9c	102	@lib.hooknum("PRIVMSG")
390fbad4 CS	103	def privmsg_hook(bot, textline):
	104	user = parser_hostmask(textline[1:textline.find(' ')])
	105	chan = textline.split()[2]
a83e1f9c	106
87f0733f JR	107	if chan in _get_blocked_chans(): return
87f0733f JR	108
a83e1f9c	109	try:
390fbad4	110	line = textline.split(None, 3)[3][1:]
a83e1f9c	111	except IndexError:
390fbad4	112	line = ''
a83e1f9c	113
467acacf	114	responses = process_line(line)
04d48353	115	if len(responses) > 0:
467acacf	116	if lib.parent.cfg.getboolean('urls', 'multiline'):
	117	for r in responses:
	118	bot.msg(chan, r, True)
	119	else:
	120	bot.msg(chan, ' \| '.join(responses), True)
a83e1f9c	121
390fbad4	122	def unescape(line):
d266ce49	123	return re.sub('\s+', ' ', html.unescape(line))
a83e1f9c	124
	125	def gotspotify(type, track):
	126	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	127	xml = urllib2.urlopen(url).read()
390fbad4	128	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
a83e1f9c	129	lookup_type = soup.contents[2].name
390fbad4	130
a83e1f9c	131	if lookup_type == 'track':
	132	name = soup.find('name').string
	133	album_name = soup.find('album').find('name').string
	134	artist_name = soup.find('artist').find('name').string
	135	popularity = soup.find('popularity')
	136	if popularity:
	137	popularity = float(popularity.string)*100
	138	length = float(soup.find('length').string)
	139	minutes = int(length)/60
467acacf	140	seconds = int(length)%60
390fbad4	141
dafa38fc	142	return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
390fbad4	143
a83e1f9c	144	elif lookup_type == 'album':
	145	album_name = soup.find('album').find('name').string
	146	artist_name = soup.find('artist').find('name').string
	147	released = soup.find('released').string
dafa38fc	148	return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
390fbad4	149
a83e1f9c	150	else:
	151	return 'Unsupported type.'
	152
467acacf	153	def _yt_duration(s):
	154	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	155	pcs = [x for x in mo.groups() if x]
	156	return ''.join(pcs).lower()
	157	def _yt_date(s, f):
	158	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	159	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
	160	def _yt_round(n):
	161	n = float(n)
	162	if n >= 10**12:
	163	return '%.1ft' % (n/10**12)
	164	elif n >= 10**9:
	165	return '%.1fb' % (n/10**9)
	166	elif n >= 10**6:
	167	return '%.1fm' % (n/10**6)
	168	elif n >= 10**3:
	169	return '%.1fk' % (n/10**3)
	170	else:
	171	return int(n)
	172
a83e1f9c	173	def gotyoutube(url):
	174	url_data = urlparse.urlparse(url)
	175	query = urlparse.parse_qs(url_data.query)
	176	video = query["v"][0]
467acacf	177	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
a83e1f9c	178	try:
a83e1f9c	179	respdata = urllib2.urlopen(api_url).read()
467acacf	180	v = json.loads(respdata)
	181	v = v['items'][0]
	182
	183	return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
	184	'title': v['snippet']['title'],
	185	'author': v['snippet']['channelTitle'],
	186	'duration': _yt_duration(v['contentDetails']['duration']),
	187	'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
	188	'views': _yt_round(v['statistics']['viewCount']),
	189	'likes': _yt_round(v['statistics']['likeCount']),
	190	'dislikes': _yt_round(v['statistics']['dislikeCount']),
	191	})
	192	except urllib2.HTTPError as e:
	193	if e.getcode() == 403:
	194	return 'API limit exceeded'
	195	else:
	196	return str(e)
	197	except IndexError:
	198	return 'no results'
	199	except Exception as e:
	200	return str(e)
a83e1f9c	201
390fbad4	202	def gottwitch(uri):
467acacf	203	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	204	opener = urllib2.build_opener()
	205	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	206	respdata = opener.open(url).read()
	207	twitch = json.loads(respdata)['data']
	208	try:
	209	# TODO: add current game.
	210	return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	211	except:
	212	return 'Channel offline.'
390fbad4	213
ecbed328 JR	214	def _humanize_bytes(b):
	215	b = int(b)
	216	i = 0
	217	table = " kMGTPEZYRQ"
	218	while b > 1024:
	219	i += 1
	220	b /= 1024.0
	221	if i == 0:
	222	return "%dB" % (b)
	223	else:
	224	return "%.2f%siB" % (b, table[i])
	225
390fbad4	226	def goturl(url):
ecbed328 JR	227	output = []
ecbed328 JR	228	for _, group in other_regexes:
467acacf	229	for regex in group:
	230	if regex.match(url):
	231	return None
9df62f90	232	request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'})
394a7b69	233	opener = urllib2.build_opener(SmartRedirectHandler())
ecbed328 JR	234
ecbed328 JR	235	# Send request and handle errors
993046cc	236	try:
ecbed328	237	response = opener.open(request, timeout=2)
de8ab9cb	238	except urllib2.HTTPError as e:
ecbed328	239	return 'Request error: %s %s' % (e.code, e.reason)
74dc2a9d	240	except urllib2.URLError as e:
ecbed328	241	return 'Request error: %s' % (e.reason)
9df62f90	242	except TimeoutError as e:
ecbed328	243	return 'Request error: request timed out'
04d48353	244	except Exception as e:
ecbed328 JR	245	return 'Unknown error: %s %r' % (type(e).__name__, e.args)
	246
	247	# Try to add type and length headers to reply
	248	c_type = response.getheader('Content-Type', '').split(';', 1)[0]
	249	c_len = response.getheader('Content-Length')
	250	if c_type != '':
	251	output.append("[%s] " % (c_type))
	252	else:
	253	output.append("[no type] ")
	254	if c_type != "text/html": # else length will be provided by HTML code below
	255	if c_len is not None:
	256	output.append("[%s] " % (_humanize_bytes(c_len)))
	257	else:
	258	output.append("[no length] ")
	259
	260	# Try to add title if HTML
	261	if c_type == 'text/html':
	262	try:
	263	responsebody = response.read(1024*1024)
	264	print(type(responsebody))
	265	except Exception as e:
	266	output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
	267	else:
	268	if c_len is not None and len(responsebody) != int(c_len):
	269	output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
	270	else:
	271	output.append("[%s] " % (_humanize_bytes(len(responsebody))))
	272	try:
	273	soup = BeautifulSoup(responsebody)
	274	if soup.title:
07fbfaa6	275	output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
ecbed328 JR	276	else:
	277	output.append('No title')
	278	except Exception as e:
	279	output.append('Title error: %s %r ' % (type(e).__name__, e.args))
	280
	281	return ''.join(output)
467acacf	282
467acacf	283	url_regex = (
ecbed328	284	re.compile(r'https?://(?:[^/\s.]+\.)+[^/\s.]+(?:/\S+)?'),
467acacf	285	)
467acacf	286	other_regexes = (
467acacf	287	)
467acacf	288	regexes = other_regexes + (
ecbed328	289	(goturl, url_regex),
467acacf	290	)