[erebus.git] / modules / urls.py

# Erebus IRC bot - Author: Conny Sjoblom
# vim: fileencoding=utf-8
# URL Checker
# This file is released into the public domain; see http://unlicense.org/

# module info
modinfo = {
	'author': 'Erebus Team',
	'license': 'public domain',
	'compatible': [0],
	'depends': [],
	'softdeps': [],
}

# http://embed.ly/tools/generator

# preamble
import modlib
lib = modlib.modlib(__name__)
modstart = lib.modstart
modstop = lib.modstop

# module code
import sys
if sys.version_info.major < 3:
	stringbase = basestring
	import urllib2
	import urlparse
	import HTMLParser
	html = HTMLParser.HTMLParser()
	from BeautifulSoup import BeautifulSoup
else:
	stringbase = str
	import urllib.request as urllib2
	import urllib.parse as urlparse
	import html
	from bs4 import BeautifulSoup

import re, json, datetime

hostmask_regex = re.compile(r'^(.*)!(.*)@(.*)$')

def parser_hostmask(hostmask):
	if isinstance(hostmask, dict):
		return hostmask

	nick = None
	user = None
	host = None

	if hostmask is not None:
		match = hostmask_regex.match(hostmask)

		if not match:
			nick = hostmask
		else:
			nick = match.group(1)
			user = match.group(2)
			host = match.group(3)

	return {
		'nick': nick,
		'user': user,
		'host': host
	}

class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	def http_error_301(self, req, fp, code, msg, headers):
		result = urllib2.HTTPRedirectHandler.http_error_301(
				self, req, fp, code, msg, headers)
		result.status = code
		return result

	def http_error_302(self, req, fp, code, msg, headers):
		result = urllib2.HTTPRedirectHandler.http_error_302(
				self, req, fp, code, msg, headers)
		result.status = code
		return result

def process_line(line):
	responses = []
	num_found = 0
	limit = lib.parent.cfg.getint('urls', 'limit', 2)
	for action, group in regexes:
		for regex in group:
			for match in regex.findall(line):
				if match:
					num_found += 1
					if num_found > limit:
						return responses
					if isinstance(match, stringbase):
						resp = action(match)
					else:
						resp = action(*match)
					if resp is not None and resp != "":
						responses.append(resp)
	return responses

@lib.hooknum("PRIVMSG")
def privmsg_hook(bot, textline):
	user = parser_hostmask(textline[1:textline.find(' ')])
	chan = textline.split()[2]

	try:
		line = textline.split(None, 3)[3][1:]
	except IndexError:
		line = ''

	responses = process_line(line)
	if len(responses) > 0:
		if lib.parent.cfg.getboolean('urls', 'multiline'):
			for r in responses:
				bot.msg(chan, r, True)
		else:
			bot.msg(chan, ' | '.join(responses), True)

def unescape(line):
	return re.sub('\s+', ' ', html.unescape(line))

def gotspotify(type, track):
	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	xml = urllib2.urlopen(url).read()
	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
	lookup_type = soup.contents[2].name

	if lookup_type == 'track':
		name = soup.find('name').string
		album_name = soup.find('album').find('name').string
		artist_name = soup.find('artist').find('name').string
		popularity = soup.find('popularity')
		if popularity:
			popularity = float(popularity.string)*100
		length = float(soup.find('length').string)
		minutes = int(length)/60
		seconds = int(length)%60

		return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))

	elif lookup_type == 'album':
		album_name = soup.find('album').find('name').string
		artist_name = soup.find('artist').find('name').string
		released = soup.find('released').string
		return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))

	else:
		return 'Unsupported type.'

def _yt_duration(s):
	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	pcs = [x for x in mo.groups() if x]
	return ''.join(pcs).lower()
def _yt_date(s, f):
	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
def _yt_round(n):
	n = float(n)
	if n >= 10**12:
		return '%.1ft' % (n/10**12)
	elif n >= 10**9:
		return '%.1fb' % (n/10**9)
	elif n >= 10**6:
		return '%.1fm' % (n/10**6)
	elif n >= 10**3:
		return '%.1fk' % (n/10**3)
	else:
		return int(n)

def gotyoutube(url):
	url_data = urlparse.urlparse(url)
	query = urlparse.parse_qs(url_data.query)
	video = query["v"][0]
	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
	try:
		respdata = urllib2.urlopen(api_url).read()
		v = json.loads(respdata)
		v = v['items'][0]

		return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
			'title': v['snippet']['title'],
			'author': v['snippet']['channelTitle'],
			'duration': _yt_duration(v['contentDetails']['duration']),
			'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
			'views': _yt_round(v['statistics']['viewCount']),
			'likes': _yt_round(v['statistics']['likeCount']),
			'dislikes': _yt_round(v['statistics']['dislikeCount']),
		})
	except urllib2.HTTPError as e:
		if e.getcode() == 403:
			return 'API limit exceeded'
		else:
			return str(e)
	except IndexError:
		return 'no results'
	except Exception as e:
		return str(e)

def gottwitch(uri):
	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	opener = urllib2.build_opener()
	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	respdata = opener.open(url).read()
	twitch = json.loads(respdata)['data']
	try:
		# TODO: add current game.
		return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	except:
		return 'Channel offline.'

def _humanize_bytes(b):
	b = int(b)
	i = 0
	table = " kMGTPEZYRQ"
	while b > 1024:
		i += 1
		b /= 1024.0
	if i == 0:
		return "%dB" % (b)
	else:
		return "%.2f%siB" % (b, table[i])

def goturl(url):
	output = []
	for _, group in other_regexes:
		for regex in group:
			if regex.match(url):
				return None
	request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'})
	opener = urllib2.build_opener(SmartRedirectHandler())

	# Send request and handle errors
	try:
		response = opener.open(request, timeout=2)
	except urllib2.HTTPError as e:
		return 'Request error: %s %s' % (e.code, e.reason)
	except urllib2.URLError as e:
		return 'Request error: %s' % (e.reason)
	except TimeoutError as e:
		return 'Request error: request timed out'
	except Exception as e:
		return 'Unknown error: %s %r' % (type(e).__name__, e.args)

	# Try to add type and length headers to reply
	c_type = response.getheader('Content-Type', '').split(';', 1)[0]
	c_len = response.getheader('Content-Length')
	if c_type != '':
		output.append("[%s] " % (c_type))
	else:
		output.append("[no type] ")
	if c_type != "text/html": # else length will be provided by HTML code below
		if c_len is not None:
			output.append("[%s] " % (_humanize_bytes(c_len)))
		else:
			output.append("[no length] ")

	# Try to add title if HTML
	if c_type == 'text/html':
		try:
			responsebody = response.read(1024*1024)
			print(type(responsebody))
		except Exception as e:
			output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
		else:
			if c_len is not None and len(responsebody) != int(c_len):
				output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
			else:
				output.append("[%s] " % (_humanize_bytes(len(responsebody))))
			try:
				soup = BeautifulSoup(responsebody)
				if soup.title:
					output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
				else:
					output.append('No title')
			except Exception as e:
				output.append('Title error: %s %r ' % (type(e).__name__, e.args))

	return ''.join(output)

url_regex = (
	re.compile(r'https?://(?:[^/\s.]+\.)+[^/\s.]+(?:/\S+)?'),
)
other_regexes = (
)
regexes = other_regexes + (
	(goturl, url_regex),
)
Commit	Line	Data
58cd0191	1	# Erebus IRC bot - Author: Conny Sjoblom
4477123d	2	# vim: fileencoding=utf-8
a83e1f9c	3	# URL Checker
	4	# This file is released into the public domain; see http://unlicense.org/
	5
	6	# module info
	7	modinfo = {
	8	'author': 'Erebus Team',
	9	'license': 'public domain',
fa93b933	10	'compatible': [0],
a62d0d18	11	'depends': [],
a62d0d18	12	'softdeps': [],
a83e1f9c	13	}
a83e1f9c	14
99366200 CS	15	# http://embed.ly/tools/generator
99366200 CS	16
a83e1f9c	17	# preamble
	18	import modlib
	19	lib = modlib.modlib(__name__)
	20	modstart = lib.modstart
	21	modstop = lib.modstop
	22
	23	# module code
a28e2ae9	24	import sys
a28e2ae9	25	if sys.version_info.major < 3:
55bfe803	26	stringbase = basestring
a28e2ae9	27	import urllib2
	28	import urlparse
	29	import HTMLParser
d266ce49	30	html = HTMLParser.HTMLParser()
a28e2ae9	31	from BeautifulSoup import BeautifulSoup
a28e2ae9	32	else:
55bfe803	33	stringbase = str
a28e2ae9	34	import urllib.request as urllib2
a28e2ae9	35	import urllib.parse as urlparse
d266ce49	36	import html
a28e2ae9	37	from bs4 import BeautifulSoup
a28e2ae9	38
467acacf	39	import re, json, datetime
a83e1f9c	40
390fbad4	41	hostmask_regex = re.compile(r'^(.)!(.)@(.*)$')
a83e1f9c	42
	43	def parser_hostmask(hostmask):
	44	if isinstance(hostmask, dict):
	45	return hostmask
	46
	47	nick = None
	48	user = None
	49	host = None
	50
	51	if hostmask is not None:
	52	match = hostmask_regex.match(hostmask)
	53
	54	if not match:
	55	nick = hostmask
	56	else:
	57	nick = match.group(1)
	58	user = match.group(2)
	59	host = match.group(3)
	60
	61	return {
	62	'nick': nick,
	63	'user': user,
	64	'host': host
	65	}
	66
394a7b69 CS	67	class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	68	def http_error_301(self, req, fp, code, msg, headers):
	69	result = urllib2.HTTPRedirectHandler.http_error_301(
	70	self, req, fp, code, msg, headers)
	71	result.status = code
	72	return result
	73
	74	def http_error_302(self, req, fp, code, msg, headers):
	75	result = urllib2.HTTPRedirectHandler.http_error_302(
	76	self, req, fp, code, msg, headers)
	77	result.status = code
	78	return result
	79
467acacf	80	def process_line(line):
	81	responses = []
	82	num_found = 0
	83	limit = lib.parent.cfg.getint('urls', 'limit', 2)
ecbed328	84	for action, group in regexes:
467acacf	85	for regex in group:
	86	for match in regex.findall(line):
	87	if match:
	88	num_found += 1
	89	if num_found > limit:
	90	return responses
55bfe803 JR	91	if isinstance(match, stringbase):
	92	resp = action(match)
	93	else:
	94	resp = action(*match)
ecbed328 JR	95	if resp is not None and resp != "":
ecbed328 JR	96	responses.append(resp)
467acacf	97	return responses
467acacf	98
a83e1f9c	99	@lib.hooknum("PRIVMSG")
390fbad4 CS	100	def privmsg_hook(bot, textline):
	101	user = parser_hostmask(textline[1:textline.find(' ')])
	102	chan = textline.split()[2]
a83e1f9c	103
a83e1f9c	104	try:
390fbad4	105	line = textline.split(None, 3)[3][1:]
a83e1f9c	106	except IndexError:
390fbad4	107	line = ''
a83e1f9c	108
467acacf	109	responses = process_line(line)
04d48353	110	if len(responses) > 0:
467acacf	111	if lib.parent.cfg.getboolean('urls', 'multiline'):
	112	for r in responses:
	113	bot.msg(chan, r, True)
	114	else:
	115	bot.msg(chan, ' \| '.join(responses), True)
a83e1f9c	116
390fbad4	117	def unescape(line):
d266ce49	118	return re.sub('\s+', ' ', html.unescape(line))
a83e1f9c	119
	120	def gotspotify(type, track):
	121	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	122	xml = urllib2.urlopen(url).read()
390fbad4	123	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
a83e1f9c	124	lookup_type = soup.contents[2].name
390fbad4	125
a83e1f9c	126	if lookup_type == 'track':
	127	name = soup.find('name').string
	128	album_name = soup.find('album').find('name').string
	129	artist_name = soup.find('artist').find('name').string
	130	popularity = soup.find('popularity')
	131	if popularity:
	132	popularity = float(popularity.string)*100
	133	length = float(soup.find('length').string)
	134	minutes = int(length)/60
467acacf	135	seconds = int(length)%60
390fbad4	136
dafa38fc	137	return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
390fbad4	138
a83e1f9c	139	elif lookup_type == 'album':
	140	album_name = soup.find('album').find('name').string
	141	artist_name = soup.find('artist').find('name').string
	142	released = soup.find('released').string
dafa38fc	143	return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
390fbad4	144
a83e1f9c	145	else:
	146	return 'Unsupported type.'
	147
467acacf	148	def _yt_duration(s):
	149	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	150	pcs = [x for x in mo.groups() if x]
	151	return ''.join(pcs).lower()
	152	def _yt_date(s, f):
	153	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	154	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
	155	def _yt_round(n):
	156	n = float(n)
	157	if n >= 10**12:
	158	return '%.1ft' % (n/10**12)
	159	elif n >= 10**9:
	160	return '%.1fb' % (n/10**9)
	161	elif n >= 10**6:
	162	return '%.1fm' % (n/10**6)
	163	elif n >= 10**3:
	164	return '%.1fk' % (n/10**3)
	165	else:
	166	return int(n)
	167
a83e1f9c	168	def gotyoutube(url):
	169	url_data = urlparse.urlparse(url)
	170	query = urlparse.parse_qs(url_data.query)
	171	video = query["v"][0]
467acacf	172	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
a83e1f9c	173	try:
a83e1f9c	174	respdata = urllib2.urlopen(api_url).read()
467acacf	175	v = json.loads(respdata)
	176	v = v['items'][0]
	177
	178	return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
	179	'title': v['snippet']['title'],
	180	'author': v['snippet']['channelTitle'],
	181	'duration': _yt_duration(v['contentDetails']['duration']),
	182	'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
	183	'views': _yt_round(v['statistics']['viewCount']),
	184	'likes': _yt_round(v['statistics']['likeCount']),
	185	'dislikes': _yt_round(v['statistics']['dislikeCount']),
	186	})
	187	except urllib2.HTTPError as e:
	188	if e.getcode() == 403:
	189	return 'API limit exceeded'
	190	else:
	191	return str(e)
	192	except IndexError:
	193	return 'no results'
	194	except Exception as e:
	195	return str(e)
a83e1f9c	196
390fbad4	197	def gottwitch(uri):
467acacf	198	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	199	opener = urllib2.build_opener()
	200	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	201	respdata = opener.open(url).read()
	202	twitch = json.loads(respdata)['data']
	203	try:
	204	# TODO: add current game.
	205	return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	206	except:
	207	return 'Channel offline.'
390fbad4	208
ecbed328 JR	209	def _humanize_bytes(b):
	210	b = int(b)
	211	i = 0
	212	table = " kMGTPEZYRQ"
	213	while b > 1024:
	214	i += 1
	215	b /= 1024.0
	216	if i == 0:
	217	return "%dB" % (b)
	218	else:
	219	return "%.2f%siB" % (b, table[i])
	220
390fbad4	221	def goturl(url):
ecbed328 JR	222	output = []
ecbed328 JR	223	for _, group in other_regexes:
467acacf	224	for regex in group:
	225	if regex.match(url):
	226	return None
9df62f90	227	request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'})
394a7b69	228	opener = urllib2.build_opener(SmartRedirectHandler())
ecbed328 JR	229
ecbed328 JR	230	# Send request and handle errors
993046cc	231	try:
ecbed328	232	response = opener.open(request, timeout=2)
de8ab9cb	233	except urllib2.HTTPError as e:
ecbed328	234	return 'Request error: %s %s' % (e.code, e.reason)
74dc2a9d	235	except urllib2.URLError as e:
ecbed328	236	return 'Request error: %s' % (e.reason)
9df62f90	237	except TimeoutError as e:
ecbed328	238	return 'Request error: request timed out'
04d48353	239	except Exception as e:
ecbed328 JR	240	return 'Unknown error: %s %r' % (type(e).__name__, e.args)
	241
	242	# Try to add type and length headers to reply
	243	c_type = response.getheader('Content-Type', '').split(';', 1)[0]
	244	c_len = response.getheader('Content-Length')
	245	if c_type != '':
	246	output.append("[%s] " % (c_type))
	247	else:
	248	output.append("[no type] ")
	249	if c_type != "text/html": # else length will be provided by HTML code below
	250	if c_len is not None:
	251	output.append("[%s] " % (_humanize_bytes(c_len)))
	252	else:
	253	output.append("[no length] ")
	254
	255	# Try to add title if HTML
	256	if c_type == 'text/html':
	257	try:
	258	responsebody = response.read(1024*1024)
	259	print(type(responsebody))
	260	except Exception as e:
	261	output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
	262	else:
	263	if c_len is not None and len(responsebody) != int(c_len):
	264	output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
	265	else:
	266	output.append("[%s] " % (_humanize_bytes(len(responsebody))))
	267	try:
	268	soup = BeautifulSoup(responsebody)
	269	if soup.title:
07fbfaa6	270	output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
ecbed328 JR	271	else:
	272	output.append('No title')
	273	except Exception as e:
	274	output.append('Title error: %s %r ' % (type(e).__name__, e.args))
	275
	276	return ''.join(output)
467acacf	277
467acacf	278	url_regex = (
ecbed328	279	re.compile(r'https?://(?:[^/\s.]+\.)+[^/\s.]+(?:/\S+)?'),
467acacf	280	)
467acacf	281	other_regexes = (
467acacf	282	)
467acacf	283	regexes = other_regexes + (
ecbed328	284	(goturl, url_regex),
467acacf	285	)