[erebus.git] / modules / urls.py

# Erebus IRC bot - Author: Erebus Team
# vim: fileencoding=utf-8
# URL Checker
# This file is released into the public domain; see http://unlicense.org/

# module info
modinfo = {
	'author': 'Erebus Team',
	'license': 'public domain',
	'compatible': [0],
	'depends': [],
	'softdeps': [],
}

# http://embed.ly/tools/generator

# preamble
import modlib
lib = modlib.modlib(__name__)
modstart = lib.modstart
modstop = lib.modstop

# module code
import sys
if sys.version_info.major < 3:
	import urllib2
	import urlparse
	import HTMLParser
	from BeautifulSoup import BeautifulSoup
else:
	import urllib.request as urllib2
	import urllib.parse as urlparse
	import html.parser as HTMLParser
	from bs4 import BeautifulSoup

import re, json, datetime

html_parser = HTMLParser.HTMLParser()

hostmask_regex = re.compile(r'^(.*)!(.*)@(.*)$')

def parser_hostmask(hostmask):
	if isinstance(hostmask, dict):
		return hostmask

	nick = None
	user = None
	host = None

	if hostmask is not None:
		match = hostmask_regex.match(hostmask)

		if not match:
			nick = hostmask
		else:
			nick = match.group(1)
			user = match.group(2)
			host = match.group(3)

	return {
		'nick': nick,
		'user': user,
		'host': host
	}

class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	def http_error_301(self, req, fp, code, msg, headers):
		result = urllib2.HTTPRedirectHandler.http_error_301(
				self, req, fp, code, msg, headers)
		result.status = code
		return result

	def http_error_302(self, req, fp, code, msg, headers):
		result = urllib2.HTTPRedirectHandler.http_error_302(
				self, req, fp, code, msg, headers)
		result.status = code
		return result

def process_line(line):
	responses = []
	num_found = 0
	limit = lib.parent.cfg.getint('urls', 'limit', 2)
	for action, group, prefix in regexes:
		for regex in group:
			for match in regex.findall(line):
				if match:
					num_found += 1
					if num_found > limit:
						return responses
					resp = action(match)
					if resp is not None:
						responses.append("%s: %s" % (prefix, action(match)))
	return responses

@lib.hooknum("PRIVMSG")
def privmsg_hook(bot, textline):
	user = parser_hostmask(textline[1:textline.find(' ')])
	chan = textline.split()[2]

	try:
		line = textline.split(None, 3)[3][1:]
	except IndexError:
		line = ''

	responses = process_line(line)
	if len(responses) > 0:
		if lib.parent.cfg.getboolean('urls', 'multiline'):
			for r in responses:
				bot.msg(chan, r, True)
		else:
			bot.msg(chan, ' | '.join(responses), True)

def unescape(line):
	return re.sub('\s+', ' ', html_parser.unescape(line))

def gotspotify(type, track):
	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	xml = urllib2.urlopen(url).read()
	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
	lookup_type = soup.contents[2].name

	if lookup_type == 'track':
		name = soup.find('name').string
		album_name = soup.find('album').find('name').string
		artist_name = soup.find('artist').find('name').string
		popularity = soup.find('popularity')
		if popularity:
			popularity = float(popularity.string)*100
		length = float(soup.find('length').string)
		minutes = int(length)/60
		seconds = int(length)%60

		return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))

	elif lookup_type == 'album':
		album_name = soup.find('album').find('name').string
		artist_name = soup.find('artist').find('name').string
		released = soup.find('released').string
		return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))

	else:
		return 'Unsupported type.'

def _yt_duration(s):
	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	pcs = [x for x in mo.groups() if x]
	return ''.join(pcs).lower()
def _yt_date(s, f):
	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
def _yt_round(n):
	n = float(n)
	if n >= 10**12:
		return '%.1ft' % (n/10**12)
	elif n >= 10**9:
		return '%.1fb' % (n/10**9)
	elif n >= 10**6:
		return '%.1fm' % (n/10**6)
	elif n >= 10**3:
		return '%.1fk' % (n/10**3)
	else:
		return int(n)

def gotyoutube(url):
	url_data = urlparse.urlparse(url)
	query = urlparse.parse_qs(url_data.query)
	video = query["v"][0]
	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
	try:
		respdata = urllib2.urlopen(api_url).read()
		v = json.loads(respdata)
		v = v['items'][0]

		return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
			'title': v['snippet']['title'],
			'author': v['snippet']['channelTitle'],
			'duration': _yt_duration(v['contentDetails']['duration']),
			'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
			'views': _yt_round(v['statistics']['viewCount']),
			'likes': _yt_round(v['statistics']['likeCount']),
			'dislikes': _yt_round(v['statistics']['dislikeCount']),
		})
	except urllib2.HTTPError as e:
		if e.getcode() == 403:
			return 'API limit exceeded'
		else:
			return str(e)
	except IndexError:
		return 'no results'
	except Exception as e:
		return str(e)

def gottwitch(uri):
	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	opener = urllib2.build_opener()
	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	respdata = opener.open(url).read()
	twitch = json.loads(respdata)['data']
	try:
		# TODO: add current game.
		return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	except:
		return 'Channel offline.'

def goturl(url):
	for _, group, _ in other_regexes:
		for regex in group:
			if regex.match(url):
				return None
	request = urllib2.Request(url)
	opener = urllib2.build_opener(SmartRedirectHandler())
	try:
		soup = BeautifulSoup(opener.open(request, timeout=0.5))
		if soup.title:
			return unescape('%s' % (soup.title.string))
		else:
			return None
	except urllib2.HTTPError as e:
		return 'Error: %s %s' % (e.code, e.reason)
	except Exception as e:
		return 'Error: %r' % (e.message)

url_regex = (
	re.compile(r'https?://[^/\s]+\.[^/\s]+(?:/\S+)?'),
)
spotify_regex = (
	re.compile(r'spotify:(?P<type>\w+):(?P<track_id>\w{22})'),
	re.compile(r'https?://open.spotify.com/(?P<type>\w+)/(?P<track_id>\w+)')
)
youtube_regex = (
	re.compile(r'https?://(?:www\.)?youtube\.com/watch\?[a-zA-Z0-9=&_\-]+'),
)
twitch_regex = (
	re.compile(r'https?:\/\/(?:www\.)?twitch.tv\/([A-Za-z0-9]*)'),
)
other_regexes = (
	(gotspotify, spotify_regex, 'Spotify'),
	(gotyoutube, youtube_regex, 'YouTube'),
	(gottwitch, twitch_regex, 'Twitch'),
)
regexes = other_regexes + (
	(goturl, url_regex, 'Title'),
)
Commit	Line	Data
a83e1f9c	1	# Erebus IRC bot - Author: Erebus Team
4477123d	2	# vim: fileencoding=utf-8
a83e1f9c	3	# URL Checker
	4	# This file is released into the public domain; see http://unlicense.org/
	5
	6	# module info
	7	modinfo = {
	8	'author': 'Erebus Team',
	9	'license': 'public domain',
fa93b933	10	'compatible': [0],
a62d0d18	11	'depends': [],
a62d0d18	12	'softdeps': [],
a83e1f9c	13	}
a83e1f9c	14
99366200 CS	15	# http://embed.ly/tools/generator
99366200 CS	16
a83e1f9c	17	# preamble
	18	import modlib
	19	lib = modlib.modlib(__name__)
	20	modstart = lib.modstart
	21	modstop = lib.modstop
	22
	23	# module code
a28e2ae9	24	import sys
	25	if sys.version_info.major < 3:
	26	import urllib2
	27	import urlparse
	28	import HTMLParser
	29	from BeautifulSoup import BeautifulSoup
	30	else:
	31	import urllib.request as urllib2
	32	import urllib.parse as urlparse
	33	import html.parser as HTMLParser
	34	from bs4 import BeautifulSoup
	35
467acacf	36	import re, json, datetime
a83e1f9c	37
390fbad4	38	html_parser = HTMLParser.HTMLParser()
a83e1f9c	39
390fbad4	40	hostmask_regex = re.compile(r'^(.)!(.)@(.*)$')
a83e1f9c	41
	42	def parser_hostmask(hostmask):
	43	if isinstance(hostmask, dict):
	44	return hostmask
	45
	46	nick = None
	47	user = None
	48	host = None
	49
	50	if hostmask is not None:
	51	match = hostmask_regex.match(hostmask)
	52
	53	if not match:
	54	nick = hostmask
	55	else:
	56	nick = match.group(1)
	57	user = match.group(2)
	58	host = match.group(3)
	59
	60	return {
	61	'nick': nick,
	62	'user': user,
	63	'host': host
	64	}
	65
394a7b69 CS	66	class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	67	def http_error_301(self, req, fp, code, msg, headers):
	68	result = urllib2.HTTPRedirectHandler.http_error_301(
	69	self, req, fp, code, msg, headers)
	70	result.status = code
	71	return result
	72
	73	def http_error_302(self, req, fp, code, msg, headers):
	74	result = urllib2.HTTPRedirectHandler.http_error_302(
	75	self, req, fp, code, msg, headers)
	76	result.status = code
	77	return result
	78
467acacf	79	def process_line(line):
	80	responses = []
	81	num_found = 0
	82	limit = lib.parent.cfg.getint('urls', 'limit', 2)
	83	for action, group, prefix in regexes:
	84	for regex in group:
	85	for match in regex.findall(line):
	86	if match:
	87	num_found += 1
	88	if num_found > limit:
	89	return responses
	90	resp = action(match)
	91	if resp is not None:
	92	responses.append("%s: %s" % (prefix, action(match)))
	93	return responses
	94
a83e1f9c	95	@lib.hooknum("PRIVMSG")
390fbad4 CS	96	def privmsg_hook(bot, textline):
	97	user = parser_hostmask(textline[1:textline.find(' ')])
	98	chan = textline.split()[2]
a83e1f9c	99
a83e1f9c	100	try:
390fbad4	101	line = textline.split(None, 3)[3][1:]
a83e1f9c	102	except IndexError:
390fbad4	103	line = ''
a83e1f9c	104
467acacf	105	responses = process_line(line)
04d48353	106	if len(responses) > 0:
467acacf	107	if lib.parent.cfg.getboolean('urls', 'multiline'):
	108	for r in responses:
	109	bot.msg(chan, r, True)
	110	else:
	111	bot.msg(chan, ' \| '.join(responses), True)
a83e1f9c	112
390fbad4	113	def unescape(line):
d140b1af	114	return re.sub('\s+', ' ', html_parser.unescape(line))
a83e1f9c	115
	116	def gotspotify(type, track):
	117	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	118	xml = urllib2.urlopen(url).read()
390fbad4	119	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
a83e1f9c	120	lookup_type = soup.contents[2].name
390fbad4	121
a83e1f9c	122	if lookup_type == 'track':
	123	name = soup.find('name').string
	124	album_name = soup.find('album').find('name').string
	125	artist_name = soup.find('artist').find('name').string
	126	popularity = soup.find('popularity')
	127	if popularity:
	128	popularity = float(popularity.string)*100
	129	length = float(soup.find('length').string)
	130	minutes = int(length)/60
467acacf	131	seconds = int(length)%60
390fbad4	132
dafa38fc	133	return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
390fbad4	134
a83e1f9c	135	elif lookup_type == 'album':
	136	album_name = soup.find('album').find('name').string
	137	artist_name = soup.find('artist').find('name').string
	138	released = soup.find('released').string
dafa38fc	139	return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
390fbad4	140
a83e1f9c	141	else:
	142	return 'Unsupported type.'
	143
467acacf	144	def _yt_duration(s):
	145	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	146	pcs = [x for x in mo.groups() if x]
	147	return ''.join(pcs).lower()
	148	def _yt_date(s, f):
	149	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	150	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
	151	def _yt_round(n):
	152	n = float(n)
	153	if n >= 10**12:
	154	return '%.1ft' % (n/10**12)
	155	elif n >= 10**9:
	156	return '%.1fb' % (n/10**9)
	157	elif n >= 10**6:
	158	return '%.1fm' % (n/10**6)
	159	elif n >= 10**3:
	160	return '%.1fk' % (n/10**3)
	161	else:
	162	return int(n)
	163
a83e1f9c	164	def gotyoutube(url):
	165	url_data = urlparse.urlparse(url)
	166	query = urlparse.parse_qs(url_data.query)
	167	video = query["v"][0]
467acacf	168	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
a83e1f9c	169	try:
a83e1f9c	170	respdata = urllib2.urlopen(api_url).read()
467acacf	171	v = json.loads(respdata)
	172	v = v['items'][0]
	173
	174	return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
	175	'title': v['snippet']['title'],
	176	'author': v['snippet']['channelTitle'],
	177	'duration': _yt_duration(v['contentDetails']['duration']),
	178	'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
	179	'views': _yt_round(v['statistics']['viewCount']),
	180	'likes': _yt_round(v['statistics']['likeCount']),
	181	'dislikes': _yt_round(v['statistics']['dislikeCount']),
	182	})
	183	except urllib2.HTTPError as e:
	184	if e.getcode() == 403:
	185	return 'API limit exceeded'
	186	else:
	187	return str(e)
	188	except IndexError:
	189	return 'no results'
	190	except Exception as e:
	191	return str(e)
a83e1f9c	192
390fbad4	193	def gottwitch(uri):
467acacf	194	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	195	opener = urllib2.build_opener()
	196	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	197	respdata = opener.open(url).read()
	198	twitch = json.loads(respdata)['data']
	199	try:
	200	# TODO: add current game.
	201	return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	202	except:
	203	return 'Channel offline.'
390fbad4 CS	204
390fbad4 CS	205	def goturl(url):
467acacf	206	for _, group, _ in other_regexes:
	207	for regex in group:
	208	if regex.match(url):
	209	return None
394a7b69 CS	210	request = urllib2.Request(url)
394a7b69 CS	211	opener = urllib2.build_opener(SmartRedirectHandler())
993046cc	212	try:
04d48353	213	soup = BeautifulSoup(opener.open(request, timeout=0.5))
18193997	214	if soup.title:
	215	return unescape('%s' % (soup.title.string))
	216	else:
	217	return None
de8ab9cb	218	except urllib2.HTTPError as e:
acfe3d05	219	return 'Error: %s %s' % (e.code, e.reason)
04d48353	220	except Exception as e:
04d48353	221	return 'Error: %r' % (e.message)
467acacf	222
	223	url_regex = (
	224	re.compile(r'https?://[^/\s]+\.[^/\s]+(?:/\S+)?'),
	225	)
	226	spotify_regex = (
	227	re.compile(r'spotify:(?P<type>\w+):(?P<track_id>\w{22})'),
	228	re.compile(r'https?://open.spotify.com/(?P<type>\w+)/(?P<track_id>\w+)')
	229	)
	230	youtube_regex = (
	231	re.compile(r'https?://(?:www\.)?youtube\.com/watch\?[a-zA-Z0-9=&_\-]+'),
	232	)
	233	twitch_regex = (
	234	re.compile(r'https?:\/\/(?:www\.)?twitch.tv\/([A-Za-z0-9]*)'),
	235	)
	236	other_regexes = (
	237	(gotspotify, spotify_regex, 'Spotify'),
	238	(gotyoutube, youtube_regex, 'YouTube'),
	239	(gottwitch, twitch_regex, 'Twitch'),
	240	)
	241	regexes = other_regexes + (
	242	(goturl, url_regex, 'Title'),
	243	)