[erebus.git] / modules / urls.py

# Erebus IRC bot - Author: Conny Sjoblom
# vim: fileencoding=utf-8
# URL Checker
# This file is released into the public domain; see http://unlicense.org/

# module info
modinfo = {
	'author': 'Erebus Team',
	'license': 'public domain',
	'compatible': [0],
	'depends': [],
	'softdeps': [],
}

# http://embed.ly/tools/generator

# preamble
import modlib
lib = modlib.modlib(__name__)
modstart = lib.modstart
modstop = lib.modstop

# module code
import sys
if sys.version_info.major < 3:
	stringbase = basestring
	import urllib2
	import urlparse
	import HTMLParser
	html = HTMLParser.HTMLParser()
	from BeautifulSoup import BeautifulSoup
else:
	stringbase = str
	import urllib.request as urllib2
	import urllib.parse as urlparse
	import html
	from bs4 import BeautifulSoup
import http.client

import re, json, datetime

try:
	import aia
	aia_session = aia.AIASession()
	# aia is broken on capath systems, needs cafile to work
	aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
	aia_session._trusted = {
		aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
		for ca_der in aia_session._context.get_ca_certs(True)
	}
	print("aia loaded")
except ImportError as e:
	print(repr(e))
	aia = None

hostmask_regex = re.compile(r'^(.*)!(.*)@(.*)$')

def parser_hostmask(hostmask):
	if isinstance(hostmask, dict):
		return hostmask

	nick = None
	user = None
	host = None

	if hostmask is not None:
		match = hostmask_regex.match(hostmask)

		if not match:
			nick = hostmask
		else:
			nick = match.group(1)
			user = match.group(2)
			host = match.group(3)

	return {
		'nick': nick,
		'user': user,
		'host': host
	}

class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	def http_error_301(self, req, fp, code, msg, headers):
		result = urllib2.HTTPRedirectHandler.http_error_301(
				self, req, fp, code, msg, headers)
		result.status = code
		return result

	def http_error_302(self, req, fp, code, msg, headers):
		result = urllib2.HTTPRedirectHandler.http_error_302(
				self, req, fp, code, msg, headers)
		result.status = code
		return result

def _get_blocked_chans():
	return lib.parent.cfg.get('urls', 'blocked', '').split(',')

def process_line(line):
	responses = []
	num_found = 0
	limit = lib.parent.cfg.getint('urls', 'limit', 2)
	for action, group in regexes:
		for regex in group:
			for match in regex.findall(line):
				if match:
					num_found += 1
					if num_found > limit:
						return responses
					if isinstance(match, stringbase):
						resp = action(match)
					else:
						resp = action(*match)
					if resp is not None and resp != "":
						responses.append(resp)
	return responses

@lib.hooknum("PRIVMSG")
def privmsg_hook(bot, textline):
	user = parser_hostmask(textline[1:textline.find(' ')])
	chan = textline.split()[2]

	if chan in _get_blocked_chans(): return

	try:
		line = textline.split(None, 3)[3][1:]
	except IndexError:
		line = ''

	responses = process_line(line)
	send_response(bot, chan, responses)

def send_response(bot, chan, responses):
	if len(responses) > 0:
		if lib.parent.cfg.getboolean('urls', 'multiline'):
			for r in responses:
				bot.msg(chan, r, True)
		else:
			bot.msg(chan, ' | '.join(responses), True)

def unescape(line):
	return re.sub('\s+', ' ', html.unescape(line))

def gotspotify(type, track):
	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	xml = urllib2.urlopen(url).read()
	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
	lookup_type = soup.contents[2].name

	if lookup_type == 'track':
		name = soup.find('name').string
		album_name = soup.find('album').find('name').string
		artist_name = soup.find('artist').find('name').string
		popularity = soup.find('popularity')
		if popularity:
			popularity = float(popularity.string)*100
		length = float(soup.find('length').string)
		minutes = int(length)/60
		seconds = int(length)%60

		return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))

	elif lookup_type == 'album':
		album_name = soup.find('album').find('name').string
		artist_name = soup.find('artist').find('name').string
		released = soup.find('released').string
		return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))

	else:
		return 'Unsupported type.'

def _yt_duration(s):
	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	pcs = [x for x in mo.groups() if x]
	return ''.join(pcs).lower()
def _yt_date(s, f):
	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
def _yt_round(n):
	n = float(n)
	if n >= 10**12:
		return '%.1ft' % (n/10**12)
	elif n >= 10**9:
		return '%.1fb' % (n/10**9)
	elif n >= 10**6:
		return '%.1fm' % (n/10**6)
	elif n >= 10**3:
		return '%.1fk' % (n/10**3)
	else:
		return int(n)

def gotyoutube(url):
	url_data = urlparse.urlparse(url)
	query = urlparse.parse_qs(url_data.query)
	video = query["v"][0]
	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
	try:
		respdata = urllib2.urlopen(api_url).read()
		v = json.loads(respdata)
		v = v['items'][0]

		return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
			'title': v['snippet']['title'],
			'author': v['snippet']['channelTitle'],
			'duration': _yt_duration(v['contentDetails']['duration']),
			'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
			'views': _yt_round(v['statistics']['viewCount']),
			'likes': _yt_round(v['statistics']['likeCount']),
			'dislikes': _yt_round(v['statistics']['dislikeCount']),
		})
	except urllib2.HTTPError as e:
		if e.getcode() == 403:
			return 'API limit exceeded'
		else:
			return str(e)
	except IndexError:
		return 'no results'
	except Exception as e:
		return str(e)

def gottwitch(uri):
	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	opener = urllib2.build_opener()
	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	respdata = opener.open(url).read()
	twitch = json.loads(respdata)['data']
	try:
		# TODO: add current game.
		return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	except:
		return 'Channel offline.'

def _humanize_bytes(b):
	b = int(b)
	i = 0
	table = " kMGTPEZYRQ"
	while b > 1024:
		i += 1
		b /= 1024.0
	if i == 0:
		return "%dB" % (b)
	else:
		return "%.2f%siB" % (b, table[i])

def _do_request(url, try_aia=False):
	"""
		Return value is a tuple consisting of:
		- the HTTPResponse object, or a string on error. Empty string -> no response.
		- and a flag indicating whether AIA was used
	"""
	try:
		request = urllib2.Request(url, headers={
			'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
			'accept-language': 'en-US,en;q=0.9',
			'cache-control': 'max-age=0',
			'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
			'sec-ch-ua-mobile': '?0',
			'sec-ch-ua-platform': '"Linux"',
			'sec-fetch-dest': 'document',
			'sec-fetch-mode': 'navigate',
			'sec-fetch-site': 'none',
			'sec-fetch-user': '?1',
			'upgrade-insecure-requests': '1',
			'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
		})
	except ValueError:
		return '', False
	if try_aia:
		try:
			opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
		except aia.AIAError as e:
			return 'Request error: %s.%s: %s' % (e.__module__, e.__class__.__name__, e.args[0]), True
	else:
		opener = urllib2.build_opener(SmartRedirectHandler())

	# Send request and handle errors
	try:
		response = opener.open(request, timeout=2)
	except http.client.InvalidURL as e: # why does a method under urllib.request raise an exception under http.client???
		return '', False
	except urllib2.HTTPError as e:
		return 'Request error: %s %s' % (e.code, e.reason), False
	except urllib2.URLError as e:
		if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
			if aia: # Retry with AIA enabled, if module is present
				return _do_request(url, True)
			else:
				lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
				return 'Request error: site may have broken TLS configuration (%s)' % (e.reason), False
		else:
			return 'Request error: %s' % (e.reason), False
	except TimeoutError as e:
		return 'Request error: request timed out', False
	except Exception as e:
		return 'Unknown error: %s %r' % (type(e).__name__, e.args), False

	return response, try_aia


def goturl(url):
	output = []
	for _, group in other_regexes:
		for regex in group:
			if regex.match(url):
				return None

	response, used_aia = _do_request(url)
	if isinstance(response, stringbase):
		return response

	# Try to add type and length headers to reply
	c_type_fields = response.getheader('Content-Type', '').split(';')
	c_type = c_type_fields.pop(0)
	c_charset = None
	for f in c_type_fields:
		f = f.strip()
		if len(f) > 8 and f[0:8] == 'charset=':
			c_charset = f[8:]
	c_len = response.getheader('Content-Length')
	if c_type != '':
		output.append("[%s] " % (c_type))
	else:
		output.append("[no type] ")
	if c_type != "text/html": # else length will be provided by HTML code below
		if c_len is not None:
			output.append("[%s] " % (_humanize_bytes(c_len)))
		else:
			output.append("[no length] ")

	if used_aia:
		output.append("[AIA] ")

	# Try to add title if HTML
	if c_type == 'text/html':
		try:
			responsebody = response.read(1024*1024)
		except Exception as e:
			output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
		else:
			if c_len is not None and len(responsebody) != int(c_len): # did we read a different amount than Content-Length?
				if response.read(1): # there's more data, we just aren't reading it
					output.append("[read %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
				else:
					output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
			else: # Content-Length = amount read
				output.append("[%s] " % (_humanize_bytes(len(responsebody))))
			try:
				soup = BeautifulSoup(responsebody, from_encoding=c_charset)
				if soup.title:
					output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
				else:
					output.append('No title')
			except Exception as e:
				output.append('Title error: %s %r ' % (type(e).__name__, e.args))

	return ''.join(output)

url_regex = (
	re.compile(r'https?://(?:[^/\s.]+\.)+[a-z0-9-]+(?:/[^\s\]>)}]+)?', re.I),
)
other_regexes = (
	(lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter|x)\.com/""", re.I),)), # skip twitter
	(lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
	(lambda x: '', (re.compile(r"""https?://jfr\.im/git/""", re.I),)), # skip my gitweb
	(lambda x: '', (re.compile(r"""https?://(?:www\.)?wunderground\.com/""", re.I),)), # skip wunderground, they time us out
)
regexes = other_regexes + (
	(goturl, url_regex),
)
Commit	Line	Data
58cd0191	1	# Erebus IRC bot - Author: Conny Sjoblom
4477123d	2	# vim: fileencoding=utf-8
a83e1f9c	3	# URL Checker
	4	# This file is released into the public domain; see http://unlicense.org/
	5
	6	# module info
	7	modinfo = {
	8	'author': 'Erebus Team',
	9	'license': 'public domain',
fa93b933	10	'compatible': [0],
a62d0d18	11	'depends': [],
a62d0d18	12	'softdeps': [],
a83e1f9c	13	}
a83e1f9c	14
99366200 CS	15	# http://embed.ly/tools/generator
99366200 CS	16
a83e1f9c	17	# preamble
	18	import modlib
	19	lib = modlib.modlib(__name__)
	20	modstart = lib.modstart
	21	modstop = lib.modstop
	22
	23	# module code
a28e2ae9	24	import sys
a28e2ae9	25	if sys.version_info.major < 3:
55bfe803	26	stringbase = basestring
a28e2ae9	27	import urllib2
	28	import urlparse
	29	import HTMLParser
d266ce49	30	html = HTMLParser.HTMLParser()
a28e2ae9	31	from BeautifulSoup import BeautifulSoup
a28e2ae9	32	else:
55bfe803	33	stringbase = str
a28e2ae9	34	import urllib.request as urllib2
a28e2ae9	35	import urllib.parse as urlparse
d266ce49	36	import html
a28e2ae9	37	from bs4 import BeautifulSoup
169ed3b5	38	import http.client
a28e2ae9	39
467acacf	40	import re, json, datetime
a83e1f9c	41
8570a2ee JR	42	try:
	43	import aia
	44	aia_session = aia.AIASession()
	45	# aia is broken on capath systems, needs cafile to work
	46	aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
	47	aia_session._trusted = {
	48	aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
	49	for ca_der in aia_session._context.get_ca_certs(True)
	50	}
	51	print("aia loaded")
	52	except ImportError as e:
	53	print(repr(e))
	54	aia = None
	55
390fbad4	56	hostmask_regex = re.compile(r'^(.)!(.)@(.*)$')
a83e1f9c	57
	58	def parser_hostmask(hostmask):
	59	if isinstance(hostmask, dict):
	60	return hostmask
	61
	62	nick = None
	63	user = None
	64	host = None
	65
	66	if hostmask is not None:
	67	match = hostmask_regex.match(hostmask)
	68
	69	if not match:
	70	nick = hostmask
	71	else:
	72	nick = match.group(1)
	73	user = match.group(2)
	74	host = match.group(3)
	75
	76	return {
	77	'nick': nick,
	78	'user': user,
	79	'host': host
	80	}
	81
394a7b69 CS	82	class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
	83	def http_error_301(self, req, fp, code, msg, headers):
	84	result = urllib2.HTTPRedirectHandler.http_error_301(
	85	self, req, fp, code, msg, headers)
	86	result.status = code
	87	return result
	88
	89	def http_error_302(self, req, fp, code, msg, headers):
	90	result = urllib2.HTTPRedirectHandler.http_error_302(
	91	self, req, fp, code, msg, headers)
	92	result.status = code
	93	return result
	94
87f0733f JR	95	def _get_blocked_chans():
	96	return lib.parent.cfg.get('urls', 'blocked', '').split(',')
	97
467acacf	98	def process_line(line):
	99	responses = []
	100	num_found = 0
	101	limit = lib.parent.cfg.getint('urls', 'limit', 2)
ecbed328	102	for action, group in regexes:
467acacf	103	for regex in group:
	104	for match in regex.findall(line):
	105	if match:
	106	num_found += 1
	107	if num_found > limit:
	108	return responses
55bfe803 JR	109	if isinstance(match, stringbase):
	110	resp = action(match)
	111	else:
	112	resp = action(*match)
ecbed328 JR	113	if resp is not None and resp != "":
ecbed328 JR	114	responses.append(resp)
467acacf	115	return responses
467acacf	116
a83e1f9c	117	@lib.hooknum("PRIVMSG")
390fbad4 CS	118	def privmsg_hook(bot, textline):
	119	user = parser_hostmask(textline[1:textline.find(' ')])
	120	chan = textline.split()[2]
a83e1f9c	121
87f0733f JR	122	if chan in _get_blocked_chans(): return
87f0733f JR	123
a83e1f9c	124	try:
390fbad4	125	line = textline.split(None, 3)[3][1:]
a83e1f9c	126	except IndexError:
390fbad4	127	line = ''
a83e1f9c	128
467acacf	129	responses = process_line(line)
9de26fbb JR	130	send_response(bot, chan, responses)
	131
	132	def send_response(bot, chan, responses):
04d48353	133	if len(responses) > 0:
467acacf	134	if lib.parent.cfg.getboolean('urls', 'multiline'):
	135	for r in responses:
	136	bot.msg(chan, r, True)
	137	else:
	138	bot.msg(chan, ' \| '.join(responses), True)
a83e1f9c	139
390fbad4	140	def unescape(line):
d266ce49	141	return re.sub('\s+', ' ', html.unescape(line))
a83e1f9c	142
	143	def gotspotify(type, track):
	144	url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
	145	xml = urllib2.urlopen(url).read()
390fbad4	146	soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
a83e1f9c	147	lookup_type = soup.contents[2].name
390fbad4	148
a83e1f9c	149	if lookup_type == 'track':
	150	name = soup.find('name').string
	151	album_name = soup.find('album').find('name').string
	152	artist_name = soup.find('artist').find('name').string
	153	popularity = soup.find('popularity')
	154	if popularity:
	155	popularity = float(popularity.string)*100
	156	length = float(soup.find('length').string)
	157	minutes = int(length)/60
467acacf	158	seconds = int(length)%60
390fbad4	159
dafa38fc	160	return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
390fbad4	161
a83e1f9c	162	elif lookup_type == 'album':
	163	album_name = soup.find('album').find('name').string
	164	artist_name = soup.find('artist').find('name').string
	165	released = soup.find('released').string
dafa38fc	166	return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
390fbad4	167
a83e1f9c	168	else:
	169	return 'Unsupported type.'
	170
467acacf	171	def _yt_duration(s):
	172	mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
	173	pcs = [x for x in mo.groups() if x]
	174	return ''.join(pcs).lower()
	175	def _yt_date(s, f):
	176	mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
	177	return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
	178	def _yt_round(n):
	179	n = float(n)
	180	if n >= 10**12:
	181	return '%.1ft' % (n/10**12)
	182	elif n >= 10**9:
	183	return '%.1fb' % (n/10**9)
	184	elif n >= 10**6:
	185	return '%.1fm' % (n/10**6)
	186	elif n >= 10**3:
	187	return '%.1fk' % (n/10**3)
	188	else:
	189	return int(n)
	190
a83e1f9c	191	def gotyoutube(url):
	192	url_data = urlparse.urlparse(url)
	193	query = urlparse.parse_qs(url_data.query)
	194	video = query["v"][0]
467acacf	195	api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
a83e1f9c	196	try:
a83e1f9c	197	respdata = urllib2.urlopen(api_url).read()
467acacf	198	v = json.loads(respdata)
	199	v = v['items'][0]
	200
	201	return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
	202	'title': v['snippet']['title'],
	203	'author': v['snippet']['channelTitle'],
	204	'duration': _yt_duration(v['contentDetails']['duration']),
	205	'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
	206	'views': _yt_round(v['statistics']['viewCount']),
	207	'likes': _yt_round(v['statistics']['likeCount']),
	208	'dislikes': _yt_round(v['statistics']['dislikeCount']),
	209	})
	210	except urllib2.HTTPError as e:
	211	if e.getcode() == 403:
	212	return 'API limit exceeded'
	213	else:
	214	return str(e)
	215	except IndexError:
	216	return 'no results'
	217	except Exception as e:
	218	return str(e)
a83e1f9c	219
390fbad4	220	def gottwitch(uri):
467acacf	221	url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
	222	opener = urllib2.build_opener()
	223	opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
	224	respdata = opener.open(url).read()
	225	twitch = json.loads(respdata)['data']
	226	try:
	227	# TODO: add current game.
	228	return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
	229	except:
	230	return 'Channel offline.'
390fbad4	231
ecbed328 JR	232	def _humanize_bytes(b):
	233	b = int(b)
	234	i = 0
	235	table = " kMGTPEZYRQ"
	236	while b > 1024:
	237	i += 1
	238	b /= 1024.0
	239	if i == 0:
	240	return "%dB" % (b)
	241	else:
	242	return "%.2f%siB" % (b, table[i])
	243
8570a2ee	244	def _do_request(url, try_aia=False):
41e5ba5b JR	245	"""
	246	Return value is a tuple consisting of:
	247	- the HTTPResponse object, or a string on error. Empty string -> no response.
	248	- and a flag indicating whether AIA was used
	249	"""
b6a309d2	250	try:
744f1db9	251	request = urllib2.Request(url, headers={
52c80cff JR	252	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7',
	253	'accept-language': 'en-US,en;q=0.9',
	254	'cache-control': 'max-age=0',
	255	'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
	256	'sec-ch-ua-mobile': '?0',
	257	'sec-ch-ua-platform': '"Linux"',
	258	'sec-fetch-dest': 'document',
	259	'sec-fetch-mode': 'navigate',
	260	'sec-fetch-site': 'none',
	261	'sec-fetch-user': '?1',
	262	'upgrade-insecure-requests': '1',
	263	'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
744f1db9	264	})
b6a309d2 JR	265	except ValueError:
b6a309d2 JR	266	return '', False
8570a2ee	267	if try_aia:
c5db6025 JR	268	try:
	269	opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
	270	except aia.AIAError as e:
	271	return 'Request error: %s.%s: %s' % (e.__module__, e.__class__.__name__, e.args[0]), True
8570a2ee JR	272	else:
8570a2ee JR	273	opener = urllib2.build_opener(SmartRedirectHandler())
ecbed328 JR	274
ecbed328 JR	275	# Send request and handle errors
993046cc	276	try:
ecbed328	277	response = opener.open(request, timeout=2)
169ed3b5	278	except http.client.InvalidURL as e: # why does a method under urllib.request raise an exception under http.client???
41e5ba5b	279	return '', False
de8ab9cb	280	except urllib2.HTTPError as e:
41e5ba5b	281	return 'Request error: %s %s' % (e.code, e.reason), False
74dc2a9d	282	except urllib2.URLError as e:
8570a2ee	283	if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
41e5ba5b	284	if aia: # Retry with AIA enabled, if module is present
8570a2ee JR	285	return _do_request(url, True)
	286	else:
	287	lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
41e5ba5b	288	return 'Request error: site may have broken TLS configuration (%s)' % (e.reason), False
8570a2ee	289	else:
41e5ba5b	290	return 'Request error: %s' % (e.reason), False
9df62f90	291	except TimeoutError as e:
41e5ba5b	292	return 'Request error: request timed out', False
04d48353	293	except Exception as e:
41e5ba5b	294	return 'Unknown error: %s %r' % (type(e).__name__, e.args), False
ecbed328	295
bd96ac57	296	return response, try_aia
8570a2ee JR	297
	298
	299	def goturl(url):
	300	output = []
	301	for _, group in other_regexes:
	302	for regex in group:
	303	if regex.match(url):
	304	return None
	305
bd96ac57	306	response, used_aia = _do_request(url)
8570a2ee JR	307	if isinstance(response, stringbase):
	308	return response
	309
ecbed328	310	# Try to add type and length headers to reply
b91b84fa JR	311	c_type_fields = response.getheader('Content-Type', '').split(';')
	312	c_type = c_type_fields.pop(0)
	313	c_charset = None
	314	for f in c_type_fields:
	315	f = f.strip()
	316	if len(f) > 8 and f[0:8] == 'charset=':
	317	c_charset = f[8:]
ecbed328 JR	318	c_len = response.getheader('Content-Length')
	319	if c_type != '':
	320	output.append("[%s] " % (c_type))
	321	else:
	322	output.append("[no type] ")
	323	if c_type != "text/html": # else length will be provided by HTML code below
	324	if c_len is not None:
	325	output.append("[%s] " % (_humanize_bytes(c_len)))
	326	else:
	327	output.append("[no length] ")
	328
bd96ac57 JR	329	if used_aia:
	330	output.append("[AIA] ")
	331
ecbed328 JR	332	# Try to add title if HTML
	333	if c_type == 'text/html':
	334	try:
	335	responsebody = response.read(1024*1024)
ecbed328 JR	336	except Exception as e:
	337	output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
	338	else:
46ab933c JR	339	if c_len is not None and len(responsebody) != int(c_len): # did we read a different amount than Content-Length?
	340	if response.read(1): # there's more data, we just aren't reading it
	341	output.append("[read %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
	342	else:
	343	output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
	344	else: # Content-Length = amount read
ecbed328 JR	345	output.append("[%s] " % (_humanize_bytes(len(responsebody))))
ecbed328 JR	346	try:
b91b84fa	347	soup = BeautifulSoup(responsebody, from_encoding=c_charset)
ecbed328	348	if soup.title:
07fbfaa6	349	output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
ecbed328 JR	350	else:
	351	output.append('No title')
	352	except Exception as e:
	353	output.append('Title error: %s %r ' % (type(e).__name__, e.args))
	354
	355	return ''.join(output)
467acacf	356
467acacf	357	url_regex = (
7429451e	358	re.compile(r'https?://(?:[^/\s.]+\.)+[a-z0-9-]+(?:/[^\s\]>)}]+)?', re.I),
467acacf	359	)
467acacf	360	other_regexes = (
2412ad6e	361	(lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter\|x)\.com/""", re.I),)), # skip twitter
cf848537	362	(lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
6634a02c	363	(lambda x: '', (re.compile(r"""https?://jfr\.im/git/""", re.I),)), # skip my gitweb
d2439073	364	(lambda x: '', (re.compile(r"""https?://(?:www\.)?wunderground\.com/""", re.I),)), # skip wunderground, they time us out
467acacf	365	)
467acacf	366	regexes = other_regexes + (
ecbed328	367	(goturl, url_regex),
467acacf	368	)