]> jfr.im git - erebus.git/blame - modules/urls.py
urls - remove no-cache
[erebus.git] / modules / urls.py
CommitLineData
58cd0191 1# Erebus IRC bot - Author: Conny Sjoblom
4477123d 2# vim: fileencoding=utf-8
a83e1f9c 3# URL Checker
4# This file is released into the public domain; see http://unlicense.org/
5
6# module info
7modinfo = {
8 'author': 'Erebus Team',
9 'license': 'public domain',
fa93b933 10 'compatible': [0],
a62d0d18 11 'depends': [],
12 'softdeps': [],
a83e1f9c 13}
14
99366200
CS
15# http://embed.ly/tools/generator
16
a83e1f9c 17# preamble
18import modlib
19lib = modlib.modlib(__name__)
20modstart = lib.modstart
21modstop = lib.modstop
22
23# module code
a28e2ae9 24import sys
25if sys.version_info.major < 3:
55bfe803 26 stringbase = basestring
a28e2ae9 27 import urllib2
28 import urlparse
29 import HTMLParser
d266ce49 30 html = HTMLParser.HTMLParser()
a28e2ae9 31 from BeautifulSoup import BeautifulSoup
32else:
55bfe803 33 stringbase = str
a28e2ae9 34 import urllib.request as urllib2
35 import urllib.parse as urlparse
d266ce49 36 import html
a28e2ae9 37 from bs4 import BeautifulSoup
169ed3b5 38import http.client
a28e2ae9 39
467acacf 40import re, json, datetime
a83e1f9c 41
8570a2ee
JR
42try:
43 import aia
44 aia_session = aia.AIASession()
45 # aia is broken on capath systems, needs cafile to work
46 aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
47 aia_session._trusted = {
48 aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
49 for ca_der in aia_session._context.get_ca_certs(True)
50 }
51 print("aia loaded")
52except ImportError as e:
53 print(repr(e))
54 aia = None
55
390fbad4 56hostmask_regex = re.compile(r'^(.*)!(.*)@(.*)$')
a83e1f9c 57
58def parser_hostmask(hostmask):
59 if isinstance(hostmask, dict):
60 return hostmask
61
62 nick = None
63 user = None
64 host = None
65
66 if hostmask is not None:
67 match = hostmask_regex.match(hostmask)
68
69 if not match:
70 nick = hostmask
71 else:
72 nick = match.group(1)
73 user = match.group(2)
74 host = match.group(3)
75
76 return {
77 'nick': nick,
78 'user': user,
79 'host': host
80 }
81
394a7b69
CS
82class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
83 def http_error_301(self, req, fp, code, msg, headers):
84 result = urllib2.HTTPRedirectHandler.http_error_301(
85 self, req, fp, code, msg, headers)
86 result.status = code
87 return result
88
89 def http_error_302(self, req, fp, code, msg, headers):
90 result = urllib2.HTTPRedirectHandler.http_error_302(
91 self, req, fp, code, msg, headers)
92 result.status = code
93 return result
94
87f0733f
JR
95def _get_blocked_chans():
96 return lib.parent.cfg.get('urls', 'blocked', '').split(',')
97
467acacf 98def process_line(line):
99 responses = []
100 num_found = 0
101 limit = lib.parent.cfg.getint('urls', 'limit', 2)
ecbed328 102 for action, group in regexes:
467acacf 103 for regex in group:
104 for match in regex.findall(line):
105 if match:
106 num_found += 1
107 if num_found > limit:
108 return responses
55bfe803
JR
109 if isinstance(match, stringbase):
110 resp = action(match)
111 else:
112 resp = action(*match)
ecbed328
JR
113 if resp is not None and resp != "":
114 responses.append(resp)
467acacf 115 return responses
116
a83e1f9c 117@lib.hooknum("PRIVMSG")
390fbad4
CS
118def privmsg_hook(bot, textline):
119 user = parser_hostmask(textline[1:textline.find(' ')])
120 chan = textline.split()[2]
a83e1f9c 121
87f0733f
JR
122 if chan in _get_blocked_chans(): return
123
a83e1f9c 124 try:
390fbad4 125 line = textline.split(None, 3)[3][1:]
a83e1f9c 126 except IndexError:
390fbad4 127 line = ''
a83e1f9c 128
467acacf 129 responses = process_line(line)
9de26fbb
JR
130 send_response(bot, chan, responses)
131
132def send_response(bot, chan, responses):
04d48353 133 if len(responses) > 0:
467acacf 134 if lib.parent.cfg.getboolean('urls', 'multiline'):
135 for r in responses:
136 bot.msg(chan, r, True)
137 else:
138 bot.msg(chan, ' | '.join(responses), True)
a83e1f9c 139
390fbad4 140def unescape(line):
d266ce49 141 return re.sub('\s+', ' ', html.unescape(line))
a83e1f9c 142
143def gotspotify(type, track):
144 url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
145 xml = urllib2.urlopen(url).read()
390fbad4 146 soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
a83e1f9c 147 lookup_type = soup.contents[2].name
390fbad4 148
a83e1f9c 149 if lookup_type == 'track':
150 name = soup.find('name').string
151 album_name = soup.find('album').find('name').string
152 artist_name = soup.find('artist').find('name').string
153 popularity = soup.find('popularity')
154 if popularity:
155 popularity = float(popularity.string)*100
156 length = float(soup.find('length').string)
157 minutes = int(length)/60
467acacf 158 seconds = int(length)%60
390fbad4 159
dafa38fc 160 return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
390fbad4 161
a83e1f9c 162 elif lookup_type == 'album':
163 album_name = soup.find('album').find('name').string
164 artist_name = soup.find('artist').find('name').string
165 released = soup.find('released').string
dafa38fc 166 return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
390fbad4 167
a83e1f9c 168 else:
169 return 'Unsupported type.'
170
467acacf 171def _yt_duration(s):
172 mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
173 pcs = [x for x in mo.groups() if x]
174 return ''.join(pcs).lower()
175def _yt_date(s, f):
176 mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
177 return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
178def _yt_round(n):
179 n = float(n)
180 if n >= 10**12:
181 return '%.1ft' % (n/10**12)
182 elif n >= 10**9:
183 return '%.1fb' % (n/10**9)
184 elif n >= 10**6:
185 return '%.1fm' % (n/10**6)
186 elif n >= 10**3:
187 return '%.1fk' % (n/10**3)
188 else:
189 return int(n)
190
a83e1f9c 191def gotyoutube(url):
192 url_data = urlparse.urlparse(url)
193 query = urlparse.parse_qs(url_data.query)
194 video = query["v"][0]
467acacf 195 api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
a83e1f9c 196 try:
197 respdata = urllib2.urlopen(api_url).read()
467acacf 198 v = json.loads(respdata)
199 v = v['items'][0]
200
201 return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
202 'title': v['snippet']['title'],
203 'author': v['snippet']['channelTitle'],
204 'duration': _yt_duration(v['contentDetails']['duration']),
205 'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
206 'views': _yt_round(v['statistics']['viewCount']),
207 'likes': _yt_round(v['statistics']['likeCount']),
208 'dislikes': _yt_round(v['statistics']['dislikeCount']),
209 })
210 except urllib2.HTTPError as e:
211 if e.getcode() == 403:
212 return 'API limit exceeded'
213 else:
214 return str(e)
215 except IndexError:
216 return 'no results'
217 except Exception as e:
218 return str(e)
a83e1f9c 219
390fbad4 220def gottwitch(uri):
467acacf 221 url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
222 opener = urllib2.build_opener()
223 opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
224 respdata = opener.open(url).read()
225 twitch = json.loads(respdata)['data']
226 try:
227 # TODO: add current game.
228 return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
229 except:
230 return 'Channel offline.'
390fbad4 231
ecbed328
JR
232def _humanize_bytes(b):
233 b = int(b)
234 i = 0
235 table = " kMGTPEZYRQ"
236 while b > 1024:
237 i += 1
238 b /= 1024.0
239 if i == 0:
240 return "%dB" % (b)
241 else:
242 return "%.2f%siB" % (b, table[i])
243
8570a2ee 244def _do_request(url, try_aia=False):
41e5ba5b
JR
245 """
246 Return value is a tuple consisting of:
247 - the HTTPResponse object, or a string on error. Empty string -> no response.
248 - and a flag indicating whether AIA was used
249 """
b6a309d2 250 try:
744f1db9
JR
251 request = urllib2.Request(url, headers={
252 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
253 'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
254 'Sec-Ch-Ua-Mobile': '?0',
255 'Sec-Ch-Ua-Platform': '"Linux"',
256 'Sec-Fetch-Dest': 'document',
257 'Sec-Fetch-Mode': 'navigate',
258 'Sec-Fetch-Site': 'same-origin',
259 'Sec-Fetch-User': '?1',
260 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
261 'Accept-Language': 'en-US,en;q=0.9',
744f1db9
JR
262 'Upgrade-Insecure-Requests': '1'
263 })
b6a309d2
JR
264 except ValueError:
265 return '', False
8570a2ee
JR
266 if try_aia:
267 opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
268 else:
269 opener = urllib2.build_opener(SmartRedirectHandler())
ecbed328
JR
270
271 # Send request and handle errors
993046cc 272 try:
ecbed328 273 response = opener.open(request, timeout=2)
169ed3b5 274 except http.client.InvalidURL as e: # why does a method under urllib.request raise an exception under http.client???
41e5ba5b 275 return '', False
de8ab9cb 276 except urllib2.HTTPError as e:
41e5ba5b 277 return 'Request error: %s %s' % (e.code, e.reason), False
74dc2a9d 278 except urllib2.URLError as e:
8570a2ee 279 if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
41e5ba5b 280 if aia: # Retry with AIA enabled, if module is present
8570a2ee
JR
281 return _do_request(url, True)
282 else:
283 lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
41e5ba5b 284 return 'Request error: site may have broken TLS configuration (%s)' % (e.reason), False
8570a2ee 285 else:
41e5ba5b 286 return 'Request error: %s' % (e.reason), False
9df62f90 287 except TimeoutError as e:
41e5ba5b 288 return 'Request error: request timed out', False
04d48353 289 except Exception as e:
41e5ba5b 290 return 'Unknown error: %s %r' % (type(e).__name__, e.args), False
ecbed328 291
bd96ac57 292 return response, try_aia
8570a2ee
JR
293
294
295def goturl(url):
296 output = []
297 for _, group in other_regexes:
298 for regex in group:
299 if regex.match(url):
300 return None
301
bd96ac57 302 response, used_aia = _do_request(url)
8570a2ee
JR
303 if isinstance(response, stringbase):
304 return response
305
ecbed328 306 # Try to add type and length headers to reply
b91b84fa
JR
307 c_type_fields = response.getheader('Content-Type', '').split(';')
308 c_type = c_type_fields.pop(0)
309 c_charset = None
310 for f in c_type_fields:
311 f = f.strip()
312 if len(f) > 8 and f[0:8] == 'charset=':
313 c_charset = f[8:]
ecbed328
JR
314 c_len = response.getheader('Content-Length')
315 if c_type != '':
316 output.append("[%s] " % (c_type))
317 else:
318 output.append("[no type] ")
319 if c_type != "text/html": # else length will be provided by HTML code below
320 if c_len is not None:
321 output.append("[%s] " % (_humanize_bytes(c_len)))
322 else:
323 output.append("[no length] ")
324
bd96ac57
JR
325 if used_aia:
326 output.append("[AIA] ")
327
ecbed328
JR
328 # Try to add title if HTML
329 if c_type == 'text/html':
330 try:
331 responsebody = response.read(1024*1024)
ecbed328
JR
332 except Exception as e:
333 output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
334 else:
335 if c_len is not None and len(responsebody) != int(c_len):
336 output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
337 else:
338 output.append("[%s] " % (_humanize_bytes(len(responsebody))))
339 try:
b91b84fa 340 soup = BeautifulSoup(responsebody, from_encoding=c_charset)
ecbed328 341 if soup.title:
07fbfaa6 342 output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
ecbed328
JR
343 else:
344 output.append('No title')
345 except Exception as e:
346 output.append('Title error: %s %r ' % (type(e).__name__, e.args))
347
348 return ''.join(output)
467acacf 349
350url_regex = (
7429451e 351 re.compile(r'https?://(?:[^/\s.]+\.)+[a-z0-9-]+(?:/[^\s\]>)}]+)?', re.I),
467acacf 352)
467acacf 353other_regexes = (
2412ad6e 354 (lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter|x)\.com/""", re.I),)), # skip twitter
cf848537 355 (lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
6634a02c 356 (lambda x: '', (re.compile(r"""https?://jfr\.im/git/""", re.I),)), # skip my gitweb
d2439073 357 (lambda x: '', (re.compile(r"""https?://(?:www\.)?wunderground\.com/""", re.I),)), # skip wunderground, they time us out
467acacf 358)
359regexes = other_regexes + (
ecbed328 360 (goturl, url_regex),
467acacf 361)