]> jfr.im git - erebus.git/blame - modules/urls.py
urls - skip new reddit since titles are useless on it
[erebus.git] / modules / urls.py
CommitLineData
58cd0191 1# Erebus IRC bot - Author: Conny Sjoblom
4477123d 2# vim: fileencoding=utf-8
a83e1f9c 3# URL Checker
4# This file is released into the public domain; see http://unlicense.org/
5
6# module info
7modinfo = {
8 'author': 'Erebus Team',
9 'license': 'public domain',
fa93b933 10 'compatible': [0],
a62d0d18 11 'depends': [],
12 'softdeps': [],
a83e1f9c 13}
14
99366200
CS
15# http://embed.ly/tools/generator
16
a83e1f9c 17# preamble
18import modlib
19lib = modlib.modlib(__name__)
20modstart = lib.modstart
21modstop = lib.modstop
22
23# module code
a28e2ae9 24import sys
25if sys.version_info.major < 3:
55bfe803 26 stringbase = basestring
a28e2ae9 27 import urllib2
28 import urlparse
29 import HTMLParser
d266ce49 30 html = HTMLParser.HTMLParser()
a28e2ae9 31 from BeautifulSoup import BeautifulSoup
32else:
55bfe803 33 stringbase = str
a28e2ae9 34 import urllib.request as urllib2
35 import urllib.parse as urlparse
d266ce49 36 import html
a28e2ae9 37 from bs4 import BeautifulSoup
38
467acacf 39import re, json, datetime
a83e1f9c 40
8570a2ee
JR
41try:
42 import aia
43 aia_session = aia.AIASession()
44 # aia is broken on capath systems, needs cafile to work
45 aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
46 aia_session._trusted = {
47 aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
48 for ca_der in aia_session._context.get_ca_certs(True)
49 }
50 print("aia loaded")
51except ImportError as e:
52 print(repr(e))
53 aia = None
54
390fbad4 55hostmask_regex = re.compile(r'^(.*)!(.*)@(.*)$')
a83e1f9c 56
57def parser_hostmask(hostmask):
58 if isinstance(hostmask, dict):
59 return hostmask
60
61 nick = None
62 user = None
63 host = None
64
65 if hostmask is not None:
66 match = hostmask_regex.match(hostmask)
67
68 if not match:
69 nick = hostmask
70 else:
71 nick = match.group(1)
72 user = match.group(2)
73 host = match.group(3)
74
75 return {
76 'nick': nick,
77 'user': user,
78 'host': host
79 }
80
394a7b69
CS
81class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
82 def http_error_301(self, req, fp, code, msg, headers):
83 result = urllib2.HTTPRedirectHandler.http_error_301(
84 self, req, fp, code, msg, headers)
85 result.status = code
86 return result
87
88 def http_error_302(self, req, fp, code, msg, headers):
89 result = urllib2.HTTPRedirectHandler.http_error_302(
90 self, req, fp, code, msg, headers)
91 result.status = code
92 return result
93
87f0733f
JR
94def _get_blocked_chans():
95 return lib.parent.cfg.get('urls', 'blocked', '').split(',')
96
467acacf 97def process_line(line):
98 responses = []
99 num_found = 0
100 limit = lib.parent.cfg.getint('urls', 'limit', 2)
ecbed328 101 for action, group in regexes:
467acacf 102 for regex in group:
103 for match in regex.findall(line):
104 if match:
105 num_found += 1
106 if num_found > limit:
107 return responses
55bfe803
JR
108 if isinstance(match, stringbase):
109 resp = action(match)
110 else:
111 resp = action(*match)
ecbed328
JR
112 if resp is not None and resp != "":
113 responses.append(resp)
467acacf 114 return responses
115
a83e1f9c 116@lib.hooknum("PRIVMSG")
390fbad4
CS
117def privmsg_hook(bot, textline):
118 user = parser_hostmask(textline[1:textline.find(' ')])
119 chan = textline.split()[2]
a83e1f9c 120
87f0733f
JR
121 if chan in _get_blocked_chans(): return
122
a83e1f9c 123 try:
390fbad4 124 line = textline.split(None, 3)[3][1:]
a83e1f9c 125 except IndexError:
390fbad4 126 line = ''
a83e1f9c 127
467acacf 128 responses = process_line(line)
04d48353 129 if len(responses) > 0:
467acacf 130 if lib.parent.cfg.getboolean('urls', 'multiline'):
131 for r in responses:
132 bot.msg(chan, r, True)
133 else:
134 bot.msg(chan, ' | '.join(responses), True)
a83e1f9c 135
390fbad4 136def unescape(line):
d266ce49 137 return re.sub('\s+', ' ', html.unescape(line))
a83e1f9c 138
139def gotspotify(type, track):
140 url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
141 xml = urllib2.urlopen(url).read()
390fbad4 142 soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
a83e1f9c 143 lookup_type = soup.contents[2].name
390fbad4 144
a83e1f9c 145 if lookup_type == 'track':
146 name = soup.find('name').string
147 album_name = soup.find('album').find('name').string
148 artist_name = soup.find('artist').find('name').string
149 popularity = soup.find('popularity')
150 if popularity:
151 popularity = float(popularity.string)*100
152 length = float(soup.find('length').string)
153 minutes = int(length)/60
467acacf 154 seconds = int(length)%60
390fbad4 155
dafa38fc 156 return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
390fbad4 157
a83e1f9c 158 elif lookup_type == 'album':
159 album_name = soup.find('album').find('name').string
160 artist_name = soup.find('artist').find('name').string
161 released = soup.find('released').string
dafa38fc 162 return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
390fbad4 163
a83e1f9c 164 else:
165 return 'Unsupported type.'
166
467acacf 167def _yt_duration(s):
168 mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
169 pcs = [x for x in mo.groups() if x]
170 return ''.join(pcs).lower()
171def _yt_date(s, f):
172 mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
173 return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
174def _yt_round(n):
175 n = float(n)
176 if n >= 10**12:
177 return '%.1ft' % (n/10**12)
178 elif n >= 10**9:
179 return '%.1fb' % (n/10**9)
180 elif n >= 10**6:
181 return '%.1fm' % (n/10**6)
182 elif n >= 10**3:
183 return '%.1fk' % (n/10**3)
184 else:
185 return int(n)
186
a83e1f9c 187def gotyoutube(url):
188 url_data = urlparse.urlparse(url)
189 query = urlparse.parse_qs(url_data.query)
190 video = query["v"][0]
467acacf 191 api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
a83e1f9c 192 try:
193 respdata = urllib2.urlopen(api_url).read()
467acacf 194 v = json.loads(respdata)
195 v = v['items'][0]
196
197 return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
198 'title': v['snippet']['title'],
199 'author': v['snippet']['channelTitle'],
200 'duration': _yt_duration(v['contentDetails']['duration']),
201 'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
202 'views': _yt_round(v['statistics']['viewCount']),
203 'likes': _yt_round(v['statistics']['likeCount']),
204 'dislikes': _yt_round(v['statistics']['dislikeCount']),
205 })
206 except urllib2.HTTPError as e:
207 if e.getcode() == 403:
208 return 'API limit exceeded'
209 else:
210 return str(e)
211 except IndexError:
212 return 'no results'
213 except Exception as e:
214 return str(e)
a83e1f9c 215
390fbad4 216def gottwitch(uri):
467acacf 217 url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
218 opener = urllib2.build_opener()
219 opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
220 respdata = opener.open(url).read()
221 twitch = json.loads(respdata)['data']
222 try:
223 # TODO: add current game.
224 return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
225 except:
226 return 'Channel offline.'
390fbad4 227
ecbed328
JR
228def _humanize_bytes(b):
229 b = int(b)
230 i = 0
231 table = " kMGTPEZYRQ"
232 while b > 1024:
233 i += 1
234 b /= 1024.0
235 if i == 0:
236 return "%dB" % (b)
237 else:
238 return "%.2f%siB" % (b, table[i])
239
8570a2ee
JR
240def _do_request(url, try_aia=False):
241 """Returns the HTTPResponse object, or a string on error"""
16d03f8a 242 request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Linux"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1'})
8570a2ee
JR
243 if try_aia:
244 opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
245 else:
246 opener = urllib2.build_opener(SmartRedirectHandler())
ecbed328
JR
247
248 # Send request and handle errors
993046cc 249 try:
ecbed328 250 response = opener.open(request, timeout=2)
de8ab9cb 251 except urllib2.HTTPError as e:
ecbed328 252 return 'Request error: %s %s' % (e.code, e.reason)
74dc2a9d 253 except urllib2.URLError as e:
8570a2ee
JR
254 if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
255 if aia: # Retry with AIA enabled
256 return _do_request(url, True)
257 else:
258 lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
259 return 'Request error: site may have broken TLS configuration (%s)' % (e.reason)
260 else:
261 return 'Request error: %s' % (e.reason)
9df62f90 262 except TimeoutError as e:
ecbed328 263 return 'Request error: request timed out'
04d48353 264 except Exception as e:
ecbed328
JR
265 return 'Unknown error: %s %r' % (type(e).__name__, e.args)
266
8570a2ee
JR
267 return response
268
269
270def goturl(url):
271 output = []
272 for _, group in other_regexes:
273 for regex in group:
274 if regex.match(url):
275 return None
276
277 response = _do_request(url)
278 if isinstance(response, stringbase):
279 return response
280
ecbed328
JR
281 # Try to add type and length headers to reply
282 c_type = response.getheader('Content-Type', '').split(';', 1)[0]
283 c_len = response.getheader('Content-Length')
284 if c_type != '':
285 output.append("[%s] " % (c_type))
286 else:
287 output.append("[no type] ")
288 if c_type != "text/html": # else length will be provided by HTML code below
289 if c_len is not None:
290 output.append("[%s] " % (_humanize_bytes(c_len)))
291 else:
292 output.append("[no length] ")
293
294 # Try to add title if HTML
295 if c_type == 'text/html':
296 try:
297 responsebody = response.read(1024*1024)
ecbed328
JR
298 except Exception as e:
299 output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
300 else:
301 if c_len is not None and len(responsebody) != int(c_len):
302 output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
303 else:
304 output.append("[%s] " % (_humanize_bytes(len(responsebody))))
305 try:
306 soup = BeautifulSoup(responsebody)
307 if soup.title:
07fbfaa6 308 output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
ecbed328
JR
309 else:
310 output.append('No title')
311 except Exception as e:
312 output.append('Title error: %s %r ' % (type(e).__name__, e.args))
313
314 return ''.join(output)
467acacf 315
316url_regex = (
ecbed328 317 re.compile(r'https?://(?:[^/\s.]+\.)+[^/\s.]+(?:/\S+)?'),
467acacf 318)
467acacf 319other_regexes = (
2412ad6e 320 (lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter|x)\.com/""", re.I),)), # skip twitter
cf848537 321 (lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
467acacf 322)
323regexes = other_regexes + (
ecbed328 324 (goturl, url_regex),
467acacf 325)