]> jfr.im git - erebus.git/blame_incremental - modules/urls.py
urls - skip wunderground
[erebus.git] / modules / urls.py
... / ...
CommitLineData
1# Erebus IRC bot - Author: Conny Sjoblom
2# vim: fileencoding=utf-8
3# URL Checker
4# This file is released into the public domain; see http://unlicense.org/
5
6# module info
7modinfo = {
8 'author': 'Erebus Team',
9 'license': 'public domain',
10 'compatible': [0],
11 'depends': [],
12 'softdeps': [],
13}
14
15# http://embed.ly/tools/generator
16
17# preamble
18import modlib
19lib = modlib.modlib(__name__)
20modstart = lib.modstart
21modstop = lib.modstop
22
23# module code
24import sys
25if sys.version_info.major < 3:
26 stringbase = basestring
27 import urllib2
28 import urlparse
29 import HTMLParser
30 html = HTMLParser.HTMLParser()
31 from BeautifulSoup import BeautifulSoup
32else:
33 stringbase = str
34 import urllib.request as urllib2
35 import urllib.parse as urlparse
36 import html
37 from bs4 import BeautifulSoup
38import http.client
39
40import re, json, datetime
41
42try:
43 import aia
44 aia_session = aia.AIASession()
45 # aia is broken on capath systems, needs cafile to work
46 aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
47 aia_session._trusted = {
48 aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
49 for ca_der in aia_session._context.get_ca_certs(True)
50 }
51 print("aia loaded")
52except ImportError as e:
53 print(repr(e))
54 aia = None
55
56hostmask_regex = re.compile(r'^(.*)!(.*)@(.*)$')
57
58def parser_hostmask(hostmask):
59 if isinstance(hostmask, dict):
60 return hostmask
61
62 nick = None
63 user = None
64 host = None
65
66 if hostmask is not None:
67 match = hostmask_regex.match(hostmask)
68
69 if not match:
70 nick = hostmask
71 else:
72 nick = match.group(1)
73 user = match.group(2)
74 host = match.group(3)
75
76 return {
77 'nick': nick,
78 'user': user,
79 'host': host
80 }
81
82class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
83 def http_error_301(self, req, fp, code, msg, headers):
84 result = urllib2.HTTPRedirectHandler.http_error_301(
85 self, req, fp, code, msg, headers)
86 result.status = code
87 return result
88
89 def http_error_302(self, req, fp, code, msg, headers):
90 result = urllib2.HTTPRedirectHandler.http_error_302(
91 self, req, fp, code, msg, headers)
92 result.status = code
93 return result
94
95def _get_blocked_chans():
96 return lib.parent.cfg.get('urls', 'blocked', '').split(',')
97
98def process_line(line):
99 responses = []
100 num_found = 0
101 limit = lib.parent.cfg.getint('urls', 'limit', 2)
102 for action, group in regexes:
103 for regex in group:
104 for match in regex.findall(line):
105 if match:
106 num_found += 1
107 if num_found > limit:
108 return responses
109 if isinstance(match, stringbase):
110 resp = action(match)
111 else:
112 resp = action(*match)
113 if resp is not None and resp != "":
114 responses.append(resp)
115 return responses
116
117@lib.hooknum("PRIVMSG")
118def privmsg_hook(bot, textline):
119 user = parser_hostmask(textline[1:textline.find(' ')])
120 chan = textline.split()[2]
121
122 if chan in _get_blocked_chans(): return
123
124 try:
125 line = textline.split(None, 3)[3][1:]
126 except IndexError:
127 line = ''
128
129 responses = process_line(line)
130 send_response(bot, chan, responses)
131
132def send_response(bot, chan, responses):
133 if len(responses) > 0:
134 if lib.parent.cfg.getboolean('urls', 'multiline'):
135 for r in responses:
136 bot.msg(chan, r, True)
137 else:
138 bot.msg(chan, ' | '.join(responses), True)
139
140def unescape(line):
141 return re.sub('\s+', ' ', html.unescape(line))
142
143def gotspotify(type, track):
144 url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
145 xml = urllib2.urlopen(url).read()
146 soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
147 lookup_type = soup.contents[2].name
148
149 if lookup_type == 'track':
150 name = soup.find('name').string
151 album_name = soup.find('album').find('name').string
152 artist_name = soup.find('artist').find('name').string
153 popularity = soup.find('popularity')
154 if popularity:
155 popularity = float(popularity.string)*100
156 length = float(soup.find('length').string)
157 minutes = int(length)/60
158 seconds = int(length)%60
159
160 return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
161
162 elif lookup_type == 'album':
163 album_name = soup.find('album').find('name').string
164 artist_name = soup.find('artist').find('name').string
165 released = soup.find('released').string
166 return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
167
168 else:
169 return 'Unsupported type.'
170
171def _yt_duration(s):
172 mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
173 pcs = [x for x in mo.groups() if x]
174 return ''.join(pcs).lower()
175def _yt_date(s, f):
176 mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
177 return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
178def _yt_round(n):
179 n = float(n)
180 if n >= 10**12:
181 return '%.1ft' % (n/10**12)
182 elif n >= 10**9:
183 return '%.1fb' % (n/10**9)
184 elif n >= 10**6:
185 return '%.1fm' % (n/10**6)
186 elif n >= 10**3:
187 return '%.1fk' % (n/10**3)
188 else:
189 return int(n)
190
191def gotyoutube(url):
192 url_data = urlparse.urlparse(url)
193 query = urlparse.parse_qs(url_data.query)
194 video = query["v"][0]
195 api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
196 try:
197 respdata = urllib2.urlopen(api_url).read()
198 v = json.loads(respdata)
199 v = v['items'][0]
200
201 return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
202 'title': v['snippet']['title'],
203 'author': v['snippet']['channelTitle'],
204 'duration': _yt_duration(v['contentDetails']['duration']),
205 'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
206 'views': _yt_round(v['statistics']['viewCount']),
207 'likes': _yt_round(v['statistics']['likeCount']),
208 'dislikes': _yt_round(v['statistics']['dislikeCount']),
209 })
210 except urllib2.HTTPError as e:
211 if e.getcode() == 403:
212 return 'API limit exceeded'
213 else:
214 return str(e)
215 except IndexError:
216 return 'no results'
217 except Exception as e:
218 return str(e)
219
220def gottwitch(uri):
221 url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
222 opener = urllib2.build_opener()
223 opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
224 respdata = opener.open(url).read()
225 twitch = json.loads(respdata)['data']
226 try:
227 # TODO: add current game.
228 return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
229 except:
230 return 'Channel offline.'
231
232def _humanize_bytes(b):
233 b = int(b)
234 i = 0
235 table = " kMGTPEZYRQ"
236 while b > 1024:
237 i += 1
238 b /= 1024.0
239 if i == 0:
240 return "%dB" % (b)
241 else:
242 return "%.2f%siB" % (b, table[i])
243
244def _do_request(url, try_aia=False):
245 """
246 Return value is a tuple consisting of:
247 - the HTTPResponse object, or a string on error. Empty string -> no response.
248 - and a flag indicating whether AIA was used
249 """
250 try:
251 request = urllib2.Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Linux"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1'})
252 except ValueError:
253 return '', False
254 if try_aia:
255 opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
256 else:
257 opener = urllib2.build_opener(SmartRedirectHandler())
258
259 # Send request and handle errors
260 try:
261 response = opener.open(request, timeout=2)
262 except http.client.InvalidURL as e: # why does a method under urllib.request raise an exception under http.client???
263 return '', False
264 except urllib2.HTTPError as e:
265 return 'Request error: %s %s' % (e.code, e.reason), False
266 except urllib2.URLError as e:
267 if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
268 if aia: # Retry with AIA enabled, if module is present
269 return _do_request(url, True)
270 else:
271 lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
272 return 'Request error: site may have broken TLS configuration (%s)' % (e.reason), False
273 else:
274 return 'Request error: %s' % (e.reason), False
275 except TimeoutError as e:
276 return 'Request error: request timed out', False
277 except Exception as e:
278 return 'Unknown error: %s %r' % (type(e).__name__, e.args), False
279
280 return response, try_aia
281
282
283def goturl(url):
284 output = []
285 for _, group in other_regexes:
286 for regex in group:
287 if regex.match(url):
288 return None
289
290 response, used_aia = _do_request(url)
291 if isinstance(response, stringbase):
292 return response
293
294 # Try to add type and length headers to reply
295 c_type_fields = response.getheader('Content-Type', '').split(';')
296 c_type = c_type_fields.pop(0)
297 c_charset = None
298 for f in c_type_fields:
299 f = f.strip()
300 if len(f) > 8 and f[0:8] == 'charset=':
301 c_charset = f[8:]
302 c_len = response.getheader('Content-Length')
303 if c_type != '':
304 output.append("[%s] " % (c_type))
305 else:
306 output.append("[no type] ")
307 if c_type != "text/html": # else length will be provided by HTML code below
308 if c_len is not None:
309 output.append("[%s] " % (_humanize_bytes(c_len)))
310 else:
311 output.append("[no length] ")
312
313 if used_aia:
314 output.append("[AIA] ")
315
316 # Try to add title if HTML
317 if c_type == 'text/html':
318 try:
319 responsebody = response.read(1024*1024)
320 except Exception as e:
321 output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
322 else:
323 if c_len is not None and len(responsebody) != int(c_len):
324 output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
325 else:
326 output.append("[%s] " % (_humanize_bytes(len(responsebody))))
327 try:
328 soup = BeautifulSoup(responsebody, from_encoding=c_charset)
329 if soup.title:
330 output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
331 else:
332 output.append('No title')
333 except Exception as e:
334 output.append('Title error: %s %r ' % (type(e).__name__, e.args))
335
336 return ''.join(output)
337
338url_regex = (
339 re.compile(r'https?://(?:[^/\s.]+\.)+[^/\s.]+(?:/\S+)?'),
340)
341other_regexes = (
342 (lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter|x)\.com/""", re.I),)), # skip twitter
343 (lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
344 (lambda x: '', (re.compile(r"""https?://jfr\.im/git/""", re.I),)), # skip my gitweb
345 (lambda x: '', (re.compile(r"""https?://(?:www\.)?wunderground\.com/""", re.I),)), # skip wunderground, they time us out
346)
347regexes = other_regexes + (
348 (goturl, url_regex),
349)