modules/urls.py

   1 # Erebus IRC bot - Author: Conny Sjoblom
   2 # vim: fileencoding=utf-8
   3 # URL Checker
   4 # This file is released into the public domain; see http://unlicense.org/
   5
   6 # module info
   7 modinfo = {
   8         'author': 'Erebus Team',
   9         'license': 'public domain',
  10         'compatible': [0],
  11         'depends': [],
  12         'softdeps': [],
  13 }
  14
  15 # http://embed.ly/tools/generator
  16
  17 # preamble
  18 import modlib
  19 lib = modlib.modlib(__name__)
  20 modstart = lib.modstart
  21 modstop = lib.modstop
  22
  23 # module code
  24 import sys
  25 if sys.version_info.major < 3:
  26         stringbase = basestring
  27         import urllib2
  28         import urlparse
  29         import HTMLParser
  30         html = HTMLParser.HTMLParser()
  31         from BeautifulSoup import BeautifulSoup
  32 else:
  33         stringbase = str
  34         import urllib.request as urllib2
  35         import urllib.parse as urlparse
  36         import html
  37         from bs4 import BeautifulSoup
  38 import http.client
  39
  40 import re, json, datetime
  41
  42 try:
  43         import aia
  44         aia_session = aia.AIASession()
  45         # aia is broken on capath systems, needs cafile to work
  46         aia_session._context.load_verify_locations(cafile='/etc/ssl/certs/ca-certificates.crt')
  47         aia_session._trusted = {
  48                 aia.openssl_get_cert_info(ca_der)["subject"]: ca_der
  49                 for ca_der in aia_session._context.get_ca_certs(True)
  50         }
  51         print("aia loaded")
  52 except ImportError as e:
  53         print(repr(e))
  54         aia = None
  55
  56 hostmask_regex = re.compile(r'^(.*)!(.*)@(.*)$')
  57
  58 def parser_hostmask(hostmask):
  59         if isinstance(hostmask, dict):
  60                 return hostmask
  61
  62         nick = None
  63         user = None
  64         host = None
  65
  66         if hostmask is not None:
  67                 match = hostmask_regex.match(hostmask)
  68
  69                 if not match:
  70                         nick = hostmask
  71                 else:
  72                         nick = match.group(1)
  73                         user = match.group(2)
  74                         host = match.group(3)
  75
  76         return {
  77                 'nick': nick,
  78                 'user': user,
  79                 'host': host
  80         }
  81
  82 class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
  83         def http_error_301(self, req, fp, code, msg, headers):
  84                 result = urllib2.HTTPRedirectHandler.http_error_301(
  85                                 self, req, fp, code, msg, headers)
  86                 result.status = code
  87                 return result
  88
  89         def http_error_302(self, req, fp, code, msg, headers):
  90                 result = urllib2.HTTPRedirectHandler.http_error_302(
  91                                 self, req, fp, code, msg, headers)
  92                 result.status = code
  93                 return result
  94
  95 def _get_blocked_chans():
  96         return lib.parent.cfg.get('urls', 'blocked', '').split(',')
  97
  98 def process_line(line):
  99         responses = []
 100         num_found = 0
 101         limit = lib.parent.cfg.getint('urls', 'limit', 2)
 102         for action, group in regexes:
 103                 for regex in group:
 104                         for match in regex.findall(line):
 105                                 if match:
 106                                         num_found += 1
 107                                         if num_found > limit:
 108                                                 return responses
 109                                         if isinstance(match, stringbase):
 110                                                 resp = action(match)
 111                                         else:
 112                                                 resp = action(*match)
 113                                         if resp is not None and resp != "":
 114                                                 responses.append(resp)
 115         return responses
 116
 117 @lib.hooknum("PRIVMSG")
 118 def privmsg_hook(bot, textline):
 119         user = parser_hostmask(textline[1:textline.find(' ')])
 120         chan = textline.split()[2]
 121
 122         if chan in _get_blocked_chans(): return
 123
 124         try:
 125                 line = textline.split(None, 3)[3][1:]
 126         except IndexError:
 127                 line = ''
 128
 129         responses = process_line(line)
 130         send_response(bot, chan, responses)
 131
 132 def send_response(bot, chan, responses):
 133         if len(responses) > 0:
 134                 if lib.parent.cfg.getboolean('urls', 'multiline'):
 135                         for r in responses:
 136                                 bot.msg(chan, r, True)
 137                 else:
 138                         bot.msg(chan, ' | '.join(responses), True)
 139
 140 def unescape(line):
 141         return re.sub('\s+', ' ', html.unescape(line))
 142
 143 def gotspotify(type, track):
 144         url = 'http://ws.spotify.com/lookup/1/?uri=spotify:%s:%s' % (type, track)
 145         xml = urllib2.urlopen(url).read()
 146         soup = BeautifulSoup(xml, convertEntities=BeautifulSoup.HTML_ENTITIES)
 147         lookup_type = soup.contents[2].name
 148
 149         if lookup_type == 'track':
 150                 name = soup.find('name').string
 151                 album_name = soup.find('album').find('name').string
 152                 artist_name = soup.find('artist').find('name').string
 153                 popularity = soup.find('popularity')
 154                 if popularity:
 155                         popularity = float(popularity.string)*100
 156                 length = float(soup.find('length').string)
 157                 minutes = int(length)/60
 158                 seconds = int(length)%60
 159
 160                 return unescape('Track: %s - %s / %s %s:%.2d %2d%%' % (artist_name, name, album_name, minutes, seconds, popularity))
 161
 162         elif lookup_type == 'album':
 163                 album_name = soup.find('album').find('name').string
 164                 artist_name = soup.find('artist').find('name').string
 165                 released = soup.find('released').string
 166                 return unescape('Album: %s - %s - %s' % (artist_name, album_name, released))
 167
 168         else:
 169                 return 'Unsupported type.'
 170
 171 def _yt_duration(s):
 172         mo = re.match(r'P(\d+D)?T(\d+H)?(\d+M)?(\d+S)?', s)
 173         pcs = [x for x in mo.groups() if x]
 174         return ''.join(pcs).lower()
 175 def _yt_date(s, f):
 176         mo = re.match(r'(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})\.(\d+)Z', s)
 177         return datetime.datetime(*(int(x) for x in mo.groups())).strftime(f)
 178 def _yt_round(n):
 179         n = float(n)
 180         if n >= 10**12:
 181                 return '%.1ft' % (n/10**12)
 182         elif n >= 10**9:
 183                 return '%.1fb' % (n/10**9)
 184         elif n >= 10**6:
 185                 return '%.1fm' % (n/10**6)
 186         elif n >= 10**3:
 187                 return '%.1fk' % (n/10**3)
 188         else:
 189                 return int(n)
 190
 191 def gotyoutube(url):
 192         url_data = urlparse.urlparse(url)
 193         query = urlparse.parse_qs(url_data.query)
 194         video = query["v"][0]
 195         api_url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails,statistics&id=%s&key=%s' % (video, lib.parent.cfg.get('urls', 'api_key'))
 196         try:
 197                 respdata = urllib2.urlopen(api_url).read()
 198                 v = json.loads(respdata)
 199                 v = v['items'][0]
 200
 201                 return unescape(lib.parent.cfg.get('urls', 'yt_format', "\002%(author)s\002: \037%(title)s\037 [%(duration)s, uploaded %(uploaded)s, %(views)s v/%(likes)s l/%(dislikes)s d]") % {
 202                         'title': v['snippet']['title'],
 203                         'author': v['snippet']['channelTitle'],
 204                         'duration': _yt_duration(v['contentDetails']['duration']),
 205                         'uploaded': _yt_date(v['snippet']['publishedAt'], lib.parent.cfg.get('urls', 'yt_date_format', '%b %d %Y')),
 206                         'views': _yt_round(v['statistics']['viewCount']),
 207                         'likes': _yt_round(v['statistics']['likeCount']),
 208                         'dislikes': _yt_round(v['statistics']['dislikeCount']),
 209                 })
 210         except urllib2.HTTPError as e:
 211                 if e.getcode() == 403:
 212                         return 'API limit exceeded'
 213                 else:
 214                         return str(e)
 215         except IndexError:
 216                 return 'no results'
 217         except Exception as e:
 218                 return str(e)
 219
 220 def gottwitch(uri):
 221         url = 'https://api.twitch.tv/helix/streams?user_login=%s' % uri.split('/')[0]
 222         opener = urllib2.build_opener()
 223         opener.addheaders = [('Client-ID', lib.parent.cfg.get('urls', 'twitch_api_key'))]
 224         respdata = opener.open(url).read()
 225         twitch = json.loads(respdata)['data']
 226         try:
 227                 # TODO: add current game.
 228                 return unescape('\037%s\037 is %s (%s)' % (twitch[0]['user_name'], twitch[0]['type'], twitch[0]['title']))
 229         except:
 230                 return 'Channel offline.'
 231
 232 def _humanize_bytes(b):
 233         b = int(b)
 234         i = 0
 235         table = " kMGTPEZYRQ"
 236         while b > 1024:
 237                 i += 1
 238                 b /= 1024.0
 239         if i == 0:
 240                 return "%dB" % (b)
 241         else:
 242                 return "%.2f%siB" % (b, table[i])
 243
 244 def _do_request(url, try_aia=False):
 245         """
 246                 Return value is a tuple consisting of:
 247                 - the HTTPResponse object, or a string on error. Empty string -> no response.
 248                 - and a flag indicating whether AIA was used
 249         """
 250         try:
 251                 request = urllib2.Request(url, headers={
 252                         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
 253                         'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
 254                         'Sec-Ch-Ua-Mobile': '?0',
 255                         'Sec-Ch-Ua-Platform': '"Linux"',
 256                         'Sec-Fetch-Dest': 'document',
 257                         'Sec-Fetch-Mode': 'navigate',
 258                         'Sec-Fetch-Site': 'same-origin',
 259                         'Sec-Fetch-User': '?1',
 260                         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
 261                         'Accept-Language': 'en-US,en;q=0.9',
 262                         'Upgrade-Insecure-Requests': '1'
 263                 })
 264         except ValueError:
 265                 return '', False
 266         if try_aia:
 267                 try:
 268                         opener = urllib2.build_opener(urllib2.HTTPSHandler(context=aia_session.ssl_context_from_url(url)), SmartRedirectHandler())
 269                 except aia.AIAError as e:
 270                         return 'Request error: %s.%s: %s' % (e.__module__, e.__class__.__name__, e.args[0]), True
 271         else:
 272                 opener = urllib2.build_opener(SmartRedirectHandler())
 273
 274         # Send request and handle errors
 275         try:
 276                 response = opener.open(request, timeout=2)
 277         except http.client.InvalidURL as e: # why does a method under urllib.request raise an exception under http.client???
 278                 return '', False
 279         except urllib2.HTTPError as e:
 280                 return 'Request error: %s %s' % (e.code, e.reason), False
 281         except urllib2.URLError as e:
 282                 if "certificate verify failed: unable to get local issuer certificate" in str(e.reason):
 283                         if aia: # Retry with AIA enabled, if module is present
 284                                 return _do_request(url, True)
 285                         else:
 286                                 lib.parent.log('urls', '?', 'If the site is not serving the certificate chain, installing the aia library might make this request work: pip install aia')
 287                                 return 'Request error: site may have broken TLS configuration (%s)' % (e.reason), False
 288                 else:
 289                         return 'Request error: %s' % (e.reason), False
 290         except TimeoutError as e:
 291                 return 'Request error: request timed out', False
 292         except Exception as e:
 293                 return 'Unknown error: %s %r' % (type(e).__name__, e.args), False
 294
 295         return response, try_aia
 296
 297
 298 def goturl(url):
 299         output = []
 300         for _, group in other_regexes:
 301                 for regex in group:
 302                         if regex.match(url):
 303                                 return None
 304
 305         response, used_aia = _do_request(url)
 306         if isinstance(response, stringbase):
 307                 return response
 308
 309         # Try to add type and length headers to reply
 310         c_type_fields = response.getheader('Content-Type', '').split(';')
 311         c_type = c_type_fields.pop(0)
 312         c_charset = None
 313         for f in c_type_fields:
 314                 f = f.strip()
 315                 if len(f) > 8 and f[0:8] == 'charset=':
 316                         c_charset = f[8:]
 317         c_len = response.getheader('Content-Length')
 318         if c_type != '':
 319                 output.append("[%s] " % (c_type))
 320         else:
 321                 output.append("[no type] ")
 322         if c_type != "text/html": # else length will be provided by HTML code below
 323                 if c_len is not None:
 324                         output.append("[%s] " % (_humanize_bytes(c_len)))
 325                 else:
 326                         output.append("[no length] ")
 327
 328         if used_aia:
 329                 output.append("[AIA] ")
 330
 331         # Try to add title if HTML
 332         if c_type == 'text/html':
 333                 try:
 334                         responsebody = response.read(1024*1024)
 335                 except Exception as e:
 336                         output.append('Error reading response body: %s %r' % (type(e).__name__, e.args))
 337                 else:
 338                         if c_len is not None and len(responsebody) != int(c_len): # did we read a different amount than Content-Length?
 339                                 if response.read(1): # there's more data, we just aren't reading it
 340                                         output.append("[read %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
 341                                 else:
 342                                         output.append("[actual %s; Content-Length %s] " % (_humanize_bytes(len(responsebody)), _humanize_bytes(c_len)))
 343                         else: # Content-Length = amount read
 344                                 output.append("[%s] " % (_humanize_bytes(len(responsebody))))
 345                         try:
 346                                 soup = BeautifulSoup(responsebody, from_encoding=c_charset)
 347                                 if soup.title:
 348                                         output.append('Title: ' + unescape('%s' % (soup.find('title').string.strip())))
 349                                 else:
 350                                         output.append('No title')
 351                         except Exception as e:
 352                                 output.append('Title error: %s %r ' % (type(e).__name__, e.args))
 353
 354         return ''.join(output)
 355
 356 url_regex = (
 357         re.compile(r'https?://(?:[^/\s.]+\.)+[a-z0-9-]+(?:/[^\s\]>)}]+)?', re.I),
 358 )
 359 other_regexes = (
 360         (lambda x: '', (re.compile(r"""https?://(?:www\.)?(?:twitter|x)\.com/""", re.I),)), # skip twitter
 361         (lambda x: '', (re.compile(r"""https?://(?:www\.)?reddit\.com/""", re.I),)), # skip new-reddit
 362         (lambda x: '', (re.compile(r"""https?://jfr\.im/git/""", re.I),)), # skip my gitweb
 363         (lambda x: '', (re.compile(r"""https?://(?:www\.)?wunderground\.com/""", re.I),)), # skip wunderground, they time us out
 364 )
 365 regexes = other_regexes + (
 366         (goturl, url_regex),
 367 )