import re
import sys
import time
+import socket
try:
from html.entities import name2codepoint
l = line.replace("%", "%%")
regex = "(https?://[^ )]+)"
return (
- re.sub(regex, "%s", l),
+ re.sub(regex, "%s", l),
[m.group(1) for m in re.finditer(regex, l)])
-
+
def follow_redirects(link, sites= None):
"""Follow directs for the link as long as the redirects are on the given
sites and return the resolved link."""
def follow(url):
return sites == None or urlparse.urlparse(url).hostname in sites
-
+
class RedirectHandler(urllib2.HTTPRedirectHandler):
def __init__(self):
self.last_url = None
self, req, fp, code, msg, hdrs, newurl)
r.get_method = lambda : 'HEAD'
return r
-
+
if not follow(link):
return link
redirect_handler = RedirectHandler()
req = urllib2.Request(link)
req.get_method = lambda : 'HEAD'
try:
- with contextlib.closing(opener.open(req)) as site:
+ with contextlib.closing(opener.open(req,timeout=1)) as site:
return site.url
- except (urllib2.HTTPError, urllib2.URLError):
+ except (urllib2.HTTPError, urllib2.URLError, socket.timeout):
return redirect_handler.last_url if redirect_handler.last_url else link
def expand_line(line, sites):
p = set(
m.group(1) for m in re.finditer("\s*([^,\s]+)\s*,?\s*", list_of_hosts))
return p
-
+