assert find_links("http://abc") == ("%s", ["http://abc"])
assert find_links("t http://abc") == ("t %s", ["http://abc"])
assert find_links("http://abc t") == ("%s t", ["http://abc"])
- assert find_links("1 http://a 2 http://b 3") == ("1 %s 2 %s 3",
+ assert find_links("1 http://a 2 http://b 3") == ("1 %s 2 %s 3",
["http://a", "http://b"])
assert find_links("%") == ("%%", [])
assert find_links("(http://abc)") == ("(%s)", ["http://abc"])
def start_server(*resp):
"""HTTP server replying with the given responses to the expected
requests."""
- def url(port, path):
+ def url(port, path):
return 'http://%s:%s%s' % (socket.gethostname(), port, path)
-
+
responses = list(reversed(resp))
-
+
class MyHandler(BaseHTTPServer.BaseHTTPRequestHandler):
def do_HEAD(self):
response = responses.pop()
for header, value in list(response.headers.items()):
self.send_header(header, value)
self.end_headers()
-
+
httpd = SocketServer.TCPServer(("", 0), MyHandler)
t = threading.Thread(target=httpd.serve_forever)
t.setDaemon(True)
port = httpd.server_address[1]
yield functools.partial(url, port)
httpd.shutdown()
-
+
def test_follow_redirects_direct_link():
link = "/resource"
with start_server(Response(link, 200, {})) as url:
redirected = "/redirected"
link = "/resource"
with start_server(
- Response(link, 301, {"Location": redirected}),
+ Response(link, 301, {"Location": redirected}),
Response(redirected, 200, {})) as url:
assert url(redirected) == follow_redirects(url(link))
-
+
def test_follow_redirects_unavailable():
link = "/resource"
with start_server(Response(link, 404, {})) as url:
unavailable = "/unavailable"
link = "/resource"
with start_server(
- Response(link, 301, {"Location": unavailable}),
+ Response(link, 301, {"Location": unavailable}),
Response(unavailable, 404, {})) as url:
assert url(unavailable) == follow_redirects(url(link))
def test_follow_redirects_no_where():
link = "http://links.nowhere/"
assert link == follow_redirects(link)
-
+
def test_follow_redirects_link_to_nowhere():
unavailable = "http://links.nowhere/"
link = "/resource"
redirected = "/redirected"
filtered = "http://dont-follow/"
with start_server(
- Response(link, 301, {"Location": redirected}),
+ Response(link, 301, {"Location": redirected}),
Response(redirected, 301, {"Location": filtered})) as url:
hosts = [socket.gethostname()]
assert filtered == follow_redirects(url(link), hosts)
redirected = "/redirected"
link = "/resource"
with start_server(
- Response(link, 301, {"Location": redirected}),
+ Response(link, 301, {"Location": redirected}),
Response(redirected, 200, {})) as url:
hosts = [socket.gethostname()]
assert url(redirected) == follow_redirects(url(link), hosts)
redirected = "/redirected"
link = "/resource"
with start_server(
- Response(link, 301, {"Location": redirected}),
+ Response(link, 301, {"Location": redirected}),
Response(redirected, 200, {})) as url:
fmt = "before %s after"
line = fmt % url(link)
l = line.replace("%", "%%")
regex = "(https?://[^ )]+)"
return (
- re.sub(regex, "%s", l),
+ re.sub(regex, "%s", l),
[m.group(1) for m in re.finditer(regex, l)])
-
+
def follow_redirects(link, sites= None):
"""Follow directs for the link as long as the redirects are on the given
sites and return the resolved link."""
def follow(url):
return sites == None or urlparse.urlparse(url).hostname in sites
-
+
class RedirectHandler(urllib2.HTTPRedirectHandler):
def __init__(self):
self.last_url = None
self, req, fp, code, msg, hdrs, newurl)
r.get_method = lambda : 'HEAD'
return r
-
+
if not follow(link):
return link
redirect_handler = RedirectHandler()
p = set(
m.group(1) for m in re.finditer("\s*([^,\s]+)\s*,?\s*", list_of_hosts))
return p
-
+