1 from __future__
import absolute_import
4 from collections
import namedtuple
6 from ..exceptions
import LocationParseError
7 from ..packages
import six
9 url_attrs
= ["scheme", "auth", "host", "port", "path", "query", "fragment"]
11 # We only want to normalize urls with an HTTP(S) scheme.
12 # urllib3 infers URLs without a scheme (None) to be http.
13 NORMALIZABLE_SCHEMES
= ("http", "https", None)
15 # Almost all of these patterns were derived from the
16 # 'rfc3986' module: https://github.com/python-hyper/rfc3986
17 PERCENT_RE
= re
.compile(r
"%[a-fA-F0-9]{2}")
18 SCHEME_RE
= re
.compile(r
"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
20 r
"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
25 re
.UNICODE | re
.DOTALL
,
28 IPV4_PAT
= r
"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
29 HEX_PAT
= "[0-9A-Fa-f]{1,4}"
30 LS32_PAT
= "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT
, ipv4
=IPV4_PAT
)
31 _subs
= {"hex": HEX_PAT, "ls32": LS32_PAT}
34 "(?:%(hex)s:){6}%(ls32)s",
35 # "::" 5( h16 ":" ) ls32
36 "::(?:%(hex)s:){5}%(ls32)s",
37 # [ h16 ] "::" 4( h16 ":" ) ls32
38 "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
39 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
40 "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
41 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
42 "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
43 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
44 "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
45 # [ *4( h16 ":" ) h16 ] "::" ls32
46 "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
47 # [ *5( h16 ":" ) h16 ] "::" h16
48 "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
49 # [ *6( h16 ":" ) h16 ] "::"
50 "(?:(?:%(hex)s:){0,6}%(hex)s)?::",
53 UNRESERVED_PAT
= r
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._\-~"
54 IPV6_PAT
= "(?:" + "|".join([x
% _subs
for x
in _variations
]) + ")"
55 ZONE_ID_PAT
= "(?:%25|%)(?:[" + UNRESERVED_PAT
+ "]|%[a-fA-F0-9]{2})+"
56 IPV6_ADDRZ_PAT
= r
"\[" + IPV6_PAT
+ r
"(?:" + ZONE_ID_PAT
+ r
")?\]"
57 REG_NAME_PAT
= r
"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
58 TARGET_RE
= re
.compile(r
"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")
60 IPV4_RE
= re
.compile("^" + IPV4_PAT
+ "$")
61 IPV6_RE
= re
.compile("^" + IPV6_PAT
+ "$")
62 IPV6_ADDRZ_RE
= re
.compile("^" + IPV6_ADDRZ_PAT
+ "$")
63 BRACELESS_IPV6_ADDRZ_RE
= re
.compile("^" + IPV6_ADDRZ_PAT
[2:-2] + "$")
64 ZONE_ID_RE
= re
.compile("(" + ZONE_ID_PAT
+ r
")\]$")
66 _HOST_PORT_PAT
= ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % (
71 _HOST_PORT_RE
= re
.compile(_HOST_PORT_PAT
, re
.UNICODE | re
.DOTALL
)
73 UNRESERVED_CHARS
= set(
74 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
76 SUB_DELIM_CHARS
= set("!$&'()*+,;=")
77 USERINFO_CHARS
= UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"}
78 PATH_CHARS
= USERINFO_CHARS | {"@", "/"}
79 QUERY_CHARS
= FRAGMENT_CHARS
= PATH_CHARS | {"?"}
82 class Url(namedtuple("Url", url_attrs
)):
84 Data structure for representing an HTTP URL. Used as a return value for
85 :func:`parse_url`. Both the scheme and host are normalized as they are
86 both case-insensitive according to RFC 3986.
101 if path
and not path
.startswith("/"):
103 if scheme
is not None:
104 scheme
= scheme
.lower()
105 return super(Url
, cls
).__new
__(
106 cls
, scheme
, auth
, host
, port
, path
, query
, fragment
111 """For backwards-compatibility with urlparse. We're nice like that."""
115 def request_uri(self
):
116 """Absolute path including the query string."""
117 uri
= self
.path
or "/"
119 if self
.query
is not None:
120 uri
+= "?" + self
.query
126 """Network location including host and port"""
128 return "%s:%d" % (self
.host
, self
.port
)
134 Convert self into a url
136 This function should more or less round-trip with :func:`.parse_url`. The
137 returned url may not be exactly the same as the url inputted to
138 :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
139 with a blank port will have : removed).
143 >>> U = parse_url('http://google.com/mail/')
145 'http://google.com/mail/'
146 >>> Url('http', 'username:password', 'host.com', 80,
147 ... '/path', 'query', 'fragment').url
148 'http://username:password@host.com:80/path?query#fragment'
150 scheme
, auth
, host
, port
, path
, query
, fragment
= self
153 # We use "is not None" we want things to happen with empty strings (or 0 port)
154 if scheme
is not None:
155 url
+= scheme
+ u
"://"
161 url
+= u
":" + str(port
)
164 if query
is not None:
166 if fragment
is not None:
167 url
+= u
"#" + fragment
175 def split_first(s
, delims
):
179 Given a string and an iterable of delimiters, split on the first found
180 delimiter. Return two split parts and the matched delimiter.
182 If not found, then the first part is the full input string.
186 >>> split_first('foo/bar?baz', '?/=')
187 ('foo', 'bar?baz', '/')
188 >>> split_first('foo/bar?baz', '123')
189 ('foo/bar?baz', '', None)
191 Scales linearly with number of delims. Not ideal for large number of delims.
200 if min_idx
is None or idx
< min_idx
:
204 if min_idx
is None or min_idx
< 0:
207 return s
[:min_idx
], s
[min_idx
+ 1 :], min_delim
210 def _encode_invalid_chars(component
, allowed_chars
, encoding
="utf-8"):
211 """Percent-encodes a URI component without reapplying
212 onto an already percent-encoded component.
214 if component
is None:
217 component
= six
.ensure_text(component
)
219 # Normalize existing percent-encoded bytes.
220 # Try to see if the component we're encoding is already percent-encoded
221 # so we can skip all '%' characters but still encode all others.
222 component
, percent_encodings
= PERCENT_RE
.subn(
223 lambda match
: match
.group(0).upper(), component
226 uri_bytes
= component
.encode("utf-8", "surrogatepass")
227 is_percent_encoded
= percent_encodings
== uri_bytes
.count(b
"%")
228 encoded_component
= bytearray()
230 for i
in range(0, len(uri_bytes
)):
231 # Will return a single character bytestring on both Python 2 & 3
232 byte
= uri_bytes
[i
: i
+ 1]
234 if (is_percent_encoded
and byte
== b
"%") or (
235 byte_ord
< 128 and byte
.decode() in allowed_chars
237 encoded_component
+= byte
239 encoded_component
.extend(b
"%" + (hex(byte_ord
)[2:].encode().zfill(2).upper()))
241 return encoded_component
.decode(encoding
)
244 def _remove_path_dot_segments(path
):
245 # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
246 segments
= path
.split("/") # Turn the path into a list of segments
247 output
= [] # Initialize the variable to use to store output
249 for segment
in segments
:
250 # '.' is the current directory, so ignore it, it is superfluous
253 # Anything other than '..', should be appended to the output
254 elif segment
!= "..":
255 output
.append(segment
)
256 # In this case segment == '..', if we can, we should pop the last
261 # If the path starts with '/' and the output is empty or the first string
263 if path
.startswith("/") and (not output
or output
[0]):
266 # If the path starts with '/.' or '/..' ensure we add one more empty
267 # string to add a trailing '/'
268 if path
.endswith(("/.", "/..")):
271 return "/".join(output
)
274 def _normalize_host(host
, scheme
):
276 if isinstance(host
, six
.binary_type
):
277 host
= six
.ensure_str(host
)
279 if scheme
in NORMALIZABLE_SCHEMES
:
280 is_ipv6
= IPV6_ADDRZ_RE
.match(host
)
282 # IPv6 hosts of the form 'a::b%zone' are encoded in a URL as
283 # such per RFC 6874: 'a::b%25zone'. Unquote the ZoneID
284 # separator as necessary to return a valid RFC 4007 scoped IP.
285 match
= ZONE_ID_RE
.search(host
)
287 start
, end
= match
.span(1)
288 zone_id
= host
[start
:end
]
290 if zone_id
.startswith("%25") and zone_id
!= "%25":
291 zone_id
= zone_id
[3:]
293 zone_id
= zone_id
[1:]
294 zone_id
= "%" + _encode_invalid_chars(zone_id
, UNRESERVED_CHARS
)
295 return host
[:start
].lower() + zone_id
+ host
[end
:]
298 elif not IPV4_RE
.match(host
):
299 return six
.ensure_str(
300 b
".".join([_idna_encode(label
) for label
in host
.split(".")])
305 def _idna_encode(name
):
306 if name
and any(ord(x
) >= 128 for x
in name
):
308 from pip
._vendor
import idna
311 LocationParseError("Unable to parse URL without the 'idna' module"),
315 return idna
.encode(name
.lower(), strict
=True, std3_rules
=True)
316 except idna
.IDNAError
:
318 LocationParseError(u
"Name '%s' is not a valid IDNA label" % name
), None
320 return name
.lower().encode("ascii")
323 def _encode_target(target
):
324 """Percent-encodes a request target so that there are no invalid characters"""
325 path
, query
= TARGET_RE
.match(target
).groups()
326 target
= _encode_invalid_chars(path
, PATH_CHARS
)
327 query
= _encode_invalid_chars(query
, QUERY_CHARS
)
328 if query
is not None:
329 target
+= "?" + query
335 Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
336 performed to parse incomplete urls. Fields not provided will be None.
337 This parser is RFC 3986 and RFC 6874 compliant.
339 The parser logic and helper functions are based heavily on
340 work done in the ``rfc3986`` module.
342 :param str url: URL to parse into a :class:`.Url` namedtuple.
344 Partly backwards-compatible with :mod:`urlparse`.
348 >>> parse_url('http://google.com/mail/')
349 Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
350 >>> parse_url('google.com:80')
351 Url(scheme=None, host='google.com', port=80, path=None, ...)
352 >>> parse_url('/foo?bar')
353 Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
360 if not SCHEME_RE
.search(url
):
364 scheme
, authority
, path
, query
, fragment
= URI_RE
.match(url
).groups()
365 normalize_uri
= scheme
is None or scheme
.lower() in NORMALIZABLE_SCHEMES
368 scheme
= scheme
.lower()
371 auth
, _
, host_port
= authority
.rpartition("@")
373 host
, port
= _HOST_PORT_RE
.match(host_port
).groups()
374 if auth
and normalize_uri
:
375 auth
= _encode_invalid_chars(auth
, USERINFO_CHARS
)
379 auth
, host
, port
= None, None, None
383 if not (0 <= port
<= 65535):
384 raise LocationParseError(url
)
386 host
= _normalize_host(host
, scheme
)
388 if normalize_uri
and path
:
389 path
= _remove_path_dot_segments(path
)
390 path
= _encode_invalid_chars(path
, PATH_CHARS
)
391 if normalize_uri
and query
:
392 query
= _encode_invalid_chars(query
, QUERY_CHARS
)
393 if normalize_uri
and fragment
:
394 fragment
= _encode_invalid_chars(fragment
, FRAGMENT_CHARS
)
396 except (ValueError, AttributeError):
397 return six
.raise_from(LocationParseError(source_url
), None)
399 # For the sake of backwards compatibility we put empty
400 # string values for path if there are any defined values
401 # beyond the path in the URL.
402 # TODO: Remove this when we break backwards compatibility.
404 if query
is not None or fragment
is not None:
409 # Ensure that each part of the URL is a `str` for
410 # backwards compatibility.
411 if isinstance(url
, six
.text_type
):
412 ensure_func
= six
.ensure_text
414 ensure_func
= six
.ensure_str
417 return x
if x
is None else ensure_func(x
)
420 scheme
=ensure_type(scheme
),
421 auth
=ensure_type(auth
),
422 host
=ensure_type(host
),
424 path
=ensure_type(path
),
425 query
=ensure_type(query
),
426 fragment
=ensure_type(fragment
),
432 Deprecated. Use :func:`parse_url` instead.
435 return p
.scheme
or "http", p
.hostname
, p
.port