venv/lib/python3.11/site-packages/pip/_vendor/urllib3/util/url.py

   1 from __future__ import absolute_import
   2
   3 import re
   4 from collections import namedtuple
   5
   6 from ..exceptions import LocationParseError
   7 from ..packages import six
   8
   9 url_attrs = ["scheme", "auth", "host", "port", "path", "query", "fragment"]
  10
  11 # We only want to normalize urls with an HTTP(S) scheme.
  12 # urllib3 infers URLs without a scheme (None) to be http.
  13 NORMALIZABLE_SCHEMES = ("http", "https", None)
  14
  15 # Almost all of these patterns were derived from the
  16 # 'rfc3986' module: https://github.com/python-hyper/rfc3986
  17 PERCENT_RE = re.compile(r"%[a-fA-F0-9]{2}")
  18 SCHEME_RE = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+-]*:|/)")
  19 URI_RE = re.compile(
  20     r"^(?:([a-zA-Z][a-zA-Z0-9+.-]*):)?"
  21     r"(?://([^\\/?#]*))?"
  22     r"([^?#]*)"
  23     r"(?:\?([^#]*))?"
  24     r"(?:#(.*))?$",
  25     re.UNICODE | re.DOTALL,
  26 )
  27
  28 IPV4_PAT = r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
  29 HEX_PAT = "[0-9A-Fa-f]{1,4}"
  30 LS32_PAT = "(?:{hex}:{hex}|{ipv4})".format(hex=HEX_PAT, ipv4=IPV4_PAT)
  31 _subs = {"hex": HEX_PAT, "ls32": LS32_PAT}
  32 _variations = [
  33     #                            6( h16 ":" ) ls32
  34     "(?:%(hex)s:){6}%(ls32)s",
  35     #                       "::" 5( h16 ":" ) ls32
  36     "::(?:%(hex)s:){5}%(ls32)s",
  37     # [               h16 ] "::" 4( h16 ":" ) ls32
  38     "(?:%(hex)s)?::(?:%(hex)s:){4}%(ls32)s",
  39     # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
  40     "(?:(?:%(hex)s:)?%(hex)s)?::(?:%(hex)s:){3}%(ls32)s",
  41     # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
  42     "(?:(?:%(hex)s:){0,2}%(hex)s)?::(?:%(hex)s:){2}%(ls32)s",
  43     # [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
  44     "(?:(?:%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s",
  45     # [ *4( h16 ":" ) h16 ] "::"              ls32
  46     "(?:(?:%(hex)s:){0,4}%(hex)s)?::%(ls32)s",
  47     # [ *5( h16 ":" ) h16 ] "::"              h16
  48     "(?:(?:%(hex)s:){0,5}%(hex)s)?::%(hex)s",
  49     # [ *6( h16 ":" ) h16 ] "::"
  50     "(?:(?:%(hex)s:){0,6}%(hex)s)?::",
  51 ]
  52
  53 UNRESERVED_PAT = r"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._\-~"
  54 IPV6_PAT = "(?:" + "|".join([x % _subs for x in _variations]) + ")"
  55 ZONE_ID_PAT = "(?:%25|%)(?:[" + UNRESERVED_PAT + "]|%[a-fA-F0-9]{2})+"
  56 IPV6_ADDRZ_PAT = r"\[" + IPV6_PAT + r"(?:" + ZONE_ID_PAT + r")?\]"
  57 REG_NAME_PAT = r"(?:[^\[\]%:/?#]|%[a-fA-F0-9]{2})*"
  58 TARGET_RE = re.compile(r"^(/[^?#]*)(?:\?([^#]*))?(?:#.*)?$")
  59
  60 IPV4_RE = re.compile("^" + IPV4_PAT + "$")
  61 IPV6_RE = re.compile("^" + IPV6_PAT + "$")
  62 IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT + "$")
  63 BRACELESS_IPV6_ADDRZ_RE = re.compile("^" + IPV6_ADDRZ_PAT[2:-2] + "$")
  64 ZONE_ID_RE = re.compile("(" + ZONE_ID_PAT + r")\]$")
  65
  66 _HOST_PORT_PAT = ("^(%s|%s|%s)(?::0*?(|0|[1-9][0-9]{0,4}))?$") % (
  67     REG_NAME_PAT,
  68     IPV4_PAT,
  69     IPV6_ADDRZ_PAT,
  70 )
  71 _HOST_PORT_RE = re.compile(_HOST_PORT_PAT, re.UNICODE | re.DOTALL)
  72
  73 UNRESERVED_CHARS = set(
  74     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789._-~"
  75 )
  76 SUB_DELIM_CHARS = set("!$&'()*+,;=")
  77 USERINFO_CHARS = UNRESERVED_CHARS | SUB_DELIM_CHARS | {":"}
  78 PATH_CHARS = USERINFO_CHARS | {"@", "/"}
  79 QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {"?"}
  80
  81
  82 class Url(namedtuple("Url", url_attrs)):
  83     """
  84     Data structure for representing an HTTP URL. Used as a return value for
  85     :func:`parse_url`. Both the scheme and host are normalized as they are
  86     both case-insensitive according to RFC 3986.
  87     """
  88
  89     __slots__ = ()
  90
  91     def __new__(
  92         cls,
  93         scheme=None,
  94         auth=None,
  95         host=None,
  96         port=None,
  97         path=None,
  98         query=None,
  99         fragment=None,
 100     ):
 101         if path and not path.startswith("/"):
 102             path = "/" + path
 103         if scheme is not None:
 104             scheme = scheme.lower()
 105         return super(Url, cls).__new__(
 106             cls, scheme, auth, host, port, path, query, fragment
 107         )
 108
 109     @property
 110     def hostname(self):
 111         """For backwards-compatibility with urlparse. We're nice like that."""
 112         return self.host
 113
 114     @property
 115     def request_uri(self):
 116         """Absolute path including the query string."""
 117         uri = self.path or "/"
 118
 119         if self.query is not None:
 120             uri += "?" + self.query
 121
 122         return uri
 123
 124     @property
 125     def netloc(self):
 126         """Network location including host and port"""
 127         if self.port:
 128             return "%s:%d" % (self.host, self.port)
 129         return self.host
 130
 131     @property
 132     def url(self):
 133         """
 134         Convert self into a url
 135
 136         This function should more or less round-trip with :func:`.parse_url`. The
 137         returned url may not be exactly the same as the url inputted to
 138         :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
 139         with a blank port will have : removed).
 140
 141         Example: ::
 142
 143             >>> U = parse_url('http://google.com/mail/')
 144             >>> U.url
 145             'http://google.com/mail/'
 146             >>> Url('http', 'username:password', 'host.com', 80,
 147             ... '/path', 'query', 'fragment').url
 148             'http://username:password@host.com:80/path?query#fragment'
 149         """
 150         scheme, auth, host, port, path, query, fragment = self
 151         url = u""
 152
 153         # We use "is not None" we want things to happen with empty strings (or 0 port)
 154         if scheme is not None:
 155             url += scheme + u"://"
 156         if auth is not None:
 157             url += auth + u"@"
 158         if host is not None:
 159             url += host
 160         if port is not None:
 161             url += u":" + str(port)
 162         if path is not None:
 163             url += path
 164         if query is not None:
 165             url += u"?" + query
 166         if fragment is not None:
 167             url += u"#" + fragment
 168
 169         return url
 170
 171     def __str__(self):
 172         return self.url
 173
 174
 175 def split_first(s, delims):
 176     """
 177     .. deprecated:: 1.25
 178
 179     Given a string and an iterable of delimiters, split on the first found
 180     delimiter. Return two split parts and the matched delimiter.
 181
 182     If not found, then the first part is the full input string.
 183
 184     Example::
 185
 186         >>> split_first('foo/bar?baz', '?/=')
 187         ('foo', 'bar?baz', '/')
 188         >>> split_first('foo/bar?baz', '123')
 189         ('foo/bar?baz', '', None)
 190
 191     Scales linearly with number of delims. Not ideal for large number of delims.
 192     """
 193     min_idx = None
 194     min_delim = None
 195     for d in delims:
 196         idx = s.find(d)
 197         if idx < 0:
 198             continue
 199
 200         if min_idx is None or idx < min_idx:
 201             min_idx = idx
 202             min_delim = d
 203
 204     if min_idx is None or min_idx < 0:
 205         return s, "", None
 206
 207     return s[:min_idx], s[min_idx + 1 :], min_delim
 208
 209
 210 def _encode_invalid_chars(component, allowed_chars, encoding="utf-8"):
 211     """Percent-encodes a URI component without reapplying
 212     onto an already percent-encoded component.
 213     """
 214     if component is None:
 215         return component
 216
 217     component = six.ensure_text(component)
 218
 219     # Normalize existing percent-encoded bytes.
 220     # Try to see if the component we're encoding is already percent-encoded
 221     # so we can skip all '%' characters but still encode all others.
 222     component, percent_encodings = PERCENT_RE.subn(
 223         lambda match: match.group(0).upper(), component
 224     )
 225
 226     uri_bytes = component.encode("utf-8", "surrogatepass")
 227     is_percent_encoded = percent_encodings == uri_bytes.count(b"%")
 228     encoded_component = bytearray()
 229
 230     for i in range(0, len(uri_bytes)):
 231         # Will return a single character bytestring on both Python 2 & 3
 232         byte = uri_bytes[i : i + 1]
 233         byte_ord = ord(byte)
 234         if (is_percent_encoded and byte == b"%") or (
 235             byte_ord < 128 and byte.decode() in allowed_chars
 236         ):
 237             encoded_component += byte
 238             continue
 239         encoded_component.extend(b"%" + (hex(byte_ord)[2:].encode().zfill(2).upper()))
 240
 241     return encoded_component.decode(encoding)
 242
 243
 244 def _remove_path_dot_segments(path):
 245     # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
 246     segments = path.split("/")  # Turn the path into a list of segments
 247     output = []  # Initialize the variable to use to store output
 248
 249     for segment in segments:
 250         # '.' is the current directory, so ignore it, it is superfluous
 251         if segment == ".":
 252             continue
 253         # Anything other than '..', should be appended to the output
 254         elif segment != "..":
 255             output.append(segment)
 256         # In this case segment == '..', if we can, we should pop the last
 257         # element
 258         elif output:
 259             output.pop()
 260
 261     # If the path starts with '/' and the output is empty or the first string
 262     # is non-empty
 263     if path.startswith("/") and (not output or output[0]):
 264         output.insert(0, "")
 265
 266     # If the path starts with '/.' or '/..' ensure we add one more empty
 267     # string to add a trailing '/'
 268     if path.endswith(("/.", "/..")):
 269         output.append("")
 270
 271     return "/".join(output)
 272
 273
 274 def _normalize_host(host, scheme):
 275     if host:
 276         if isinstance(host, six.binary_type):
 277             host = six.ensure_str(host)
 278
 279         if scheme in NORMALIZABLE_SCHEMES:
 280             is_ipv6 = IPV6_ADDRZ_RE.match(host)
 281             if is_ipv6:
 282                 # IPv6 hosts of the form 'a::b%zone' are encoded in a URL as
 283                 # such per RFC 6874: 'a::b%25zone'. Unquote the ZoneID
 284                 # separator as necessary to return a valid RFC 4007 scoped IP.
 285                 match = ZONE_ID_RE.search(host)
 286                 if match:
 287                     start, end = match.span(1)
 288                     zone_id = host[start:end]
 289
 290                     if zone_id.startswith("%25") and zone_id != "%25":
 291                         zone_id = zone_id[3:]
 292                     else:
 293                         zone_id = zone_id[1:]
 294                     zone_id = "%" + _encode_invalid_chars(zone_id, UNRESERVED_CHARS)
 295                     return host[:start].lower() + zone_id + host[end:]
 296                 else:
 297                     return host.lower()
 298             elif not IPV4_RE.match(host):
 299                 return six.ensure_str(
 300                     b".".join([_idna_encode(label) for label in host.split(".")])
 301                 )
 302     return host
 303
 304
 305 def _idna_encode(name):
 306     if name and any(ord(x) >= 128 for x in name):
 307         try:
 308             from pip._vendor import idna
 309         except ImportError:
 310             six.raise_from(
 311                 LocationParseError("Unable to parse URL without the 'idna' module"),
 312                 None,
 313             )
 314         try:
 315             return idna.encode(name.lower(), strict=True, std3_rules=True)
 316         except idna.IDNAError:
 317             six.raise_from(
 318                 LocationParseError(u"Name '%s' is not a valid IDNA label" % name), None
 319             )
 320     return name.lower().encode("ascii")
 321
 322
 323 def _encode_target(target):
 324     """Percent-encodes a request target so that there are no invalid characters"""
 325     path, query = TARGET_RE.match(target).groups()
 326     target = _encode_invalid_chars(path, PATH_CHARS)
 327     query = _encode_invalid_chars(query, QUERY_CHARS)
 328     if query is not None:
 329         target += "?" + query
 330     return target
 331
 332
 333 def parse_url(url):
 334     """
 335     Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
 336     performed to parse incomplete urls. Fields not provided will be None.
 337     This parser is RFC 3986 and RFC 6874 compliant.
 338
 339     The parser logic and helper functions are based heavily on
 340     work done in the ``rfc3986`` module.
 341
 342     :param str url: URL to parse into a :class:`.Url` namedtuple.
 343
 344     Partly backwards-compatible with :mod:`urlparse`.
 345
 346     Example::
 347
 348         >>> parse_url('http://google.com/mail/')
 349         Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
 350         >>> parse_url('google.com:80')
 351         Url(scheme=None, host='google.com', port=80, path=None, ...)
 352         >>> parse_url('/foo?bar')
 353         Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
 354     """
 355     if not url:
 356         # Empty
 357         return Url()
 358
 359     source_url = url
 360     if not SCHEME_RE.search(url):
 361         url = "//" + url
 362
 363     try:
 364         scheme, authority, path, query, fragment = URI_RE.match(url).groups()
 365         normalize_uri = scheme is None or scheme.lower() in NORMALIZABLE_SCHEMES
 366
 367         if scheme:
 368             scheme = scheme.lower()
 369
 370         if authority:
 371             auth, _, host_port = authority.rpartition("@")
 372             auth = auth or None
 373             host, port = _HOST_PORT_RE.match(host_port).groups()
 374             if auth and normalize_uri:
 375                 auth = _encode_invalid_chars(auth, USERINFO_CHARS)
 376             if port == "":
 377                 port = None
 378         else:
 379             auth, host, port = None, None, None
 380
 381         if port is not None:
 382             port = int(port)
 383             if not (0 <= port <= 65535):
 384                 raise LocationParseError(url)
 385
 386         host = _normalize_host(host, scheme)
 387
 388         if normalize_uri and path:
 389             path = _remove_path_dot_segments(path)
 390             path = _encode_invalid_chars(path, PATH_CHARS)
 391         if normalize_uri and query:
 392             query = _encode_invalid_chars(query, QUERY_CHARS)
 393         if normalize_uri and fragment:
 394             fragment = _encode_invalid_chars(fragment, FRAGMENT_CHARS)
 395
 396     except (ValueError, AttributeError):
 397         return six.raise_from(LocationParseError(source_url), None)
 398
 399     # For the sake of backwards compatibility we put empty
 400     # string values for path if there are any defined values
 401     # beyond the path in the URL.
 402     # TODO: Remove this when we break backwards compatibility.
 403     if not path:
 404         if query is not None or fragment is not None:
 405             path = ""
 406         else:
 407             path = None
 408
 409     # Ensure that each part of the URL is a `str` for
 410     # backwards compatibility.
 411     if isinstance(url, six.text_type):
 412         ensure_func = six.ensure_text
 413     else:
 414         ensure_func = six.ensure_str
 415
 416     def ensure_type(x):
 417         return x if x is None else ensure_func(x)
 418
 419     return Url(
 420         scheme=ensure_type(scheme),
 421         auth=ensure_type(auth),
 422         host=ensure_type(host),
 423         port=port,
 424         path=ensure_type(path),
 425         query=ensure_type(query),
 426         fragment=ensure_type(fragment),
 427     )
 428
 429
 430 def get_host(url):
 431     """
 432     Deprecated. Use :func:`parse_url` instead.
 433     """
 434     p = parse_url(url)
 435     return p.scheme or "http", p.hostname, p.port