venv/lib/python3.11/site-packages/werkzeug/urls.py

   1 from __future__ import annotations
   2
   3 import codecs
   4 import re
   5 import typing as t
   6 from urllib.parse import quote
   7 from urllib.parse import unquote
   8 from urllib.parse import urlencode
   9 from urllib.parse import urlsplit
  10 from urllib.parse import urlunsplit
  11
  12 from .datastructures import iter_multi_items
  13
  14
  15 def _codec_error_url_quote(e: UnicodeError) -> tuple[str, int]:
  16     """Used in :func:`uri_to_iri` after unquoting to re-quote any
  17     invalid bytes.
  18     """
  19     # the docs state that UnicodeError does have these attributes,
  20     # but mypy isn't picking them up
  21     out = quote(e.object[e.start : e.end], safe="")  # type: ignore
  22     return out, e.end  # type: ignore
  23
  24
  25 codecs.register_error("werkzeug.url_quote", _codec_error_url_quote)
  26
  27
  28 def _make_unquote_part(name: str, chars: str) -> t.Callable[[str], str]:
  29     """Create a function that unquotes all percent encoded characters except those
  30     given. This allows working with unquoted characters if possible while not changing
  31     the meaning of a given part of a URL.
  32     """
  33     choices = "|".join(f"{ord(c):02X}" for c in sorted(chars))
  34     pattern = re.compile(f"((?:%(?:{choices}))+)", re.I)
  35
  36     def _unquote_partial(value: str) -> str:
  37         parts = iter(pattern.split(value))
  38         out = []
  39
  40         for part in parts:
  41             out.append(unquote(part, "utf-8", "werkzeug.url_quote"))
  42             out.append(next(parts, ""))
  43
  44         return "".join(out)
  45
  46     _unquote_partial.__name__ = f"_unquote_{name}"
  47     return _unquote_partial
  48
  49
  50 # characters that should remain quoted in URL parts
  51 # based on https://url.spec.whatwg.org/#percent-encoded-bytes
  52 # always keep all controls, space, and % quoted
  53 _always_unsafe = bytes((*range(0x21), 0x25, 0x7F)).decode()
  54 _unquote_fragment = _make_unquote_part("fragment", _always_unsafe)
  55 _unquote_query = _make_unquote_part("query", _always_unsafe + "&=+#")
  56 _unquote_path = _make_unquote_part("path", _always_unsafe + "/?#")
  57 _unquote_user = _make_unquote_part("user", _always_unsafe + ":@/?#")
  58
  59
  60 def uri_to_iri(uri: str) -> str:
  61     """Convert a URI to an IRI. All valid UTF-8 characters are unquoted,
  62     leaving all reserved and invalid characters quoted. If the URL has
  63     a domain, it is decoded from Punycode.
  64
  65     >>> uri_to_iri("http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF")
  66     'http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF'
  67
  68     :param uri: The URI to convert.
  69
  70     .. versionchanged:: 3.0
  71         Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters,
  72         are removed.
  73
  74     .. versionchanged:: 2.3
  75         Which characters remain quoted is specific to each part of the URL.
  76
  77     .. versionchanged:: 0.15
  78         All reserved and invalid characters remain quoted. Previously,
  79         only some reserved characters were preserved, and invalid bytes
  80         were replaced instead of left quoted.
  81
  82     .. versionadded:: 0.6
  83     """
  84     parts = urlsplit(uri)
  85     path = _unquote_path(parts.path)
  86     query = _unquote_query(parts.query)
  87     fragment = _unquote_fragment(parts.fragment)
  88
  89     if parts.hostname:
  90         netloc = _decode_idna(parts.hostname)
  91     else:
  92         netloc = ""
  93
  94     if ":" in netloc:
  95         netloc = f"[{netloc}]"
  96
  97     if parts.port:
  98         netloc = f"{netloc}:{parts.port}"
  99
 100     if parts.username:
 101         auth = _unquote_user(parts.username)
 102
 103         if parts.password:
 104             password = _unquote_user(parts.password)
 105             auth = f"{auth}:{password}"
 106
 107         netloc = f"{auth}@{netloc}"
 108
 109     return urlunsplit((parts.scheme, netloc, path, query, fragment))
 110
 111
 112 def iri_to_uri(iri: str) -> str:
 113     """Convert an IRI to a URI. All non-ASCII and unsafe characters are
 114     quoted. If the URL has a domain, it is encoded to Punycode.
 115
 116     >>> iri_to_uri('http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF')
 117     'http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF'
 118
 119     :param iri: The IRI to convert.
 120
 121     .. versionchanged:: 3.0
 122         Passing a tuple or bytes, the ``charset`` and ``errors`` parameters,
 123         and the ``safe_conversion`` parameter, are removed.
 124
 125     .. versionchanged:: 2.3
 126         Which characters remain unquoted is specific to each part of the URL.
 127
 128     .. versionchanged:: 0.15
 129         All reserved characters remain unquoted. Previously, only some reserved
 130         characters were left unquoted.
 131
 132     .. versionchanged:: 0.9.6
 133        The ``safe_conversion`` parameter was added.
 134
 135     .. versionadded:: 0.6
 136     """
 137     parts = urlsplit(iri)
 138     # safe = https://url.spec.whatwg.org/#url-path-segment-string
 139     # as well as percent for things that are already quoted
 140     path = quote(parts.path, safe="%!$&'()*+,/:;=@")
 141     query = quote(parts.query, safe="%!$&'()*+,/:;=?@")
 142     fragment = quote(parts.fragment, safe="%!#$&'()*+,/:;=?@")
 143
 144     if parts.hostname:
 145         netloc = parts.hostname.encode("idna").decode("ascii")
 146     else:
 147         netloc = ""
 148
 149     if ":" in netloc:
 150         netloc = f"[{netloc}]"
 151
 152     if parts.port:
 153         netloc = f"{netloc}:{parts.port}"
 154
 155     if parts.username:
 156         auth = quote(parts.username, safe="%!$&'()*+,;=")
 157
 158         if parts.password:
 159             password = quote(parts.password, safe="%!$&'()*+,;=")
 160             auth = f"{auth}:{password}"
 161
 162         netloc = f"{auth}@{netloc}"
 163
 164     return urlunsplit((parts.scheme, netloc, path, query, fragment))
 165
 166
 167 def _invalid_iri_to_uri(iri: str) -> str:
 168     """The URL scheme ``itms-services://`` must contain the ``//`` even though it does
 169     not have a host component. There may be other invalid schemes as well. Currently,
 170     responses will always call ``iri_to_uri`` on the redirect ``Location`` header, which
 171     removes the ``//``. For now, if the IRI only contains ASCII and does not contain
 172     spaces, pass it on as-is. In Werkzeug 3.0, this should become a
 173     ``response.process_location`` flag.
 174
 175     :meta private:
 176     """
 177     try:
 178         iri.encode("ascii")
 179     except UnicodeError:
 180         pass
 181     else:
 182         if len(iri.split(None, 1)) == 1:
 183             return iri
 184
 185     return iri_to_uri(iri)
 186
 187
 188 def _decode_idna(domain: str) -> str:
 189     try:
 190         data = domain.encode("ascii")
 191     except UnicodeEncodeError:
 192         # If the domain is not ASCII, it's decoded already.
 193         return domain
 194
 195     try:
 196         # Try decoding in one shot.
 197         return data.decode("idna")
 198     except UnicodeDecodeError:
 199         pass
 200
 201     # Decode each part separately, leaving invalid parts as punycode.
 202     parts = []
 203
 204     for part in data.split(b"."):
 205         try:
 206             parts.append(part.decode("idna"))
 207         except UnicodeDecodeError:
 208             parts.append(part.decode("ascii"))
 209
 210     return ".".join(parts)
 211
 212
 213 def _urlencode(query: t.Mapping[str, str] | t.Iterable[tuple[str, str]]) -> str:
 214     items = [x for x in iter_multi_items(query) if x[1] is not None]
 215     # safe = https://url.spec.whatwg.org/#percent-encoded-bytes
 216     return urlencode(items, safe="!$'()*,/:;?@")