]>
jfr.im git - dlqueue.git/blob - venv/lib/python3.11/site-packages/pip/_vendor/idna/core.py
5 from typing
import Union
, Optional
6 from .intranges
import intranges_contain
8 _virama_combining_class
= 9
9 _alabel_prefix
= b
'xn--'
10 _unicode_dots_re
= re
.compile('[\u002e\u3002\uff0e\uff61]')
12 class IDNAError(UnicodeError):
13 """ Base exception for all IDNA-encoding related problems """
17 class IDNABidiError(IDNAError
):
18 """ Exception when bidirectional requirements are not satisfied """
22 class InvalidCodepoint(IDNAError
):
23 """ Exception when a disallowed or unallocated codepoint is used """
27 class InvalidCodepointContext(IDNAError
):
28 """ Exception when the codepoint is not valid in the context it is used """
32 def _combining_class(cp
: int) -> int:
33 v
= unicodedata
.combining(chr(cp
))
35 if not unicodedata
.name(chr(cp
)):
36 raise ValueError('Unknown character in unicodedata')
39 def _is_script(cp
: str, script
: str) -> bool:
40 return intranges_contain(ord(cp
), idnadata
.scripts
[script
])
42 def _punycode(s
: str) -> bytes:
43 return s
.encode('punycode')
45 def _unot(s
: int) -> str:
46 return 'U+{:04X}'.format(s
)
49 def valid_label_length(label
: Union
[bytes, str]) -> bool:
55 def valid_string_length(label
: Union
[bytes, str], trailing_dot
: bool) -> bool:
56 if len(label
) > (254 if trailing_dot
else 253):
61 def check_bidi(label
: str, check_ltr
: bool = False) -> bool:
62 # Bidi rules should only be applied if string contains RTL characters
64 for (idx
, cp
) in enumerate(label
, 1):
65 direction
= unicodedata
.bidirectional(cp
)
67 # String likely comes from a newer version of Unicode
68 raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label
), idx
))
69 if direction
in ['R', 'AL', 'AN']:
71 if not bidi_label
and not check_ltr
:
75 direction
= unicodedata
.bidirectional(label
[0])
76 if direction
in ['R', 'AL']:
78 elif direction
== 'L':
81 raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label
)))
84 number_type
= None # type: Optional[str]
85 for (idx
, cp
) in enumerate(label
, 1):
86 direction
= unicodedata
.bidirectional(cp
)
90 if not direction
in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
91 raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx
))
93 if direction
in ['R', 'AL', 'EN', 'AN']:
95 elif direction
!= 'NSM':
98 if direction
in ['AN', 'EN']:
100 number_type
= direction
102 if number_type
!= direction
:
103 raise IDNABidiError('Can not mix numeral types in a right-to-left label')
106 if not direction
in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
107 raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx
))
109 if direction
in ['L', 'EN']:
111 elif direction
!= 'NSM':
115 raise IDNABidiError('Label ends with illegal codepoint directionality')
120 def check_initial_combiner(label
: str) -> bool:
121 if unicodedata
.category(label
[0])[0] == 'M':
122 raise IDNAError('Label begins with an illegal combining character')
126 def check_hyphen_ok(label
: str) -> bool:
127 if label
[2:4] == '--':
128 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')
129 if label
[0] == '-' or label
[-1] == '-':
130 raise IDNAError('Label must not start or end with a hyphen')
134 def check_nfc(label
: str) -> None:
135 if unicodedata
.normalize('NFC', label
) != label
:
136 raise IDNAError('Label must be in Normalization Form C')
139 def valid_contextj(label
: str, pos
: int) -> bool:
140 cp_value
= ord(label
[pos
])
142 if cp_value
== 0x200c:
145 if _combining_class(ord(label
[pos
- 1])) == _virama_combining_class
:
149 for i
in range(pos
-1, -1, -1):
150 joining_type
= idnadata
.joining_types
.get(ord(label
[i
]))
151 if joining_type
== ord('T'):
153 if joining_type
in [ord('L'), ord('D')]:
161 for i
in range(pos
+1, len(label
)):
162 joining_type
= idnadata
.joining_types
.get(ord(label
[i
]))
163 if joining_type
== ord('T'):
165 if joining_type
in [ord('R'), ord('D')]:
170 if cp_value
== 0x200d:
173 if _combining_class(ord(label
[pos
- 1])) == _virama_combining_class
:
182 def valid_contexto(label
: str, pos
: int, exception
: bool = False) -> bool:
183 cp_value
= ord(label
[pos
])
185 if cp_value
== 0x00b7:
186 if 0 < pos
< len(label
)-1:
187 if ord(label
[pos
- 1]) == 0x006c and ord(label
[pos
+ 1]) == 0x006c:
191 elif cp_value
== 0x0375:
192 if pos
< len(label
)-1 and len(label
) > 1:
193 return _is_script(label
[pos
+ 1], 'Greek')
196 elif cp_value
== 0x05f3 or cp_value
== 0x05f4:
198 return _is_script(label
[pos
- 1], 'Hebrew')
201 elif cp_value
== 0x30fb:
205 if _is_script(cp
, 'Hiragana') or _is_script(cp
, 'Katakana') or _is_script(cp
, 'Han'):
209 elif 0x660 <= cp_value
<= 0x669:
211 if 0x6f0 <= ord(cp
) <= 0x06f9:
215 elif 0x6f0 <= cp_value
<= 0x6f9:
217 if 0x660 <= ord(cp
) <= 0x0669:
224 def check_label(label
: Union
[str, bytes, bytearray
]) -> None:
225 if isinstance(label
, (bytes, bytearray
)):
226 label
= label
.decode('utf-8')
228 raise IDNAError('Empty Label')
231 check_hyphen_ok(label
)
232 check_initial_combiner(label
)
234 for (pos
, cp
) in enumerate(label
):
236 if intranges_contain(cp_value
, idnadata
.codepoint_classes
['PVALID']):
238 elif intranges_contain(cp_value
, idnadata
.codepoint_classes
['CONTEXTJ']):
240 if not valid_contextj(label
, pos
):
241 raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(
242 _unot(cp_value
), pos
+1, repr(label
)))
244 raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format(
245 _unot(cp_value
), pos
+1, repr(label
)))
246 elif intranges_contain(cp_value
, idnadata
.codepoint_classes
['CONTEXTO']):
247 if not valid_contexto(label
, pos
):
248 raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value
), pos
+1, repr(label
)))
250 raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value
), pos
+1, repr(label
)))
255 def alabel(label
: str) -> bytes:
257 label_bytes
= label
.encode('ascii')
259 if not valid_label_length(label_bytes
):
260 raise IDNAError('Label too long')
262 except UnicodeEncodeError:
266 raise IDNAError('No Input')
270 label_bytes
= _punycode(label
)
271 label_bytes
= _alabel_prefix
+ label_bytes
273 if not valid_label_length(label_bytes
):
274 raise IDNAError('Label too long')
279 def ulabel(label
: Union
[str, bytes, bytearray
]) -> str:
280 if not isinstance(label
, (bytes, bytearray
)):
282 label_bytes
= label
.encode('ascii')
283 except UnicodeEncodeError:
289 label_bytes
= label_bytes
.lower()
290 if label_bytes
.startswith(_alabel_prefix
):
291 label_bytes
= label_bytes
[len(_alabel_prefix
):]
293 raise IDNAError('Malformed A-label, no Punycode eligible content found')
294 if label_bytes
.decode('ascii')[-1] == '-':
295 raise IDNAError('A-label must not end with a hyphen')
297 check_label(label_bytes
)
298 return label_bytes
.decode('ascii')
301 label
= label_bytes
.decode('punycode')
303 raise IDNAError('Invalid A-label')
308 def uts46_remap(domain
: str, std3_rules
: bool = True, transitional
: bool = False) -> str:
309 """Re-map the characters in the string according to UTS46 processing."""
310 from .uts46data
import uts46data
313 for pos
, char
in enumerate(domain
):
314 code_point
= ord(char
)
316 uts46row
= uts46data
[code_point
if code_point
< 256 else
317 bisect
.bisect_left(uts46data
, (code_point
, 'Z')) - 1]
319 replacement
= None # type: Optional[str]
320 if len(uts46row
) == 3:
321 replacement
= uts46row
[2] # type: ignore
323 (status
== 'D' and not transitional
) or
324 (status
== '3' and not std3_rules
and replacement
is None)):
326 elif replacement
is not None and (status
== 'M' or
327 (status
== '3' and not std3_rules
) or
328 (status
== 'D' and transitional
)):
329 output
+= replacement
333 raise InvalidCodepoint(
334 'Codepoint {} not allowed at position {} in {}'.format(
335 _unot(code_point
), pos
+ 1, repr(domain
)))
337 return unicodedata
.normalize('NFC', output
)
340 def encode(s
: Union
[str, bytes, bytearray
], strict
: bool = False, uts46
: bool = False, std3_rules
: bool = False, transitional
: bool = False) -> bytes:
341 if isinstance(s
, (bytes, bytearray
)):
343 s
= s
.decode('ascii')
344 except UnicodeDecodeError:
345 raise IDNAError('should pass a unicode string to the function rather than a byte string.')
347 s
= uts46_remap(s
, std3_rules
, transitional
)
351 labels
= s
.split('.')
353 labels
= _unicode_dots_re
.split(s
)
354 if not labels
or labels
== ['']:
355 raise IDNAError('Empty domain')
364 raise IDNAError('Empty label')
367 s
= b
'.'.join(result
)
368 if not valid_string_length(s
, trailing_dot
):
369 raise IDNAError('Domain too long')
373 def decode(s
: Union
[str, bytes, bytearray
], strict
: bool = False, uts46
: bool = False, std3_rules
: bool = False) -> str:
375 if isinstance(s
, (bytes, bytearray
)):
376 s
= s
.decode('ascii')
377 except UnicodeDecodeError:
378 raise IDNAError('Invalid ASCII in A-label')
380 s
= uts46_remap(s
, std3_rules
, False)
384 labels
= _unicode_dots_re
.split(s
)
386 labels
= s
.split('.')
387 if not labels
or labels
== ['']:
388 raise IDNAError('Empty domain')
397 raise IDNAError('Empty label')
400 return '.'.join(result
)