]>
jfr.im git - dlqueue.git/blob - venv/lib/python3.11/site-packages/pip/_vendor/webencodings/__init__.py
7 This is a Python implementation of the `WHATWG Encoding standard
8 <http://encoding.spec.whatwg.org/>`. See README for details.
10 :copyright: Copyright 2012 by Simon Sapin
11 :license: BSD, see LICENSE for details.
15 from __future__
import unicode_literals
19 from .labels
import LABELS
25 # Some names in Encoding are not valid Python aliases. Remap these.
27 'iso-8859-8-i': 'iso-8859-8',
28 'x-mac-cyrillic': 'mac-cyrillic',
29 'macintosh': 'mac-roman',
30 'windows-874': 'cp874'}
35 def ascii_lower(string
):
36 r
"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
38 :param string: An Unicode string.
39 :returns: A new Unicode string.
41 This is used for `ASCII case-insensitive
42 <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
43 matching of encoding labels.
44 The same matching is also used, among other things,
45 for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
47 This is different from the :meth:`~py:str.lower` method of Unicode strings
48 which also affect non-ASCII characters,
49 sometimes mapping them into the ASCII range:
51 >>> keyword = u'Bac\N{KELVIN SIGN}ground'
52 >>> assert keyword.lower() == u'background'
53 >>> assert ascii_lower(keyword) != keyword.lower()
54 >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
57 # This turns out to be faster than unicode.translate()
58 return string
.encode('utf8').lower().decode('utf8')
63 Look for an encoding by its label.
64 This is the spec’s `get an encoding
65 <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
66 Supported labels are listed there.
68 :param label: A string.
70 An :class:`Encoding` object, or :obj:`None` for an unknown label.
73 # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
74 label
= ascii_lower(label
.strip('\t\n\f\r '))
75 name
= LABELS
.get(label
)
78 encoding
= CACHE
.get(name
)
80 if name
== 'x-user-defined':
81 from .x_user_defined
import codec_info
83 python_name
= PYTHON_NAMES
.get(name
, name
)
84 # Any python_name value that gets to here should be valid.
85 codec_info
= codecs
.lookup(python_name
)
86 encoding
= Encoding(name
, codec_info
)
87 CACHE
[name
] = encoding
91 def _get_encoding(encoding_or_label
):
93 Accept either an encoding object or label.
95 :param encoding: An :class:`Encoding` object or a label string.
96 :returns: An :class:`Encoding` object.
97 :raises: :exc:`~exceptions.LookupError` for an unknown label.
100 if hasattr(encoding_or_label
, 'codec_info'):
101 return encoding_or_label
103 encoding
= lookup(encoding_or_label
)
105 raise LookupError('Unknown encoding label: %r' % encoding_or_label
)
109 class Encoding(object):
110 """Reresents a character encoding such as UTF-8,
111 that can be used for decoding or encoding.
115 Canonical name of the encoding
117 .. attribute:: codec_info
119 The actual implementation of the encoding,
120 a stdlib :class:`~codecs.CodecInfo` object.
121 See :func:`codecs.register`.
124 def __init__(self
, name
, codec_info
):
126 self
.codec_info
= codec_info
129 return '<Encoding %s>' % self
.name
132 #: The UTF-8 encoding. Should be used for new content and formats.
133 UTF8
= lookup('utf-8')
135 _UTF16LE
= lookup('utf-16le')
136 _UTF16BE
= lookup('utf-16be')
139 def decode(input, fallback_encoding
, errors
='replace'):
141 Decode a single string.
143 :param input: A byte string
144 :param fallback_encoding:
145 An :class:`Encoding` object or a label string.
146 The encoding to use if :obj:`input` does note have a BOM.
147 :param errors: Type of error handling. See :func:`codecs.register`.
148 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
150 A ``(output, encoding)`` tuple of an Unicode string
151 and an :obj:`Encoding`.
154 # Fail early if `encoding` is an invalid label.
155 fallback_encoding
= _get_encoding(fallback_encoding
)
156 bom_encoding
, input = _detect_bom(input)
157 encoding
= bom_encoding
or fallback_encoding
158 return encoding
.codec_info
.decode(input, errors
)[0], encoding
161 def _detect_bom(input):
162 """Return (bom_encoding, input), with any BOM removed from the input."""
163 if input.startswith(b
'\xFF\xFE'):
164 return _UTF16LE
, input[2:]
165 if input.startswith(b
'\xFE\xFF'):
166 return _UTF16BE
, input[2:]
167 if input.startswith(b
'\xEF\xBB\xBF'):
168 return UTF8
, input[3:]
172 def encode(input, encoding
=UTF8
, errors
='strict'):
174 Encode a single string.
176 :param input: An Unicode string.
177 :param encoding: An :class:`Encoding` object or a label string.
178 :param errors: Type of error handling. See :func:`codecs.register`.
179 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
180 :return: A byte string.
183 return _get_encoding(encoding
).codec_info
.encode(input, errors
)[0]
186 def iter_decode(input, fallback_encoding
, errors
='replace'):
188 "Pull"-based decoder.
191 An iterable of byte strings.
193 The input is first consumed just enough to determine the encoding
194 based on the precense of a BOM,
195 then consumed on demand when the return value is.
196 :param fallback_encoding:
197 An :class:`Encoding` object or a label string.
198 The encoding to use if :obj:`input` does note have a BOM.
199 :param errors: Type of error handling. See :func:`codecs.register`.
200 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
202 An ``(output, encoding)`` tuple.
203 :obj:`output` is an iterable of Unicode strings,
204 :obj:`encoding` is the :obj:`Encoding` that is being used.
208 decoder
= IncrementalDecoder(fallback_encoding
, errors
)
209 generator
= _iter_decode_generator(input, decoder
)
210 encoding
= next(generator
)
211 return generator
, encoding
214 def _iter_decode_generator(input, decoder
):
215 """Return a generator that first yields the :obj:`Encoding`,
216 then yields output chukns as Unicode strings.
219 decode
= decoder
.decode
222 output
= decode(chunck
)
224 assert decoder
.encoding
is not None
225 yield decoder
.encoding
229 # Input exhausted without determining the encoding
230 output
= decode(b
'', final
=True)
231 assert decoder
.encoding
is not None
232 yield decoder
.encoding
238 output
= decode(chunck
)
241 output
= decode(b
'', final
=True)
246 def iter_encode(input, encoding
=UTF8
, errors
='strict'):
248 “Pull”-based encoder.
250 :param input: An iterable of Unicode strings.
251 :param encoding: An :class:`Encoding` object or a label string.
252 :param errors: Type of error handling. See :func:`codecs.register`.
253 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
254 :returns: An iterable of byte strings.
257 # Fail early if `encoding` is an invalid label.
258 encode
= IncrementalEncoder(encoding
, errors
).encode
259 return _iter_encode_generator(input, encode
)
262 def _iter_encode_generator(input, encode
):
264 output
= encode(chunck
)
267 output
= encode('', final
=True)
272 class IncrementalDecoder(object):
274 “Push”-based decoder.
276 :param fallback_encoding:
277 An :class:`Encoding` object or a label string.
278 The encoding to use if :obj:`input` does note have a BOM.
279 :param errors: Type of error handling. See :func:`codecs.register`.
280 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
283 def __init__(self
, fallback_encoding
, errors
='replace'):
284 # Fail early if `encoding` is an invalid label.
285 self
._fallback
_encoding
= _get_encoding(fallback_encoding
)
286 self
._errors
= errors
289 #: The actual :class:`Encoding` that is being used,
290 #: or :obj:`None` if that is not determined yet.
291 #: (Ie. if there is not enough input yet to determine
292 #: if there is a BOM.)
293 self
.encoding
= None # Not known yet.
295 def decode(self
, input, final
=False):
296 """Decode one chunk of the input.
298 :param input: A byte string.
300 Indicate that no more input is available.
301 Must be :obj:`True` if this is the last call.
302 :returns: An Unicode string.
305 decoder
= self
._decoder
306 if decoder
is not None:
307 return decoder(input, final
)
309 input = self
._buffer
+ input
310 encoding
, input = _detect_bom(input)
312 if len(input) < 3 and not final
: # Not enough data yet.
316 encoding
= self
._fallback
_encoding
317 decoder
= encoding
.codec_info
.incrementaldecoder(self
._errors
).decode
318 self
._decoder
= decoder
319 self
.encoding
= encoding
320 return decoder(input, final
)
323 class IncrementalEncoder(object):
325 “Push”-based encoder.
327 :param encoding: An :class:`Encoding` object or a label string.
328 :param errors: Type of error handling. See :func:`codecs.register`.
329 :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
331 .. method:: encode(input, final=False)
333 :param input: An Unicode string.
335 Indicate that no more input is available.
336 Must be :obj:`True` if this is the last call.
337 :returns: A byte string.
340 def __init__(self
, encoding
=UTF8
, errors
='strict'):
341 encoding
= _get_encoding(encoding
)
342 self
.encode
= encoding
.codec_info
.incrementalencoder(errors
).encode