[dlqueue.git] / venv / lib / python3.11 / site-packages / pip / _vendor / webencodings / __init__.py

# coding: utf-8
"""

    webencodings
    ~~~~~~~~~~~~

    This is a Python implementation of the `WHATWG Encoding standard
    <http://encoding.spec.whatwg.org/>`. See README for details.

    :copyright: Copyright 2012 by Simon Sapin
    :license: BSD, see LICENSE for details.

"""

from __future__ import unicode_literals

import codecs

from .labels import LABELS


VERSION = '0.5.1'


# Some names in Encoding are not valid Python aliases. Remap these.
PYTHON_NAMES = {
    'iso-8859-8-i': 'iso-8859-8',
    'x-mac-cyrillic': 'mac-cyrillic',
    'macintosh': 'mac-roman',
    'windows-874': 'cp874'}

CACHE = {}


def ascii_lower(string):
    r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.

    :param string: An Unicode string.
    :returns: A new Unicode string.

    This is used for `ASCII case-insensitive
    <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
    matching of encoding labels.
    The same matching is also used, among other things,
    for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.

    This is different from the :meth:`~py:str.lower` method of Unicode strings
    which also affect non-ASCII characters,
    sometimes mapping them into the ASCII range:

        >>> keyword = u'Bac\N{KELVIN SIGN}ground'
        >>> assert keyword.lower() == u'background'
        >>> assert ascii_lower(keyword) != keyword.lower()
        >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'

    """
    # This turns out to be faster than unicode.translate()
    return string.encode('utf8').lower().decode('utf8')


def lookup(label):
    """
    Look for an encoding by its label.
    This is the spec’s `get an encoding
    <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
    Supported labels are listed there.

    :param label: A string.
    :returns:
        An :class:`Encoding` object, or :obj:`None` for an unknown label.

    """
    # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
    label = ascii_lower(label.strip('\t\n\f\r '))
    name = LABELS.get(label)
    if name is None:
        return None
    encoding = CACHE.get(name)
    if encoding is None:
        if name == 'x-user-defined':
            from .x_user_defined import codec_info
        else:
            python_name = PYTHON_NAMES.get(name, name)
            # Any python_name value that gets to here should be valid.
            codec_info = codecs.lookup(python_name)
        encoding = Encoding(name, codec_info)
        CACHE[name] = encoding
    return encoding


def _get_encoding(encoding_or_label):
    """
    Accept either an encoding object or label.

    :param encoding: An :class:`Encoding` object or a label string.
    :returns: An :class:`Encoding` object.
    :raises: :exc:`~exceptions.LookupError` for an unknown label.

    """
    if hasattr(encoding_or_label, 'codec_info'):
        return encoding_or_label

    encoding = lookup(encoding_or_label)
    if encoding is None:
        raise LookupError('Unknown encoding label: %r' % encoding_or_label)
    return encoding


class Encoding(object):
    """Reresents a character encoding such as UTF-8,
    that can be used for decoding or encoding.

    .. attribute:: name

        Canonical name of the encoding

    .. attribute:: codec_info

        The actual implementation of the encoding,
        a stdlib :class:`~codecs.CodecInfo` object.
        See :func:`codecs.register`.

    """
    def __init__(self, name, codec_info):
        self.name = name
        self.codec_info = codec_info

    def __repr__(self):
        return '<Encoding %s>' % self.name


#: The UTF-8 encoding. Should be used for new content and formats.
UTF8 = lookup('utf-8')

_UTF16LE = lookup('utf-16le')
_UTF16BE = lookup('utf-16be')


def decode(input, fallback_encoding, errors='replace'):
    """
    Decode a single string.

    :param input: A byte string
    :param fallback_encoding:
        An :class:`Encoding` object or a label string.
        The encoding to use if :obj:`input` does note have a BOM.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    :return:
        A ``(output, encoding)`` tuple of an Unicode string
        and an :obj:`Encoding`.

    """
    # Fail early if `encoding` is an invalid label.
    fallback_encoding = _get_encoding(fallback_encoding)
    bom_encoding, input = _detect_bom(input)
    encoding = bom_encoding or fallback_encoding
    return encoding.codec_info.decode(input, errors)[0], encoding


def _detect_bom(input):
    """Return (bom_encoding, input), with any BOM removed from the input."""
    if input.startswith(b'\xFF\xFE'):
        return _UTF16LE, input[2:]
    if input.startswith(b'\xFE\xFF'):
        return _UTF16BE, input[2:]
    if input.startswith(b'\xEF\xBB\xBF'):
        return UTF8, input[3:]
    return None, input


def encode(input, encoding=UTF8, errors='strict'):
    """
    Encode a single string.

    :param input: An Unicode string.
    :param encoding: An :class:`Encoding` object or a label string.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    :return: A byte string.

    """
    return _get_encoding(encoding).codec_info.encode(input, errors)[0]


def iter_decode(input, fallback_encoding, errors='replace'):
    """
    "Pull"-based decoder.

    :param input:
        An iterable of byte strings.

        The input is first consumed just enough to determine the encoding
        based on the precense of a BOM,
        then consumed on demand when the return value is.
    :param fallback_encoding:
        An :class:`Encoding` object or a label string.
        The encoding to use if :obj:`input` does note have a BOM.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    :returns:
        An ``(output, encoding)`` tuple.
        :obj:`output` is an iterable of Unicode strings,
        :obj:`encoding` is the :obj:`Encoding` that is being used.

    """

    decoder = IncrementalDecoder(fallback_encoding, errors)
    generator = _iter_decode_generator(input, decoder)
    encoding = next(generator)
    return generator, encoding


def _iter_decode_generator(input, decoder):
    """Return a generator that first yields the :obj:`Encoding`,
    then yields output chukns as Unicode strings.

    """
    decode = decoder.decode
    input = iter(input)
    for chunck in input:
        output = decode(chunck)
        if output:
            assert decoder.encoding is not None
            yield decoder.encoding
            yield output
            break
    else:
        # Input exhausted without determining the encoding
        output = decode(b'', final=True)
        assert decoder.encoding is not None
        yield decoder.encoding
        if output:
            yield output
        return

    for chunck in input:
        output = decode(chunck)
        if output:
            yield output
    output = decode(b'', final=True)
    if output:
        yield output


def iter_encode(input, encoding=UTF8, errors='strict'):
    """
    “Pull”-based encoder.

    :param input: An iterable of Unicode strings.
    :param encoding: An :class:`Encoding` object or a label string.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    :returns: An iterable of byte strings.

    """
    # Fail early if `encoding` is an invalid label.
    encode = IncrementalEncoder(encoding, errors).encode
    return _iter_encode_generator(input, encode)


def _iter_encode_generator(input, encode):
    for chunck in input:
        output = encode(chunck)
        if output:
            yield output
    output = encode('', final=True)
    if output:
        yield output


class IncrementalDecoder(object):
    """
    “Push”-based decoder.

    :param fallback_encoding:
        An :class:`Encoding` object or a label string.
        The encoding to use if :obj:`input` does note have a BOM.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.

    """
    def __init__(self, fallback_encoding, errors='replace'):
        # Fail early if `encoding` is an invalid label.
        self._fallback_encoding = _get_encoding(fallback_encoding)
        self._errors = errors
        self._buffer = b''
        self._decoder = None
        #: The actual :class:`Encoding` that is being used,
        #: or :obj:`None` if that is not determined yet.
        #: (Ie. if there is not enough input yet to determine
        #: if there is a BOM.)
        self.encoding = None  # Not known yet.

    def decode(self, input, final=False):
        """Decode one chunk of the input.

        :param input: A byte string.
        :param final:
            Indicate that no more input is available.
            Must be :obj:`True` if this is the last call.
        :returns: An Unicode string.

        """
        decoder = self._decoder
        if decoder is not None:
            return decoder(input, final)

        input = self._buffer + input
        encoding, input = _detect_bom(input)
        if encoding is None:
            if len(input) < 3 and not final:  # Not enough data yet.
                self._buffer = input
                return ''
            else:  # No BOM
                encoding = self._fallback_encoding
        decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
        self._decoder = decoder
        self.encoding = encoding
        return decoder(input, final)


class IncrementalEncoder(object):
    """
    “Push”-based encoder.

    :param encoding: An :class:`Encoding` object or a label string.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.

    .. method:: encode(input, final=False)

        :param input: An Unicode string.
        :param final:
            Indicate that no more input is available.
            Must be :obj:`True` if this is the last call.
        :returns: A byte string.

    """
    def __init__(self, encoding=UTF8, errors='strict'):
        encoding = _get_encoding(encoding)
        self.encode = encoding.codec_info.incrementalencoder(errors).encode
Commit	Line	Data
e0df8241 JR	1	# coding: utf-8
	2	"""
	3
	4	webencodings
	5	~~~~~~~~~~~~
	6
	7	This is a Python implementation of the `WHATWG Encoding standard
	8	<http://encoding.spec.whatwg.org/>`. See README for details.
	9
	10	:copyright: Copyright 2012 by Simon Sapin
	11	:license: BSD, see LICENSE for details.
	12
	13	"""
	14
	15	from __future__ import unicode_literals
	16
	17	import codecs
	18
	19	from .labels import LABELS
	20
	21
	22	VERSION = '0.5.1'
	23
	24
	25	# Some names in Encoding are not valid Python aliases. Remap these.
	26	PYTHON_NAMES = {
	27	'iso-8859-8-i': 'iso-8859-8',
	28	'x-mac-cyrillic': 'mac-cyrillic',
	29	'macintosh': 'mac-roman',
	30	'windows-874': 'cp874'}
	31
	32	CACHE = {}
	33
	34
	35	def ascii_lower(string):
	36	r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
	37
	38	:param string: An Unicode string.
	39	:returns: A new Unicode string.
	40
	41	This is used for `ASCII case-insensitive
	42	<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
	43	matching of encoding labels.
	44	The same matching is also used, among other things,
	45	for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
	46
	47	This is different from the :meth:`~py:str.lower` method of Unicode strings
	48	which also affect non-ASCII characters,
	49	sometimes mapping them into the ASCII range:
	50
	51	>>> keyword = u'Bac\N{KELVIN SIGN}ground'
	52	>>> assert keyword.lower() == u'background'
	53	>>> assert ascii_lower(keyword) != keyword.lower()
	54	>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
	55
	56	"""
	57	# This turns out to be faster than unicode.translate()
	58	return string.encode('utf8').lower().decode('utf8')
	59
	60
	61	def lookup(label):
	62	"""
	63	Look for an encoding by its label.
	64	This is the spec’s `get an encoding
65	<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
66	Supported labels are listed there.
67
68	:param label: A string.
69	:returns:
70	An :class:`Encoding` object, or :obj:`None` for an unknown label.
71
72	"""
73	# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
74	label = ascii_lower(label.strip('\t\n\f\r '))
75	name = LABELS.get(label)
76	if name is None:
77	return None
78	encoding = CACHE.get(name)
79	if encoding is None:
80	if name == 'x-user-defined':
81	from .x_user_defined import codec_info
82	else:
83	python_name = PYTHON_NAMES.get(name, name)
84	# Any python_name value that gets to here should be valid.
85	codec_info = codecs.lookup(python_name)
86	encoding = Encoding(name, codec_info)
87	CACHE[name] = encoding
88	return encoding
89
90
91	def _get_encoding(encoding_or_label):
92	"""
93	Accept either an encoding object or label.
94
95	:param encoding: An :class:`Encoding` object or a label string.
96	:returns: An :class:`Encoding` object.
97	:raises: :exc:`~exceptions.LookupError` for an unknown label.
98
99	"""
100	if hasattr(encoding_or_label, 'codec_info'):
101	return encoding_or_label
102
103	encoding = lookup(encoding_or_label)
104	if encoding is None:
105	raise LookupError('Unknown encoding label: %r' % encoding_or_label)
106	return encoding
107
108
109	class Encoding(object):
110	"""Reresents a character encoding such as UTF-8,
111	that can be used for decoding or encoding.
112
113	.. attribute:: name
114
115	Canonical name of the encoding
116
117	.. attribute:: codec_info
118
119	The actual implementation of the encoding,
120	a stdlib :class:`~codecs.CodecInfo` object.
121	See :func:`codecs.register`.
122
123	"""
124	def __init__(self, name, codec_info):
125	self.name = name
126	self.codec_info = codec_info
127
128	def __repr__(self):
129	return '<Encoding %s>' % self.name
130
131
132	#: The UTF-8 encoding. Should be used for new content and formats.
133	UTF8 = lookup('utf-8')
134
135	_UTF16LE = lookup('utf-16le')
136	_UTF16BE = lookup('utf-16be')
137
138
139	def decode(input, fallback_encoding, errors='replace'):
140	"""
141	Decode a single string.
142
143	:param input: A byte string
144	:param fallback_encoding:
145	An :class:`Encoding` object or a label string.
146	The encoding to use if :obj:`input` does note have a BOM.
147	:param errors: Type of error handling. See :func:`codecs.register`.
148	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
149	:return:
150	A ``(output, encoding)`` tuple of an Unicode string
151	and an :obj:`Encoding`.
152
153	"""
154	# Fail early if `encoding` is an invalid label.
155	fallback_encoding = _get_encoding(fallback_encoding)
156	bom_encoding, input = _detect_bom(input)
157	encoding = bom_encoding or fallback_encoding
158	return encoding.codec_info.decode(input, errors)[0], encoding
159
160
161	def _detect_bom(input):
162	"""Return (bom_encoding, input), with any BOM removed from the input."""
163	if input.startswith(b'\xFF\xFE'):
164	return _UTF16LE, input[2:]
165	if input.startswith(b'\xFE\xFF'):
166	return _UTF16BE, input[2:]
167	if input.startswith(b'\xEF\xBB\xBF'):
168	return UTF8, input[3:]
169	return None, input
170
171
172	def encode(input, encoding=UTF8, errors='strict'):
173	"""
174	Encode a single string.
175
176	:param input: An Unicode string.
177	:param encoding: An :class:`Encoding` object or a label string.
178	:param errors: Type of error handling. See :func:`codecs.register`.
179	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
180	:return: A byte string.
181
182	"""
183	return _get_encoding(encoding).codec_info.encode(input, errors)[0]
184
185
186	def iter_decode(input, fallback_encoding, errors='replace'):
187	"""
188	"Pull"-based decoder.
189
190	:param input:
191	An iterable of byte strings.
192
193	The input is first consumed just enough to determine the encoding
194	based on the precense of a BOM,
195	then consumed on demand when the return value is.
196	:param fallback_encoding:
197	An :class:`Encoding` object or a label string.
198	The encoding to use if :obj:`input` does note have a BOM.
199	:param errors: Type of error handling. See :func:`codecs.register`.
200	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
201	:returns:
202	An ``(output, encoding)`` tuple.
203	:obj:`output` is an iterable of Unicode strings,
204	:obj:`encoding` is the :obj:`Encoding` that is being used.
205
206	"""
207
208	decoder = IncrementalDecoder(fallback_encoding, errors)
209	generator = _iter_decode_generator(input, decoder)
210	encoding = next(generator)
211	return generator, encoding
212
213
214	def _iter_decode_generator(input, decoder):
215	"""Return a generator that first yields the :obj:`Encoding`,
216	then yields output chukns as Unicode strings.
217
218	"""
219	decode = decoder.decode
220	input = iter(input)
221	for chunck in input:
222	output = decode(chunck)
223	if output:
224	assert decoder.encoding is not None
225	yield decoder.encoding
226	yield output
227	break
228	else:
229	# Input exhausted without determining the encoding
230	output = decode(b'', final=True)
231	assert decoder.encoding is not None
232	yield decoder.encoding
233	if output:
234	yield output
235	return
236
237	for chunck in input:
238	output = decode(chunck)
239	if output:
240	yield output
241	output = decode(b'', final=True)
242	if output:
243	yield output
244
245
246	def iter_encode(input, encoding=UTF8, errors='strict'):
247	"""
248	“Pull”-based encoder.
249
250	:param input: An iterable of Unicode strings.
251	:param encoding: An :class:`Encoding` object or a label string.
252	:param errors: Type of error handling. See :func:`codecs.register`.
253	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
254	:returns: An iterable of byte strings.
255
256	"""
257	# Fail early if `encoding` is an invalid label.
258	encode = IncrementalEncoder(encoding, errors).encode
259	return _iter_encode_generator(input, encode)
260
261
262	def _iter_encode_generator(input, encode):
263	for chunck in input:
264	output = encode(chunck)
265	if output:
266	yield output
267	output = encode('', final=True)
268	if output:
269	yield output
270
271
272	class IncrementalDecoder(object):
273	"""
274	“Push”-based decoder.
275
276	:param fallback_encoding:
277	An :class:`Encoding` object or a label string.
278	The encoding to use if :obj:`input` does note have a BOM.
279	:param errors: Type of error handling. See :func:`codecs.register`.
280	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
281
282	"""
283	def __init__(self, fallback_encoding, errors='replace'):
284	# Fail early if `encoding` is an invalid label.
285	self._fallback_encoding = _get_encoding(fallback_encoding)
286	self._errors = errors
287	self._buffer = b''
288	self._decoder = None
289	#: The actual :class:`Encoding` that is being used,
290	#: or :obj:`None` if that is not determined yet.
291	#: (Ie. if there is not enough input yet to determine
292	#: if there is a BOM.)
293	self.encoding = None # Not known yet.
294
295	def decode(self, input, final=False):
296	"""Decode one chunk of the input.
297
298	:param input: A byte string.
299	:param final:
300	Indicate that no more input is available.
301	Must be :obj:`True` if this is the last call.
302	:returns: An Unicode string.
303
304	"""
305	decoder = self._decoder
306	if decoder is not None:
307	return decoder(input, final)
308
309	input = self._buffer + input
310	encoding, input = _detect_bom(input)
311	if encoding is None:
312	if len(input) < 3 and not final: # Not enough data yet.
313	self._buffer = input
314	return ''
315	else: # No BOM
316	encoding = self._fallback_encoding
317	decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
318	self._decoder = decoder
319	self.encoding = encoding
320	return decoder(input, final)
321
322
323	class IncrementalEncoder(object):
324	"""
325	“Push”-based encoder.
326
327	:param encoding: An :class:`Encoding` object or a label string.
328	:param errors: Type of error handling. See :func:`codecs.register`.
329	:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
330
331	.. method:: encode(input, final=False)
332
333	:param input: An Unicode string.
334	:param final:
335	Indicate that no more input is available.
336	Must be :obj:`True` if this is the last call.
337	:returns: A byte string.
338
339	"""
340	def __init__(self, encoding=UTF8, errors='strict'):
341	encoding = _get_encoding(encoding)
342	self.encode = encoding.codec_info.incrementalencoder(errors).encode