]>
Commit | Line | Data |
---|---|---|
e0df8241 JR |
1 | # coding: utf-8 |
2 | """ | |
3 | ||
4 | webencodings | |
5 | ~~~~~~~~~~~~ | |
6 | ||
7 | This is a Python implementation of the `WHATWG Encoding standard | |
8 | <http://encoding.spec.whatwg.org/>`. See README for details. | |
9 | ||
10 | :copyright: Copyright 2012 by Simon Sapin | |
11 | :license: BSD, see LICENSE for details. | |
12 | ||
13 | """ | |
14 | ||
15 | from __future__ import unicode_literals | |
16 | ||
17 | import codecs | |
18 | ||
19 | from .labels import LABELS | |
20 | ||
21 | ||
22 | VERSION = '0.5.1' | |
23 | ||
24 | ||
25 | # Some names in Encoding are not valid Python aliases. Remap these. | |
26 | PYTHON_NAMES = { | |
27 | 'iso-8859-8-i': 'iso-8859-8', | |
28 | 'x-mac-cyrillic': 'mac-cyrillic', | |
29 | 'macintosh': 'mac-roman', | |
30 | 'windows-874': 'cp874'} | |
31 | ||
32 | CACHE = {} | |
33 | ||
34 | ||
35 | def ascii_lower(string): | |
36 | r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. | |
37 | ||
38 | :param string: An Unicode string. | |
39 | :returns: A new Unicode string. | |
40 | ||
41 | This is used for `ASCII case-insensitive | |
42 | <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ | |
43 | matching of encoding labels. | |
44 | The same matching is also used, among other things, | |
45 | for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_. | |
46 | ||
47 | This is different from the :meth:`~py:str.lower` method of Unicode strings | |
48 | which also affect non-ASCII characters, | |
49 | sometimes mapping them into the ASCII range: | |
50 | ||
51 | >>> keyword = u'Bac\N{KELVIN SIGN}ground' | |
52 | >>> assert keyword.lower() == u'background' | |
53 | >>> assert ascii_lower(keyword) != keyword.lower() | |
54 | >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' | |
55 | ||
56 | """ | |
57 | # This turns out to be faster than unicode.translate() | |
58 | return string.encode('utf8').lower().decode('utf8') | |
59 | ||
60 | ||
61 | def lookup(label): | |
62 | """ | |
63 | Look for an encoding by its label. | |
64 | This is the spec’s `get an encoding | |
65 | <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. | |
66 | Supported labels are listed there. | |
67 | ||
68 | :param label: A string. | |
69 | :returns: | |
70 | An :class:`Encoding` object, or :obj:`None` for an unknown label. | |
71 | ||
72 | """ | |
73 | # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. | |
74 | label = ascii_lower(label.strip('\t\n\f\r ')) | |
75 | name = LABELS.get(label) | |
76 | if name is None: | |
77 | return None | |
78 | encoding = CACHE.get(name) | |
79 | if encoding is None: | |
80 | if name == 'x-user-defined': | |
81 | from .x_user_defined import codec_info | |
82 | else: | |
83 | python_name = PYTHON_NAMES.get(name, name) | |
84 | # Any python_name value that gets to here should be valid. | |
85 | codec_info = codecs.lookup(python_name) | |
86 | encoding = Encoding(name, codec_info) | |
87 | CACHE[name] = encoding | |
88 | return encoding | |
89 | ||
90 | ||
91 | def _get_encoding(encoding_or_label): | |
92 | """ | |
93 | Accept either an encoding object or label. | |
94 | ||
95 | :param encoding: An :class:`Encoding` object or a label string. | |
96 | :returns: An :class:`Encoding` object. | |
97 | :raises: :exc:`~exceptions.LookupError` for an unknown label. | |
98 | ||
99 | """ | |
100 | if hasattr(encoding_or_label, 'codec_info'): | |
101 | return encoding_or_label | |
102 | ||
103 | encoding = lookup(encoding_or_label) | |
104 | if encoding is None: | |
105 | raise LookupError('Unknown encoding label: %r' % encoding_or_label) | |
106 | return encoding | |
107 | ||
108 | ||
109 | class Encoding(object): | |
110 | """Reresents a character encoding such as UTF-8, | |
111 | that can be used for decoding or encoding. | |
112 | ||
113 | .. attribute:: name | |
114 | ||
115 | Canonical name of the encoding | |
116 | ||
117 | .. attribute:: codec_info | |
118 | ||
119 | The actual implementation of the encoding, | |
120 | a stdlib :class:`~codecs.CodecInfo` object. | |
121 | See :func:`codecs.register`. | |
122 | ||
123 | """ | |
124 | def __init__(self, name, codec_info): | |
125 | self.name = name | |
126 | self.codec_info = codec_info | |
127 | ||
128 | def __repr__(self): | |
129 | return '<Encoding %s>' % self.name | |
130 | ||
131 | ||
132 | #: The UTF-8 encoding. Should be used for new content and formats. | |
133 | UTF8 = lookup('utf-8') | |
134 | ||
135 | _UTF16LE = lookup('utf-16le') | |
136 | _UTF16BE = lookup('utf-16be') | |
137 | ||
138 | ||
139 | def decode(input, fallback_encoding, errors='replace'): | |
140 | """ | |
141 | Decode a single string. | |
142 | ||
143 | :param input: A byte string | |
144 | :param fallback_encoding: | |
145 | An :class:`Encoding` object or a label string. | |
146 | The encoding to use if :obj:`input` does note have a BOM. | |
147 | :param errors: Type of error handling. See :func:`codecs.register`. | |
148 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
149 | :return: | |
150 | A ``(output, encoding)`` tuple of an Unicode string | |
151 | and an :obj:`Encoding`. | |
152 | ||
153 | """ | |
154 | # Fail early if `encoding` is an invalid label. | |
155 | fallback_encoding = _get_encoding(fallback_encoding) | |
156 | bom_encoding, input = _detect_bom(input) | |
157 | encoding = bom_encoding or fallback_encoding | |
158 | return encoding.codec_info.decode(input, errors)[0], encoding | |
159 | ||
160 | ||
161 | def _detect_bom(input): | |
162 | """Return (bom_encoding, input), with any BOM removed from the input.""" | |
163 | if input.startswith(b'\xFF\xFE'): | |
164 | return _UTF16LE, input[2:] | |
165 | if input.startswith(b'\xFE\xFF'): | |
166 | return _UTF16BE, input[2:] | |
167 | if input.startswith(b'\xEF\xBB\xBF'): | |
168 | return UTF8, input[3:] | |
169 | return None, input | |
170 | ||
171 | ||
172 | def encode(input, encoding=UTF8, errors='strict'): | |
173 | """ | |
174 | Encode a single string. | |
175 | ||
176 | :param input: An Unicode string. | |
177 | :param encoding: An :class:`Encoding` object or a label string. | |
178 | :param errors: Type of error handling. See :func:`codecs.register`. | |
179 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
180 | :return: A byte string. | |
181 | ||
182 | """ | |
183 | return _get_encoding(encoding).codec_info.encode(input, errors)[0] | |
184 | ||
185 | ||
186 | def iter_decode(input, fallback_encoding, errors='replace'): | |
187 | """ | |
188 | "Pull"-based decoder. | |
189 | ||
190 | :param input: | |
191 | An iterable of byte strings. | |
192 | ||
193 | The input is first consumed just enough to determine the encoding | |
194 | based on the precense of a BOM, | |
195 | then consumed on demand when the return value is. | |
196 | :param fallback_encoding: | |
197 | An :class:`Encoding` object or a label string. | |
198 | The encoding to use if :obj:`input` does note have a BOM. | |
199 | :param errors: Type of error handling. See :func:`codecs.register`. | |
200 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
201 | :returns: | |
202 | An ``(output, encoding)`` tuple. | |
203 | :obj:`output` is an iterable of Unicode strings, | |
204 | :obj:`encoding` is the :obj:`Encoding` that is being used. | |
205 | ||
206 | """ | |
207 | ||
208 | decoder = IncrementalDecoder(fallback_encoding, errors) | |
209 | generator = _iter_decode_generator(input, decoder) | |
210 | encoding = next(generator) | |
211 | return generator, encoding | |
212 | ||
213 | ||
214 | def _iter_decode_generator(input, decoder): | |
215 | """Return a generator that first yields the :obj:`Encoding`, | |
216 | then yields output chukns as Unicode strings. | |
217 | ||
218 | """ | |
219 | decode = decoder.decode | |
220 | input = iter(input) | |
221 | for chunck in input: | |
222 | output = decode(chunck) | |
223 | if output: | |
224 | assert decoder.encoding is not None | |
225 | yield decoder.encoding | |
226 | yield output | |
227 | break | |
228 | else: | |
229 | # Input exhausted without determining the encoding | |
230 | output = decode(b'', final=True) | |
231 | assert decoder.encoding is not None | |
232 | yield decoder.encoding | |
233 | if output: | |
234 | yield output | |
235 | return | |
236 | ||
237 | for chunck in input: | |
238 | output = decode(chunck) | |
239 | if output: | |
240 | yield output | |
241 | output = decode(b'', final=True) | |
242 | if output: | |
243 | yield output | |
244 | ||
245 | ||
246 | def iter_encode(input, encoding=UTF8, errors='strict'): | |
247 | """ | |
248 | “Pull”-based encoder. | |
249 | ||
250 | :param input: An iterable of Unicode strings. | |
251 | :param encoding: An :class:`Encoding` object or a label string. | |
252 | :param errors: Type of error handling. See :func:`codecs.register`. | |
253 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
254 | :returns: An iterable of byte strings. | |
255 | ||
256 | """ | |
257 | # Fail early if `encoding` is an invalid label. | |
258 | encode = IncrementalEncoder(encoding, errors).encode | |
259 | return _iter_encode_generator(input, encode) | |
260 | ||
261 | ||
262 | def _iter_encode_generator(input, encode): | |
263 | for chunck in input: | |
264 | output = encode(chunck) | |
265 | if output: | |
266 | yield output | |
267 | output = encode('', final=True) | |
268 | if output: | |
269 | yield output | |
270 | ||
271 | ||
272 | class IncrementalDecoder(object): | |
273 | """ | |
274 | “Push”-based decoder. | |
275 | ||
276 | :param fallback_encoding: | |
277 | An :class:`Encoding` object or a label string. | |
278 | The encoding to use if :obj:`input` does note have a BOM. | |
279 | :param errors: Type of error handling. See :func:`codecs.register`. | |
280 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
281 | ||
282 | """ | |
283 | def __init__(self, fallback_encoding, errors='replace'): | |
284 | # Fail early if `encoding` is an invalid label. | |
285 | self._fallback_encoding = _get_encoding(fallback_encoding) | |
286 | self._errors = errors | |
287 | self._buffer = b'' | |
288 | self._decoder = None | |
289 | #: The actual :class:`Encoding` that is being used, | |
290 | #: or :obj:`None` if that is not determined yet. | |
291 | #: (Ie. if there is not enough input yet to determine | |
292 | #: if there is a BOM.) | |
293 | self.encoding = None # Not known yet. | |
294 | ||
295 | def decode(self, input, final=False): | |
296 | """Decode one chunk of the input. | |
297 | ||
298 | :param input: A byte string. | |
299 | :param final: | |
300 | Indicate that no more input is available. | |
301 | Must be :obj:`True` if this is the last call. | |
302 | :returns: An Unicode string. | |
303 | ||
304 | """ | |
305 | decoder = self._decoder | |
306 | if decoder is not None: | |
307 | return decoder(input, final) | |
308 | ||
309 | input = self._buffer + input | |
310 | encoding, input = _detect_bom(input) | |
311 | if encoding is None: | |
312 | if len(input) < 3 and not final: # Not enough data yet. | |
313 | self._buffer = input | |
314 | return '' | |
315 | else: # No BOM | |
316 | encoding = self._fallback_encoding | |
317 | decoder = encoding.codec_info.incrementaldecoder(self._errors).decode | |
318 | self._decoder = decoder | |
319 | self.encoding = encoding | |
320 | return decoder(input, final) | |
321 | ||
322 | ||
323 | class IncrementalEncoder(object): | |
324 | """ | |
325 | “Push”-based encoder. | |
326 | ||
327 | :param encoding: An :class:`Encoding` object or a label string. | |
328 | :param errors: Type of error handling. See :func:`codecs.register`. | |
329 | :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. | |
330 | ||
331 | .. method:: encode(input, final=False) | |
332 | ||
333 | :param input: An Unicode string. | |
334 | :param final: | |
335 | Indicate that no more input is available. | |
336 | Must be :obj:`True` if this is the last call. | |
337 | :returns: A byte string. | |
338 | ||
339 | """ | |
340 | def __init__(self, encoding=UTF8, errors='strict'): | |
341 | encoding = _get_encoding(encoding) | |
342 | self.encode = encoding.codec_info.incrementalencoder(errors).encode |