]>
Commit | Line | Data |
---|---|---|
e0df8241 JR |
1 | # unicode.py |
2 | ||
3 | import sys | |
4 | from itertools import filterfalse | |
5 | from typing import List, Tuple, Union | |
6 | ||
7 | ||
8 | class _lazyclassproperty: | |
9 | def __init__(self, fn): | |
10 | self.fn = fn | |
11 | self.__doc__ = fn.__doc__ | |
12 | self.__name__ = fn.__name__ | |
13 | ||
14 | def __get__(self, obj, cls): | |
15 | if cls is None: | |
16 | cls = type(obj) | |
17 | if not hasattr(cls, "_intern") or any( | |
18 | cls._intern is getattr(superclass, "_intern", []) | |
19 | for superclass in cls.__mro__[1:] | |
20 | ): | |
21 | cls._intern = {} | |
22 | attrname = self.fn.__name__ | |
23 | if attrname not in cls._intern: | |
24 | cls._intern[attrname] = self.fn(cls) | |
25 | return cls._intern[attrname] | |
26 | ||
27 | ||
28 | UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]] | |
29 | ||
30 | ||
31 | class unicode_set: | |
32 | """ | |
33 | A set of Unicode characters, for language-specific strings for | |
34 | ``alphas``, ``nums``, ``alphanums``, and ``printables``. | |
35 | A unicode_set is defined by a list of ranges in the Unicode character | |
36 | set, in a class attribute ``_ranges``. Ranges can be specified using | |
37 | 2-tuples or a 1-tuple, such as:: | |
38 | ||
39 | _ranges = [ | |
40 | (0x0020, 0x007e), | |
41 | (0x00a0, 0x00ff), | |
42 | (0x0100,), | |
43 | ] | |
44 | ||
45 | Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x). | |
46 | ||
47 | A unicode set can also be defined using multiple inheritance of other unicode sets:: | |
48 | ||
49 | class CJK(Chinese, Japanese, Korean): | |
50 | pass | |
51 | """ | |
52 | ||
53 | _ranges: UnicodeRangeList = [] | |
54 | ||
55 | @_lazyclassproperty | |
56 | def _chars_for_ranges(cls): | |
57 | ret = [] | |
58 | for cc in cls.__mro__: | |
59 | if cc is unicode_set: | |
60 | break | |
61 | for rr in getattr(cc, "_ranges", ()): | |
62 | ret.extend(range(rr[0], rr[-1] + 1)) | |
63 | return [chr(c) for c in sorted(set(ret))] | |
64 | ||
65 | @_lazyclassproperty | |
66 | def printables(cls): | |
67 | """all non-whitespace characters in this range""" | |
68 | return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) | |
69 | ||
70 | @_lazyclassproperty | |
71 | def alphas(cls): | |
72 | """all alphabetic characters in this range""" | |
73 | return "".join(filter(str.isalpha, cls._chars_for_ranges)) | |
74 | ||
75 | @_lazyclassproperty | |
76 | def nums(cls): | |
77 | """all numeric digit characters in this range""" | |
78 | return "".join(filter(str.isdigit, cls._chars_for_ranges)) | |
79 | ||
80 | @_lazyclassproperty | |
81 | def alphanums(cls): | |
82 | """all alphanumeric characters in this range""" | |
83 | return cls.alphas + cls.nums | |
84 | ||
85 | @_lazyclassproperty | |
86 | def identchars(cls): | |
87 | """all characters in this range that are valid identifier characters, plus underscore '_'""" | |
88 | return "".join( | |
89 | sorted( | |
90 | set( | |
91 | "".join(filter(str.isidentifier, cls._chars_for_ranges)) | |
92 | + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" | |
93 | + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" | |
94 | + "_" | |
95 | ) | |
96 | ) | |
97 | ) | |
98 | ||
99 | @_lazyclassproperty | |
100 | def identbodychars(cls): | |
101 | """ | |
102 | all characters in this range that are valid identifier body characters, | |
103 | plus the digits 0-9, and · (Unicode MIDDLE DOT) | |
104 | """ | |
105 | return "".join( | |
106 | sorted( | |
107 | set( | |
108 | cls.identchars | |
109 | + "0123456789·" | |
110 | + "".join( | |
111 | [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()] | |
112 | ) | |
113 | ) | |
114 | ) | |
115 | ) | |
116 | ||
117 | @_lazyclassproperty | |
118 | def identifier(cls): | |
119 | """ | |
120 | a pyparsing Word expression for an identifier using this range's definitions for | |
121 | identchars and identbodychars | |
122 | """ | |
123 | from pip._vendor.pyparsing import Word | |
124 | ||
125 | return Word(cls.identchars, cls.identbodychars) | |
126 | ||
127 | ||
128 | class pyparsing_unicode(unicode_set): | |
129 | """ | |
130 | A namespace class for defining common language unicode_sets. | |
131 | """ | |
132 | ||
133 | # fmt: off | |
134 | ||
135 | # define ranges in language character sets | |
136 | _ranges: UnicodeRangeList = [ | |
137 | (0x0020, sys.maxunicode), | |
138 | ] | |
139 | ||
140 | class BasicMultilingualPlane(unicode_set): | |
141 | """Unicode set for the Basic Multilingual Plane""" | |
142 | _ranges: UnicodeRangeList = [ | |
143 | (0x0020, 0xFFFF), | |
144 | ] | |
145 | ||
146 | class Latin1(unicode_set): | |
147 | """Unicode set for Latin-1 Unicode Character Range""" | |
148 | _ranges: UnicodeRangeList = [ | |
149 | (0x0020, 0x007E), | |
150 | (0x00A0, 0x00FF), | |
151 | ] | |
152 | ||
153 | class LatinA(unicode_set): | |
154 | """Unicode set for Latin-A Unicode Character Range""" | |
155 | _ranges: UnicodeRangeList = [ | |
156 | (0x0100, 0x017F), | |
157 | ] | |
158 | ||
159 | class LatinB(unicode_set): | |
160 | """Unicode set for Latin-B Unicode Character Range""" | |
161 | _ranges: UnicodeRangeList = [ | |
162 | (0x0180, 0x024F), | |
163 | ] | |
164 | ||
165 | class Greek(unicode_set): | |
166 | """Unicode set for Greek Unicode Character Ranges""" | |
167 | _ranges: UnicodeRangeList = [ | |
168 | (0x0342, 0x0345), | |
169 | (0x0370, 0x0377), | |
170 | (0x037A, 0x037F), | |
171 | (0x0384, 0x038A), | |
172 | (0x038C,), | |
173 | (0x038E, 0x03A1), | |
174 | (0x03A3, 0x03E1), | |
175 | (0x03F0, 0x03FF), | |
176 | (0x1D26, 0x1D2A), | |
177 | (0x1D5E,), | |
178 | (0x1D60,), | |
179 | (0x1D66, 0x1D6A), | |
180 | (0x1F00, 0x1F15), | |
181 | (0x1F18, 0x1F1D), | |
182 | (0x1F20, 0x1F45), | |
183 | (0x1F48, 0x1F4D), | |
184 | (0x1F50, 0x1F57), | |
185 | (0x1F59,), | |
186 | (0x1F5B,), | |
187 | (0x1F5D,), | |
188 | (0x1F5F, 0x1F7D), | |
189 | (0x1F80, 0x1FB4), | |
190 | (0x1FB6, 0x1FC4), | |
191 | (0x1FC6, 0x1FD3), | |
192 | (0x1FD6, 0x1FDB), | |
193 | (0x1FDD, 0x1FEF), | |
194 | (0x1FF2, 0x1FF4), | |
195 | (0x1FF6, 0x1FFE), | |
196 | (0x2129,), | |
197 | (0x2719, 0x271A), | |
198 | (0xAB65,), | |
199 | (0x10140, 0x1018D), | |
200 | (0x101A0,), | |
201 | (0x1D200, 0x1D245), | |
202 | (0x1F7A1, 0x1F7A7), | |
203 | ] | |
204 | ||
205 | class Cyrillic(unicode_set): | |
206 | """Unicode set for Cyrillic Unicode Character Range""" | |
207 | _ranges: UnicodeRangeList = [ | |
208 | (0x0400, 0x052F), | |
209 | (0x1C80, 0x1C88), | |
210 | (0x1D2B,), | |
211 | (0x1D78,), | |
212 | (0x2DE0, 0x2DFF), | |
213 | (0xA640, 0xA672), | |
214 | (0xA674, 0xA69F), | |
215 | (0xFE2E, 0xFE2F), | |
216 | ] | |
217 | ||
218 | class Chinese(unicode_set): | |
219 | """Unicode set for Chinese Unicode Character Range""" | |
220 | _ranges: UnicodeRangeList = [ | |
221 | (0x2E80, 0x2E99), | |
222 | (0x2E9B, 0x2EF3), | |
223 | (0x31C0, 0x31E3), | |
224 | (0x3400, 0x4DB5), | |
225 | (0x4E00, 0x9FEF), | |
226 | (0xA700, 0xA707), | |
227 | (0xF900, 0xFA6D), | |
228 | (0xFA70, 0xFAD9), | |
229 | (0x16FE2, 0x16FE3), | |
230 | (0x1F210, 0x1F212), | |
231 | (0x1F214, 0x1F23B), | |
232 | (0x1F240, 0x1F248), | |
233 | (0x20000, 0x2A6D6), | |
234 | (0x2A700, 0x2B734), | |
235 | (0x2B740, 0x2B81D), | |
236 | (0x2B820, 0x2CEA1), | |
237 | (0x2CEB0, 0x2EBE0), | |
238 | (0x2F800, 0x2FA1D), | |
239 | ] | |
240 | ||
241 | class Japanese(unicode_set): | |
242 | """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges""" | |
243 | ||
244 | class Kanji(unicode_set): | |
245 | "Unicode set for Kanji Unicode Character Range" | |
246 | _ranges: UnicodeRangeList = [ | |
247 | (0x4E00, 0x9FBF), | |
248 | (0x3000, 0x303F), | |
249 | ] | |
250 | ||
251 | class Hiragana(unicode_set): | |
252 | """Unicode set for Hiragana Unicode Character Range""" | |
253 | _ranges: UnicodeRangeList = [ | |
254 | (0x3041, 0x3096), | |
255 | (0x3099, 0x30A0), | |
256 | (0x30FC,), | |
257 | (0xFF70,), | |
258 | (0x1B001,), | |
259 | (0x1B150, 0x1B152), | |
260 | (0x1F200,), | |
261 | ] | |
262 | ||
263 | class Katakana(unicode_set): | |
264 | """Unicode set for Katakana Unicode Character Range""" | |
265 | _ranges: UnicodeRangeList = [ | |
266 | (0x3099, 0x309C), | |
267 | (0x30A0, 0x30FF), | |
268 | (0x31F0, 0x31FF), | |
269 | (0x32D0, 0x32FE), | |
270 | (0xFF65, 0xFF9F), | |
271 | (0x1B000,), | |
272 | (0x1B164, 0x1B167), | |
273 | (0x1F201, 0x1F202), | |
274 | (0x1F213,), | |
275 | ] | |
276 | ||
277 | 漢字 = Kanji | |
278 | カタカナ = Katakana | |
279 | ひらがな = Hiragana | |
280 | ||
281 | _ranges = ( | |
282 | Kanji._ranges | |
283 | + Hiragana._ranges | |
284 | + Katakana._ranges | |
285 | ) | |
286 | ||
287 | class Hangul(unicode_set): | |
288 | """Unicode set for Hangul (Korean) Unicode Character Range""" | |
289 | _ranges: UnicodeRangeList = [ | |
290 | (0x1100, 0x11FF), | |
291 | (0x302E, 0x302F), | |
292 | (0x3131, 0x318E), | |
293 | (0x3200, 0x321C), | |
294 | (0x3260, 0x327B), | |
295 | (0x327E,), | |
296 | (0xA960, 0xA97C), | |
297 | (0xAC00, 0xD7A3), | |
298 | (0xD7B0, 0xD7C6), | |
299 | (0xD7CB, 0xD7FB), | |
300 | (0xFFA0, 0xFFBE), | |
301 | (0xFFC2, 0xFFC7), | |
302 | (0xFFCA, 0xFFCF), | |
303 | (0xFFD2, 0xFFD7), | |
304 | (0xFFDA, 0xFFDC), | |
305 | ] | |
306 | ||
307 | Korean = Hangul | |
308 | ||
309 | class CJK(Chinese, Japanese, Hangul): | |
310 | """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range""" | |
311 | ||
312 | class Thai(unicode_set): | |
313 | """Unicode set for Thai Unicode Character Range""" | |
314 | _ranges: UnicodeRangeList = [ | |
315 | (0x0E01, 0x0E3A), | |
316 | (0x0E3F, 0x0E5B) | |
317 | ] | |
318 | ||
319 | class Arabic(unicode_set): | |
320 | """Unicode set for Arabic Unicode Character Range""" | |
321 | _ranges: UnicodeRangeList = [ | |
322 | (0x0600, 0x061B), | |
323 | (0x061E, 0x06FF), | |
324 | (0x0700, 0x077F), | |
325 | ] | |
326 | ||
327 | class Hebrew(unicode_set): | |
328 | """Unicode set for Hebrew Unicode Character Range""" | |
329 | _ranges: UnicodeRangeList = [ | |
330 | (0x0591, 0x05C7), | |
331 | (0x05D0, 0x05EA), | |
332 | (0x05EF, 0x05F4), | |
333 | (0xFB1D, 0xFB36), | |
334 | (0xFB38, 0xFB3C), | |
335 | (0xFB3E,), | |
336 | (0xFB40, 0xFB41), | |
337 | (0xFB43, 0xFB44), | |
338 | (0xFB46, 0xFB4F), | |
339 | ] | |
340 | ||
341 | class Devanagari(unicode_set): | |
342 | """Unicode set for Devanagari Unicode Character Range""" | |
343 | _ranges: UnicodeRangeList = [ | |
344 | (0x0900, 0x097F), | |
345 | (0xA8E0, 0xA8FF) | |
346 | ] | |
347 | ||
348 | BMP = BasicMultilingualPlane | |
349 | ||
350 | # add language identifiers using language Unicode | |
351 | العربية = Arabic | |
352 | 中文 = Chinese | |
353 | кириллица = Cyrillic | |
354 | Ελληνικά = Greek | |
355 | עִברִית = Hebrew | |
356 | 日本語 = Japanese | |
357 | 한국어 = Korean | |
358 | ไทย = Thai | |
359 | देवनागरी = Devanagari | |
360 | ||
361 | # fmt: on |