[dlqueue.git] / venv / lib / python3.11 / site-packages / pip / _vendor / pyparsing / unicode.py

# unicode.py

import sys
from itertools import filterfalse
from typing import List, Tuple, Union


class _lazyclassproperty:
    def __init__(self, fn):
        self.fn = fn
        self.__doc__ = fn.__doc__
        self.__name__ = fn.__name__

    def __get__(self, obj, cls):
        if cls is None:
            cls = type(obj)
        if not hasattr(cls, "_intern") or any(
            cls._intern is getattr(superclass, "_intern", [])
            for superclass in cls.__mro__[1:]
        ):
            cls._intern = {}
        attrname = self.fn.__name__
        if attrname not in cls._intern:
            cls._intern[attrname] = self.fn(cls)
        return cls._intern[attrname]


UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]


class unicode_set:
    """
    A set of Unicode characters, for language-specific strings for
    ``alphas``, ``nums``, ``alphanums``, and ``printables``.
    A unicode_set is defined by a list of ranges in the Unicode character
    set, in a class attribute ``_ranges``. Ranges can be specified using
    2-tuples or a 1-tuple, such as::

        _ranges = [
            (0x0020, 0x007e),
            (0x00a0, 0x00ff),
            (0x0100,),
            ]

    Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).

    A unicode set can also be defined using multiple inheritance of other unicode sets::

        class CJK(Chinese, Japanese, Korean):
            pass
    """

    _ranges: UnicodeRangeList = []

    @_lazyclassproperty
    def _chars_for_ranges(cls):
        ret = []
        for cc in cls.__mro__:
            if cc is unicode_set:
                break
            for rr in getattr(cc, "_ranges", ()):
                ret.extend(range(rr[0], rr[-1] + 1))
        return [chr(c) for c in sorted(set(ret))]

    @_lazyclassproperty
    def printables(cls):
        """all non-whitespace characters in this range"""
        return "".join(filterfalse(str.isspace, cls._chars_for_ranges))

    @_lazyclassproperty
    def alphas(cls):
        """all alphabetic characters in this range"""
        return "".join(filter(str.isalpha, cls._chars_for_ranges))

    @_lazyclassproperty
    def nums(cls):
        """all numeric digit characters in this range"""
        return "".join(filter(str.isdigit, cls._chars_for_ranges))

    @_lazyclassproperty
    def alphanums(cls):
        """all alphanumeric characters in this range"""
        return cls.alphas + cls.nums

    @_lazyclassproperty
    def identchars(cls):
        """all characters in this range that are valid identifier characters, plus underscore '_'"""
        return "".join(
            sorted(
                set(
                    "".join(filter(str.isidentifier, cls._chars_for_ranges))
                    + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
                    + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
                    + "_"
                )
            )
        )

    @_lazyclassproperty
    def identbodychars(cls):
        """
        all characters in this range that are valid identifier body characters,
        plus the digits 0-9, and · (Unicode MIDDLE DOT)
        """
        return "".join(
            sorted(
                set(
                    cls.identchars
                    + "0123456789·"
                    + "".join(
                        [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()]
                    )
                )
            )
        )

    @_lazyclassproperty
    def identifier(cls):
        """
        a pyparsing Word expression for an identifier using this range's definitions for
        identchars and identbodychars
        """
        from pip._vendor.pyparsing import Word

        return Word(cls.identchars, cls.identbodychars)


class pyparsing_unicode(unicode_set):
    """
    A namespace class for defining common language unicode_sets.
    """

    # fmt: off

    # define ranges in language character sets
    _ranges: UnicodeRangeList = [
        (0x0020, sys.maxunicode),
    ]

    class BasicMultilingualPlane(unicode_set):
        """Unicode set for the Basic Multilingual Plane"""
        _ranges: UnicodeRangeList = [
            (0x0020, 0xFFFF),
        ]

    class Latin1(unicode_set):
        """Unicode set for Latin-1 Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x0020, 0x007E),
            (0x00A0, 0x00FF),
        ]

    class LatinA(unicode_set):
        """Unicode set for Latin-A Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x0100, 0x017F),
        ]

    class LatinB(unicode_set):
        """Unicode set for Latin-B Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x0180, 0x024F),
        ]

    class Greek(unicode_set):
        """Unicode set for Greek Unicode Character Ranges"""
        _ranges: UnicodeRangeList = [
            (0x0342, 0x0345),
            (0x0370, 0x0377),
            (0x037A, 0x037F),
            (0x0384, 0x038A),
            (0x038C,),
            (0x038E, 0x03A1),
            (0x03A3, 0x03E1),
            (0x03F0, 0x03FF),
            (0x1D26, 0x1D2A),
            (0x1D5E,),
            (0x1D60,),
            (0x1D66, 0x1D6A),
            (0x1F00, 0x1F15),
            (0x1F18, 0x1F1D),
            (0x1F20, 0x1F45),
            (0x1F48, 0x1F4D),
            (0x1F50, 0x1F57),
            (0x1F59,),
            (0x1F5B,),
            (0x1F5D,),
            (0x1F5F, 0x1F7D),
            (0x1F80, 0x1FB4),
            (0x1FB6, 0x1FC4),
            (0x1FC6, 0x1FD3),
            (0x1FD6, 0x1FDB),
            (0x1FDD, 0x1FEF),
            (0x1FF2, 0x1FF4),
            (0x1FF6, 0x1FFE),
            (0x2129,),
            (0x2719, 0x271A),
            (0xAB65,),
            (0x10140, 0x1018D),
            (0x101A0,),
            (0x1D200, 0x1D245),
            (0x1F7A1, 0x1F7A7),
        ]

    class Cyrillic(unicode_set):
        """Unicode set for Cyrillic Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x0400, 0x052F),
            (0x1C80, 0x1C88),
            (0x1D2B,),
            (0x1D78,),
            (0x2DE0, 0x2DFF),
            (0xA640, 0xA672),
            (0xA674, 0xA69F),
            (0xFE2E, 0xFE2F),
        ]

    class Chinese(unicode_set):
        """Unicode set for Chinese Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x2E80, 0x2E99),
            (0x2E9B, 0x2EF3),
            (0x31C0, 0x31E3),
            (0x3400, 0x4DB5),
            (0x4E00, 0x9FEF),
            (0xA700, 0xA707),
            (0xF900, 0xFA6D),
            (0xFA70, 0xFAD9),
            (0x16FE2, 0x16FE3),
            (0x1F210, 0x1F212),
            (0x1F214, 0x1F23B),
            (0x1F240, 0x1F248),
            (0x20000, 0x2A6D6),
            (0x2A700, 0x2B734),
            (0x2B740, 0x2B81D),
            (0x2B820, 0x2CEA1),
            (0x2CEB0, 0x2EBE0),
            (0x2F800, 0x2FA1D),
        ]

    class Japanese(unicode_set):
        """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""

        class Kanji(unicode_set):
            "Unicode set for Kanji Unicode Character Range"
            _ranges: UnicodeRangeList = [
                (0x4E00, 0x9FBF),
                (0x3000, 0x303F),
            ]

        class Hiragana(unicode_set):
            """Unicode set for Hiragana Unicode Character Range"""
            _ranges: UnicodeRangeList = [
                (0x3041, 0x3096),
                (0x3099, 0x30A0),
                (0x30FC,),
                (0xFF70,),
                (0x1B001,),
                (0x1B150, 0x1B152),
                (0x1F200,),
            ]

        class Katakana(unicode_set):
            """Unicode set for Katakana  Unicode Character Range"""
            _ranges: UnicodeRangeList = [
                (0x3099, 0x309C),
                (0x30A0, 0x30FF),
                (0x31F0, 0x31FF),
                (0x32D0, 0x32FE),
                (0xFF65, 0xFF9F),
                (0x1B000,),
                (0x1B164, 0x1B167),
                (0x1F201, 0x1F202),
                (0x1F213,),
            ]

        漢字 = Kanji
        カタカナ = Katakana
        ひらがな = Hiragana

        _ranges = (
            Kanji._ranges
            + Hiragana._ranges
            + Katakana._ranges
        )

    class Hangul(unicode_set):
        """Unicode set for Hangul (Korean) Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x1100, 0x11FF),
            (0x302E, 0x302F),
            (0x3131, 0x318E),
            (0x3200, 0x321C),
            (0x3260, 0x327B),
            (0x327E,),
            (0xA960, 0xA97C),
            (0xAC00, 0xD7A3),
            (0xD7B0, 0xD7C6),
            (0xD7CB, 0xD7FB),
            (0xFFA0, 0xFFBE),
            (0xFFC2, 0xFFC7),
            (0xFFCA, 0xFFCF),
            (0xFFD2, 0xFFD7),
            (0xFFDA, 0xFFDC),
        ]

    Korean = Hangul

    class CJK(Chinese, Japanese, Hangul):
        """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""

    class Thai(unicode_set):
        """Unicode set for Thai Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x0E01, 0x0E3A),
            (0x0E3F, 0x0E5B)
        ]

    class Arabic(unicode_set):
        """Unicode set for Arabic Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x0600, 0x061B),
            (0x061E, 0x06FF),
            (0x0700, 0x077F),
        ]

    class Hebrew(unicode_set):
        """Unicode set for Hebrew Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x0591, 0x05C7),
            (0x05D0, 0x05EA),
            (0x05EF, 0x05F4),
            (0xFB1D, 0xFB36),
            (0xFB38, 0xFB3C),
            (0xFB3E,),
            (0xFB40, 0xFB41),
            (0xFB43, 0xFB44),
            (0xFB46, 0xFB4F),
        ]

    class Devanagari(unicode_set):
        """Unicode set for Devanagari Unicode Character Range"""
        _ranges: UnicodeRangeList = [
            (0x0900, 0x097F),
            (0xA8E0, 0xA8FF)
        ]

    BMP = BasicMultilingualPlane

    # add language identifiers using language Unicode
    العربية = Arabic
    中文 = Chinese
    кириллица = Cyrillic
    Ελληνικά = Greek
    עִברִית = Hebrew
    日本語 = Japanese
    한국어 = Korean
    ไทย = Thai
    देवनागरी = Devanagari

    # fmt: on
Commit	Line	Data
e0df8241 JR	1	# unicode.py
	2
	3	import sys
	4	from itertools import filterfalse
	5	from typing import List, Tuple, Union
	6
	7
	8	class _lazyclassproperty:
	9	def __init__(self, fn):
	10	self.fn = fn
	11	self.__doc__ = fn.__doc__
	12	self.__name__ = fn.__name__
	13
	14	def __get__(self, obj, cls):
	15	if cls is None:
	16	cls = type(obj)
	17	if not hasattr(cls, "_intern") or any(
	18	cls._intern is getattr(superclass, "_intern", [])
	19	for superclass in cls.__mro__[1:]
	20	):
	21	cls._intern = {}
	22	attrname = self.fn.__name__
	23	if attrname not in cls._intern:
	24	cls._intern[attrname] = self.fn(cls)
	25	return cls._intern[attrname]
	26
	27
	28	UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
	29
	30
	31	class unicode_set:
	32	"""
	33	A set of Unicode characters, for language-specific strings for
	34	``alphas``, ``nums``, ``alphanums``, and ``printables``.
	35	A unicode_set is defined by a list of ranges in the Unicode character
	36	set, in a class attribute ``_ranges``. Ranges can be specified using
	37	2-tuples or a 1-tuple, such as::
	38
	39	_ranges = [
	40	(0x0020, 0x007e),
	41	(0x00a0, 0x00ff),
	42	(0x0100,),
	43	]
	44
	45	Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
	46
	47	A unicode set can also be defined using multiple inheritance of other unicode sets::
	48
	49	class CJK(Chinese, Japanese, Korean):
	50	pass
	51	"""
	52
	53	_ranges: UnicodeRangeList = []
	54
	55	@_lazyclassproperty
	56	def _chars_for_ranges(cls):
	57	ret = []
	58	for cc in cls.__mro__:
	59	if cc is unicode_set:
	60	break
	61	for rr in getattr(cc, "_ranges", ()):
	62	ret.extend(range(rr[0], rr[-1] + 1))
	63	return [chr(c) for c in sorted(set(ret))]
	64
65	@_lazyclassproperty
66	def printables(cls):
67	"""all non-whitespace characters in this range"""
68	return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
69
70	@_lazyclassproperty
71	def alphas(cls):
72	"""all alphabetic characters in this range"""
73	return "".join(filter(str.isalpha, cls._chars_for_ranges))
74
75	@_lazyclassproperty
76	def nums(cls):
77	"""all numeric digit characters in this range"""
78	return "".join(filter(str.isdigit, cls._chars_for_ranges))
79
80	@_lazyclassproperty
81	def alphanums(cls):
82	"""all alphanumeric characters in this range"""
83	return cls.alphas + cls.nums
84
85	@_lazyclassproperty
86	def identchars(cls):
87	"""all characters in this range that are valid identifier characters, plus underscore '_'"""
88	return "".join(
89	sorted(
90	set(
91	"".join(filter(str.isidentifier, cls._chars_for_ranges))
92	+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
93	+ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
94	+ "_"
95	)
96	)
97	)
98
99	@_lazyclassproperty
100	def identbodychars(cls):
101	"""
102	all characters in this range that are valid identifier body characters,
103	plus the digits 0-9, and · (Unicode MIDDLE DOT)
104	"""
105	return "".join(
106	sorted(
107	set(
108	cls.identchars
109	+ "0123456789·"
110	+ "".join(
111	[c for c in cls._chars_for_ranges if ("_" + c).isidentifier()]
112	)
113	)
114	)
115	)
116
117	@_lazyclassproperty
118	def identifier(cls):
119	"""
120	a pyparsing Word expression for an identifier using this range's definitions for
121	identchars and identbodychars
122	"""
123	from pip._vendor.pyparsing import Word
124
125	return Word(cls.identchars, cls.identbodychars)
126
127
128	class pyparsing_unicode(unicode_set):
129	"""
130	A namespace class for defining common language unicode_sets.
131	"""
132
133	# fmt: off
134
135	# define ranges in language character sets
136	_ranges: UnicodeRangeList = [
137	(0x0020, sys.maxunicode),
138	]
139
140	class BasicMultilingualPlane(unicode_set):
141	"""Unicode set for the Basic Multilingual Plane"""
142	_ranges: UnicodeRangeList = [
143	(0x0020, 0xFFFF),
144	]
145
146	class Latin1(unicode_set):
147	"""Unicode set for Latin-1 Unicode Character Range"""
148	_ranges: UnicodeRangeList = [
149	(0x0020, 0x007E),
150	(0x00A0, 0x00FF),
151	]
152
153	class LatinA(unicode_set):
154	"""Unicode set for Latin-A Unicode Character Range"""
155	_ranges: UnicodeRangeList = [
156	(0x0100, 0x017F),
157	]
158
159	class LatinB(unicode_set):
160	"""Unicode set for Latin-B Unicode Character Range"""
161	_ranges: UnicodeRangeList = [
162	(0x0180, 0x024F),
163	]
164
165	class Greek(unicode_set):
166	"""Unicode set for Greek Unicode Character Ranges"""
167	_ranges: UnicodeRangeList = [
168	(0x0342, 0x0345),
169	(0x0370, 0x0377),
170	(0x037A, 0x037F),
171	(0x0384, 0x038A),
172	(0x038C,),
173	(0x038E, 0x03A1),
174	(0x03A3, 0x03E1),
175	(0x03F0, 0x03FF),
176	(0x1D26, 0x1D2A),
177	(0x1D5E,),
178	(0x1D60,),
179	(0x1D66, 0x1D6A),
180	(0x1F00, 0x1F15),
181	(0x1F18, 0x1F1D),
182	(0x1F20, 0x1F45),
183	(0x1F48, 0x1F4D),
184	(0x1F50, 0x1F57),
185	(0x1F59,),
186	(0x1F5B,),
187	(0x1F5D,),
188	(0x1F5F, 0x1F7D),
189	(0x1F80, 0x1FB4),
190	(0x1FB6, 0x1FC4),
191	(0x1FC6, 0x1FD3),
192	(0x1FD6, 0x1FDB),
193	(0x1FDD, 0x1FEF),
194	(0x1FF2, 0x1FF4),
195	(0x1FF6, 0x1FFE),
196	(0x2129,),
197	(0x2719, 0x271A),
198	(0xAB65,),
199	(0x10140, 0x1018D),
200	(0x101A0,),
201	(0x1D200, 0x1D245),
202	(0x1F7A1, 0x1F7A7),
203	]
204
205	class Cyrillic(unicode_set):
206	"""Unicode set for Cyrillic Unicode Character Range"""
207	_ranges: UnicodeRangeList = [
208	(0x0400, 0x052F),
209	(0x1C80, 0x1C88),
210	(0x1D2B,),
211	(0x1D78,),
212	(0x2DE0, 0x2DFF),
213	(0xA640, 0xA672),
214	(0xA674, 0xA69F),
215	(0xFE2E, 0xFE2F),
216	]
217
218	class Chinese(unicode_set):
219	"""Unicode set for Chinese Unicode Character Range"""
220	_ranges: UnicodeRangeList = [
221	(0x2E80, 0x2E99),
222	(0x2E9B, 0x2EF3),
223	(0x31C0, 0x31E3),
224	(0x3400, 0x4DB5),
225	(0x4E00, 0x9FEF),
226	(0xA700, 0xA707),
227	(0xF900, 0xFA6D),
228	(0xFA70, 0xFAD9),
229	(0x16FE2, 0x16FE3),
230	(0x1F210, 0x1F212),
231	(0x1F214, 0x1F23B),
232	(0x1F240, 0x1F248),
233	(0x20000, 0x2A6D6),
234	(0x2A700, 0x2B734),
235	(0x2B740, 0x2B81D),
236	(0x2B820, 0x2CEA1),
237	(0x2CEB0, 0x2EBE0),
238	(0x2F800, 0x2FA1D),
239	]
240
241	class Japanese(unicode_set):
242	"""Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""
243
244	class Kanji(unicode_set):
245	"Unicode set for Kanji Unicode Character Range"
246	_ranges: UnicodeRangeList = [
247	(0x4E00, 0x9FBF),
248	(0x3000, 0x303F),
249	]
250
251	class Hiragana(unicode_set):
252	"""Unicode set for Hiragana Unicode Character Range"""
253	_ranges: UnicodeRangeList = [
254	(0x3041, 0x3096),
255	(0x3099, 0x30A0),
256	(0x30FC,),
257	(0xFF70,),
258	(0x1B001,),
259	(0x1B150, 0x1B152),
260	(0x1F200,),
261	]
262
263	class Katakana(unicode_set):
264	"""Unicode set for Katakana Unicode Character Range"""
265	_ranges: UnicodeRangeList = [
266	(0x3099, 0x309C),
267	(0x30A0, 0x30FF),
268	(0x31F0, 0x31FF),
269	(0x32D0, 0x32FE),
270	(0xFF65, 0xFF9F),
271	(0x1B000,),
272	(0x1B164, 0x1B167),
273	(0x1F201, 0x1F202),
274	(0x1F213,),
275	]
276
277	漢字 = Kanji
278	カタカナ = Katakana
279	ひらがな = Hiragana
280
281	_ranges = (
282	Kanji._ranges
283	+ Hiragana._ranges
284	+ Katakana._ranges
285	)
286
287	class Hangul(unicode_set):
288	"""Unicode set for Hangul (Korean) Unicode Character Range"""
289	_ranges: UnicodeRangeList = [
290	(0x1100, 0x11FF),
291	(0x302E, 0x302F),
292	(0x3131, 0x318E),
293	(0x3200, 0x321C),
294	(0x3260, 0x327B),
295	(0x327E,),
296	(0xA960, 0xA97C),
297	(0xAC00, 0xD7A3),
298	(0xD7B0, 0xD7C6),
299	(0xD7CB, 0xD7FB),
300	(0xFFA0, 0xFFBE),
301	(0xFFC2, 0xFFC7),
302	(0xFFCA, 0xFFCF),
303	(0xFFD2, 0xFFD7),
304	(0xFFDA, 0xFFDC),
305	]
306
307	Korean = Hangul
308
309	class CJK(Chinese, Japanese, Hangul):
310	"""Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""
311
312	class Thai(unicode_set):
313	"""Unicode set for Thai Unicode Character Range"""
314	_ranges: UnicodeRangeList = [
315	(0x0E01, 0x0E3A),
316	(0x0E3F, 0x0E5B)
317	]
318
319	class Arabic(unicode_set):
320	"""Unicode set for Arabic Unicode Character Range"""
321	_ranges: UnicodeRangeList = [
322	(0x0600, 0x061B),
323	(0x061E, 0x06FF),
324	(0x0700, 0x077F),
325	]
326
327	class Hebrew(unicode_set):
328	"""Unicode set for Hebrew Unicode Character Range"""
329	_ranges: UnicodeRangeList = [
330	(0x0591, 0x05C7),
331	(0x05D0, 0x05EA),
332	(0x05EF, 0x05F4),
333	(0xFB1D, 0xFB36),
334	(0xFB38, 0xFB3C),
335	(0xFB3E,),
336	(0xFB40, 0xFB41),
337	(0xFB43, 0xFB44),
338	(0xFB46, 0xFB4F),
339	]
340
341	class Devanagari(unicode_set):
342	"""Unicode set for Devanagari Unicode Character Range"""
343	_ranges: UnicodeRangeList = [
344	(0x0900, 0x097F),
345	(0xA8E0, 0xA8FF)
346	]
347
348	BMP = BasicMultilingualPlane
349
350	# add language identifiers using language Unicode
351	العربية = Arabic
352	中文 = Chinese
353	кириллица = Cyrillic
354	Ελληνικά = Greek
355	עִברִית = Hebrew
356	日本語 = Japanese
357	한국어 = Korean
358	ไทย = Thai
359	देवनागरी = Devanagari
360
361	# fmt: on