[dlqueue.git] / venv / lib / python3.11 / site-packages / pip / _vendor / chardet / chardistribution.py

######################## BEGIN LICENSE BLOCK ########################
# The Original Code is Mozilla Communicator client code.
#
# The Initial Developer of the Original Code is
# Netscape Communications Corporation.
# Portions created by the Initial Developer are Copyright (C) 1998
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   Mark Pilgrim - port to Python
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301  USA
######################### END LICENSE BLOCK #########################

from typing import Tuple, Union

from .big5freq import (
    BIG5_CHAR_TO_FREQ_ORDER,
    BIG5_TABLE_SIZE,
    BIG5_TYPICAL_DISTRIBUTION_RATIO,
)
from .euckrfreq import (
    EUCKR_CHAR_TO_FREQ_ORDER,
    EUCKR_TABLE_SIZE,
    EUCKR_TYPICAL_DISTRIBUTION_RATIO,
)
from .euctwfreq import (
    EUCTW_CHAR_TO_FREQ_ORDER,
    EUCTW_TABLE_SIZE,
    EUCTW_TYPICAL_DISTRIBUTION_RATIO,
)
from .gb2312freq import (
    GB2312_CHAR_TO_FREQ_ORDER,
    GB2312_TABLE_SIZE,
    GB2312_TYPICAL_DISTRIBUTION_RATIO,
)
from .jisfreq import (
    JIS_CHAR_TO_FREQ_ORDER,
    JIS_TABLE_SIZE,
    JIS_TYPICAL_DISTRIBUTION_RATIO,
)
from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE


class CharDistributionAnalysis:
    ENOUGH_DATA_THRESHOLD = 1024
    SURE_YES = 0.99
    SURE_NO = 0.01
    MINIMUM_DATA_THRESHOLD = 3

    def __init__(self) -> None:
        # Mapping table to get frequency order from char order (get from
        # GetOrder())
        self._char_to_freq_order: Tuple[int, ...] = tuple()
        self._table_size = 0  # Size of above table
        # This is a constant value which varies from language to language,
        # used in calculating confidence.  See
        # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
        # for further detail.
        self.typical_distribution_ratio = 0.0
        self._done = False
        self._total_chars = 0
        self._freq_chars = 0
        self.reset()

    def reset(self) -> None:
        """reset analyser, clear any state"""
        # If this flag is set to True, detection is done and conclusion has
        # been made
        self._done = False
        self._total_chars = 0  # Total characters encountered
        # The number of characters whose frequency order is less than 512
        self._freq_chars = 0

    def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
        """feed a character with known length"""
        if char_len == 2:
            # we only care about 2-bytes character in our distribution analysis
            order = self.get_order(char)
        else:
            order = -1
        if order >= 0:
            self._total_chars += 1
            # order is valid
            if order < self._table_size:
                if 512 > self._char_to_freq_order[order]:
                    self._freq_chars += 1

    def get_confidence(self) -> float:
        """return confidence based on existing data"""
        # if we didn't receive any character in our consideration range,
        # return negative answer
        if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD:
            return self.SURE_NO

        if self._total_chars != self._freq_chars:
            r = self._freq_chars / (
                (self._total_chars - self._freq_chars) * self.typical_distribution_ratio
            )
            if r < self.SURE_YES:
                return r

        # normalize confidence (we don't want to be 100% sure)
        return self.SURE_YES

    def got_enough_data(self) -> bool:
        # It is not necessary to receive all data to draw conclusion.
        # For charset detection, certain amount of data is enough
        return self._total_chars > self.ENOUGH_DATA_THRESHOLD

    def get_order(self, _: Union[bytes, bytearray]) -> int:
        # We do not handle characters based on the original encoding string,
        # but convert this encoding string to a number, here called order.
        # This allows multiple encodings of a language to share one frequency
        # table.
        return -1


class EUCTWDistributionAnalysis(CharDistributionAnalysis):
    def __init__(self) -> None:
        super().__init__()
        self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
        self._table_size = EUCTW_TABLE_SIZE
        self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO

    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
        # for euc-TW encoding, we are interested
        #   first  byte range: 0xc4 -- 0xfe
        #   second byte range: 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
        first_char = byte_str[0]
        if first_char >= 0xC4:
            return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
        return -1


class EUCKRDistributionAnalysis(CharDistributionAnalysis):
    def __init__(self) -> None:
        super().__init__()
        self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
        self._table_size = EUCKR_TABLE_SIZE
        self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO

    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
        # for euc-KR encoding, we are interested
        #   first  byte range: 0xb0 -- 0xfe
        #   second byte range: 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
        first_char = byte_str[0]
        if first_char >= 0xB0:
            return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
        return -1


class JOHABDistributionAnalysis(CharDistributionAnalysis):
    def __init__(self) -> None:
        super().__init__()
        self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
        self._table_size = EUCKR_TABLE_SIZE
        self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO

    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
        first_char = byte_str[0]
        if 0x88 <= first_char < 0xD4:
            code = first_char * 256 + byte_str[1]
            return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
        return -1


class GB2312DistributionAnalysis(CharDistributionAnalysis):
    def __init__(self) -> None:
        super().__init__()
        self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
        self._table_size = GB2312_TABLE_SIZE
        self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO

    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
        # for GB2312 encoding, we are interested
        #  first  byte range: 0xb0 -- 0xfe
        #  second byte range: 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
        first_char, second_char = byte_str[0], byte_str[1]
        if (first_char >= 0xB0) and (second_char >= 0xA1):
            return 94 * (first_char - 0xB0) + second_char - 0xA1
        return -1


class Big5DistributionAnalysis(CharDistributionAnalysis):
    def __init__(self) -> None:
        super().__init__()
        self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
        self._table_size = BIG5_TABLE_SIZE
        self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO

    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
        # for big5 encoding, we are interested
        #   first  byte range: 0xa4 -- 0xfe
        #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
        first_char, second_char = byte_str[0], byte_str[1]
        if first_char >= 0xA4:
            if second_char >= 0xA1:
                return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
            return 157 * (first_char - 0xA4) + second_char - 0x40
        return -1


class SJISDistributionAnalysis(CharDistributionAnalysis):
    def __init__(self) -> None:
        super().__init__()
        self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
        self._table_size = JIS_TABLE_SIZE
        self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO

    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
        # for sjis encoding, we are interested
        #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
        #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
        # no validation needed here. State machine has done that
        first_char, second_char = byte_str[0], byte_str[1]
        if 0x81 <= first_char <= 0x9F:
            order = 188 * (first_char - 0x81)
        elif 0xE0 <= first_char <= 0xEF:
            order = 188 * (first_char - 0xE0 + 31)
        else:
            return -1
        order = order + second_char - 0x40
        if second_char > 0x7F:
            order = -1
        return order


class EUCJPDistributionAnalysis(CharDistributionAnalysis):
    def __init__(self) -> None:
        super().__init__()
        self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
        self._table_size = JIS_TABLE_SIZE
        self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO

    def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
        # for euc-JP encoding, we are interested
        #   first  byte range: 0xa0 -- 0xfe
        #   second byte range: 0xa1 -- 0xfe
        # no validation needed here. State machine has done that
        char = byte_str[0]
        if char >= 0xA0:
            return 94 * (char - 0xA1) + byte_str[1] - 0xA1
        return -1
Commit	Line	Data
e0df8241 JR	1	######################## BEGIN LICENSE BLOCK ########################
	2	# The Original Code is Mozilla Communicator client code.
	3	#
	4	# The Initial Developer of the Original Code is
	5	# Netscape Communications Corporation.
	6	# Portions created by the Initial Developer are Copyright (C) 1998
	7	# the Initial Developer. All Rights Reserved.
	8	#
	9	# Contributor(s):
	10	# Mark Pilgrim - port to Python
	11	#
	12	# This library is free software; you can redistribute it and/or
	13	# modify it under the terms of the GNU Lesser General Public
	14	# License as published by the Free Software Foundation; either
	15	# version 2.1 of the License, or (at your option) any later version.
	16	#
	17	# This library is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	20	# Lesser General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU Lesser General Public
	23	# License along with this library; if not, write to the Free Software
	24	# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
	25	# 02110-1301 USA
	26	######################### END LICENSE BLOCK #########################
	27
	28	from typing import Tuple, Union
	29
	30	from .big5freq import (
	31	BIG5_CHAR_TO_FREQ_ORDER,
	32	BIG5_TABLE_SIZE,
	33	BIG5_TYPICAL_DISTRIBUTION_RATIO,
	34	)
	35	from .euckrfreq import (
	36	EUCKR_CHAR_TO_FREQ_ORDER,
	37	EUCKR_TABLE_SIZE,
	38	EUCKR_TYPICAL_DISTRIBUTION_RATIO,
	39	)
	40	from .euctwfreq import (
	41	EUCTW_CHAR_TO_FREQ_ORDER,
	42	EUCTW_TABLE_SIZE,
	43	EUCTW_TYPICAL_DISTRIBUTION_RATIO,
	44	)
	45	from .gb2312freq import (
	46	GB2312_CHAR_TO_FREQ_ORDER,
	47	GB2312_TABLE_SIZE,
	48	GB2312_TYPICAL_DISTRIBUTION_RATIO,
	49	)
	50	from .jisfreq import (
	51	JIS_CHAR_TO_FREQ_ORDER,
	52	JIS_TABLE_SIZE,
	53	JIS_TYPICAL_DISTRIBUTION_RATIO,
	54	)
	55	from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE
	56
	57
	58	class CharDistributionAnalysis:
	59	ENOUGH_DATA_THRESHOLD = 1024
	60	SURE_YES = 0.99
	61	SURE_NO = 0.01
	62	MINIMUM_DATA_THRESHOLD = 3
	63
	64	def __init__(self) -> None:
65	# Mapping table to get frequency order from char order (get from
66	# GetOrder())
67	self._char_to_freq_order: Tuple[int, ...] = tuple()
68	self._table_size = 0 # Size of above table
69	# This is a constant value which varies from language to language,
70	# used in calculating confidence. See
71	# http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
72	# for further detail.
73	self.typical_distribution_ratio = 0.0
74	self._done = False
75	self._total_chars = 0
76	self._freq_chars = 0
77	self.reset()
78
79	def reset(self) -> None:
80	"""reset analyser, clear any state"""
81	# If this flag is set to True, detection is done and conclusion has
82	# been made
83	self._done = False
84	self._total_chars = 0 # Total characters encountered
85	# The number of characters whose frequency order is less than 512
86	self._freq_chars = 0
87
88	def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
89	"""feed a character with known length"""
90	if char_len == 2:
91	# we only care about 2-bytes character in our distribution analysis
92	order = self.get_order(char)
93	else:
94	order = -1
95	if order >= 0:
96	self._total_chars += 1
97	# order is valid
98	if order < self._table_size:
99	if 512 > self._char_to_freq_order[order]:
100	self._freq_chars += 1
101
102	def get_confidence(self) -> float:
103	"""return confidence based on existing data"""
104	# if we didn't receive any character in our consideration range,
105	# return negative answer
106	if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD:
107	return self.SURE_NO
108
109	if self._total_chars != self._freq_chars:
110	r = self._freq_chars / (
111	(self._total_chars - self._freq_chars) * self.typical_distribution_ratio
112	)
113	if r < self.SURE_YES:
114	return r
115
116	# normalize confidence (we don't want to be 100% sure)
117	return self.SURE_YES
118
119	def got_enough_data(self) -> bool:
120	# It is not necessary to receive all data to draw conclusion.
121	# For charset detection, certain amount of data is enough
122	return self._total_chars > self.ENOUGH_DATA_THRESHOLD
123
124	def get_order(self, _: Union[bytes, bytearray]) -> int:
125	# We do not handle characters based on the original encoding string,
126	# but convert this encoding string to a number, here called order.
127	# This allows multiple encodings of a language to share one frequency
128	# table.
129	return -1
130
131
132	class EUCTWDistributionAnalysis(CharDistributionAnalysis):
133	def __init__(self) -> None:
134	super().__init__()
135	self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
136	self._table_size = EUCTW_TABLE_SIZE
137	self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
138
139	def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
140	# for euc-TW encoding, we are interested
141	# first byte range: 0xc4 -- 0xfe
142	# second byte range: 0xa1 -- 0xfe
143	# no validation needed here. State machine has done that
144	first_char = byte_str[0]
145	if first_char >= 0xC4:
146	return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
147	return -1
148
149
150	class EUCKRDistributionAnalysis(CharDistributionAnalysis):
151	def __init__(self) -> None:
152	super().__init__()
153	self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
154	self._table_size = EUCKR_TABLE_SIZE
155	self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
156
157	def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
158	# for euc-KR encoding, we are interested
159	# first byte range: 0xb0 -- 0xfe
160	# second byte range: 0xa1 -- 0xfe
161	# no validation needed here. State machine has done that
162	first_char = byte_str[0]
163	if first_char >= 0xB0:
164	return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
165	return -1
166
167
168	class JOHABDistributionAnalysis(CharDistributionAnalysis):
169	def __init__(self) -> None:
170	super().__init__()
171	self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
172	self._table_size = EUCKR_TABLE_SIZE
173	self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
174
175	def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
176	first_char = byte_str[0]
177	if 0x88 <= first_char < 0xD4:
178	code = first_char * 256 + byte_str[1]
179	return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
180	return -1
181
182
183	class GB2312DistributionAnalysis(CharDistributionAnalysis):
184	def __init__(self) -> None:
185	super().__init__()
186	self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
187	self._table_size = GB2312_TABLE_SIZE
188	self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
189
190	def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
191	# for GB2312 encoding, we are interested
192	# first byte range: 0xb0 -- 0xfe
193	# second byte range: 0xa1 -- 0xfe
194	# no validation needed here. State machine has done that
195	first_char, second_char = byte_str[0], byte_str[1]
196	if (first_char >= 0xB0) and (second_char >= 0xA1):
197	return 94 * (first_char - 0xB0) + second_char - 0xA1
198	return -1
199
200
201	class Big5DistributionAnalysis(CharDistributionAnalysis):
202	def __init__(self) -> None:
203	super().__init__()
204	self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
205	self._table_size = BIG5_TABLE_SIZE
206	self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
207
208	def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
209	# for big5 encoding, we are interested
210	# first byte range: 0xa4 -- 0xfe
211	# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
212	# no validation needed here. State machine has done that
213	first_char, second_char = byte_str[0], byte_str[1]
214	if first_char >= 0xA4:
215	if second_char >= 0xA1:
216	return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
217	return 157 * (first_char - 0xA4) + second_char - 0x40
218	return -1
219
220
221	class SJISDistributionAnalysis(CharDistributionAnalysis):
222	def __init__(self) -> None:
223	super().__init__()
224	self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
225	self._table_size = JIS_TABLE_SIZE
226	self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
227
228	def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
229	# for sjis encoding, we are interested
230	# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
231	# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
232	# no validation needed here. State machine has done that
233	first_char, second_char = byte_str[0], byte_str[1]
234	if 0x81 <= first_char <= 0x9F:
235	order = 188 * (first_char - 0x81)
236	elif 0xE0 <= first_char <= 0xEF:
237	order = 188 * (first_char - 0xE0 + 31)
238	else:
239	return -1
240	order = order + second_char - 0x40
241	if second_char > 0x7F:
242	order = -1
243	return order
244
245
246	class EUCJPDistributionAnalysis(CharDistributionAnalysis):
247	def __init__(self) -> None:
248	super().__init__()
249	self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
250	self._table_size = JIS_TABLE_SIZE
251	self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
252
253	def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
254	# for euc-JP encoding, we are interested
255	# first byte range: 0xa0 -- 0xfe
256	# second byte range: 0xa1 -- 0xfe
257	# no validation needed here. State machine has done that
258	char = byte_str[0]
259	if char >= 0xA0:
260	return 94 * (char - 0xA1) + byte_str[1] - 0xA1
261	return -1