]>
Commit | Line | Data |
---|---|---|
e0df8241 JR |
1 | ######################## BEGIN LICENSE BLOCK ######################## |
2 | # The Original Code is Mozilla Universal charset detector code. | |
3 | # | |
4 | # The Initial Developer of the Original Code is | |
5 | # Netscape Communications Corporation. | |
6 | # Portions created by the Initial Developer are Copyright (C) 2001 | |
7 | # the Initial Developer. All Rights Reserved. | |
8 | # | |
9 | # Contributor(s): | |
10 | # Mark Pilgrim - port to Python | |
11 | # Shy Shalom - original C code | |
12 | # | |
13 | # This library is free software; you can redistribute it and/or | |
14 | # modify it under the terms of the GNU Lesser General Public | |
15 | # License as published by the Free Software Foundation; either | |
16 | # version 2.1 of the License, or (at your option) any later version. | |
17 | # | |
18 | # This library is distributed in the hope that it will be useful, | |
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 | # Lesser General Public License for more details. | |
22 | # | |
23 | # You should have received a copy of the GNU Lesser General Public | |
24 | # License along with this library; if not, write to the Free Software | |
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA | |
26 | # 02110-1301 USA | |
27 | ######################### END LICENSE BLOCK ######################### | |
28 | ||
29 | from typing import List, Union | |
30 | ||
31 | from .charsetprober import CharSetProber | |
32 | from .enums import ProbingState | |
33 | ||
34 | FREQ_CAT_NUM = 4 | |
35 | ||
36 | UDF = 0 # undefined | |
37 | OTH = 1 # other | |
38 | ASC = 2 # ascii capital letter | |
39 | ASS = 3 # ascii small letter | |
40 | ACV = 4 # accent capital vowel | |
41 | ACO = 5 # accent capital other | |
42 | ASV = 6 # accent small vowel | |
43 | ASO = 7 # accent small other | |
44 | CLASS_NUM = 8 # total classes | |
45 | ||
46 | # fmt: off | |
47 | Latin1_CharToClass = ( | |
48 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 | |
49 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F | |
50 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 | |
51 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F | |
52 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 | |
53 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F | |
54 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 | |
55 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F | |
56 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 | |
57 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F | |
58 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 | |
59 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F | |
60 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 | |
61 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F | |
62 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 | |
63 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F | |
64 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 | |
65 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F | |
66 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 | |
67 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F | |
68 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 | |
69 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF | |
70 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 | |
71 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF | |
72 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 | |
73 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF | |
74 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 | |
75 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF | |
76 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 | |
77 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF | |
78 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 | |
79 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF | |
80 | ) | |
81 | ||
82 | # 0 : illegal | |
83 | # 1 : very unlikely | |
84 | # 2 : normal | |
85 | # 3 : very likely | |
86 | Latin1ClassModel = ( | |
87 | # UDF OTH ASC ASS ACV ACO ASV ASO | |
88 | 0, 0, 0, 0, 0, 0, 0, 0, # UDF | |
89 | 0, 3, 3, 3, 3, 3, 3, 3, # OTH | |
90 | 0, 3, 3, 3, 3, 3, 3, 3, # ASC | |
91 | 0, 3, 3, 3, 1, 1, 3, 3, # ASS | |
92 | 0, 3, 3, 3, 1, 2, 1, 2, # ACV | |
93 | 0, 3, 3, 3, 3, 3, 3, 3, # ACO | |
94 | 0, 3, 1, 3, 1, 1, 1, 3, # ASV | |
95 | 0, 3, 1, 3, 1, 1, 3, 3, # ASO | |
96 | ) | |
97 | # fmt: on | |
98 | ||
99 | ||
100 | class Latin1Prober(CharSetProber): | |
101 | def __init__(self) -> None: | |
102 | super().__init__() | |
103 | self._last_char_class = OTH | |
104 | self._freq_counter: List[int] = [] | |
105 | self.reset() | |
106 | ||
107 | def reset(self) -> None: | |
108 | self._last_char_class = OTH | |
109 | self._freq_counter = [0] * FREQ_CAT_NUM | |
110 | super().reset() | |
111 | ||
112 | @property | |
113 | def charset_name(self) -> str: | |
114 | return "ISO-8859-1" | |
115 | ||
116 | @property | |
117 | def language(self) -> str: | |
118 | return "" | |
119 | ||
120 | def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: | |
121 | byte_str = self.remove_xml_tags(byte_str) | |
122 | for c in byte_str: | |
123 | char_class = Latin1_CharToClass[c] | |
124 | freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class] | |
125 | if freq == 0: | |
126 | self._state = ProbingState.NOT_ME | |
127 | break | |
128 | self._freq_counter[freq] += 1 | |
129 | self._last_char_class = char_class | |
130 | ||
131 | return self.state | |
132 | ||
133 | def get_confidence(self) -> float: | |
134 | if self.state == ProbingState.NOT_ME: | |
135 | return 0.01 | |
136 | ||
137 | total = sum(self._freq_counter) | |
138 | confidence = ( | |
139 | 0.0 | |
140 | if total < 0.01 | |
141 | else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total | |
142 | ) | |
143 | confidence = max(confidence, 0.0) | |
144 | # lower the confidence of latin1 so that other more accurate | |
145 | # detector can take priority. | |
146 | confidence *= 0.73 | |
147 | return confidence |