]>
jfr.im git - dlqueue.git/blob - venv/lib/python3.11/site-packages/pip/_vendor/chardet/macromanprober.py
1 ######################## BEGIN LICENSE BLOCK ########################
2 # This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
3 # The Original Code is Mozilla Universal charset detector code.
5 # The Initial Developer of the Original Code is
6 # Netscape Communications Corporation.
7 # Portions created by the Initial Developer are Copyright (C) 2001
8 # the Initial Developer. All Rights Reserved.
11 # Rob Speer - adapt to MacRoman encoding
12 # Mark Pilgrim - port to Python
13 # Shy Shalom - original C code
15 # This library is free software; you can redistribute it and/or
16 # modify it under the terms of the GNU Lesser General Public
17 # License as published by the Free Software Foundation; either
18 # version 2.1 of the License, or (at your option) any later version.
20 # This library is distributed in the hope that it will be useful,
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 # Lesser General Public License for more details.
25 # You should have received a copy of the GNU Lesser General Public
26 # License along with this library; if not, write to the Free Software
27 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
29 ######################### END LICENSE BLOCK #########################
31 from typing
import List
, Union
33 from .charsetprober
import CharSetProber
34 from .enums
import ProbingState
40 ASC
= 2 # ascii capital letter
41 ASS
= 3 # ascii small letter
42 ACV
= 4 # accent capital vowel
43 ACO
= 5 # accent capital other
44 ASV
= 6 # accent small vowel
45 ASO
= 7 # accent small other
46 ODD
= 8 # character that is unlikely to appear
47 CLASS_NUM
= 9 # total classes
49 # The change from Latin1 is that we explicitly look for extended characters
50 # that are infrequently-occurring symbols, and consider them to always be
51 # improbable. This should let MacRoman get out of the way of more likely
52 # encodings in most situations.
55 MacRoman_CharToClass
= (
56 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 00 - 07
57 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 08 - 0F
58 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 10 - 17
59 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 18 - 1F
60 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 20 - 27
61 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 28 - 2F
62 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 30 - 37
63 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # 38 - 3F
64 OTH
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, # 40 - 47
65 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, # 48 - 4F
66 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, # 50 - 57
67 ASC
, ASC
, ASC
, OTH
, OTH
, OTH
, OTH
, OTH
, # 58 - 5F
68 OTH
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, # 60 - 67
69 ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, # 68 - 6F
70 ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, ASS
, # 70 - 77
71 ASS
, ASS
, ASS
, OTH
, OTH
, OTH
, OTH
, OTH
, # 78 - 7F
72 ACV
, ACV
, ACO
, ACV
, ACO
, ACV
, ACV
, ASV
, # 80 - 87
73 ASV
, ASV
, ASV
, ASV
, ASV
, ASO
, ASV
, ASV
, # 88 - 8F
74 ASV
, ASV
, ASV
, ASV
, ASV
, ASV
, ASO
, ASV
, # 90 - 97
75 ASV
, ASV
, ASV
, ASV
, ASV
, ASV
, ASV
, ASV
, # 98 - 9F
76 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, ASO
, # A0 - A7
77 OTH
, OTH
, ODD
, ODD
, OTH
, OTH
, ACV
, ACV
, # A8 - AF
78 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, # B0 - B7
79 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, ASV
, ASV
, # B8 - BF
80 OTH
, OTH
, ODD
, OTH
, ODD
, OTH
, OTH
, OTH
, # C0 - C7
81 OTH
, OTH
, OTH
, ACV
, ACV
, ACV
, ACV
, ASV
, # C8 - CF
82 OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, OTH
, ODD
, # D0 - D7
83 ASV
, ACV
, ODD
, OTH
, OTH
, OTH
, OTH
, OTH
, # D8 - DF
84 OTH
, OTH
, OTH
, OTH
, OTH
, ACV
, ACV
, ACV
, # E0 - E7
85 ACV
, ACV
, ACV
, ACV
, ACV
, ACV
, ACV
, ACV
, # E8 - EF
86 ODD
, ACV
, ACV
, ACV
, ACV
, ASV
, ODD
, ODD
, # F0 - F7
87 ODD
, ODD
, ODD
, ODD
, ODD
, ODD
, ODD
, ODD
, # F8 - FF
94 MacRomanClassModel
= (
95 # UDF OTH ASC ASS ACV ACO ASV ASO ODD
96 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
97 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
98 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
99 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
100 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
101 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
102 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
103 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
104 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
109 class MacRomanProber(CharSetProber
):
110 def __init__(self
) -> None:
112 self
._last
_char
_class
= OTH
113 self
._freq
_counter
: List
[int] = []
116 def reset(self
) -> None:
117 self
._last
_char
_class
= OTH
118 self
._freq
_counter
= [0] * FREQ_CAT_NUM
120 # express the prior that MacRoman is a somewhat rare encoding;
121 # this can be done by starting out in a slightly improbable state
122 # that must be overcome
123 self
._freq
_counter
[2] = 10
128 def charset_name(self
) -> str:
132 def language(self
) -> str:
135 def feed(self
, byte_str
: Union
[bytes, bytearray
]) -> ProbingState
:
136 byte_str
= self
.remove_xml_tags(byte_str
)
138 char_class
= MacRoman_CharToClass
[c
]
139 freq
= MacRomanClassModel
[(self
._last
_char
_class
* CLASS_NUM
) + char_class
]
141 self
._state
= ProbingState
.NOT_ME
143 self
._freq
_counter
[freq
] += 1
144 self
._last
_char
_class
= char_class
148 def get_confidence(self
) -> float:
149 if self
.state
== ProbingState
.NOT_ME
:
152 total
= sum(self
._freq
_counter
)
156 else (self
._freq
_counter
[3] - self
._freq
_counter
[1] * 20.0) / total
158 confidence
= max(confidence
, 0.0)
159 # lower the confidence of MacRoman so that other more accurate
160 # detector can take priority.