venv/lib/python3.11/site-packages/pip/_vendor/chardet/macromanprober.py

   1 ######################## BEGIN LICENSE BLOCK ########################
   2 # This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
   3 # The Original Code is Mozilla Universal charset detector code.
   4 #
   5 # The Initial Developer of the Original Code is
   6 # Netscape Communications Corporation.
   7 # Portions created by the Initial Developer are Copyright (C) 2001
   8 # the Initial Developer. All Rights Reserved.
   9 #
  10 # Contributor(s):
  11 #   Rob Speer - adapt to MacRoman encoding
  12 #   Mark Pilgrim - port to Python
  13 #   Shy Shalom - original C code
  14 #
  15 # This library is free software; you can redistribute it and/or
  16 # modify it under the terms of the GNU Lesser General Public
  17 # License as published by the Free Software Foundation; either
  18 # version 2.1 of the License, or (at your option) any later version.
  19 #
  20 # This library is distributed in the hope that it will be useful,
  21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23 # Lesser General Public License for more details.
  24 #
  25 # You should have received a copy of the GNU Lesser General Public
  26 # License along with this library; if not, write to the Free Software
  27 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  28 # 02110-1301  USA
  29 ######################### END LICENSE BLOCK #########################
  30
  31 from typing import List, Union
  32
  33 from .charsetprober import CharSetProber
  34 from .enums import ProbingState
  35
  36 FREQ_CAT_NUM = 4
  37
  38 UDF = 0  # undefined
  39 OTH = 1  # other
  40 ASC = 2  # ascii capital letter
  41 ASS = 3  # ascii small letter
  42 ACV = 4  # accent capital vowel
  43 ACO = 5  # accent capital other
  44 ASV = 6  # accent small vowel
  45 ASO = 7  # accent small other
  46 ODD = 8  # character that is unlikely to appear
  47 CLASS_NUM = 9  # total classes
  48
  49 # The change from Latin1 is that we explicitly look for extended characters
  50 # that are infrequently-occurring symbols, and consider them to always be
  51 # improbable. This should let MacRoman get out of the way of more likely
  52 # encodings in most situations.
  53
  54 # fmt: off
  55 MacRoman_CharToClass = (
  56     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 00 - 07
  57     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 08 - 0F
  58     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 10 - 17
  59     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 18 - 1F
  60     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 20 - 27
  61     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 28 - 2F
  62     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 30 - 37
  63     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 38 - 3F
  64     OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  # 40 - 47
  65     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  # 48 - 4F
  66     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  # 50 - 57
  67     ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,  # 58 - 5F
  68     OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  # 60 - 67
  69     ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  # 68 - 6F
  70     ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  # 70 - 77
  71     ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,  # 78 - 7F
  72     ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV,  # 80 - 87
  73     ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV,  # 88 - 8F
  74     ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV,  # 90 - 97
  75     ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,  # 98 - 9F
  76     OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO,  # A0 - A7
  77     OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV,  # A8 - AF
  78     OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # B0 - B7
  79     OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV,  # B8 - BF
  80     OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH,  # C0 - C7
  81     OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV,  # C8 - CF
  82     OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD,  # D0 - D7
  83     ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH,  # D8 - DF
  84     OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV,  # E0 - E7
  85     ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,  # E8 - EF
  86     ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD,  # F0 - F7
  87     ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD,  # F8 - FF
  88 )
  89
  90 # 0 : illegal
  91 # 1 : very unlikely
  92 # 2 : normal
  93 # 3 : very likely
  94 MacRomanClassModel = (
  95 # UDF OTH ASC ASS ACV ACO ASV ASO ODD
  96     0,  0,  0,  0,  0,  0,  0,  0,  0,  # UDF
  97     0,  3,  3,  3,  3,  3,  3,  3,  1,  # OTH
  98     0,  3,  3,  3,  3,  3,  3,  3,  1,  # ASC
  99     0,  3,  3,  3,  1,  1,  3,  3,  1,  # ASS
 100     0,  3,  3,  3,  1,  2,  1,  2,  1,  # ACV
 101     0,  3,  3,  3,  3,  3,  3,  3,  1,  # ACO
 102     0,  3,  1,  3,  1,  1,  1,  3,  1,  # ASV
 103     0,  3,  1,  3,  1,  1,  3,  3,  1,  # ASO
 104     0,  1,  1,  1,  1,  1,  1,  1,  1,  # ODD
 105 )
 106 # fmt: on
 107
 108
 109 class MacRomanProber(CharSetProber):
 110     def __init__(self) -> None:
 111         super().__init__()
 112         self._last_char_class = OTH
 113         self._freq_counter: List[int] = []
 114         self.reset()
 115
 116     def reset(self) -> None:
 117         self._last_char_class = OTH
 118         self._freq_counter = [0] * FREQ_CAT_NUM
 119
 120         # express the prior that MacRoman is a somewhat rare encoding;
 121         # this can be done by starting out in a slightly improbable state
 122         # that must be overcome
 123         self._freq_counter[2] = 10
 124
 125         super().reset()
 126
 127     @property
 128     def charset_name(self) -> str:
 129         return "MacRoman"
 130
 131     @property
 132     def language(self) -> str:
 133         return ""
 134
 135     def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
 136         byte_str = self.remove_xml_tags(byte_str)
 137         for c in byte_str:
 138             char_class = MacRoman_CharToClass[c]
 139             freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
 140             if freq == 0:
 141                 self._state = ProbingState.NOT_ME
 142                 break
 143             self._freq_counter[freq] += 1
 144             self._last_char_class = char_class
 145
 146         return self.state
 147
 148     def get_confidence(self) -> float:
 149         if self.state == ProbingState.NOT_ME:
 150             return 0.01
 151
 152         total = sum(self._freq_counter)
 153         confidence = (
 154             0.0
 155             if total < 0.01
 156             else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
 157         )
 158         confidence = max(confidence, 0.0)
 159         # lower the confidence of MacRoman so that other more accurate
 160         # detector can take priority.
 161         confidence *= 0.73
 162         return confidence