]>
jfr.im git - dlqueue.git/blob - venv/lib/python3.11/site-packages/pip/_vendor/chardet/escprober.py
1 ######################## BEGIN LICENSE BLOCK ########################
2 # The Original Code is mozilla.org code.
4 # The Initial Developer of the Original Code is
5 # Netscape Communications Corporation.
6 # Portions created by the Initial Developer are Copyright (C) 1998
7 # the Initial Developer. All Rights Reserved.
10 # Mark Pilgrim - port to Python
12 # This library is free software; you can redistribute it and/or
13 # modify it under the terms of the GNU Lesser General Public
14 # License as published by the Free Software Foundation; either
15 # version 2.1 of the License, or (at your option) any later version.
17 # This library is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 # Lesser General Public License for more details.
22 # You should have received a copy of the GNU Lesser General Public
23 # License along with this library; if not, write to the Free Software
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 ######################### END LICENSE BLOCK #########################
28 from typing
import Optional
, Union
30 from .charsetprober
import CharSetProber
31 from .codingstatemachine
import CodingStateMachine
32 from .enums
import LanguageFilter
, MachineState
, ProbingState
41 class EscCharSetProber(CharSetProber
):
43 This CharSetProber uses a "code scheme" approach for detecting encodings,
44 whereby easily recognizable escape or shift sequences are relied on to
45 identify these encodings.
48 def __init__(self
, lang_filter
: LanguageFilter
= LanguageFilter
.NONE
) -> None:
49 super().__init
__(lang_filter
=lang_filter
)
51 if self
.lang_filter
& LanguageFilter
.CHINESE_SIMPLIFIED
:
52 self
.coding_sm
.append(CodingStateMachine(HZ_SM_MODEL
))
53 self
.coding_sm
.append(CodingStateMachine(ISO2022CN_SM_MODEL
))
54 if self
.lang_filter
& LanguageFilter
.JAPANESE
:
55 self
.coding_sm
.append(CodingStateMachine(ISO2022JP_SM_MODEL
))
56 if self
.lang_filter
& LanguageFilter
.KOREAN
:
57 self
.coding_sm
.append(CodingStateMachine(ISO2022KR_SM_MODEL
))
58 self
.active_sm_count
= 0
59 self
._detected
_charset
: Optional
[str] = None
60 self
._detected
_language
: Optional
[str] = None
61 self
._state
= ProbingState
.DETECTING
64 def reset(self
) -> None:
66 for coding_sm
in self
.coding_sm
:
67 coding_sm
.active
= True
69 self
.active_sm_count
= len(self
.coding_sm
)
70 self
._detected
_charset
= None
71 self
._detected
_language
= None
74 def charset_name(self
) -> Optional
[str]:
75 return self
._detected
_charset
78 def language(self
) -> Optional
[str]:
79 return self
._detected
_language
81 def get_confidence(self
) -> float:
82 return 0.99 if self
._detected
_charset
else 0.00
84 def feed(self
, byte_str
: Union
[bytes, bytearray
]) -> ProbingState
:
86 for coding_sm
in self
.coding_sm
:
87 if not coding_sm
.active
:
89 coding_state
= coding_sm
.next_state(c
)
90 if coding_state
== MachineState
.ERROR
:
91 coding_sm
.active
= False
92 self
.active_sm_count
-= 1
93 if self
.active_sm_count
<= 0:
94 self
._state
= ProbingState
.NOT_ME
96 elif coding_state
== MachineState
.ITS_ME
:
97 self
._state
= ProbingState
.FOUND_IT
98 self
._detected
_charset
= coding_sm
.get_coding_state_machine()
99 self
._detected
_language
= coding_sm
.language