X7ROOT File Manager
Current Path:
/opt/cloudlinux/venv/lib/python3.11/site-packages/charset_normalizer
opt
/
cloudlinux
/
venv
/
lib
/
python3.11
/
site-packages
/
charset_normalizer
/
??
..
??
__init__.py
(1.75 KB)
??
__pycache__
??
api.py
(18.74 KB)
??
assets
??
cd.py
(10.56 KB)
??
cli
??
constant.py
(18.71 KB)
??
legacy.py
(3.3 KB)
??
md.py
(17.23 KB)
??
models.py
(12.86 KB)
??
py.typed
(0 B)
??
utils.py
(11.5 KB)
??
version.py
(79 B)
Editing: md.py
from functools import lru_cache from typing import List, Optional from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD from .utils import ( is_accentuated, is_ascii, is_case_variable, is_cjk, is_emoticon, is_hangul, is_hiragana, is_katakana, is_latin, is_punctuation, is_separator, is_symbol, is_thai, is_unprintable, remove_accent, unicode_range, ) class MessDetectorPlugin: """ Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods. """ def eligible(self, character: str) -> bool: """ Determine if given character should be fed in. """ raise NotImplementedError # pragma: nocover def feed(self, character: str) -> None: """ The main routine to be executed upon character. Insert the logic in witch the text would be considered chaotic. """ raise NotImplementedError # pragma: nocover def reset(self) -> None: # pragma: no cover """ Permit to reset the plugin to the initial state. """ raise NotImplementedError @property def ratio(self) -> float: """ Compute the chaos ratio based on what your feed() has seen. Must NOT be lower than 0.; No restriction gt 0. """ raise NotImplementedError # pragma: nocover class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): def __init__(self) -> None: self._punctuation_count: int = 0 self._symbol_count: int = 0 self._character_count: int = 0 self._last_printable_char: Optional[str] = None self._frenzy_symbol_in_word: bool = False def eligible(self, character: str) -> bool: return character.isprintable() def feed(self, character: str) -> None: self._character_count += 1 if ( character != self._last_printable_char and character not in COMMON_SAFE_ASCII_CHARACTERS ): if is_punctuation(character): self._punctuation_count += 1 elif ( character.isdigit() is False and is_symbol(character) and is_emoticon(character) is False ): self._symbol_count += 2 self._last_printable_char = character def reset(self) -> None: # pragma: no cover self._punctuation_count = 0 self._character_count = 0 self._symbol_count = 0 @property def ratio(self) -> float: if self._character_count == 0: return 0.0 ratio_of_punctuation: float = ( self._punctuation_count + self._symbol_count ) / self._character_count return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 class TooManyAccentuatedPlugin(MessDetectorPlugin): def __init__(self) -> None: self._character_count: int = 0 self._accentuated_count: int = 0 def eligible(self, character: str) -> bool: return character.isalpha() def feed(self, character: str) -> None: self._character_count += 1 if is_accentuated(character): self._accentuated_count += 1 def reset(self) -> None: # pragma: no cover self._character_count = 0 self._accentuated_count = 0 @property def ratio(self) -> float: if self._character_count == 0: return 0.0 ratio_of_accentuation: float = self._accentuated_count / self._character_count return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 class UnprintablePlugin(MessDetectorPlugin): def __init__(self) -> None: self._unprintable_count: int = 0 self._character_count: int = 0 def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: if is_unprintable(character): self._unprintable_count += 1 self._character_count += 1 def reset(self) -> None: # pragma: no cover self._unprintable_count = 0 @property def ratio(self) -> float: if self._character_count == 0: return 0.0 return (self._unprintable_count * 8) / self._character_count class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): def __init__(self) -> None: self._successive_count: int = 0 self._character_count: int = 0 self._last_latin_character: Optional[str] = None def eligible(self, character: str) -> bool: return character.isalpha() and is_latin(character) def feed(self, character: str) -> None: self._character_count += 1 if ( self._last_latin_character is not None and is_accentuated(character) and is_accentuated(self._last_latin_character) ): if character.isupper() and self._last_latin_character.isupper(): self._successive_count += 1 # Worse if its the same char duplicated with different accent. if remove_accent(character) == remove_accent(self._last_latin_character): self._successive_count += 1 self._last_latin_character = character def reset(self) -> None: # pragma: no cover self._successive_count = 0 self._character_count = 0 self._last_latin_character = None @property def ratio(self) -> float: if self._character_count == 0: return 0.0 return (self._successive_count * 2) / self._character_count class SuspiciousRange(MessDetectorPlugin): def __init__(self) -> None: self._suspicious_successive_range_count: int = 0 self._character_count: int = 0 self._last_printable_seen: Optional[str] = None def eligible(self, character: str) -> bool: return character.isprintable() def feed(self, character: str) -> None: self._character_count += 1 if ( character.isspace() or is_punctuation(character) or character in COMMON_SAFE_ASCII_CHARACTERS ): self._last_printable_seen = None return if self._last_printable_seen is None: self._last_printable_seen = character return unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) unicode_range_b: Optional[str] = unicode_range(character) if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): self._suspicious_successive_range_count += 1 self._last_printable_seen = character def reset(self) -> None: # pragma: no cover self._character_count = 0 self._suspicious_successive_range_count = 0 self._last_printable_seen = None @property def ratio(self) -> float: if self._character_count == 0: return 0.0 ratio_of_suspicious_range_usage: float = ( self._suspicious_successive_range_count * 2 ) / self._character_count if ratio_of_suspicious_range_usage < 0.1: return 0.0 return ratio_of_suspicious_range_usage class SuperWeirdWordPlugin(MessDetectorPlugin): def __init__(self) -> None: self._word_count: int = 0 self._bad_word_count: int = 0 self._foreign_long_count: int = 0 self._is_current_word_bad: bool = False self._foreign_long_watch: bool = False self._character_count: int = 0 self._bad_character_count: int = 0 self._buffer: str = "" self._buffer_accent_count: int = 0 def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: if character.isalpha(): self._buffer += character if is_accentuated(character): self._buffer_accent_count += 1 if ( self._foreign_long_watch is False and (is_latin(character) is False or is_accentuated(character)) and is_cjk(character) is False and is_hangul(character) is False and is_katakana(character) is False and is_hiragana(character) is False and is_thai(character) is False ): self._foreign_long_watch = True return if not self._buffer: return if ( character.isspace() or is_punctuation(character) or is_separator(character) ) and self._buffer: self._word_count += 1 buffer_length: int = len(self._buffer) self._character_count += buffer_length if buffer_length >= 4: if self._buffer_accent_count / buffer_length > 0.34: self._is_current_word_bad = True # Word/Buffer ending with a upper case accentuated letter are so rare, # that we will consider them all as suspicious. Same weight as foreign_long suspicious. if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): self._foreign_long_count += 1 self._is_current_word_bad = True if buffer_length >= 24 and self._foreign_long_watch: self._foreign_long_count += 1 self._is_current_word_bad = True if self._is_current_word_bad: self._bad_word_count += 1 self._bad_character_count += len(self._buffer) self._is_current_word_bad = False self._foreign_long_watch = False self._buffer = "" self._buffer_accent_count = 0 elif ( character not in {"<", ">", "-", "=", "~", "|", "_"} and character.isdigit() is False and is_symbol(character) ): self._is_current_word_bad = True self._buffer += character def reset(self) -> None: # pragma: no cover self._buffer = "" self._is_current_word_bad = False self._foreign_long_watch = False self._bad_word_count = 0 self._word_count = 0 self._character_count = 0 self._bad_character_count = 0 self._foreign_long_count = 0 @property def ratio(self) -> float: if self._word_count <= 10 and self._foreign_long_count == 0: return 0.0 return self._bad_character_count / self._character_count class CjkInvalidStopPlugin(MessDetectorPlugin): """ GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected. Searching for the overuse of '丅' and '丄'. """ def __init__(self) -> None: self._wrong_stop_count: int = 0 self._cjk_character_count: int = 0 def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: if character in {"丅", "丄"}: self._wrong_stop_count += 1 return if is_cjk(character): self._cjk_character_count += 1 def reset(self) -> None: # pragma: no cover self._wrong_stop_count = 0 self._cjk_character_count = 0 @property def ratio(self) -> float: if self._cjk_character_count < 16: return 0.0 return self._wrong_stop_count / self._cjk_character_count class ArchaicUpperLowerPlugin(MessDetectorPlugin): def __init__(self) -> None: self._buf: bool = False self._character_count_since_last_sep: int = 0 self._successive_upper_lower_count: int = 0 self._successive_upper_lower_count_final: int = 0 self._character_count: int = 0 self._last_alpha_seen: Optional[str] = None self._current_ascii_only: bool = True def eligible(self, character: str) -> bool: return True def feed(self, character: str) -> None: is_concerned = character.isalpha() and is_case_variable(character) chunk_sep = is_concerned is False if chunk_sep and self._character_count_since_last_sep > 0: if ( self._character_count_since_last_sep <= 64 and character.isdigit() is False and self._current_ascii_only is False ): self._successive_upper_lower_count_final += ( self._successive_upper_lower_count ) self._successive_upper_lower_count = 0 self._character_count_since_last_sep = 0 self._last_alpha_seen = None self._buf = False self._character_count += 1 self._current_ascii_only = True return if self._current_ascii_only is True and is_ascii(character) is False: self._current_ascii_only = False if self._last_alpha_seen is not None: if (character.isupper() and self._last_alpha_seen.islower()) or ( character.islower() and self._last_alpha_seen.isupper() ): if self._buf is True: self._successive_upper_lower_count += 2 self._buf = False else: self._buf = True else: self._buf = False self._character_count += 1 self._character_count_since_last_sep += 1 self._last_alpha_seen = character def reset(self) -> None: # pragma: no cover self._character_count = 0 self._character_count_since_last_sep = 0 self._successive_upper_lower_count = 0 self._successive_upper_lower_count_final = 0 self._last_alpha_seen = None self._buf = False self._current_ascii_only = True @property def ratio(self) -> float: if self._character_count == 0: return 0.0 return self._successive_upper_lower_count_final / self._character_count @lru_cache(maxsize=1024) def is_suspiciously_successive_range( unicode_range_a: Optional[str], unicode_range_b: Optional[str] ) -> bool: """ Determine if two Unicode range seen next to each other can be considered as suspicious. """ if unicode_range_a is None or unicode_range_b is None: return True if unicode_range_a == unicode_range_b: return False if "Latin" in unicode_range_a and "Latin" in unicode_range_b: return False if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: return False # Latin characters can be accompanied with a combining diacritical mark # eg. Vietnamese. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( "Combining" in unicode_range_a or "Combining" in unicode_range_b ): return False keywords_range_a, keywords_range_b = unicode_range_a.split( " " ), unicode_range_b.split(" ") for el in keywords_range_a: if el in UNICODE_SECONDARY_RANGE_KEYWORD: continue if el in keywords_range_b: return False # Japanese Exception range_a_jp_chars, range_b_jp_chars = ( unicode_range_a in ( "Hiragana", "Katakana", ), unicode_range_b in ("Hiragana", "Katakana"), ) if (range_a_jp_chars or range_b_jp_chars) and ( "CJK" in unicode_range_a or "CJK" in unicode_range_b ): return False if range_a_jp_chars and range_b_jp_chars: return False if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: if "CJK" in unicode_range_a or "CJK" in unicode_range_b: return False if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": return False # Chinese/Japanese use dedicated range for punctuation and/or separators. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( unicode_range_a in ["Katakana", "Hiragana"] and unicode_range_b in ["Katakana", "Hiragana"] ): if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: return False if "Forms" in unicode_range_a or "Forms" in unicode_range_b: return False return True @lru_cache(maxsize=2048) def mess_ratio( decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False ) -> float: """ Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. """ detectors: List[MessDetectorPlugin] = [ md_class() for md_class in MessDetectorPlugin.__subclasses__() ] length: int = len(decoded_sequence) + 1 mean_mess_ratio: float = 0.0 if length < 512: intermediary_mean_mess_ratio_calc: int = 32 elif length <= 1024: intermediary_mean_mess_ratio_calc = 64 else: intermediary_mean_mess_ratio_calc = 128 for character, index in zip(decoded_sequence + "\n", range(length)): for detector in detectors: if detector.eligible(character): detector.feed(character) if ( index > 0 and index % intermediary_mean_mess_ratio_calc == 0 ) or index == length - 1: mean_mess_ratio = sum(dt.ratio for dt in detectors) if mean_mess_ratio >= maximum_threshold: break if debug: for dt in detectors: # pragma: nocover print(dt.__class__, dt.ratio) return round(mean_mess_ratio, 3)
Upload File
Create Folder