File Manager

X7ROOT File Manager

Current Path: /opt/cloudlinux/venv/lib/python3.11/site-packages/charset_normalizer

opt / cloudlinux / venv / lib / python3.11 / site-packages / charset_normalizer /

?? ..
?? __init__.py(1.75 KB)
?? __pycache__
?? api.py(18.74 KB)
?? assets
?? cd.py(10.56 KB)
?? cli
?? constant.py(18.71 KB)
?? legacy.py(3.3 KB)
?? md.py(17.23 KB)
?? models.py(12.86 KB)
?? py.typed(0 B)
?? utils.py(11.5 KB)
?? version.py(79 B)

Editing: md.py

from functools import lru_cache
from typing import List, Optional

from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
from .utils import (
    is_accentuated,
    is_ascii,
    is_case_variable,
    is_cjk,
    is_emoticon,
    is_hangul,
    is_hiragana,
    is_katakana,
    is_latin,
    is_punctuation,
    is_separator,
    is_symbol,
    is_thai,
    is_unprintable,
    remove_accent,
    unicode_range,
)

class MessDetectorPlugin:
    """
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    """

def eligible(self, character: str) -> bool:
        """
        Determine if given character should be fed in.
        """
        raise NotImplementedError  # pragma: nocover

def feed(self, character: str) -> None:
        """
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        """
        raise NotImplementedError  # pragma: nocover

def reset(self) -> None:  # pragma: no cover
        """
        Permit to reset the plugin to the initial state.
        """
        raise NotImplementedError

@property
    def ratio(self) -> float:
        """
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        """
        raise NotImplementedError  # pragma: nocover

class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
        self._punctuation_count: int = 0
        self._symbol_count: int = 0
        self._character_count: int = 0

self._last_printable_char: Optional[str] = None
        self._frenzy_symbol_in_word: bool = False

def eligible(self, character: str) -> bool:
        return character.isprintable()

def feed(self, character: str) -> None:
        self._character_count += 1

if (
            character != self._last_printable_char
            and character not in COMMON_SAFE_ASCII_CHARACTERS
        ):
            if is_punctuation(character):
                self._punctuation_count += 1
            elif (
                character.isdigit() is False
                and is_symbol(character)
                and is_emoticon(character) is False
            ):
                self._symbol_count += 2

self._last_printable_char = character

def reset(self) -> None:  # pragma: no cover
        self._punctuation_count = 0
        self._character_count = 0
        self._symbol_count = 0

@property
    def ratio(self) -> float:
        if self._character_count == 0:
            return 0.0

ratio_of_punctuation: float = (
            self._punctuation_count + self._symbol_count
        ) / self._character_count

return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0

class TooManyAccentuatedPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
        self._character_count: int = 0
        self._accentuated_count: int = 0

def eligible(self, character: str) -> bool:
        return character.isalpha()

def feed(self, character: str) -> None:
        self._character_count += 1

if is_accentuated(character):
            self._accentuated_count += 1

def reset(self) -> None:  # pragma: no cover
        self._character_count = 0
        self._accentuated_count = 0

@property
    def ratio(self) -> float:
        if self._character_count == 0:
            return 0.0
        ratio_of_accentuation: float = self._accentuated_count / self._character_count
        return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0

class UnprintablePlugin(MessDetectorPlugin):
    def __init__(self) -> None:
        self._unprintable_count: int = 0
        self._character_count: int = 0

def eligible(self, character: str) -> bool:
        return True

def feed(self, character: str) -> None:
        if is_unprintable(character):
            self._unprintable_count += 1
        self._character_count += 1

def reset(self) -> None:  # pragma: no cover
        self._unprintable_count = 0

@property
    def ratio(self) -> float:
        if self._character_count == 0:
            return 0.0

return (self._unprintable_count * 8) / self._character_count

class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
        self._successive_count: int = 0
        self._character_count: int = 0

self._last_latin_character: Optional[str] = None

def eligible(self, character: str) -> bool:
        return character.isalpha() and is_latin(character)

def feed(self, character: str) -> None:
        self._character_count += 1
        if (
            self._last_latin_character is not None
            and is_accentuated(character)
            and is_accentuated(self._last_latin_character)
        ):
            if character.isupper() and self._last_latin_character.isupper():
                self._successive_count += 1
            # Worse if its the same char duplicated with different accent.
            if remove_accent(character) == remove_accent(self._last_latin_character):
                self._successive_count += 1
        self._last_latin_character = character

def reset(self) -> None:  # pragma: no cover
        self._successive_count = 0
        self._character_count = 0
        self._last_latin_character = None

@property
    def ratio(self) -> float:
        if self._character_count == 0:
            return 0.0

return (self._successive_count * 2) / self._character_count

class SuspiciousRange(MessDetectorPlugin):
    def __init__(self) -> None:
        self._suspicious_successive_range_count: int = 0
        self._character_count: int = 0
        self._last_printable_seen: Optional[str] = None

def eligible(self, character: str) -> bool:
        return character.isprintable()

def feed(self, character: str) -> None:
        self._character_count += 1

if (
            character.isspace()
            or is_punctuation(character)
            or character in COMMON_SAFE_ASCII_CHARACTERS
        ):
            self._last_printable_seen = None
            return

if self._last_printable_seen is None:
            self._last_printable_seen = character
            return

unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
        unicode_range_b: Optional[str] = unicode_range(character)

if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
            self._suspicious_successive_range_count += 1

self._last_printable_seen = character

def reset(self) -> None:  # pragma: no cover
        self._character_count = 0
        self._suspicious_successive_range_count = 0
        self._last_printable_seen = None

@property
    def ratio(self) -> float:
        if self._character_count == 0:
            return 0.0

ratio_of_suspicious_range_usage: float = (
            self._suspicious_successive_range_count * 2
        ) / self._character_count

if ratio_of_suspicious_range_usage < 0.1:
            return 0.0

return ratio_of_suspicious_range_usage

class SuperWeirdWordPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
        self._word_count: int = 0
        self._bad_word_count: int = 0
        self._foreign_long_count: int = 0

self._is_current_word_bad: bool = False
        self._foreign_long_watch: bool = False

self._character_count: int = 0
        self._bad_character_count: int = 0

self._buffer: str = ""
        self._buffer_accent_count: int = 0

def eligible(self, character: str) -> bool:
        return True

def feed(self, character: str) -> None:
        if character.isalpha():
            self._buffer += character
            if is_accentuated(character):
                self._buffer_accent_count += 1
            if (
                self._foreign_long_watch is False
                and (is_latin(character) is False or is_accentuated(character))
                and is_cjk(character) is False
                and is_hangul(character) is False
                and is_katakana(character) is False
                and is_hiragana(character) is False
                and is_thai(character) is False
            ):
                self._foreign_long_watch = True
            return
        if not self._buffer:
            return
        if (
            character.isspace() or is_punctuation(character) or is_separator(character)
        ) and self._buffer:
            self._word_count += 1
            buffer_length: int = len(self._buffer)

self._character_count += buffer_length

if buffer_length >= 4:
                if self._buffer_accent_count / buffer_length > 0.34:
                    self._is_current_word_bad = True
                # Word/Buffer ending with a upper case accentuated letter are so rare,
                # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
                if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
                    self._foreign_long_count += 1
                    self._is_current_word_bad = True
            if buffer_length >= 24 and self._foreign_long_watch:
                self._foreign_long_count += 1
                self._is_current_word_bad = True

if self._is_current_word_bad:
                self._bad_word_count += 1
                self._bad_character_count += len(self._buffer)
                self._is_current_word_bad = False

self._foreign_long_watch = False
            self._buffer = ""
            self._buffer_accent_count = 0
        elif (
            character not in {"<", ">", "-", "=", "~", "|", "_"}
            and character.isdigit() is False
            and is_symbol(character)
        ):
            self._is_current_word_bad = True
            self._buffer += character

def reset(self) -> None:  # pragma: no cover
        self._buffer = ""
        self._is_current_word_bad = False
        self._foreign_long_watch = False
        self._bad_word_count = 0
        self._word_count = 0
        self._character_count = 0
        self._bad_character_count = 0
        self._foreign_long_count = 0

@property
    def ratio(self) -> float:
        if self._word_count <= 10 and self._foreign_long_count == 0:
            return 0.0

return self._bad_character_count / self._character_count

class CjkInvalidStopPlugin(MessDetectorPlugin):
    """
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    """

def __init__(self) -> None:
        self._wrong_stop_count: int = 0
        self._cjk_character_count: int = 0

def eligible(self, character: str) -> bool:
        return True

def feed(self, character: str) -> None:
        if character in {"丅", "丄"}:
            self._wrong_stop_count += 1
            return
        if is_cjk(character):
            self._cjk_character_count += 1

def reset(self) -> None:  # pragma: no cover
        self._wrong_stop_count = 0
        self._cjk_character_count = 0

@property
    def ratio(self) -> float:
        if self._cjk_character_count < 16:
            return 0.0
        return self._wrong_stop_count / self._cjk_character_count

class ArchaicUpperLowerPlugin(MessDetectorPlugin):
    def __init__(self) -> None:
        self._buf: bool = False

self._character_count_since_last_sep: int = 0

self._successive_upper_lower_count: int = 0
        self._successive_upper_lower_count_final: int = 0

self._character_count: int = 0

self._last_alpha_seen: Optional[str] = None
        self._current_ascii_only: bool = True

def eligible(self, character: str) -> bool:
        return True

def feed(self, character: str) -> None:
        is_concerned = character.isalpha() and is_case_variable(character)
        chunk_sep = is_concerned is False

if chunk_sep and self._character_count_since_last_sep > 0:
            if (
                self._character_count_since_last_sep <= 64
                and character.isdigit() is False
                and self._current_ascii_only is False
            ):
                self._successive_upper_lower_count_final += (
                    self._successive_upper_lower_count
                )

self._successive_upper_lower_count = 0
            self._character_count_since_last_sep = 0
            self._last_alpha_seen = None
            self._buf = False
            self._character_count += 1
            self._current_ascii_only = True

return

if self._current_ascii_only is True and is_ascii(character) is False:
            self._current_ascii_only = False

if self._last_alpha_seen is not None:
            if (character.isupper() and self._last_alpha_seen.islower()) or (
                character.islower() and self._last_alpha_seen.isupper()
            ):
                if self._buf is True:
                    self._successive_upper_lower_count += 2
                    self._buf = False
                else:
                    self._buf = True
            else:
                self._buf = False

self._character_count += 1
        self._character_count_since_last_sep += 1
        self._last_alpha_seen = character

def reset(self) -> None:  # pragma: no cover
        self._character_count = 0
        self._character_count_since_last_sep = 0
        self._successive_upper_lower_count = 0
        self._successive_upper_lower_count_final = 0
        self._last_alpha_seen = None
        self._buf = False
        self._current_ascii_only = True

@property
    def ratio(self) -> float:
        if self._character_count == 0:
            return 0.0

return self._successive_upper_lower_count_final / self._character_count

@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
    unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
    """
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    """
    if unicode_range_a is None or unicode_range_b is None:
        return True

if unicode_range_a == unicode_range_b:
        return False

if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
        return False

if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
        return False

# Latin characters can be accompanied with a combining diacritical mark
    # eg. Vietnamese.
    if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
        "Combining" in unicode_range_a or "Combining" in unicode_range_b
    ):
        return False

keywords_range_a, keywords_range_b = unicode_range_a.split(
        " "
    ), unicode_range_b.split(" ")

for el in keywords_range_a:
        if el in UNICODE_SECONDARY_RANGE_KEYWORD:
            continue
        if el in keywords_range_b:
            return False

# Japanese Exception
    range_a_jp_chars, range_b_jp_chars = (
        unicode_range_a
        in (
            "Hiragana",
            "Katakana",
        ),
        unicode_range_b in ("Hiragana", "Katakana"),
    )
    if (range_a_jp_chars or range_b_jp_chars) and (
        "CJK" in unicode_range_a or "CJK" in unicode_range_b
    ):
        return False
    if range_a_jp_chars and range_b_jp_chars:
        return False

if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
        if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
            return False
        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
            return False

# Chinese/Japanese use dedicated range for punctuation and/or separators.
    if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
        unicode_range_a in ["Katakana", "Hiragana"]
        and unicode_range_b in ["Katakana", "Hiragana"]
    ):
        if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
            return False
        if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
            return False

return True

@lru_cache(maxsize=2048)
def mess_ratio(
    decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
    """
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    """

detectors: List[MessDetectorPlugin] = [
        md_class() for md_class in MessDetectorPlugin.__subclasses__()
    ]

length: int = len(decoded_sequence) + 1

mean_mess_ratio: float = 0.0

if length < 512:
        intermediary_mean_mess_ratio_calc: int = 32
    elif length <= 1024:
        intermediary_mean_mess_ratio_calc = 64
    else:
        intermediary_mean_mess_ratio_calc = 128

for character, index in zip(decoded_sequence + "\n", range(length)):
        for detector in detectors:
            if detector.eligible(character):
                detector.feed(character)

if (
            index > 0 and index % intermediary_mean_mess_ratio_calc == 0
        ) or index == length - 1:
            mean_mess_ratio = sum(dt.ratio for dt in detectors)

if mean_mess_ratio >= maximum_threshold:
                break

if debug:
        for dt in detectors:  # pragma: nocover
            print(dt.__class__, dt.ratio)

return round(mean_mess_ratio, 3)

X7ROOT File Manager

Editing: md.py

Upload File

Create Folder