From 4f8a964add2da8b7d22fa96063ae8421d238890c Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Thu, 19 Oct 2023 07:52:13 +0200 Subject: [PATCH] :sparkle: Improve the detection around some cases Close #365 #357 #356 --- bin/coverage.py | 4 +++- charset_normalizer/md.py | 7 +++---- charset_normalizer/utils.py | 2 +- charset_normalizer/version.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bin/coverage.py b/bin/coverage.py index 94e058cf..e5f07bd5 100644 --- a/bin/coverage.py +++ b/bin/coverage.py @@ -5,7 +5,7 @@ from typing import List import argparse -from charset_normalizer import from_path +from charset_normalizer import from_path, __version__ from charset_normalizer.utils import iana_name from os import sep @@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]): print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory") exit(1) + print(f"> using charset-normalizer {__version__}") + success_count = 0 total_count = 0 diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index a6d9350c..37036f0a 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -233,16 +233,13 @@ def reset(self) -> None: # pragma: no cover @property def ratio(self) -> float: - if self._character_count == 0: + if self._character_count <= 32: return 0.0 ratio_of_suspicious_range_usage: float = ( self._suspicious_successive_range_count * 2 ) / self._character_count - if ratio_of_suspicious_range_usage < 0.1: - return 0.0 - return ratio_of_suspicious_range_usage @@ -521,6 +518,8 @@ def is_suspiciously_successive_range( return False if "Forms" in unicode_range_a or "Forms" in unicode_range_b: return False + if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": + return False return True diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 45a402e4..c23d25d8 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool: if character_range is None: return False - return "Forms" in character_range + return "Forms" in character_range and character_category != "Lo" @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index db1ff57a..83683f4c 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.3.0" +__version__ = "3.3.1" VERSION = __version__.split(".")