❇️ Improve the detection around some cases

Close #365 #357 #356
jawah · Oct 19, 2023 · 4f8a964 · 4f8a964
1 parent 165211a
commit 4f8a964
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 7 deletions.
diff --git a/bin/coverage.py b/bin/coverage.py
@@ -5,7 +5,7 @@
 from typing import List
 import argparse
 
-from charset_normalizer import from_path
+from charset_normalizer import from_path, __version__
 from charset_normalizer.utils import iana_name
 
 from os import sep
@@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]):
         print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
         exit(1)
 
+    print(f"> using charset-normalizer {__version__}")
+
     success_count = 0
     total_count = 0
 

diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py
@@ -233,16 +233,13 @@ def reset(self) -> None:  # pragma: no cover
 
     @property
     def ratio(self) -> float:
-        if self._character_count == 0:
+        if self._character_count <= 32:
             return 0.0
 
         ratio_of_suspicious_range_usage: float = (
             self._suspicious_successive_range_count * 2
         ) / self._character_count
 
-        if ratio_of_suspicious_range_usage < 0.1:
-            return 0.0
-
         return ratio_of_suspicious_range_usage
 
 
@@ -521,6 +518,8 @@ def is_suspiciously_successive_range(
             return False
         if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
             return False
+        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+            return False
 
     return True
 

diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
@@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool:
     if character_range is None:
         return False
 
-    return "Forms" in character_range
+    return "Forms" in character_range and character_category != "Lo"
 
 
 @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)

diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py
@@ -2,5 +2,5 @@
 Expose version
 """
 
-__version__ = "3.3.0"
+__version__ = "3.3.1"
 VERSION = __version__.split(".")