Skip to content

Commit

Permalink
❇️ Improve the detection around some cases
Browse files Browse the repository at this point in the history
Close #365 #357 #356
  • Loading branch information
Ousret committed Oct 19, 2023
1 parent 165211a commit 4f8a964
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 7 deletions.
4 changes: 3 additions & 1 deletion bin/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List
import argparse

from charset_normalizer import from_path
from charset_normalizer import from_path, __version__
from charset_normalizer.utils import iana_name

from os import sep
Expand Down Expand Up @@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]):
print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
exit(1)

print(f"> using charset-normalizer {__version__}")

success_count = 0
total_count = 0

Expand Down
7 changes: 3 additions & 4 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,16 +233,13 @@ def reset(self) -> None: # pragma: no cover

@property
def ratio(self) -> float:
if self._character_count == 0:
if self._character_count <= 32:
return 0.0

ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count

if ratio_of_suspicious_range_usage < 0.1:
return 0.0

return ratio_of_suspicious_range_usage


Expand Down Expand Up @@ -521,6 +518,8 @@ def is_suspiciously_successive_range(
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False

return True

Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool:
if character_range is None:
return False

return "Forms" in character_range
return "Forms" in character_range and character_category != "Lo"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "3.3.0"
__version__ = "3.3.1"
VERSION = __version__.split(".")

0 comments on commit 4f8a964

Please sign in to comment.