From 03603646a392090b8bfdba1190fe3352ee396f01 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Wed, 25 Sep 2024 14:18:15 +0200 Subject: [PATCH] :wrench: improve the detector general reliability Issues (#520) (#509) (#498) (#407) --- CHANGELOG.md | 6 ++---- charset_normalizer/api.py | 16 ++++++++++------ charset_normalizer/constant.py | 2 ++ charset_normalizer/md.py | 19 ++++++++++++++++--- charset_normalizer/version.py | 2 +- 5 files changed, 31 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 404f7e6e..59fe33ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,11 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-03-??) +## [3.3.3](https://github.com/Ousret/charset_normalizer/compare/3.3.2...master) (2024-09-??) ### Fixed - Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch. - -### Changed -- Optional mypyc compilation upgraded to version 1.9.0 for Python >= 3.8 +- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) ## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 0ba08e3a..b5e4dd51 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -221,16 +221,20 @@ def from_bytes( try: if is_too_large_sequence and is_multi_byte_decoder is False: str( - sequences[: int(50e4)] - if strip_sig_or_bom is False - else sequences[len(sig_payload) : int(50e4)], + ( + sequences[: int(50e4)] + if strip_sig_or_bom is False + else sequences[len(sig_payload) : int(50e4)] + ), encoding=encoding_iana, ) else: decoded_payload = str( - sequences - if strip_sig_or_bom is False - else sequences[len(sig_payload) :], + ( + sequences + if strip_sig_or_bom is False + else sequences[len(sig_payload) :] + ), encoding=encoding_iana, ) except (UnicodeDecodeError, LookupError) as e: diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py index 86349046..f8f2a811 100644 --- a/charset_normalizer/constant.py +++ b/charset_normalizer/constant.py @@ -544,6 +544,8 @@ "|", '"', "-", + "(", + ")", } diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 77897aae..d834db0e 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -236,7 +236,7 @@ def reset(self) -> None: # pragma: no cover @property def ratio(self) -> float: - if self._character_count <= 24: + if self._character_count <= 13: return 0.0 ratio_of_suspicious_range_usage: float = ( @@ -260,6 +260,7 @@ def __init__(self) -> None: self._buffer: str = "" self._buffer_accent_count: int = 0 + self._buffer_glyph_count: int = 0 def eligible(self, character: str) -> bool: return True @@ -279,6 +280,14 @@ def feed(self, character: str) -> None: and is_thai(character) is False ): self._foreign_long_watch = True + if ( + is_cjk(character) + or is_hangul(character) + or is_katakana(character) + or is_hiragana(character) + or is_thai(character) + ): + self._buffer_glyph_count += 1 return if not self._buffer: return @@ -291,17 +300,20 @@ def feed(self, character: str) -> None: self._character_count += buffer_length if buffer_length >= 4: - if self._buffer_accent_count / buffer_length > 0.34: + if self._buffer_accent_count / buffer_length >= 0.5: self._is_current_word_bad = True # Word/Buffer ending with an upper case accentuated letter are so rare, # that we will consider them all as suspicious. Same weight as foreign_long suspicious. - if ( + elif ( is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper() and all(_.isupper() for _ in self._buffer) is False ): self._foreign_long_count += 1 self._is_current_word_bad = True + elif self._buffer_glyph_count == 1: + self._is_current_word_bad = True + self._foreign_long_count += 1 if buffer_length >= 24 and self._foreign_long_watch: camel_case_dst = [ i @@ -325,6 +337,7 @@ def feed(self, character: str) -> None: self._foreign_long_watch = False self._buffer = "" self._buffer_accent_count = 0 + self._buffer_glyph_count = 0 elif ( character not in {"<", ">", "-", "=", "~", "|", "_"} and character.isdigit() is False diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 5a4da4ff..709140e5 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.3.2" +__version__ = "3.3.3" VERSION = __version__.split(".")