diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 5b79b6e4..75ef10a6 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -26,10 +26,10 @@ jobs: uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: '3.11' - - name: Update pip, setuptools, wheel, build and twine + - name: Update pip, install build run: | python -m pip install --upgrade pip - python -m pip install setuptools wheel build + python -m pip install build - name: Build Wheel env: CHARSET_NORMALIZER_USE_MYPYC: '0' @@ -83,10 +83,9 @@ jobs: - name: Build wheels uses: pypa/cibuildwheel@7940a4c0e76eb2030e473a5f864f291f63ee879b # v2.21.3 env: - CIBW_BUILD_FRONTEND: "pip; args: --no-build-isolation" + CIBW_BUILD_FRONTEND: build CIBW_ARCHS_MACOS: x86_64 arm64 universal2 CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' - CIBW_BEFORE_BUILD: pip install -r build-requirements.txt CIBW_TEST_REQUIRES: pytest CIBW_TEST_COMMAND: pytest -c {package} {package}/tests CIBW_SKIP: pp* cp36* diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 70d7b8fb..23433aab 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,18 +25,9 @@ jobs: python -m pip install -U pip setuptools python -m pip install -r dev-requirements.txt python -m pip uninstall -y charset-normalizer - - name: Type checking (Mypy) + - name: Pre-commit checks run: | - mypy --strict charset_normalizer - - name: Import sorting check (isort) - run: | - isort --check charset_normalizer - - name: Code format (Black) - run: | - black --check --diff --target-version=py37 charset_normalizer - - name: Style guide enforcement (Flake8) - run: | - flake8 charset_normalizer + pre-commit run --all tests: name: ✅ Tests @@ -68,7 +59,7 @@ jobs: python -m pip uninstall -y charset-normalizer - name: Install the package run: | - python -m build --no-isolation + python -m build python -m pip install ./dist/*.whl - name: Run tests run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..09fa625e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,31 @@ +exclude: 'docs/|data/|tests/' + +repos: + - repo: https://github.com/asottile/pyupgrade + rev: v3.3.1 + hooks: + - id: pyupgrade + args: ["--py37-plus"] + + - repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + args: ["--target-version", "py37"] + + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + + - repo: https://github.com/PyCQA/flake8 + rev: 6.1.0 + hooks: + - id: flake8 + additional_dependencies: [flake8-2020] + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.13.0 + hooks: + - id: mypy + exclude: 'tests/|bin/' diff --git a/CHANGELOG.md b/CHANGELOG.md index d7cd7e1a..608567e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,18 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...master) (2024-10-??) + +### Changed +- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend. +- Enforce annotation delayed loading for a simpler and consistent types in the project. + +### Added +- pre-commit configuration. + +### Removed +- `build-requirements.txt` as per using `pyproject.toml` native build configuration. + ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08) ### Added diff --git a/bin/bc.py b/bin/bc.py index df289433..4eacc1c4 100644 --- a/bin/bc.py +++ b/bin/bc.py @@ -1,13 +1,14 @@ #!/bin/python +from __future__ import annotations + +import argparse from glob import glob from os.path import isdir from sys import argv -from typing import List -import argparse -from charset_normalizer import detect as tbt_detect from chardet import detect as chardet_detect +from charset_normalizer import detect as tbt_detect from charset_normalizer.utils import iana_name @@ -16,28 +17,35 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str): str_a = content.decode(cp_a) str_b = content.decode(cp_b) except UnicodeDecodeError: - return 0. + return 0.0 character_count = len(str_a) - diff_character_count = sum( - chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b) - ) + diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)) - return 1. - (diff_character_count / character_count) + return 1.0 - (diff_character_count / character_count) -def cli_bc(arguments: List[str]): +def cli_bc(arguments: list[str]): parser = argparse.ArgumentParser( description="BC script checker for Charset-Normalizer with Chardet" ) - parser.add_argument('-c', '--coverage', action="store", default=85, type=int, dest='coverage', - help="Define the minimum acceptable coverage to succeed") + parser.add_argument( + "-c", + "--coverage", + action="store", + default=85, + type=int, + dest="coverage", + help="Define the minimum acceptable coverage to succeed", + ) args = parser.parse_args(arguments) if not isdir("./char-dataset"): - print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory") + print( + "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory" + ) exit(1) success_count = 0 @@ -50,44 +58,52 @@ def cli_bc(arguments: List[str]): content = fp.read() chardet_result = chardet_detect(content) - chardet_encoding = chardet_result['encoding'] + chardet_encoding = chardet_result["encoding"] charset_normalizer_result = tbt_detect(content) - charset_normalizer_encoding = charset_normalizer_result['encoding'] + charset_normalizer_encoding = charset_normalizer_result["encoding"] if [chardet_encoding, charset_normalizer_encoding].count(None) == 1: - print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding)) + print( + f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')" + ) continue if charset_normalizer_encoding == chardet_encoding: success_count += 1 - print("✅✅ '{}' (BC)".format(tbt_path)) + print(f"✅✅ '{tbt_path}' (BC)") continue - if (chardet_encoding is None and charset_normalizer_encoding is None) or (iana_name(chardet_encoding, False) == iana_name(charset_normalizer_encoding, False)): + if (chardet_encoding is None and charset_normalizer_encoding is None) or ( + iana_name(chardet_encoding, False) + == iana_name(charset_normalizer_encoding, False) + ): success_count += 1 - print("✅✅ '{}' (BC)".format(tbt_path)) + print(f"✅✅ '{tbt_path}' (BC)") continue - calc_eq = calc_equivalence(content, chardet_encoding, charset_normalizer_encoding) + calc_eq = calc_equivalence( + content, chardet_encoding, charset_normalizer_encoding + ) if calc_eq >= 0.98: success_count += 1 - print("️✅ ️'{}' (got '{}' but eq {} WITH {} %)".format(tbt_path, charset_normalizer_encoding, chardet_encoding, round(calc_eq * 100., 3))) + print( + f"️✅ ️'{tbt_path}' (got '{charset_normalizer_encoding}' but " + f"eq {chardet_encoding} WITH {round(calc_eq * 100., 3)} %)" + ) continue - print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding)) + print( + f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')" + ) - success_ratio = round(success_count / total_count, 2) * 100. + success_ratio = round(success_count / total_count, 2) * 100.0 - print("Total EST BC = {} % ({} / {} files)".format(success_ratio, success_count, total_count)) + print(f"Total EST BC = {success_ratio} % ({success_count} / {total_count} files)") return 0 if success_ratio >= args.coverage else 1 if __name__ == "__main__": - exit( - cli_bc( - argv[1:] - ) - ) + exit(cli_bc(argv[1:])) diff --git a/bin/coverage.py b/bin/coverage.py index e5f07bd5..e5ba0110 100644 --- a/bin/coverage.py +++ b/bin/coverage.py @@ -1,43 +1,55 @@ #!/bin/python +from __future__ import annotations + +import argparse from glob import glob +from os import sep from os.path import isdir from sys import argv -from typing import List -import argparse -from charset_normalizer import from_path, __version__ +from charset_normalizer import __version__, from_path from charset_normalizer.utils import iana_name -from os import sep - def calc_equivalence(content: bytes, cp_a: str, cp_b: str): str_a = content.decode(cp_a) str_b = content.decode(cp_b) character_count = len(str_a) - diff_character_count = sum( - chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b) - ) - + diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)) - return 1. - (diff_character_count / character_count) + return 1.0 - (diff_character_count / character_count) -def cli_coverage(arguments: List[str]): +def cli_coverage(arguments: list[str]): parser = argparse.ArgumentParser( description="Embedded detection success coverage script checker for Charset-Normalizer" ) - parser.add_argument('-p', '--with-preemptive', action="store_true", default=False, dest='preemptive', - help='Enable the preemptive scan behaviour during coverage check') - parser.add_argument('-c', '--coverage', action="store", default=90, type=int, dest='coverage', - help="Define the minimum acceptable coverage to succeed") + parser.add_argument( + "-p", + "--with-preemptive", + action="store_true", + default=False, + dest="preemptive", + help="Enable the preemptive scan behaviour during coverage check", + ) + parser.add_argument( + "-c", + "--coverage", + action="store", + default=90, + type=int, + dest="coverage", + help="Define the minimum acceptable coverage to succeed", + ) args = parser.parse_args(arguments) if not isdir("./char-dataset"): - print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory") + print( + "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory" + ) exit(1) print(f"> using charset-normalizer {__version__}") @@ -46,28 +58,27 @@ def cli_coverage(arguments: List[str]): total_count = 0 for tbt_path in sorted(glob("./char-dataset/**/*.*")): - expected_encoding = tbt_path.split(sep)[-2] total_count += 1 - results = from_path( - tbt_path, - preemptive_behaviour=args.preemptive - ) + results = from_path(tbt_path, preemptive_behaviour=args.preemptive) if expected_encoding == "None" and len(results) == 0: - print("✅✅ '{}'".format(tbt_path)) + print(f"✅✅ '{tbt_path}'") success_count += 1 continue if len(results) == 0: - print("⚡⚡ '{}' (nothing)".format(tbt_path)) + print(f"⚡⚡ '{tbt_path}' (nothing)") continue result = results.best() - if expected_encoding in result.could_be_from_charset or iana_name(expected_encoding) in result.could_be_from_charset: - print("✅✅ '{}'".format(tbt_path)) + if ( + expected_encoding in result.could_be_from_charset + or iana_name(expected_encoding) in result.could_be_from_charset + ): + print(f"✅✅ '{tbt_path}'") success_count += 1 continue @@ -75,21 +86,21 @@ def cli_coverage(arguments: List[str]): if calc_eq >= 0.98: success_count += 1 - print("️✅ ️'{}' (got '{}' but equivalence {} %)".format(tbt_path, result.encoding, round(calc_eq * 100., 3))) + print( + f"️✅ ️'{tbt_path}' (got '{result.encoding}' but equivalence {round(calc_eq * 100., 3)} %)" + ) continue - print("⚡ '{}' (got '{}')".format(tbt_path, result.encoding)) + print(f"⚡ '{tbt_path}' (got '{result.encoding}')") - success_ratio = round(success_count / total_count, 2) * 100. + success_ratio = round(success_count / total_count, 2) * 100.0 - print("Total EST coverage = {} % ({} / {} files)".format(success_ratio, success_count, total_count)) + print( + f"Total EST coverage = {success_ratio} % ({success_count} / {total_count} files)" + ) return 0 if success_ratio >= args.coverage else 1 if __name__ == "__main__": - exit( - cli_coverage( - argv[1:] - ) - ) + exit(cli_coverage(argv[1:])) diff --git a/bin/integration.py b/bin/integration.py index b186313a..7313ae5e 100644 --- a/bin/integration.py +++ b/bin/integration.py @@ -1,20 +1,20 @@ -from requests import get, __version__ -from typing import List -from charset_normalizer import detect, __version__ as __version_cn__ +from __future__ import annotations -if __name__ == "__main__": +from requests import __version__, get + +from charset_normalizer import __version__ as __version_cn__ +from charset_normalizer import detect +if __name__ == "__main__": print(f"requests {__version__}") print(f"charset_normalizer {__version_cn__}") - files: List[str] = get("http://127.0.0.1:8080/").json() + files: list[str] = get("http://127.0.0.1:8080/").json() print("## Testing with actual files") for file in files: - r = get( - "http://127.0.0.1:8080/" + file - ) + r = get("http://127.0.0.1:8080/" + file) if r.ok is False: print(f"Unable to retrieve '{file}' | HTTP/{r.status_code}") @@ -23,7 +23,9 @@ expected_encoding = detect(r.content)["encoding"] if expected_encoding != r.apparent_encoding: - print(f"Integration test failed | File '{file}' | Expected '{expected_encoding}' got '{r.apparent_encoding}'") + print( + f"Integration test failed | File '{file}' | Expected '{expected_encoding}' got '{r.apparent_encoding}'" + ) exit(1) print(f"✅✅ '{file}' OK") diff --git a/bin/performance.py b/bin/performance.py index ff715fd1..41195b8f 100644 --- a/bin/performance.py +++ b/bin/performance.py @@ -1,15 +1,17 @@ #!/bin/python -from glob import glob -from time import perf_counter_ns +from __future__ import annotations + import argparse -from sys import argv +from glob import glob +from math import ceil from os.path import isdir +from statistics import mean, stdev +from sys import argv +from time import perf_counter_ns -from charset_normalizer import detect from chardet import detect as chardet_detect -from statistics import mean, stdev -from math import ceil +from charset_normalizer import detect def calc_percentile(data, percentile): @@ -66,7 +68,8 @@ def performance_compare(arguments): charset_normalizer_time = charset_normalizer_time or 0.000005 cn_faster = (chardet_time / charset_normalizer_time) * 100 - 100 print( - f"{idx+1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %" + f"{idx + 1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} " + f"CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %" ) # Print the top 10 rows with the slowest execution time @@ -78,7 +81,7 @@ def performance_compare(arguments): ) for idx, time in sorted_results[:10]: tbt_path = file_list[idx] - print(f"{idx+1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}") + print(f"{idx + 1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}") # Print charset normalizer statistics min_time = min(charset_normalizer_results) diff --git a/bin/serve.py b/bin/serve.py index 0b055ef6..99d6b6ea 100644 --- a/bin/serve.py +++ b/bin/serve.py @@ -1,19 +1,27 @@ -from flask import Flask, jsonify, send_from_directory +from __future__ import annotations + from glob import glob +from flask import Flask, jsonify, send_from_directory + app = Flask(__name__) -@app.route('/raw/') +@app.route("/raw/") def read_file(path): - return send_from_directory('../char-dataset', path, as_attachment=True), 200, {"Content-Type": "text/plain"} + return ( + send_from_directory("../char-dataset", path, as_attachment=True), + 200, + {"Content-Type": "text/plain"}, + ) @app.route("/") def read_targets(): return jsonify( [ - el.replace("./char-dataset", "/raw").replace("\\", "/") for el in sorted(glob("./char-dataset/**/*")) + el.replace("./char-dataset", "/raw").replace("\\", "/") + for el in sorted(glob("./char-dataset/**/*")) ] ) @@ -30,7 +38,13 @@ def read_empty_response_json(): @app.route("/edge/gb18030/json") def read_gb18030_response_json(): - return '{"abc": "我没有埋怨,磋砣的只是一些时间。。今觀俗士之論也,以族舉德,以位命賢,茲可謂得論之一體矣,而未獲至論之淑真也。"}'.encode("gb18030"), 200, {"Content-Type": "application/json"} + return ( + '{"abc": "我没有埋怨,磋砣的只是一些时间。。今觀俗士之論也,以族舉德,以位命賢,茲可謂得論之一體矣,而未獲至論之淑真也。"}'.encode( + "gb18030" + ), + 200, + {"Content-Type": "application/json"}, + ) if __name__ == "__main__": diff --git a/build-requirements.txt b/build-requirements.txt deleted file mode 100644 index ce978b23..00000000 --- a/build-requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -# in the meantime we migrate to pyproject.toml -# this represent the minimum requirement to build (for the optional speedup) ---find-links https://github.com/mypyc/mypy_mypyc-wheels/releases/expanded_assets/v1.12.0+dev.b2deaaecf1a11e13bc962558992b5f2d5701f295 -mypy==1.11.2; python_version >= '3.8' and python_version < '3.13' -mypy==1.12.0; python_version >= '3.13' -mypy==1.4.1; python_version < '3.8' -build>=0.10.0,<2 -wheel==0.42.0 -setuptools>=68,<76 diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py index 55991fc3..348341fb 100644 --- a/charset_normalizer/__init__.py +++ b/charset_normalizer/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ Charset-Normalizer ~~~~~~~~~~~~~~ @@ -19,6 +18,8 @@ :copyright: (c) 2021 by Ahmed TAHRI :license: MIT, see LICENSE for more details. """ +from __future__ import annotations + import logging from .api import from_bytes, from_fp, from_path, is_binary diff --git a/charset_normalizer/__main__.py b/charset_normalizer/__main__.py index beae2ef7..e0e76f7b 100644 --- a/charset_normalizer/__main__.py +++ b/charset_normalizer/__main__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .cli import cli_detect if __name__ == "__main__": diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index e3f2283b..9ffc049d 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import logging from os import PathLike -from typing import BinaryIO, List, Optional, Set, Union +from typing import BinaryIO from .cd import ( coherence_ratio, @@ -31,12 +33,12 @@ def from_bytes( - sequences: Union[bytes, bytearray], + sequences: bytes | bytearray, steps: int = 5, chunk_size: int = 512, threshold: float = 0.2, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, + cp_isolation: list[str] | None = None, + cp_exclusion: list[str] | None = None, preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, @@ -62,7 +64,7 @@ def from_bytes( if not isinstance(sequences, (bytearray, bytes)): raise TypeError( - "Expected object of type bytes or bytearray, got: {0}".format( + "Expected object of type bytes or bytearray, got: {}".format( type(sequences) ) ) @@ -135,9 +137,9 @@ def from_bytes( ), ) - prioritized_encodings: List[str] = [] + prioritized_encodings: list[str] = [] - specified_encoding: Optional[str] = ( + specified_encoding: str | None = ( any_specified_encoding(sequences) if preemptive_behaviour else None ) @@ -149,13 +151,13 @@ def from_bytes( specified_encoding, ) - tested: Set[str] = set() - tested_but_hard_failure: List[str] = [] - tested_but_soft_failure: List[str] = [] + tested: set[str] = set() + tested_but_hard_failure: list[str] = [] + tested_but_soft_failure: list[str] = [] - fallback_ascii: Optional[CharsetMatch] = None - fallback_u8: Optional[CharsetMatch] = None - fallback_specified: Optional[CharsetMatch] = None + fallback_ascii: CharsetMatch | None = None + fallback_u8: CharsetMatch | None = None + fallback_specified: CharsetMatch | None = None results: CharsetMatches = CharsetMatches() @@ -189,7 +191,7 @@ def from_bytes( tested.add(encoding_iana) - decoded_payload: Optional[str] = None + decoded_payload: str | None = None bom_or_sig_available: bool = sig_encoding == encoding_iana strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom( encoding_iana @@ -292,7 +294,7 @@ def from_bytes( early_stop_count: int = 0 lazy_str_hard_failure = False - md_chunks: List[str] = [] + md_chunks: list[str] = [] md_ratios = [] try: @@ -397,7 +399,7 @@ def from_bytes( ) if not is_multi_byte_decoder: - target_languages: List[str] = encoding_languages(encoding_iana) + target_languages: list[str] = encoding_languages(encoding_iana) else: target_languages = mb_encoding_languages(encoding_iana) @@ -546,8 +548,8 @@ def from_fp( steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, + cp_isolation: list[str] | None = None, + cp_exclusion: list[str] | None = None, preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, @@ -572,12 +574,12 @@ def from_fp( def from_path( - path: Union[str, bytes, PathLike], # type: ignore[type-arg] + path: str | bytes | PathLike, # type: ignore[type-arg] steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, + cp_isolation: list[str] | None = None, + cp_exclusion: list[str] | None = None, preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, @@ -603,12 +605,12 @@ def from_path( def is_binary( - fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg] + fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg] steps: int = 5, chunk_size: int = 512, threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, + cp_isolation: list[str] | None = None, + cp_exclusion: list[str] | None = None, preemptive_behaviour: bool = True, explain: bool = False, language_threshold: float = 0.1, diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py index 4ea6760c..71a3ed51 100644 --- a/charset_normalizer/cd.py +++ b/charset_normalizer/cd.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import importlib from codecs import IncrementalDecoder from collections import Counter from functools import lru_cache -from typing import Counter as TypeCounter, Dict, List, Optional, Tuple +from typing import Counter as TypeCounter from .constant import ( FREQUENCIES, @@ -22,26 +24,24 @@ ) -def encoding_unicode_range(iana_name: str) -> List[str]: +def encoding_unicode_range(iana_name: str) -> list[str]: """ Return associated unicode ranges in a single byte code page. """ if is_multi_byte_encoding(iana_name): - raise IOError("Function not supported on multi-byte code page") + raise OSError("Function not supported on multi-byte code page") - decoder = importlib.import_module( - "encodings.{}".format(iana_name) - ).IncrementalDecoder + decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder p: IncrementalDecoder = decoder(errors="ignore") - seen_ranges: Dict[str, int] = {} + seen_ranges: dict[str, int] = {} character_count: int = 0 for i in range(0x40, 0xFF): chunk: str = p.decode(bytes([i])) if chunk: - character_range: Optional[str] = unicode_range(chunk) + character_range: str | None = unicode_range(chunk) if character_range is None: continue @@ -61,11 +61,11 @@ def encoding_unicode_range(iana_name: str) -> List[str]: ) -def unicode_range_languages(primary_range: str) -> List[str]: +def unicode_range_languages(primary_range: str) -> list[str]: """ Return inferred languages used with a unicode range. """ - languages: List[str] = [] + languages: list[str] = [] for language, characters in FREQUENCIES.items(): for character in characters: @@ -77,13 +77,13 @@ def unicode_range_languages(primary_range: str) -> List[str]: @lru_cache() -def encoding_languages(iana_name: str) -> List[str]: +def encoding_languages(iana_name: str) -> list[str]: """ Single-byte encoding language association. Some code page are heavily linked to particular language(s). This function does the correspondence. """ - unicode_ranges: List[str] = encoding_unicode_range(iana_name) - primary_range: Optional[str] = None + unicode_ranges: list[str] = encoding_unicode_range(iana_name) + primary_range: str | None = None for specified_range in unicode_ranges: if "Latin" not in specified_range: @@ -97,7 +97,7 @@ def encoding_languages(iana_name: str) -> List[str]: @lru_cache() -def mb_encoding_languages(iana_name: str) -> List[str]: +def mb_encoding_languages(iana_name: str) -> list[str]: """ Multi-byte encoding language association. Some code page are heavily linked to particular language(s). This function does the correspondence. @@ -118,7 +118,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]: @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT) -def get_target_features(language: str) -> Tuple[bool, bool]: +def get_target_features(language: str) -> tuple[bool, bool]: """ Determine main aspects from a supported language if it contains accents and if is pure Latin. """ @@ -135,12 +135,12 @@ def get_target_features(language: str) -> Tuple[bool, bool]: def alphabet_languages( - characters: List[str], ignore_non_latin: bool = False -) -> List[str]: + characters: list[str], ignore_non_latin: bool = False +) -> list[str]: """ Return associated languages associated to given characters. """ - languages: List[Tuple[str, float]] = [] + languages: list[tuple[str, float]] = [] source_have_accents = any(is_accentuated(character) for character in characters) @@ -170,7 +170,7 @@ def alphabet_languages( def characters_popularity_compare( - language: str, ordered_characters: List[str] + language: str, ordered_characters: list[str] ) -> float: """ Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language. @@ -178,7 +178,7 @@ def characters_popularity_compare( Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.) """ if language not in FREQUENCIES: - raise ValueError("{} not available".format(language)) + raise ValueError(f"{language} not available") character_approved_count: int = 0 FREQUENCIES_language_set = set(FREQUENCIES[language]) @@ -214,14 +214,14 @@ def characters_popularity_compare( character_approved_count += 1 continue - characters_before_source: List[str] = FREQUENCIES[language][ + characters_before_source: list[str] = FREQUENCIES[language][ 0:character_rank_in_language ] - characters_after_source: List[str] = FREQUENCIES[language][ + characters_after_source: list[str] = FREQUENCIES[language][ character_rank_in_language: ] - characters_before: List[str] = ordered_characters[0:character_rank] - characters_after: List[str] = ordered_characters[character_rank:] + characters_before: list[str] = ordered_characters[0:character_rank] + characters_after: list[str] = ordered_characters[character_rank:] before_match_count: int = len( set(characters_before) & set(characters_before_source) @@ -249,24 +249,24 @@ def characters_popularity_compare( return character_approved_count / len(ordered_characters) -def alpha_unicode_split(decoded_sequence: str) -> List[str]: +def alpha_unicode_split(decoded_sequence: str) -> list[str]: """ Given a decoded text sequence, return a list of str. Unicode range / alphabet separation. Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list; One containing the latin letters and the other hebrew. """ - layers: Dict[str, str] = {} + layers: dict[str, str] = {} for character in decoded_sequence: if character.isalpha() is False: continue - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: continue - layer_target_range: Optional[str] = None + layer_target_range: str | None = None for discovered_range in layers: if ( @@ -288,12 +288,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]: return list(layers.values()) -def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: +def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches: """ This function merge results previously given by the function coherence_ratio. The return type is the same as coherence_ratio. """ - per_language_ratios: Dict[str, List[float]] = {} + per_language_ratios: dict[str, list[float]] = {} for result in results: for sub_result in result: language, ratio = sub_result @@ -321,7 +321,7 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: We shall NOT return "English—" in CoherenceMatches because it is an alternative of "English". This function only keeps the best match and remove the em-dash in it. """ - index_results: Dict[str, List[float]] = dict() + index_results: dict[str, list[float]] = dict() for result in results: language, ratio = result @@ -345,14 +345,14 @@ def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: @lru_cache(maxsize=2048) def coherence_ratio( - decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None + decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None ) -> CoherenceMatches: """ Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers. A layer = Character extraction by alphabets/ranges. """ - results: List[Tuple[str, float]] = [] + results: list[tuple[str, float]] = [] ignore_non_latin: bool = False sufficient_match_count: int = 0 @@ -371,7 +371,7 @@ def coherence_ratio( if character_count <= TOO_SMALL_SEQUENCE: continue - popular_character_ordered: List[str] = [c for c, o in most_common] + popular_character_ordered: list[str] = [c for c, o in most_common] for language in lg_inclusion_list or alphabet_languages( popular_character_ordered, ignore_non_latin diff --git a/charset_normalizer/cli/__init__.py b/charset_normalizer/cli/__init__.py index d95fedfe..543a5a4d 100644 --- a/charset_normalizer/cli/__init__.py +++ b/charset_normalizer/cli/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .__main__ import cli_detect, query_yes_no __all__ = ( diff --git a/charset_normalizer/cli/__main__.py b/charset_normalizer/cli/__main__.py index e7edd0fc..b4f364be 100644 --- a/charset_normalizer/cli/__main__.py +++ b/charset_normalizer/cli/__main__.py @@ -1,9 +1,10 @@ +from __future__ import annotations + import argparse import sys from json import dumps from os.path import abspath, basename, dirname, join, realpath from platform import python_version -from typing import List, Optional from unicodedata import unidata_version import charset_normalizer.md as md_module @@ -45,7 +46,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool: sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n") -def cli_detect(argv: Optional[List[str]] = None) -> int: +def cli_detect(argv: list[str] | None = None) -> int: """ CLI assistant using ARGV and ArgumentParser :param argv: @@ -259,7 +260,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: dir_path = dirname(realpath(my_file.name)) file_name = basename(realpath(my_file.name)) - o_: List[str] = file_name.split(".") + o_: list[str] = file_name.split(".") if args.replace is False: o_.insert(-1, best_guess.encoding) @@ -284,7 +285,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: with open(x_[0].unicode_path, "wb") as fp: fp.write(best_guess.output()) - except IOError as e: + except OSError as e: print(str(e), file=sys.stderr) if my_file.closed is False: my_file.close() diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py index f8f2a811..1fb9508d 100644 --- a/charset_normalizer/constant.py +++ b/charset_normalizer/constant.py @@ -1,11 +1,12 @@ -# -*- coding: utf-8 -*- +from __future__ import annotations + from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE from encodings.aliases import aliases -from re import IGNORECASE, compile as re_compile -from typing import Dict, List, Set, Union +from re import IGNORECASE +from re import compile as re_compile # Contain for each eligible encoding a list of/item bytes SIG/BOM -ENCODING_MARKS: Dict[str, Union[bytes, List[bytes]]] = { +ENCODING_MARKS: dict[str, bytes | list[bytes]] = { "utf_8": BOM_UTF8, "utf_7": [ b"\x2b\x2f\x76\x38", @@ -25,7 +26,7 @@ UTF8_MAXIMAL_ALLOCATION: int = 1_112_064 # Up-to-date Unicode ucd/15.0.0 -UNICODE_RANGES_COMBINED: Dict[str, range] = { +UNICODE_RANGES_COMBINED: dict[str, range] = { "Control character": range(32), "Basic Latin": range(32, 128), "Latin-1 Supplement": range(128, 256), @@ -357,7 +358,7 @@ } -UNICODE_SECONDARY_RANGE_KEYWORD: List[str] = [ +UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [ "Supplement", "Extended", "Extensions", @@ -392,7 +393,7 @@ "koi8_u", ] -IANA_SUPPORTED: List[str] = sorted( +IANA_SUPPORTED: list[str] = sorted( filter( lambda x: x.endswith("_codec") is False and x not in {"rot_13", "tactis", "mbcs"}, @@ -403,7 +404,7 @@ IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED) # pre-computed code page that are similar using the function cp_similarity. -IANA_SUPPORTED_SIMILAR: Dict[str, List[str]] = { +IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = { "cp037": ["cp1026", "cp1140", "cp273", "cp500"], "cp1026": ["cp037", "cp1140", "cp273", "cp500"], "cp1125": ["cp866"], @@ -492,7 +493,7 @@ } -CHARDET_CORRESPONDENCE: Dict[str, str] = { +CHARDET_CORRESPONDENCE: dict[str, str] = { "iso2022_kr": "ISO-2022-KR", "iso2022_jp": "ISO-2022-JP", "euc_kr": "EUC-KR", @@ -528,7 +529,7 @@ } -COMMON_SAFE_ASCII_CHARACTERS: Set[str] = { +COMMON_SAFE_ASCII_CHARACTERS: set[str] = { "<", ">", "=", @@ -549,8 +550,8 @@ } -KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} -ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} +KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"} +ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"} # Logging LEVEL below DEBUG TRACE: int = 5 @@ -558,7 +559,7 @@ # Language label that contain the em dash "—" # character are to be considered alternative seq to origin -FREQUENCIES: Dict[str, List[str]] = { +FREQUENCIES: dict[str, list[str]] = { "English": [ "e", "a", diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py index 3f6d4907..cfb876a1 100644 --- a/charset_normalizer/legacy.py +++ b/charset_normalizer/legacy.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any from warnings import warn from .api import from_bytes @@ -11,9 +11,9 @@ from typing_extensions import TypedDict class ResultDict(TypedDict): - encoding: Optional[str] + encoding: str | None language: str - confidence: Optional[float] + confidence: float | None def detect( @@ -38,7 +38,7 @@ def detect( if not isinstance(byte_str, (bytearray, bytes)): raise TypeError( # pragma: nocover "Expected object of type bytes or bytearray, got: " - "{0}".format(type(byte_str)) + "{}".format(type(byte_str)) ) if isinstance(byte_str, bytearray): diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index d834db0e..d177db01 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -1,6 +1,7 @@ +from __future__ import annotations + from functools import lru_cache from logging import getLogger -from typing import List, Optional from .constant import ( COMMON_SAFE_ASCII_CHARACTERS, @@ -68,7 +69,7 @@ def __init__(self) -> None: self._symbol_count: int = 0 self._character_count: int = 0 - self._last_printable_char: Optional[str] = None + self._last_printable_char: str | None = None self._frenzy_symbol_in_word: bool = False def eligible(self, character: str) -> bool: @@ -165,7 +166,7 @@ def __init__(self) -> None: self._successive_count: int = 0 self._character_count: int = 0 - self._last_latin_character: Optional[str] = None + self._last_latin_character: str | None = None def eligible(self, character: str) -> bool: return character.isalpha() and is_latin(character) @@ -201,7 +202,7 @@ class SuspiciousRange(MessDetectorPlugin): def __init__(self) -> None: self._suspicious_successive_range_count: int = 0 self._character_count: int = 0 - self._last_printable_seen: Optional[str] = None + self._last_printable_seen: str | None = None def eligible(self, character: str) -> bool: return character.isprintable() @@ -221,8 +222,8 @@ def feed(self, character: str) -> None: self._last_printable_seen = character return - unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) - unicode_range_b: Optional[str] = unicode_range(character) + unicode_range_a: str | None = unicode_range(self._last_printable_seen) + unicode_range_b: str | None = unicode_range(character) if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): self._suspicious_successive_range_count += 1 @@ -406,7 +407,7 @@ def __init__(self) -> None: self._character_count: int = 0 - self._last_alpha_seen: Optional[str] = None + self._last_alpha_seen: str | None = None self._current_ascii_only: bool = True def eligible(self, character: str) -> bool: @@ -501,7 +502,7 @@ def ratio(self) -> float: @lru_cache(maxsize=1024) def is_suspiciously_successive_range( - unicode_range_a: Optional[str], unicode_range_b: Optional[str] + unicode_range_a: str | None, unicode_range_b: str | None ) -> bool: """ Determine if two Unicode range seen next to each other can be considered as suspicious. @@ -580,7 +581,7 @@ def mess_ratio( Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. """ - detectors: List[MessDetectorPlugin] = [ + detectors: list[MessDetectorPlugin] = [ md_class() for md_class in MessDetectorPlugin.__subclasses__() ] diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index 6f6b86b3..09492a9c 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -1,8 +1,10 @@ +from __future__ import annotations + from encodings.aliases import aliases from hashlib import sha256 from json import dumps from re import sub -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Iterator, List, Tuple from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE from .utils import iana_name, is_multi_byte_encoding, unicode_range @@ -15,9 +17,9 @@ def __init__( guessed_encoding: str, mean_mess_ratio: float, has_sig_or_bom: bool, - languages: "CoherenceMatches", - decoded_payload: Optional[str] = None, - preemptive_declaration: Optional[str] = None, + languages: CoherenceMatches, + decoded_payload: str | None = None, + preemptive_declaration: str | None = None, ): self._payload: bytes = payload @@ -25,17 +27,17 @@ def __init__( self._mean_mess_ratio: float = mean_mess_ratio self._languages: CoherenceMatches = languages self._has_sig_or_bom: bool = has_sig_or_bom - self._unicode_ranges: Optional[List[str]] = None + self._unicode_ranges: list[str] | None = None - self._leaves: List[CharsetMatch] = [] + self._leaves: list[CharsetMatch] = [] self._mean_coherence_ratio: float = 0.0 - self._output_payload: Optional[bytes] = None - self._output_encoding: Optional[str] = None + self._output_payload: bytes | None = None + self._output_encoding: str | None = None - self._string: Optional[str] = decoded_payload + self._string: str | None = decoded_payload - self._preemptive_declaration: Optional[str] = preemptive_declaration + self._preemptive_declaration: str | None = preemptive_declaration def __eq__(self, other: object) -> bool: if not isinstance(other, CharsetMatch): @@ -77,9 +79,9 @@ def __str__(self) -> str: return self._string def __repr__(self) -> str: - return "".format(self.encoding, self.fingerprint) + return f"" - def add_submatch(self, other: "CharsetMatch") -> None: + def add_submatch(self, other: CharsetMatch) -> None: if not isinstance(other, CharsetMatch) or other == self: raise ValueError( "Unable to add instance <{}> as a submatch of a CharsetMatch".format( @@ -95,11 +97,11 @@ def encoding(self) -> str: return self._encoding @property - def encoding_aliases(self) -> List[str]: + def encoding_aliases(self) -> list[str]: """ Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. """ - also_known_as: List[str] = [] + also_known_as: list[str] = [] for u, p in aliases.items(): if self.encoding == u: also_known_as.append(p) @@ -116,7 +118,7 @@ def byte_order_mark(self) -> bool: return self._has_sig_or_bom @property - def languages(self) -> List[str]: + def languages(self) -> list[str]: """ Return the complete list of possible languages found in decoded sequence. Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. @@ -177,7 +179,7 @@ def raw(self) -> bytes: return self._payload @property - def submatch(self) -> List["CharsetMatch"]: + def submatch(self) -> list[CharsetMatch]: return self._leaves @property @@ -185,19 +187,17 @@ def has_submatch(self) -> bool: return len(self._leaves) > 0 @property - def alphabets(self) -> List[str]: + def alphabets(self) -> list[str]: if self._unicode_ranges is not None: return self._unicode_ranges # list detected ranges - detected_ranges: List[Optional[str]] = [ - unicode_range(char) for char in str(self) - ] + detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)] # filter and sort self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) return self._unicode_ranges @property - def could_be_from_charset(self) -> List[str]: + def could_be_from_charset(self) -> list[str]: """ The complete list of encoding that output the exact SAME str result and therefore could be the originating encoding. @@ -247,13 +247,13 @@ class CharsetMatches: Act like a list(iterable) but does not implements all related methods. """ - def __init__(self, results: Optional[List[CharsetMatch]] = None): - self._results: List[CharsetMatch] = sorted(results) if results else [] + def __init__(self, results: list[CharsetMatch] | None = None): + self._results: list[CharsetMatch] = sorted(results) if results else [] def __iter__(self) -> Iterator[CharsetMatch]: yield from self._results - def __getitem__(self, item: Union[int, str]) -> CharsetMatch: + def __getitem__(self, item: int | str) -> CharsetMatch: """ Retrieve a single item either by its position or encoding name (alias may be used here). Raise KeyError upon invalid index or encoding not present in results. @@ -293,7 +293,7 @@ def append(self, item: CharsetMatch) -> None: self._results.append(item) self._results = sorted(self._results) - def best(self) -> Optional["CharsetMatch"]: + def best(self) -> CharsetMatch | None: """ Simply return the first match. Strict equivalent to matches[0]. """ @@ -301,7 +301,7 @@ def best(self) -> Optional["CharsetMatch"]: return None return self._results[0] - def first(self) -> Optional["CharsetMatch"]: + def first(self) -> CharsetMatch | None: """ Redundant method, call the method best(). Kept for BC reasons. """ @@ -316,31 +316,31 @@ class CliDetectionResult: def __init__( self, path: str, - encoding: Optional[str], - encoding_aliases: List[str], - alternative_encodings: List[str], + encoding: str | None, + encoding_aliases: list[str], + alternative_encodings: list[str], language: str, - alphabets: List[str], + alphabets: list[str], has_sig_or_bom: bool, chaos: float, coherence: float, - unicode_path: Optional[str], + unicode_path: str | None, is_preferred: bool, ): self.path: str = path - self.unicode_path: Optional[str] = unicode_path - self.encoding: Optional[str] = encoding - self.encoding_aliases: List[str] = encoding_aliases - self.alternative_encodings: List[str] = alternative_encodings + self.unicode_path: str | None = unicode_path + self.encoding: str | None = encoding + self.encoding_aliases: list[str] = encoding_aliases + self.alternative_encodings: list[str] = alternative_encodings self.language: str = language - self.alphabets: List[str] = alphabets + self.alphabets: list[str] = alphabets self.has_sig_or_bom: bool = has_sig_or_bom self.chaos: float = chaos self.coherence: float = coherence self.is_preferred: bool = is_preferred @property - def __dict__(self) -> Dict[str, Any]: # type: ignore + def __dict__(self) -> dict[str, Any]: # type: ignore return { "path": self.path, "encoding": self.encoding, diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index e5cbbf4c..4498d3c6 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import importlib import logging import unicodedata @@ -5,9 +7,11 @@ from encodings.aliases import aliases from functools import lru_cache from re import findall -from typing import Generator, List, Optional, Set, Tuple, Union +from typing import Generator -from _multibytecodec import MultibyteIncrementalDecoder +from _multibytecodec import ( # type: ignore[import-not-found,import] + MultibyteIncrementalDecoder, +) from .constant import ( ENCODING_MARKS, @@ -43,13 +47,13 @@ def remove_accent(character: str) -> str: if not decomposed: return character - codes: List[str] = decomposed.split(" ") + codes: list[str] = decomposed.split(" ") return chr(int(codes[0], 16)) @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) -def unicode_range(character: str) -> Optional[str]: +def unicode_range(character: str) -> str | None: """ Retrieve the Unicode range official name from a single character. """ @@ -78,7 +82,7 @@ def is_punctuation(character: str) -> bool: if "P" in character_category: return True - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: return False @@ -93,7 +97,7 @@ def is_symbol(character: str) -> bool: if "S" in character_category or "N" in character_category: return True - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: return False @@ -103,7 +107,7 @@ def is_symbol(character: str) -> bool: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) def is_emoticon(character: str) -> bool: - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: return False @@ -212,7 +216,7 @@ def is_unprintable(character: str) -> bool: ) -def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]: +def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None: """ Extract using ASCII-only decoder any specified encoding in the first n-bytes. """ @@ -221,7 +225,7 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional seq_len: int = len(sequence) - results: List[str] = findall( + results: list[str] = findall( RE_POSSIBLE_ENCODING_INDICATION, sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), ) @@ -260,18 +264,18 @@ def is_multi_byte_encoding(name: str) -> bool: "utf_32_be", "utf_7", } or issubclass( - importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, + importlib.import_module(f"encodings.{name}").IncrementalDecoder, MultibyteIncrementalDecoder, ) -def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: +def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]: """ Identify and extract SIG/BOM in given sequence. """ for iana_encoding in ENCODING_MARKS: - marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] + marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding] if isinstance(marks, bytes): marks = [marks] @@ -298,16 +302,16 @@ def iana_name(cp_name: str, strict: bool = True) -> str: return encoding_iana if strict: - raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) + raise ValueError(f"Unable to retrieve IANA for '{cp_name}'") return cp_name -def range_scan(decoded_sequence: str) -> List[str]: - ranges: Set[str] = set() +def range_scan(decoded_sequence: str) -> list[str]: + ranges: set[str] = set() for character in decoded_sequence: - character_range: Optional[str] = unicode_range(character) + character_range: str | None = unicode_range(character) if character_range is None: continue @@ -321,12 +325,8 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): return 0.0 - decoder_a = importlib.import_module( - "encodings.{}".format(iana_name_a) - ).IncrementalDecoder - decoder_b = importlib.import_module( - "encodings.{}".format(iana_name_b) - ).IncrementalDecoder + decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder + decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder id_a: IncrementalDecoder = decoder_a(errors="ignore") id_b: IncrementalDecoder = decoder_b(errors="ignore") @@ -374,7 +374,7 @@ def cut_sequence_chunks( strip_sig_or_bom: bool, sig_payload: bytes, is_multi_byte_decoder: bool, - decoded_payload: Optional[str] = None, + decoded_payload: str | None = None, ) -> Generator[str, None, None]: if decoded_payload and is_multi_byte_decoder is False: for i in offsets: diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 699990ee..69eaffc1 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,7 @@ Expose version """ +from __future__ import annotations + __version__ = "3.4.0" VERSION = __version__.split(".") diff --git a/dev-requirements.txt b/dev-requirements.txt index 12c2ebfd..325adac2 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,15 +1,7 @@ ---find-links https://github.com/mypyc/mypy_mypyc-wheels/releases/expanded_assets/v1.12.0+dev.b2deaaecf1a11e13bc962558992b5f2d5701f295 -flake8==5.0.4 chardet==5.1.0 -isort==5.11.4 -codecov==2.1.13 pytest-cov==4.1.0 -build>=0.10.0,<2 -wheel==0.42.0 -black==23.3.0 -mypy==1.11.2; python_version >= '3.8' and python_version < '3.13' -mypy==1.12.0; python_version >= '3.13' -mypy==1.4.1; python_version < '3.8' Flask==2.2.3 pytest>=7.4.4,<=8.3.3 requests==2.31.0 +pre-commit +build diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..05f32e5a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,81 @@ +[build-system] +requires = ["setuptools", "setuptools-scm", "mypy>=1.4.1,<=1.13.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "charset-normalizer" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +license = {text = "MIT"} +keywords = ["encoding", "charset", "charset-detector", "detector", "normalization", "unicode", "chardet", "detect"] +authors = [ + {name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"}, +] +maintainers = [ + {name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"}, +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Text Processing :: Linguistic", + "Topic :: Utilities", + "Typing :: Typed", +] +requires-python = ">=3.7" +dynamic = ["version", "readme"] + +[project.optional-dependencies] +unicode_backport = [] + +[tool.setuptools.dynamic] +version = {attr = "charset_normalizer.__version__"} +readme = {file = ["README.md", "CHANGELOG.md", "LICENSE"]} + +[project.scripts] +normalizer = "charset_normalizer:cli.cli_detect" + +[project.urls] +"Changelog" = "https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md" +"Documentation" = "https://charset-normalizer.readthedocs.io/" +"Code" = "https://github.com/jawah/charset_normalizer" +"Issue tracker" = "https://github.com/jawah/charset_normalizer/issues" + +[tool.setuptools.packages.find] +exclude = ["tests*"] + +[tool.pytest.ini_options] +addopts = "--cov=charset_normalizer --cov-report=term-missing -rxXs" + +[tool.isort] +profile = "black" +add_imports = "from __future__ import annotations" + +[tool.mypy] +check_untyped_defs = true +disallow_any_generics = true +disallow_incomplete_defs = true +disallow_subclassing_any = true +disallow_untyped_calls = true +disallow_untyped_decorators = true +disallow_untyped_defs = true +no_implicit_optional = true +no_implicit_reexport = true +show_error_codes = true +strict_equality = true +warn_redundant_casts = true +warn_return_any = true +warn_unused_configs = true +warn_unused_ignores = false diff --git a/setup.cfg b/setup.cfg index 3eb71fa8..cbb60249 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,71 +1,6 @@ -[metadata] -name = charset-normalizer -description = The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet. -long_description = file: README.md, CHANGELOG.md, LICENSE -long_description_content_type = text/markdown -keywords = encoding, charset, charset-detector, detector, normalization, unicode, chardet, detect -url = https://github.com/Ousret/charset_normalizer -license = MIT -author_email = tahri.ahmed@proton.me -author = Ahmed TAHRI -project_urls = - Bug Reports = https://github.com/Ousret/charset_normalizer/issues - Documentation = https://charset-normalizer.readthedocs.io/en/latest -classifiers = - Development Status :: 5 - Production/Stable - License :: OSI Approved :: MIT License - Intended Audience :: Developers - Topic :: Software Development :: Libraries :: Python Modules - Operating System :: OS Independent - Programming Language :: Python - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 - Programming Language :: Python :: 3.12 - Programming Language :: Python :: 3.13 - Programming Language :: Python :: Implementation :: PyPy - Topic :: Text Processing :: Linguistic - Topic :: Utilities - Typing :: Typed - -[options.packages.find] -exclude = - tests - *.tests - *.tests.* - tests.* - docs* - data* - [options.extras_require] unicode_backport = -[options.entry_points] -console_scripts = - normalizer = charset_normalizer.cli:cli_detect - -[options] -packages = find: -include_package_data = True -python_requires = >=3.7.0 - -[options.package_data] -charset_normalizer = py.typed - -[tool:pytest] -addopts = --cov=charset_normalizer --cov-report=term-missing -rxXs - [flake8] ignore = W503, E203, B305 max-line-length = 120 - -[mypy] -disallow_untyped_defs = True -ignore_missing_imports = True - -[tool:isort] -profile = black -combine_as_imports = True diff --git a/setup.py b/setup.py index 0ccc7e91..c113acda 100644 --- a/setup.py +++ b/setup.py @@ -1,19 +1,12 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- + +from __future__ import annotations import os import sys -from re import search from setuptools import setup - -def get_version(): - with open('charset_normalizer/version.py') as version_file: - return search(r"""__version__\s+=\s+(['"])(?P.+?)\1""", - version_file.read()).group('version') - - USE_MYPYC = False if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc": @@ -25,14 +18,13 @@ def get_version(): if USE_MYPYC: from mypyc.build import mypycify - MYPYC_MODULES = mypycify([ - "charset_normalizer/md.py", - ], debug_level="0") + MYPYC_MODULES = mypycify( + [ + "charset_normalizer/md.py", + ], + debug_level="0", + ) else: MYPYC_MODULES = None -setup( - name="charset-normalizer", - version=get_version(), - ext_modules=MYPYC_MODULES -) +setup(name="charset-normalizer", ext_modules=MYPYC_MODULES) diff --git a/tests/__init__.py b/tests/__init__.py index 8b137891..e69de29b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py index e5d774d3..e4fb5fd3 100644 --- a/tests/test_base_detection.py +++ b/tests/test_base_detection.py @@ -1,40 +1,52 @@ -from charset_normalizer.api import from_bytes -from charset_normalizer.models import CharsetMatches +from __future__ import annotations import pytest +from charset_normalizer.api import from_bytes +from charset_normalizer.models import CharsetMatches + def test_empty(): - best_guess = from_bytes(b'').best() + best_guess = from_bytes(b"").best() assert best_guess is not None, "Empty bytes payload SHOULD NOT return None" - assert best_guess.encoding == "utf_8", "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)" + assert ( + best_guess.encoding == "utf_8" + ), "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)" assert len(best_guess.alphabets) == 0, "" def test_bool_matches(): - guesses_not_empty = from_bytes(b'') + guesses_not_empty = from_bytes(b"") guesses_empty = CharsetMatches([]) - assert bool(guesses_not_empty) is True, "Bool behaviour of CharsetMatches altered, should be True" - assert bool(guesses_empty) is False, "Bool behaviour of CharsetMatches altered, should be False" + assert ( + bool(guesses_not_empty) is True + ), "Bool behaviour of CharsetMatches altered, should be True" + assert ( + bool(guesses_empty) is False + ), "Bool behaviour of CharsetMatches altered, should be False" @pytest.mark.parametrize( "payload, expected_encoding", [ - (b'\xfe\xff', 'utf_16'), - ('\uFEFF'.encode('gb18030'), 'gb18030'), - (b'\xef\xbb\xbf', 'utf_8'), - ("".encode('utf_32'), "utf_32") - ] + (b"\xfe\xff", "utf_16"), + ("\uFEFF".encode("gb18030"), "gb18030"), + (b"\xef\xbb\xbf", "utf_8"), + ("".encode("utf_32"), "utf_32"), + ], ) def test_empty_but_with_bom_or_sig(payload, expected_encoding): best_guess = from_bytes(payload).best() assert best_guess is not None, "Empty detection but with SIG/BOM has failed!" - assert best_guess.encoding == expected_encoding, "Empty detection but with SIG/BOM is wrongly detected!" - assert best_guess.raw == payload, "The RAW property should contain the original payload given for detection." + assert ( + best_guess.encoding == expected_encoding + ), "Empty detection but with SIG/BOM is wrongly detected!" + assert ( + best_guess.raw == payload + ), "The RAW property should contain the original payload given for detection." assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True" assert str(best_guess) == "", "The cast to str SHOULD be empty" @@ -42,16 +54,27 @@ def test_empty_but_with_bom_or_sig(payload, expected_encoding): @pytest.mark.parametrize( "payload, expected_encoding", [ - ((u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030'), "gb18030",), - ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_32'), "utf_32",), - ('我没有埋怨,磋砣的只是一些时间。'.encode('utf_8_sig'), "utf_8",), - ] + ( + ("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"), + "gb18030", + ), + ( + "我没有埋怨,磋砣的只是一些时间。".encode("utf_32"), + "utf_32", + ), + ( + "我没有埋怨,磋砣的只是一些时间。".encode("utf_8_sig"), + "utf_8", + ), + ], ) def test_content_with_bom_or_sig(payload, expected_encoding): best_guess = from_bytes(payload).best() assert best_guess is not None, "Detection but with SIG/BOM has failed!" - assert best_guess.encoding == expected_encoding, "Detection but with SIG/BOM is wrongly detected!" + assert ( + best_guess.encoding == expected_encoding + ), "Detection but with SIG/BOM is wrongly detected!" assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True" @@ -63,42 +86,49 @@ def test_content_with_bom_or_sig(payload, expected_encoding): b'{"token": "g4UsPJdfzNkGW2jwmKDGDilKGKYtpF2X.mx3MaTWL1tL7CNn5U7DeCcodKX7S3lwwJPKNjBT8etY"}', b"81f4ab054b39cb0e12701e734077d84264308f5fc79494fc5f159fa2ebc07b73c8cc0e98e009664a20986706f90146e8eefcb929ce1f74a8eab21369fdc70198", b"{}", - ] + ], ) def test_obviously_ascii_content(payload): best_guess = from_bytes(payload).best() assert best_guess is not None, "Dead-simple ASCII detection has failed!" - assert best_guess.encoding == "ascii", "Dead-simple ASCII detection is wrongly detected!" + assert ( + best_guess.encoding == "ascii" + ), "Dead-simple ASCII detection is wrongly detected!" @pytest.mark.parametrize( "payload", [ - '\u020d\x1b'.encode('utf-8'), - 'h\xe9llo world!\n'.encode('utf_8'), - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8'), - 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.'.encode('utf_8'), - 'Bсеки човек има право на образование.'.encode('utf_8'), - "(° ͜ʖ °), creepy face, smiley 😀".encode("utf_8"), - """["Financiën", "La France"]""".encode("utf_8"), - "Qu'est ce que une étoile?".encode("utf_8"), - """Financiën""".encode("utf_8"), - "😀".encode("utf_8") - ] + "\u020d\x1b".encode(), + "h\xe9llo world!\n".encode(), + "我没有埋怨,磋砣的只是一些时间。".encode(), + "Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.".encode(), + "Bсеки човек има право на образование.".encode(), + "(° ͜ʖ °), creepy face, smiley 😀".encode(), + """["Financiën", "La France"]""".encode(), + "Qu'est ce que une étoile?".encode(), + """Financiën""".encode(), + "😀".encode(), + ], ) def test_obviously_utf8_content(payload): best_guess = from_bytes(payload).best() assert best_guess is not None, "Dead-simple UTF-8 detection has failed!" - assert best_guess.encoding == "utf_8", "Dead-simple UTF-8 detection is wrongly detected!" + assert ( + best_guess.encoding == "utf_8" + ), "Dead-simple UTF-8 detection is wrongly detected!" def test_mb_cutting_chk(): # This payload should be wrongfully split and the autofix should ran automatically # on chunks extraction. - payload = b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 " \ - b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa " * 128 + payload = ( + b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 " + b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa " + * 128 + ) guesses = from_bytes(payload, cp_isolation=["cp949"]) best_guess = guesses.best() @@ -108,9 +138,7 @@ def test_mb_cutting_chk(): def test_alphabets_property(): - best_guess = from_bytes( - "😀 Hello World! How affairs are going? 😀".encode("utf_8") - ).best() + best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best() assert "Basic Latin" in best_guess.alphabets assert "Emoticons range(Emoji)" in best_guess.alphabets @@ -119,16 +147,14 @@ def test_alphabets_property(): def test_doc_example_short_cp1251(): best_guess = from_bytes( - 'Bсеки човек има право на образование.'.encode('cp1251') + "Bсеки човек има право на образование.".encode("cp1251") ).best() assert best_guess.encoding == "cp1251" def test_direct_cmp_charset_match(): - best_guess = from_bytes( - "😀 Hello World! How affairs are going? 😀".encode("utf_8") - ).best() + best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best() assert best_guess == "utf_8" assert best_guess == "utf-8" diff --git a/tests/test_cli.py b/tests/test_cli.py index b73fb613..5f2777c4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,70 +1,46 @@ +from __future__ import annotations + import unittest -from charset_normalizer.cli import cli_detect, query_yes_no -from unittest.mock import patch +from os import pardir, path, remove from os.path import exists -from os import remove, path, pardir +from unittest.mock import patch + +from charset_normalizer.cli import cli_detect, query_yes_no -DIR_PATH = path.join( - path.dirname(path.realpath(__file__)), - pardir -) +DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir) class TestCommandLineInterface(unittest.TestCase): - - @patch('builtins.input', lambda *args: 'y') + @patch("builtins.input", lambda *args: "y") def test_simple_yes_input(self): - self.assertTrue( - query_yes_no('Are u willing to chill a little bit ?') - ) + self.assertTrue(query_yes_no("Are u willing to chill a little bit ?")) - @patch('builtins.input', lambda *args: 'N') + @patch("builtins.input", lambda *args: "N") def test_simple_no_input(self): - self.assertFalse( - query_yes_no('Are u willing to chill a little bit ?') - ) + self.assertFalse(query_yes_no("Are u willing to chill a little bit ?")) def test_single_file(self): - - self.assertEqual( - 0, - cli_detect( - [DIR_PATH + '/data/sample-arabic-1.txt'] - ) - ) + self.assertEqual(0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt"])) def test_version_output_success(self): with self.assertRaises(SystemExit): - cli_detect( - ['--version'] - ) + cli_detect(["--version"]) def test_single_file_normalize(self): self.assertEqual( - 0, - cli_detect( - [ - DIR_PATH + '/data/sample-arabic-1.txt', - '--normalize' - ] - ) + 0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--normalize"]) ) - self.assertTrue( - exists(DIR_PATH + '/data/sample-arabic-1.cp1256.txt') - ) + self.assertTrue(exists(DIR_PATH + "/data/sample-arabic-1.cp1256.txt")) try: - remove(DIR_PATH + '/data/sample-arabic-1.cp1256.txt') + remove(DIR_PATH + "/data/sample-arabic-1.cp1256.txt") except: pass def test_single_verbose_file(self): self.assertEqual( - 0, - cli_detect( - [DIR_PATH + '/data/sample-arabic-1.txt', '--verbose'] - ) + 0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--verbose"]) ) def test_multiple_file(self): @@ -72,11 +48,11 @@ def test_multiple_file(self): 0, cli_detect( [ - DIR_PATH + '/data/sample-arabic-1.txt', - DIR_PATH + '/data/sample-french.txt', - DIR_PATH + '/data/sample-chinese.txt' + DIR_PATH + "/data/sample-arabic-1.txt", + DIR_PATH + "/data/sample-french.txt", + DIR_PATH + "/data/sample-chinese.txt", ] - ) + ), ) def test_with_alternative(self): @@ -84,12 +60,12 @@ def test_with_alternative(self): 0, cli_detect( [ - '-a', - DIR_PATH + '/data/sample-arabic-1.txt', - DIR_PATH + '/data/sample-french.txt', - DIR_PATH + '/data/sample-chinese.txt' + "-a", + DIR_PATH + "/data/sample-arabic-1.txt", + DIR_PATH + "/data/sample-french.txt", + DIR_PATH + "/data/sample-chinese.txt", ] - ) + ), ) def test_with_minimal_output(self): @@ -97,12 +73,12 @@ def test_with_minimal_output(self): 0, cli_detect( [ - '-m', - DIR_PATH + '/data/sample-arabic-1.txt', - DIR_PATH + '/data/sample-french.txt', - DIR_PATH + '/data/sample-chinese.txt' + "-m", + DIR_PATH + "/data/sample-arabic-1.txt", + DIR_PATH + "/data/sample-french.txt", + DIR_PATH + "/data/sample-chinese.txt", ] - ) + ), ) def test_with_minimal_and_alt(self): @@ -110,47 +86,31 @@ def test_with_minimal_and_alt(self): 0, cli_detect( [ - '-m', - '-a', - DIR_PATH + '/data/sample-arabic-1.txt', - DIR_PATH + '/data/sample-french.txt', - DIR_PATH + '/data/sample-chinese.txt' + "-m", + "-a", + DIR_PATH + "/data/sample-arabic-1.txt", + DIR_PATH + "/data/sample-french.txt", + DIR_PATH + "/data/sample-chinese.txt", ] - ) + ), ) def test_non_existent_file(self): - with self.assertRaises(SystemExit) as cm: - cli_detect( - [DIR_PATH + '/data/not_found_data.txt'] - ) + cli_detect([DIR_PATH + "/data/not_found_data.txt"]) self.assertEqual(cm.exception.code, 2) def test_replace_without_normalize(self): - self.assertEqual( - cli_detect( - [ - DIR_PATH + '/data/sample-arabic-1.txt', - '--replace' - ] - ), - 1 + cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--replace"]), 1 ) def test_force_replace_without_replace(self): self.assertEqual( - cli_detect( - [ - DIR_PATH + '/data/sample-arabic-1.txt', - '--force' - ] - ), - 1 + cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--force"]), 1 ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py index 7e399132..e5952d6c 100644 --- a/tests/test_coherence_detection.py +++ b/tests/test_coherence_detection.py @@ -1,5 +1,14 @@ +from __future__ import annotations + import pytest -from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features, filter_alt_coherence_matches + +from charset_normalizer.cd import ( + encoding_languages, + filter_alt_coherence_matches, + get_target_features, + is_multi_byte_encoding, + mb_encoding_languages, +) @pytest.mark.parametrize( @@ -13,14 +22,20 @@ ("johab", ["Korean"]), ("shift_jis", ["Japanese"]), ("mac_greek", ["Greek"]), - ("iso2022_jp", ["Japanese"]) - ] + ("iso2022_jp", ["Japanese"]), + ], ) def test_infer_language_from_cp(iana_encoding, expected_languages): - languages = mb_encoding_languages(iana_encoding) if is_multi_byte_encoding(iana_encoding) else encoding_languages(iana_encoding) + languages = ( + mb_encoding_languages(iana_encoding) + if is_multi_byte_encoding(iana_encoding) + else encoding_languages(iana_encoding) + ) for expected_language in expected_languages: - assert expected_language in languages, "Wrongly detected language for given code page" + assert ( + expected_language in languages + ), "Wrongly detected language for given code page" @pytest.mark.parametrize( @@ -31,8 +46,8 @@ def test_infer_language_from_cp(iana_encoding, expected_languages): ("Hebrew", False, False), ("Arabic", False, False), ("Vietnamese", True, True), - ("Turkish", True, True) - ] + ("Turkish", True, True), + ], ) def test_target_features(language, expected_have_accents, expected_pure_latin): target_have_accents, target_pure_latin = get_target_features(language) @@ -44,11 +59,48 @@ def test_target_features(language, expected_have_accents, expected_pure_latin): @pytest.mark.parametrize( "matches, expected_return", [ - ([("English", 0.88,), ("English—", 0.99)], [("English", 0.99)]), - ([("English", 0.88,), ("English—", 0.99), ("English——", 0.999)], [("English", 0.999)]), - ([("English", 0.88,), ("English—", 0.77)], [("English", 0.88)]), - ([("English", 0.88,), ("Italian", 0.77)], [("English", 0.88), ("Italian", 0.77)]), - ] + ( + [ + ( + "English", + 0.88, + ), + ("English—", 0.99), + ], + [("English", 0.99)], + ), + ( + [ + ( + "English", + 0.88, + ), + ("English—", 0.99), + ("English——", 0.999), + ], + [("English", 0.999)], + ), + ( + [ + ( + "English", + 0.88, + ), + ("English—", 0.77), + ], + [("English", 0.88)], + ), + ( + [ + ( + "English", + 0.88, + ), + ("Italian", 0.77), + ], + [("English", 0.88), ("Italian", 0.77)], + ), + ], ) def test_filter_alt_coherence_matches(matches, expected_return): results = filter_alt_coherence_matches(matches) diff --git a/tests/test_detect_legacy.py b/tests/test_detect_legacy.py index ec45aa77..bd2b0351 100644 --- a/tests/test_detect_legacy.py +++ b/tests/test_detect_legacy.py @@ -1,75 +1,43 @@ +from __future__ import annotations + import unittest + from charset_normalizer.legacy import detect class TestDetectLegacy(unittest.TestCase): - def test_detect_dict_keys(self): + r = detect(("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030")) - r = detect( - (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030') - ) + with self.subTest("encoding key present"): + self.assertIn("encoding", r.keys()) - with self.subTest('encoding key present'): - self.assertIn( - 'encoding', - r.keys() - ) + with self.subTest("language key present"): + self.assertIn("language", r.keys()) - with self.subTest('language key present'): - self.assertIn( - 'language', - r.keys() - ) - - with self.subTest('confidence key present'): - self.assertIn( - 'confidence', - r.keys() - ) + with self.subTest("confidence key present"): + self.assertIn("confidence", r.keys()) def test_detect_dict_value_type(self): + r = detect("我没有埋怨,磋砣的只是一些时间。".encode()) - r = detect( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8') - ) - - with self.subTest('encoding instance of str'): - self.assertIsInstance( - r['encoding'], - str - ) + with self.subTest("encoding instance of str"): + self.assertIsInstance(r["encoding"], str) - with self.subTest('language instance of str'): - self.assertIsInstance( - r['language'], - str - ) + with self.subTest("language instance of str"): + self.assertIsInstance(r["language"], str) - with self.subTest('confidence instance of float'): - self.assertIsInstance( - r['confidence'], - float - ) + with self.subTest("confidence instance of float"): + self.assertIsInstance(r["confidence"], float) def test_detect_dict_value(self): - r = detect( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32') - ) + r = detect("我没有埋怨,磋砣的只是一些时间。".encode("utf_32")) - with self.subTest('encoding is equal to utf_32'): - self.assertEqual( - r['encoding'], - 'UTF-32' - ) + with self.subTest("encoding is equal to utf_32"): + self.assertEqual(r["encoding"], "UTF-32") def test_utf8_sig_not_striped(self): - r = detect( - "Hello World".encode('utf-8-sig') - ) + r = detect("Hello World".encode("utf-8-sig")) with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"): - self.assertEqual( - r['encoding'], - "UTF-8-SIG" - ) + self.assertEqual(r["encoding"], "UTF-8-SIG") diff --git a/tests/test_edge_case.py b/tests/test_edge_case.py index 6caa1c48..5b763ba2 100644 --- a/tests/test_edge_case.py +++ b/tests/test_edge_case.py @@ -1,14 +1,25 @@ -from charset_normalizer import from_bytes -import pytest +from __future__ import annotations + import platform -@pytest.mark.xfail(platform.python_version_tuple()[0] == "3" and platform.python_version_tuple()[1] == "7", reason="Unicode database is too old for this case (Python 3.7)") +import pytest + +from charset_normalizer import from_bytes + + +@pytest.mark.xfail( + platform.python_version_tuple()[0] == "3" + and platform.python_version_tuple()[1] == "7", + reason="Unicode database is too old for this case (Python 3.7)", +) def test_unicode_edge_case(): - payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3' + payload = b"\xef\xbb\xbf\xf0\x9f\xa9\xb3" best_guess = from_bytes(payload).best() - assert best_guess is not None, "Payload should have given something, detection failure" + assert ( + best_guess is not None + ), "Payload should have given something, detection failure" assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected" @@ -18,7 +29,9 @@ def test_issue_gh520(): best_guess = from_bytes(payload).best() - assert best_guess is not None, "Payload should have given something, detection failure" + assert ( + best_guess is not None + ), "Payload should have given something, detection failure" assert "Basic Latin" in best_guess.alphabets @@ -28,15 +41,19 @@ def test_issue_gh509(): best_guess = from_bytes(payload).best() - assert best_guess is not None, "Payload should have given something, detection failure" + assert ( + best_guess is not None + ), "Payload should have given something, detection failure" assert "ascii" == best_guess.encoding def test_issue_gh498(): """This case was mistaken for utf-16-le, this should never happen again.""" - payload = b'\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx' + payload = b"\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx" best_guess = from_bytes(payload).best() - assert best_guess is not None, "Payload should have given something, detection failure" + assert ( + best_guess is not None + ), "Payload should have given something, detection failure" assert "Cyrillic" in best_guess.alphabets diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py index adff8801..ff91e125 100644 --- a/tests/test_full_detection.py +++ b/tests/test_full_detection.py @@ -1,43 +1,50 @@ -from charset_normalizer.api import from_path +from __future__ import annotations + +from os import pardir, path + import pytest -from os import path, pardir -DIR_PATH = path.join( - path.dirname(path.realpath(__file__)), - pardir -) +from charset_normalizer.api import from_path + +DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir) @pytest.mark.parametrize( "input_data_file, expected_charset, expected_language", [ - ('sample-arabic-1.txt', 'cp1256', 'Arabic'), - ('sample-french-1.txt', 'cp1252', 'French'), - ('sample-arabic.txt', 'utf_8', 'Arabic'), - ('sample-russian-3.txt', 'utf_8', 'Russian'), - ('sample-french.txt', 'utf_8', 'French'), - ('sample-chinese.txt', 'big5', 'Chinese'), - ('sample-greek.txt', 'cp1253', 'Greek'), - ('sample-greek-2.txt', 'cp1253', 'Greek'), - ('sample-hebrew-2.txt', 'cp1255', 'Hebrew'), - ('sample-hebrew-3.txt', 'cp1255', 'Hebrew'), - ('sample-bulgarian.txt', 'utf_8', 'Bulgarian'), - ('sample-english.bom.txt', 'utf_8', 'English'), - ('sample-spanish.txt', 'utf_8', 'Spanish'), - ('sample-korean.txt', 'cp949', 'Korean'), - ('sample-turkish.txt', 'cp1254', 'Turkish'), - ('sample-russian-2.txt', 'utf_8', 'Russian'), - ('sample-russian.txt', 'mac_cyrillic', 'Russian'), - ('sample-polish.txt', 'utf_8', 'Polish'), - ] + ("sample-arabic-1.txt", "cp1256", "Arabic"), + ("sample-french-1.txt", "cp1252", "French"), + ("sample-arabic.txt", "utf_8", "Arabic"), + ("sample-russian-3.txt", "utf_8", "Russian"), + ("sample-french.txt", "utf_8", "French"), + ("sample-chinese.txt", "big5", "Chinese"), + ("sample-greek.txt", "cp1253", "Greek"), + ("sample-greek-2.txt", "cp1253", "Greek"), + ("sample-hebrew-2.txt", "cp1255", "Hebrew"), + ("sample-hebrew-3.txt", "cp1255", "Hebrew"), + ("sample-bulgarian.txt", "utf_8", "Bulgarian"), + ("sample-english.bom.txt", "utf_8", "English"), + ("sample-spanish.txt", "utf_8", "Spanish"), + ("sample-korean.txt", "cp949", "Korean"), + ("sample-turkish.txt", "cp1254", "Turkish"), + ("sample-russian-2.txt", "utf_8", "Russian"), + ("sample-russian.txt", "mac_cyrillic", "Russian"), + ("sample-polish.txt", "utf_8", "Polish"), + ], ) def test_elementary_detection( input_data_file: str, expected_charset: str, expected_language: str, ): - best_guess = from_path(DIR_PATH + "/data/{}".format(input_data_file)).best() + best_guess = from_path(DIR_PATH + f"/data/{input_data_file}").best() - assert best_guess is not None, "Elementary detection has failed upon '{}'".format(input_data_file) - assert best_guess.encoding == expected_charset, "Elementary charset detection has failed upon '{}'".format(input_data_file) - assert best_guess.language == expected_language, "Elementary language detection has failed upon '{}'".format(input_data_file) + assert ( + best_guess is not None + ), f"Elementary detection has failed upon '{input_data_file}'" + assert ( + best_guess.encoding == expected_charset + ), f"Elementary charset detection has failed upon '{input_data_file}'" + assert ( + best_guess.language == expected_language + ), f"Elementary language detection has failed upon '{input_data_file}'" diff --git a/tests/test_isbinary.py b/tests/test_isbinary.py index b134a8ac..841474f1 100644 --- a/tests/test_isbinary.py +++ b/tests/test_isbinary.py @@ -1,28 +1,29 @@ -import pytest +from __future__ import annotations + import typing -from io import BytesIO from base64 import b64decode +from io import BytesIO +from os import pardir, path + +import pytest + from charset_normalizer import is_binary -from os import path, pardir -DIR_PATH = path.join( - path.dirname(path.realpath(__file__)), - pardir -) +DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir) @pytest.mark.parametrize( "raw, expected", [ - (b'\x00\x5f\x2f\xff'*50, True), + (b"\x00\x5f\x2f\xff" * 50, True), (b64decode("R0lGODlhAQABAAAAACw="), True), (BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True), - ('sample-polish.txt', False), - ('sample-arabic.txt', False) - ] + ("sample-polish.txt", False), + ("sample-arabic.txt", False), + ], ) -def test_isbinary(raw: typing.Union[bytes, typing.BinaryIO, str], expected: bool) -> None: +def test_isbinary(raw: bytes | typing.BinaryIO | str, expected: bool) -> None: if isinstance(raw, str): - raw = DIR_PATH + "/data/{}".format(raw) + raw = DIR_PATH + f"/data/{raw}" assert is_binary(raw) is expected diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py index 04526d38..7fc28fac 100644 --- a/tests/test_large_payload.py +++ b/tests/test_large_payload.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest from charset_normalizer import from_bytes @@ -5,29 +7,43 @@ def test_large_payload_u8_sig_basic_entry(): - payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8_sig") + payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8_sig") best_guess = from_bytes(payload).best() assert best_guess is not None, "Large U8 payload case detection completely failed" - assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!" + assert ( + best_guess.encoding == "utf_8" + ), "Large U8 payload case detection wrongly detected!" assert best_guess.bom is True, "SIG/BOM property should be True" - assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw" - assert best_guess._string is not None, "str should be decoded before direct access (sig available)" + assert len(best_guess.raw) == len( + payload + ), "Large payload should remain untouched when accessed through .raw" + assert ( + best_guess._string is not None + ), "str should be decoded before direct access (sig available)" def test_large_payload_ascii_basic_entry(): - payload = ('0' * TOO_BIG_SEQUENCE).encode("utf_8") + payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8") best_guess = from_bytes(payload).best() - assert best_guess is not None, "Large ASCII payload case detection completely failed" - assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!" + assert ( + best_guess is not None + ), "Large ASCII payload case detection completely failed" + assert ( + best_guess.encoding == "ascii" + ), "Large ASCII payload case detection wrongly detected!" assert best_guess.bom is False, "SIG/BOM property should be False" - assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw" + assert len(best_guess.raw) == len( + payload + ), "Large payload should remain untouched when accessed through .raw" assert best_guess._string is None, "str should not be decoded until direct access" def test_misleading_large_sequence(): - content = (("hello simple ascii " * TOO_BIG_SEQUENCE) + ('我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。')) .encode('utf_8') + content = ( + ("hello simple ascii " * TOO_BIG_SEQUENCE) + ("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。") + ).encode("utf_8") guesses = from_bytes(content) @@ -35,5 +51,5 @@ def test_misleading_large_sequence(): match = guesses.best() assert match is not None assert match._string is not None, "str should be cached as only match" - assert match.encoding == 'utf_8' + assert match.encoding == "utf_8" assert str(match) is not None diff --git a/tests/test_logging.py b/tests/test_logging.py index f44820e2..ad2413e2 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -1,9 +1,12 @@ -import pytest +from __future__ import annotations + import logging -from charset_normalizer.utils import set_logging_handler -from charset_normalizer.api import from_bytes, explain_handler +import pytest + +from charset_normalizer.api import explain_handler, from_bytes from charset_normalizer.constant import TRACE +from charset_normalizer.utils import set_logging_handler class TestLogBehaviorClass: @@ -14,34 +17,32 @@ def setup_method(self): self.logger.level = logging.WARNING def test_explain_true_behavior(self, caplog): - test_sequence = b'This is a test sequence of bytes that should be sufficient' + test_sequence = b"This is a test sequence of bytes that should be sufficient" from_bytes(test_sequence, steps=1, chunk_size=50, explain=True) assert explain_handler not in self.logger.handlers for record in caplog.records: assert record.levelname in ["Level 5", "DEBUG"] def test_explain_false_handler_set_behavior(self, caplog): - test_sequence = b'This is a test sequence of bytes that should be sufficient' + test_sequence = b"This is a test sequence of bytes that should be sufficient" set_logging_handler(level=TRACE, format_string="%(message)s") from_bytes(test_sequence, steps=1, chunk_size=50, explain=False) - assert any(isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers) + assert any( + isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers + ) for record in caplog.records: assert record.levelname in ["Level 5", "DEBUG"] assert "Encoding detection: ascii is most likely the one." in caplog.text def test_set_stream_handler(self, caplog): - set_logging_handler( - "charset_normalizer", level=logging.DEBUG - ) + set_logging_handler("charset_normalizer", level=logging.DEBUG) self.logger.debug("log content should log with default format") for record in caplog.records: assert record.levelname in ["Level 5", "DEBUG"] assert "log content should log with default format" in caplog.text def test_set_stream_handler_format(self, caplog): - set_logging_handler( - "charset_normalizer", format_string="%(message)s" - ) + set_logging_handler("charset_normalizer", format_string="%(message)s") self.logger.info("log content should only be this message") assert caplog.record_tuples == [ ( diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py index d70fee45..4089f825 100644 --- a/tests/test_mess_detection.py +++ b/tests/test_mess_detection.py @@ -1,27 +1,48 @@ +from __future__ import annotations + import pytest + from charset_normalizer.md import mess_ratio @pytest.mark.parametrize( "content, min_expected_ratio, max_expected_ratio", [ - ('典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。', 0., 0.), - ('العقلية , التنويم المغناطيسي و / أو الاقتراح', 0., 0.), - ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0., 0.), + ( + "典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。", + 0.0, + 0.0, + ), + ("العقلية , التنويم المغناطيسي و / أو الاقتراح", 0.0, 0.0), + ("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0.0, 0.0), ("Cehennemin Sava■þ²s²'da kim?", 0.1, 0.5), - ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.), - ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5), - ("Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", 0.01, 0.5), - ("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 3.0), + ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.0), + ( + "ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", + 0.1, + 0.5, + ), + ( + "Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", + 0.01, + 0.5, + ), + ( + """ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", + 0.8, + 3.0, + ), ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5), - ("""hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0) - - ] + ( + """hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", + 0.5, + 2.0, + ), + ], ) def test_mess_detection(content, min_expected_ratio, max_expected_ratio): - calculated_mess_ratio = mess_ratio( - content, - maximum_threshold=1. - ) + calculated_mess_ratio = mess_ratio(content, maximum_threshold=1.0) - assert min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio, "The mess detection ratio calculated for given content is not well adjusted!" + assert ( + min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio + ), "The mess detection ratio calculated for given content is not well adjusted!" diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py index 411bf45f..64b52023 100644 --- a/tests/test_preemptive_detection.py +++ b/tests/test_preemptive_detection.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import pytest -from charset_normalizer.utils import any_specified_encoding from charset_normalizer import CharsetMatch +from charset_normalizer.utils import any_specified_encoding @pytest.mark.parametrize( @@ -10,52 +12,76 @@ (b'', "euc_jp"), (b'', "utf_8"), (b'', None), - (b'# coding: utf-8', "utf_8"), - (b'', 'utf_8'), - (b'', 'ascii'), - (b'', 'johab'), - (b'', 'cp037'), - (b'', "cp1252"), + (b"# coding: utf-8", "utf_8"), + (b'', "utf_8"), + (b'', "ascii"), + (b'', "johab"), + (b'', "cp037"), + (b"", "cp1252"), (b'', "cp1256"), - ] + ], ) def test_detect_most_common_body_encoding(payload, expected_encoding): - specified_encoding = any_specified_encoding( - payload - ) + specified_encoding = any_specified_encoding(payload) - assert specified_encoding == expected_encoding, "Unable to determine properly encoding from given body" + assert ( + specified_encoding == expected_encoding + ), "Unable to determine properly encoding from given body" @pytest.mark.parametrize( "payload, expected_outcome", [ - (b'', b''), - (b'', b''), - (b'', b''), - (b'# coding: utf-8', b'# coding: utf-8'), - (b'', b''), - (b'', b''), - (b'', b''), - (b'', b''), - (b'', b''), - ] + ( + b'', + b'', + ), + ( + b'', + b'', + ), + ( + b'', + b'', + ), + (b"# coding: utf-8", b"# coding: utf-8"), + ( + b'', + b'', + ), + ( + b'', + b'', + ), + ( + b'', + b'', + ), + ( + b"", + b"", + ), + ( + b'', + b'', + ), + ], ) def test_preemptive_mark_replacement(payload, expected_outcome): """ When generating (to Unicode converted) bytes, we want to change any potential declarative charset to utf-8. This test that. """ - specified_encoding = any_specified_encoding( - payload - ) + specified_encoding = any_specified_encoding(payload) - detected_encoding = specified_encoding if specified_encoding is not None else "utf-8" + detected_encoding = ( + specified_encoding if specified_encoding is not None else "utf-8" + ) m = CharsetMatch( payload, detected_encoding, - 0., + 0.0, False, [], preemptive_declaration=specified_encoding, diff --git a/tests/test_utils.py b/tests/test_utils.py index 5c603b3c..a0cc088e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,10 @@ +from __future__ import annotations + import logging + import pytest -from charset_normalizer.utils import is_accentuated, cp_similarity, set_logging_handler + +from charset_normalizer.utils import cp_similarity, is_accentuated, set_logging_handler @pytest.mark.parametrize(