🎨 refresh/upgrade project structure

jawah · Oct 29, 2024 · eab2792 · eab2792
1 parent b2f1bb0
commit eab2792
Show file tree

Hide file tree

Showing 39 changed files with 836 additions and 656 deletions.
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -26,10 +26,10 @@ jobs:
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: '3.11'
-      - name: Update pip, setuptools, wheel, build and twine
+      - name: Update pip, install build
         run: |
           python -m pip install --upgrade pip
-          python -m pip install setuptools wheel build
+          python -m pip install build
       - name: Build Wheel
         env:
           CHARSET_NORMALIZER_USE_MYPYC: '0'
@@ -83,10 +83,9 @@ jobs:
       - name: Build wheels
         uses: pypa/cibuildwheel@7940a4c0e76eb2030e473a5f864f291f63ee879b # v2.21.3
         env:
-          CIBW_BUILD_FRONTEND: "pip; args: --no-build-isolation"
+          CIBW_BUILD_FRONTEND: build
           CIBW_ARCHS_MACOS: x86_64 arm64 universal2
           CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1'
-          CIBW_BEFORE_BUILD: pip install -r build-requirements.txt
           CIBW_TEST_REQUIRES: pytest
           CIBW_TEST_COMMAND: pytest -c {package} {package}/tests
           CIBW_SKIP: pp* cp36*

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -25,18 +25,9 @@ jobs:
           python -m pip install -U pip setuptools
           python -m pip install -r dev-requirements.txt
           python -m pip uninstall -y charset-normalizer
-      - name: Type checking (Mypy)
+      - name: Pre-commit checks
         run: |
-          mypy --strict charset_normalizer
-      - name: Import sorting check (isort)
-        run: |
-          isort --check charset_normalizer
-      - name: Code format (Black)
-        run: |
-          black --check --diff --target-version=py37 charset_normalizer
-      - name: Style guide enforcement (Flake8)
-        run: |
-          flake8 charset_normalizer
+          pre-commit run --all
 
   tests:
     name: ✅ Tests
@@ -68,7 +59,7 @@ jobs:
           python -m pip uninstall -y charset-normalizer
       - name: Install the package
         run: |
-          python -m build --no-isolation
+          python -m build
           python -m pip install ./dist/*.whl
       - name: Run tests
         run: |

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,31 @@
+exclude: 'docs/|data/|tests/'
+
+repos:
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.3.1
+    hooks:
+      - id: pyupgrade
+        args: ["--py37-plus"]
+
+  - repo: https://github.com/psf/black
+    rev: 23.1.0
+    hooks:
+      - id: black
+        args: ["--target-version", "py37"]
+
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+
+  - repo: https://github.com/PyCQA/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+        additional_dependencies: [flake8-2020]
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.13.0
+    hooks:
+      - id: mypy
+        exclude: 'tests/|bin/'
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,18 @@
 All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...master) (2024-10-??)
+
+### Changed
+- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
+- Enforce annotation delayed loading for a simpler and consistent types in the project.
+
+### Added
+- pre-commit configuration.
+
+### Removed
+- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
+
 ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
 
 ### Added

diff --git a/bin/bc.py b/bin/bc.py
@@ -1,13 +1,14 @@
 #!/bin/python
+from __future__ import annotations
+
+import argparse
 from glob import glob
 from os.path import isdir
 from sys import argv
-from typing import List
-import argparse
 
-from charset_normalizer import detect as tbt_detect
 from chardet import detect as chardet_detect
 
+from charset_normalizer import detect as tbt_detect
 from charset_normalizer.utils import iana_name
 
 
@@ -16,28 +17,35 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
         str_a = content.decode(cp_a)
         str_b = content.decode(cp_b)
     except UnicodeDecodeError:
-        return 0.
+        return 0.0
 
     character_count = len(str_a)
-    diff_character_count = sum(
-        chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
-    )
+    diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))
 
-    return 1. - (diff_character_count / character_count)
+    return 1.0 - (diff_character_count / character_count)
 
 
-def cli_bc(arguments: List[str]):
+def cli_bc(arguments: list[str]):
     parser = argparse.ArgumentParser(
         description="BC script checker for Charset-Normalizer with Chardet"
     )
 
-    parser.add_argument('-c', '--coverage', action="store", default=85, type=int, dest='coverage',
-                        help="Define the minimum acceptable coverage to succeed")
+    parser.add_argument(
+        "-c",
+        "--coverage",
+        action="store",
+        default=85,
+        type=int,
+        dest="coverage",
+        help="Define the minimum acceptable coverage to succeed",
+    )
 
     args = parser.parse_args(arguments)
 
     if not isdir("./char-dataset"):
-        print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
+        print(
+            "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
+        )
         exit(1)
 
     success_count = 0
@@ -50,44 +58,52 @@ def cli_bc(arguments: List[str]):
             content = fp.read()
 
         chardet_result = chardet_detect(content)
-        chardet_encoding = chardet_result['encoding']
+        chardet_encoding = chardet_result["encoding"]
 
         charset_normalizer_result = tbt_detect(content)
-        charset_normalizer_encoding = charset_normalizer_result['encoding']
+        charset_normalizer_encoding = charset_normalizer_result["encoding"]
 
         if [chardet_encoding, charset_normalizer_encoding].count(None) == 1:
-            print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding))
+            print(
+                f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
+            )
             continue
 
         if charset_normalizer_encoding == chardet_encoding:
             success_count += 1
-            print("✅✅ '{}' (BC)".format(tbt_path))
+            print(f"✅✅ '{tbt_path}' (BC)")
             continue
 
-        if (chardet_encoding is None and charset_normalizer_encoding is None) or (iana_name(chardet_encoding, False) == iana_name(charset_normalizer_encoding, False)):
+        if (chardet_encoding is None and charset_normalizer_encoding is None) or (
+            iana_name(chardet_encoding, False)
+            == iana_name(charset_normalizer_encoding, False)
+        ):
             success_count += 1
-            print("✅✅ '{}' (BC)".format(tbt_path))
+            print(f"✅✅ '{tbt_path}' (BC)")
             continue
 
-        calc_eq = calc_equivalence(content, chardet_encoding, charset_normalizer_encoding)
+        calc_eq = calc_equivalence(
+            content, chardet_encoding, charset_normalizer_encoding
+        )
 
         if calc_eq >= 0.98:
             success_count += 1
-            print("️✅ ️'{}' (got '{}' but eq {} WITH {} %)".format(tbt_path, charset_normalizer_encoding, chardet_encoding, round(calc_eq * 100., 3)))
+            print(
+                f"️✅ ️'{tbt_path}' (got '{charset_normalizer_encoding}' but "
+                f"eq {chardet_encoding} WITH {round(calc_eq * 100., 3)} %)"
+            )
             continue
 
-        print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding))
+        print(
+            f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
+        )
 
-    success_ratio = round(success_count / total_count, 2) * 100.
+    success_ratio = round(success_count / total_count, 2) * 100.0
 
-    print("Total EST BC = {} % ({} / {} files)".format(success_ratio, success_count, total_count))
+    print(f"Total EST BC = {success_ratio} % ({success_count} / {total_count} files)")
 
     return 0 if success_ratio >= args.coverage else 1
 
 
 if __name__ == "__main__":
-    exit(
-        cli_bc(
-            argv[1:]
-        )
-    )
+    exit(cli_bc(argv[1:]))
diff --git a/bin/coverage.py b/bin/coverage.py
@@ -1,43 +1,55 @@
 #!/bin/python
+from __future__ import annotations
+
+import argparse
 from glob import glob
+from os import sep
 from os.path import isdir
 from sys import argv
-from typing import List
-import argparse
 
-from charset_normalizer import from_path, __version__
+from charset_normalizer import __version__, from_path
 from charset_normalizer.utils import iana_name
 
-from os import sep
-
 
 def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
     str_a = content.decode(cp_a)
     str_b = content.decode(cp_b)
 
     character_count = len(str_a)
-    diff_character_count = sum(
-        chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
-    )
-
+    diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))
 
-    return 1. - (diff_character_count / character_count)
+    return 1.0 - (diff_character_count / character_count)
 
 
-def cli_coverage(arguments: List[str]):
+def cli_coverage(arguments: list[str]):
     parser = argparse.ArgumentParser(
         description="Embedded detection success coverage script checker for Charset-Normalizer"
     )
 
-    parser.add_argument('-p', '--with-preemptive', action="store_true", default=False, dest='preemptive',
-                        help='Enable the preemptive scan behaviour during coverage check')
-    parser.add_argument('-c', '--coverage', action="store", default=90, type=int, dest='coverage',
-                        help="Define the minimum acceptable coverage to succeed")
+    parser.add_argument(
+        "-p",
+        "--with-preemptive",
+        action="store_true",
+        default=False,
+        dest="preemptive",
+        help="Enable the preemptive scan behaviour during coverage check",
+    )
+    parser.add_argument(
+        "-c",
+        "--coverage",
+        action="store",
+        default=90,
+        type=int,
+        dest="coverage",
+        help="Define the minimum acceptable coverage to succeed",
+    )
 
     args = parser.parse_args(arguments)
 
     if not isdir("./char-dataset"):
-        print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
+        print(
+            "This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
+        )
         exit(1)
 
     print(f"> using charset-normalizer {__version__}")
@@ -46,50 +58,49 @@ def cli_coverage(arguments: List[str]):
     total_count = 0
 
     for tbt_path in sorted(glob("./char-dataset/**/*.*")):
-
         expected_encoding = tbt_path.split(sep)[-2]
         total_count += 1
 
-        results = from_path(
-            tbt_path,
-            preemptive_behaviour=args.preemptive
-        )
+        results = from_path(tbt_path, preemptive_behaviour=args.preemptive)
 
         if expected_encoding == "None" and len(results) == 0:
-            print("✅✅ '{}'".format(tbt_path))
+            print(f"✅✅ '{tbt_path}'")
             success_count += 1
             continue
 
         if len(results) == 0:
-            print("⚡⚡ '{}' (nothing)".format(tbt_path))
+            print(f"⚡⚡ '{tbt_path}' (nothing)")
             continue
 
         result = results.best()
 
-        if expected_encoding in result.could_be_from_charset or iana_name(expected_encoding) in result.could_be_from_charset:
-            print("✅✅ '{}'".format(tbt_path))
+        if (
+            expected_encoding in result.could_be_from_charset
+            or iana_name(expected_encoding) in result.could_be_from_charset
+        ):
+            print(f"✅✅ '{tbt_path}'")
             success_count += 1
             continue
 
         calc_eq = calc_equivalence(result.raw, expected_encoding, result.encoding)
 
         if calc_eq >= 0.98:
             success_count += 1
-            print("️✅ ️'{}' (got '{}' but equivalence {} %)".format(tbt_path, result.encoding, round(calc_eq * 100., 3)))
+            print(
+                f"️✅ ️'{tbt_path}' (got '{result.encoding}' but equivalence {round(calc_eq * 100., 3)} %)"
+            )
             continue
 
-        print("⚡ '{}' (got '{}')".format(tbt_path, result.encoding))
+        print(f"⚡ '{tbt_path}' (got '{result.encoding}')")
 
-    success_ratio = round(success_count / total_count, 2) * 100.
+    success_ratio = round(success_count / total_count, 2) * 100.0
 
-    print("Total EST coverage = {} % ({} / {} files)".format(success_ratio, success_count, total_count))
+    print(
+        f"Total EST coverage = {success_ratio} % ({success_count} / {total_count} files)"
+    )
 
     return 0 if success_ratio >= args.coverage else 1
 
 
 if __name__ == "__main__":
-    exit(
-        cli_coverage(
-            argv[1:]
-        )
-    )
+    exit(cli_coverage(argv[1:]))