Skip to content

Commit

Permalink
🎨 refresh/upgrade project structure
Browse files Browse the repository at this point in the history
  • Loading branch information
Ousret committed Oct 29, 2024
1 parent b2f1bb0 commit fb83072
Show file tree
Hide file tree
Showing 39 changed files with 836 additions and 656 deletions.
7 changes: 3 additions & 4 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ jobs:
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: '3.11'
- name: Update pip, setuptools, wheel, build and twine
- name: Update pip, install build
run: |
python -m pip install --upgrade pip
python -m pip install setuptools wheel build
python -m pip install build
- name: Build Wheel
env:
CHARSET_NORMALIZER_USE_MYPYC: '0'
Expand Down Expand Up @@ -83,10 +83,9 @@ jobs:
- name: Build wheels
uses: pypa/cibuildwheel@7940a4c0e76eb2030e473a5f864f291f63ee879b # v2.21.3
env:
CIBW_BUILD_FRONTEND: "pip; args: --no-build-isolation"
CIBW_BUILD_FRONTEND: build
CIBW_ARCHS_MACOS: x86_64 arm64 universal2
CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1'
CIBW_BEFORE_BUILD: pip install -r build-requirements.txt
CIBW_TEST_REQUIRES: pytest
CIBW_TEST_COMMAND: pytest -c {package} {package}/tests
CIBW_SKIP: pp* cp36*
Expand Down
15 changes: 3 additions & 12 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,9 @@ jobs:
python -m pip install -U pip setuptools
python -m pip install -r dev-requirements.txt
python -m pip uninstall -y charset-normalizer
- name: Type checking (Mypy)
- name: Pre-commit checks
run: |
mypy --strict charset_normalizer
- name: Import sorting check (isort)
run: |
isort --check charset_normalizer
- name: Code format (Black)
run: |
black --check --diff --target-version=py37 charset_normalizer
- name: Style guide enforcement (Flake8)
run: |
flake8 charset_normalizer
pre-commit run --all
tests:
name: ✅ Tests
Expand Down Expand Up @@ -68,7 +59,7 @@ jobs:
python -m pip uninstall -y charset-normalizer
- name: Install the package
run: |
python -m build --no-isolation
python -m build
python -m pip install ./dist/*.whl
- name: Run tests
run: |
Expand Down
31 changes: 31 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
exclude: 'docs/|data/|tests/'

repos:
- repo: https://github.com/asottile/pyupgrade
rev: v3.3.1
hooks:
- id: pyupgrade
args: ["--py37-plus"]

- repo: https://github.com/psf/black
rev: 23.1.0
hooks:
- id: black
args: ["--target-version", "py37"]

- repo: https://github.com/PyCQA/isort
rev: 5.12.0
hooks:
- id: isort

- repo: https://github.com/PyCQA/flake8
rev: 6.1.0
hooks:
- id: flake8
additional_dependencies: [flake8-2020]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.13.0
hooks:
- id: mypy
exclude: 'tests/|bin/'
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...master) (2024-10-??)

### Changed
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
- Enforce annotation delayed loading for a simpler and consistent types in the project.

### Added
- pre-commit configuration.

### Removed
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.

## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)

### Added
Expand Down
72 changes: 44 additions & 28 deletions bin/bc.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#!/bin/python
from __future__ import annotations

import argparse
from glob import glob
from os.path import isdir
from sys import argv
from typing import List
import argparse

from charset_normalizer import detect as tbt_detect
from chardet import detect as chardet_detect

from charset_normalizer import detect as tbt_detect
from charset_normalizer.utils import iana_name


Expand All @@ -16,28 +17,35 @@ def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_a = content.decode(cp_a)
str_b = content.decode(cp_b)
except UnicodeDecodeError:
return 0.
return 0.0

character_count = len(str_a)
diff_character_count = sum(
chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
)
diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))

return 1. - (diff_character_count / character_count)
return 1.0 - (diff_character_count / character_count)


def cli_bc(arguments: List[str]):
def cli_bc(arguments: list[str]):
parser = argparse.ArgumentParser(
description="BC script checker for Charset-Normalizer with Chardet"
)

parser.add_argument('-c', '--coverage', action="store", default=85, type=int, dest='coverage',
help="Define the minimum acceptable coverage to succeed")
parser.add_argument(
"-c",
"--coverage",
action="store",
default=85,
type=int,
dest="coverage",
help="Define the minimum acceptable coverage to succeed",
)

args = parser.parse_args(arguments)

if not isdir("./char-dataset"):
print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
print(
"This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
)
exit(1)

success_count = 0
Expand All @@ -50,44 +58,52 @@ def cli_bc(arguments: List[str]):
content = fp.read()

chardet_result = chardet_detect(content)
chardet_encoding = chardet_result['encoding']
chardet_encoding = chardet_result["encoding"]

charset_normalizer_result = tbt_detect(content)
charset_normalizer_encoding = charset_normalizer_result['encoding']
charset_normalizer_encoding = charset_normalizer_result["encoding"]

if [chardet_encoding, charset_normalizer_encoding].count(None) == 1:
print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding))
print(
f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
)
continue

if charset_normalizer_encoding == chardet_encoding:
success_count += 1
print("✅✅ '{}' (BC)".format(tbt_path))
print(f"✅✅ '{tbt_path}' (BC)")
continue

if (chardet_encoding is None and charset_normalizer_encoding is None) or (iana_name(chardet_encoding, False) == iana_name(charset_normalizer_encoding, False)):
if (chardet_encoding is None and charset_normalizer_encoding is None) or (
iana_name(chardet_encoding, False)
== iana_name(charset_normalizer_encoding, False)
):
success_count += 1
print("✅✅ '{}' (BC)".format(tbt_path))
print(f"✅✅ '{tbt_path}' (BC)")
continue

calc_eq = calc_equivalence(content, chardet_encoding, charset_normalizer_encoding)
calc_eq = calc_equivalence(
content, chardet_encoding, charset_normalizer_encoding
)

if calc_eq >= 0.98:
success_count += 1
print("️✅ ️'{}' (got '{}' but eq {} WITH {} %)".format(tbt_path, charset_normalizer_encoding, chardet_encoding, round(calc_eq * 100., 3)))
print(
f"️✅ ️'{tbt_path}' (got '{charset_normalizer_encoding}' but "
f"eq {chardet_encoding} WITH {round(calc_eq * 100., 3)} %)"
)
continue

print("⚡⚡ '{}' (BC-Break) New('{}') vs Legacy('{}')".format(tbt_path, charset_normalizer_encoding, chardet_encoding))
print(
f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
)

success_ratio = round(success_count / total_count, 2) * 100.
success_ratio = round(success_count / total_count, 2) * 100.0

print("Total EST BC = {} % ({} / {} files)".format(success_ratio, success_count, total_count))
print(f"Total EST BC = {success_ratio} % ({success_count} / {total_count} files)")

return 0 if success_ratio >= args.coverage else 1


if __name__ == "__main__":
exit(
cli_bc(
argv[1:]
)
)
exit(cli_bc(argv[1:]))
79 changes: 45 additions & 34 deletions bin/coverage.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,55 @@
#!/bin/python
from __future__ import annotations

import argparse
from glob import glob
from os import sep
from os.path import isdir
from sys import argv
from typing import List
import argparse

from charset_normalizer import from_path, __version__
from charset_normalizer import __version__, from_path
from charset_normalizer.utils import iana_name

from os import sep


def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_a = content.decode(cp_a)
str_b = content.decode(cp_b)

character_count = len(str_a)
diff_character_count = sum(
chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b)
)

diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))

return 1. - (diff_character_count / character_count)
return 1.0 - (diff_character_count / character_count)


def cli_coverage(arguments: List[str]):
def cli_coverage(arguments: list[str]):
parser = argparse.ArgumentParser(
description="Embedded detection success coverage script checker for Charset-Normalizer"
)

parser.add_argument('-p', '--with-preemptive', action="store_true", default=False, dest='preemptive',
help='Enable the preemptive scan behaviour during coverage check')
parser.add_argument('-c', '--coverage', action="store", default=90, type=int, dest='coverage',
help="Define the minimum acceptable coverage to succeed")
parser.add_argument(
"-p",
"--with-preemptive",
action="store_true",
default=False,
dest="preemptive",
help="Enable the preemptive scan behaviour during coverage check",
)
parser.add_argument(
"-c",
"--coverage",
action="store",
default=90,
type=int,
dest="coverage",
help="Define the minimum acceptable coverage to succeed",
)

args = parser.parse_args(arguments)

if not isdir("./char-dataset"):
print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
print(
"This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
)
exit(1)

print(f"> using charset-normalizer {__version__}")
Expand All @@ -46,50 +58,49 @@ def cli_coverage(arguments: List[str]):
total_count = 0

for tbt_path in sorted(glob("./char-dataset/**/*.*")):

expected_encoding = tbt_path.split(sep)[-2]
total_count += 1

results = from_path(
tbt_path,
preemptive_behaviour=args.preemptive
)
results = from_path(tbt_path, preemptive_behaviour=args.preemptive)

if expected_encoding == "None" and len(results) == 0:
print("✅✅ '{}'".format(tbt_path))
print(f"✅✅ '{tbt_path}'")
success_count += 1
continue

if len(results) == 0:
print("⚡⚡ '{}' (nothing)".format(tbt_path))
print(f"⚡⚡ '{tbt_path}' (nothing)")
continue

result = results.best()

if expected_encoding in result.could_be_from_charset or iana_name(expected_encoding) in result.could_be_from_charset:
print("✅✅ '{}'".format(tbt_path))
if (
expected_encoding in result.could_be_from_charset
or iana_name(expected_encoding) in result.could_be_from_charset
):
print(f"✅✅ '{tbt_path}'")
success_count += 1
continue

calc_eq = calc_equivalence(result.raw, expected_encoding, result.encoding)

if calc_eq >= 0.98:
success_count += 1
print("️✅ ️'{}' (got '{}' but equivalence {} %)".format(tbt_path, result.encoding, round(calc_eq * 100., 3)))
print(
f"️✅ ️'{tbt_path}' (got '{result.encoding}' but equivalence {round(calc_eq * 100., 3)} %)"
)
continue

print("⚡ '{}' (got '{}')".format(tbt_path, result.encoding))
print(f"⚡ '{tbt_path}' (got '{result.encoding}')")

success_ratio = round(success_count / total_count, 2) * 100.
success_ratio = round(success_count / total_count, 2) * 100.0

print("Total EST coverage = {} % ({} / {} files)".format(success_ratio, success_count, total_count))
print(
f"Total EST coverage = {success_ratio} % ({success_count} / {total_count} files)"
)

return 0 if success_ratio >= args.coverage else 1


if __name__ == "__main__":
exit(
cli_coverage(
argv[1:]
)
)
exit(cli_coverage(argv[1:]))
Loading

0 comments on commit fb83072

Please sign in to comment.