diff --git a/.codecov.yml b/.codecov.yml deleted file mode 100644 index f307895e..00000000 --- a/.codecov.yml +++ /dev/null @@ -1,8 +0,0 @@ -coverage: - status: - project: - default: - target: 88% - threshold: null - patch: false - changes: false diff --git a/.coveragerc b/.coveragerc index 723bfd0c..8b591311 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,2 +1,28 @@ [run] -source=charset_normalizer +source = + charset_normalizer +# Needed for Python 3.11 and lower +disable_warnings = no-sysmon + +[paths] +source = + src/charset_normalizer + */charset_normalizer + *\charset_normalizer + +[report] +omit = + src/charset_normalizer/__main__.py + +exclude_lines = + except ModuleNotFoundError: + except ImportError: + pass + import + raise NotImplementedError + .* # Platform-specific.* + .*:.* # Python \d.* + .* # Abstract + .* # Defensive: + if (?:typing.)?TYPE_CHECKING: + ^\s*?\.\.\.\s*$ diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000..33e824d2 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,9 @@ +# Restrict all files related to deploying to +# require lead maintainer approval. + +.github/workflows/ @Ousret +.github/CODEOWNERS @Ousret +src/charset_normalizer/ @Ousret +pyproject.toml @Ousret +tests/ @Ousret +data/ @Ousret diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 8e95ec11..26c9d3c1 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -25,7 +25,7 @@ jobs: - name: Set up Python uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: - python-version: '3.11' + python-version: '3' - name: Update pip, install build run: | python -m pip install --upgrade pip @@ -46,7 +46,7 @@ jobs: needs: pre_flight_check strategy: matrix: - os: [ ubuntu-latest, windows-latest, macos-13 ] + os: [ ubuntu-22.04, windows-latest, macos-13 ] qemu: [ '' ] include: # Split ubuntu job for the sake of speed-up @@ -77,18 +77,15 @@ jobs: shell: bash - name: Setup Python uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - - name: Update pip, wheel, setuptools, build, twine - run: | - python -m pip install -U pip wheel setuptools build twine - name: Build wheels uses: pypa/cibuildwheel@ee63bf16da6cddfb925f542f2c7b59ad50e93969 # v2.22.0 env: CIBW_BUILD_FRONTEND: build - CIBW_ARCHS_MACOS: x86_64 arm64 universal2 + CIBW_ARCHS_MACOS: universal2 CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' CIBW_TEST_REQUIRES: pytest CIBW_TEST_COMMAND: pytest -c {package} {package}/tests - CIBW_SKIP: pp* cp36* + CIBW_SKIP: pp* - name: Upload artifacts uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce with: @@ -152,7 +149,7 @@ jobs: run: | tree dist - name: Publish package distributions to PyPI - uses: pypa/gh-action-pypi-publish@f7600683efdcb7656dec5b29656edb7bc586e597 # release/v1 + uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # release/v1 - name: Upload dists to GitHub Release env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 23433aab..c3f0dc59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,19 +19,15 @@ jobs: - name: Set up Python uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: - python-version: '3.11' - - name: Install dependencies - run: | - python -m pip install -U pip setuptools - python -m pip install -r dev-requirements.txt - python -m pip uninstall -y charset-normalizer + python-version: '3' + - name: Install nox + run: python -m pip install nox - name: Pre-commit checks - run: | - pre-commit run --all + run: nox -s lint tests: name: ✅ Tests - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: fail-fast: false @@ -53,18 +49,15 @@ jobs: python-version: ${{ matrix.python-version }} allow-prereleases: true - name: Install dependencies - run: | - python -m pip install -U pip setuptools - python -m pip install -r dev-requirements.txt - python -m pip uninstall -y charset-normalizer - - name: Install the package - run: | - python -m build - python -m pip install ./dist/*.whl + run: python -m pip install nox - name: Run tests - run: | - pytest - - uses: codecov/codecov-action@4fe8c5f003fae66aa5ebb77cfd3e7bfbbda0b6b0 # v3.1.5 + run: nox -s test-${{ matrix.python-version }} + - name: "Upload artifact" + uses: "actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce" + with: + name: coverage-data + path: ".coverage.*" + if-no-files-found: error detection_coverage: @@ -79,61 +72,45 @@ jobs: - name: Set up Python uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: - python-version: '3.11' + python-version: '3' - name: Install dependencies - run: | - python -m pip install -U pip setuptools - python -m pip install -r dev-requirements.txt - python -m pip uninstall -y charset-normalizer - - name: Install the package - run: | - python -m build - python -m pip install ./dist/*.whl - - name: Clone the complete dataset - run: | - git clone https://github.com/Ousret/char-dataset.git + run: python -m pip install nox - name: Coverage WITH preemptive - run: | - python ./bin/coverage.py --coverage 97 --with-preemptive + run: nox -s coverage -- --coverage 97 --with-preemptive - name: Coverage WITHOUT preemptive - run: | - python ./bin/coverage.py --coverage 95 - -# integration_test: -# -# needs: -# - tests -# -# name: 🔗 Integration Tests -# runs-on: ubuntu-latest -# -# steps: -# - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 -# - name: Set up Python -# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 -# with: -# python-version: '3.11' -# - name: Install dependencies -# run: | -# pip install -U pip setuptools -# pip install -r dev-requirements.txt -# - name: Remove Chardet & Charset-Normalizer -# run: | -# pip uninstall -y chardet -# pip uninstall -y charset-normalizer -# - name: Install the package -# run: | -# python -m build -# pip install ./dist/*.whl -# - name: Clone the complete dataset -# run: | -# git clone https://github.com/Ousret/char-dataset.git -# - name: Start the Flask server -# run: | -# python ./bin/serve.py & -# - name: Integration Tests with Requests -# run: | -# python ./bin/integration.py + run: nox -s coverage -- --coverage 95 + - name: "Upload artifact" + uses: "actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce" + with: + name: coverage-data + path: ".coverage.*" + if-no-files-found: error + + integration_test: + + needs: + - tests + + name: 🔗 Integration Tests + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + downstream_project: + - niquests + - requests + + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: '3' + - name: Install dependencies + run: pip install nox + - name: Integration Tests with Requests + run: nox -s downstream_${{ matrix.downstream_project }} chardet_bc: @@ -145,22 +122,11 @@ jobs: - name: Set up Python uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: - python-version: '3.11' + python-version: '3' - name: Install dependencies - run: | - python -m pip install -U pip setuptools - python -m pip install -r dev-requirements.txt - python -m pip uninstall -y charset-normalizer - - name: Install the package - run: | - python -m build - python -m pip install ./dist/*.whl - - name: Clone the complete dataset - run: | - git clone https://github.com/Ousret/char-dataset.git + run: pip install nox - name: BC Coverage - run: | - python ./bin/bc.py --coverage 80 + run: nox -s backward_compatibility -- --coverage 80 mypyc_test: @@ -184,7 +150,7 @@ jobs: os: [ ubuntu-latest, macos-latest, windows-latest ] include: - python-version: "3.7" - os: ubuntu-latest + os: ubuntu-22.04 - python-version: "3.7" os: macos-13 - python-version: "3.7" @@ -198,55 +164,66 @@ jobs: with: python-version: ${{ matrix.python-version }} allow-prereleases: true - - name: Install dependencies - run: | - python -m pip install -U pip setuptools - python -m pip install -r dev-requirements.txt - python -m pip uninstall -y charset-normalizer - - name: Install the package - env: - CHARSET_NORMALIZER_USE_MYPYC: '1' - run: | - python -m pip install . - - name: Clone the complete dataset - run: | - git clone https://github.com/Ousret/char-dataset.git - - name: Coverage WITH preemptive - run: | - python ./bin/coverage.py --coverage 97 --with-preemptive - - name: Performance (Normal) + - name: Install nox + run: pip install nox + - name: Run tests with mypyc enabled + run: nox -s test_mypyc-${{ matrix.python-version }} + - name: "Upload artifact" + uses: "actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce" + with: + name: coverage-data + path: ".coverage.*" + if-no-files-found: error + + coverage: + if: always() + runs-on: "ubuntu-latest" + needs: + - tests + - mypyc_test + - detection_coverage + steps: + - name: "Checkout repository" + uses: "actions/checkout@d632683dd7b4114ad314bca15554477dd762a938" + + - name: "Setup Python" + uses: "actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3" + with: + python-version: "3.x" + + - name: "Install coverage" + run: "python -m pip install --upgrade coverage" + + - name: "Download artifact" + uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a + with: + name: coverage-data + + - name: "Combine & check coverage" run: | - python ./bin/performance.py + python -m coverage combine + python -m coverage html --skip-covered --skip-empty + python -m coverage report --ignore-errors --show-missing --fail-under=92 + + - name: "Upload report" + uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce + with: + name: coverage-report + path: htmlcov performance: - name: ⚡ Performance Test (no MypyC) + name: ⚡ Performance Test runs-on: ubuntu-latest - needs: - - mypyc_test - - chardet_bc + needs: coverage steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: - python-version: '3.11' + python-version: '3' - name: Install dependencies - run: | - python -m pip install -U pip setuptools - python -m pip install -r dev-requirements.txt - python -m pip uninstall -y charset-normalizer - - name: Install the package - run: | - python -m build - python -m pip install ./dist/*.whl - - name: Clone the complete dataset - run: | - git clone https://github.com/Ousret/char-dataset.git - - name: Performance (Normal) - run: | - python ./bin/performance.py - - name: Performance (Medium) - run: | - python ./bin/performance.py --size-increase 2 + run: pip install nox + - name: Performance Measurement + run: nox -s performance diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 54c73a6b..3f31cc31 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -45,7 +45,7 @@ jobs: uses: github/codeql-action/init@48ab28a6f5dbc2a99bf1e0131198dd8f1df78169 # v3.28.0 with: languages: ${{ matrix.language }} - + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09fa625e..0b051524 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,31 +1,30 @@ exclude: 'docs/|data/|tests/' repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-yaml + - id: debug-statements + - id: end-of-file-fixer + - id: trailing-whitespace - repo: https://github.com/asottile/pyupgrade - rev: v3.3.1 + rev: v3.19.1 hooks: - id: pyupgrade - args: ["--py37-plus"] - - - repo: https://github.com/psf/black - rev: 23.1.0 - hooks: - - id: black - args: ["--target-version", "py37"] - - - repo: https://github.com/PyCQA/isort - rev: 5.12.0 + args: [ --py37-plus, --keep-runtime-typing ] + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.8.4 hooks: - - id: isort - - - repo: https://github.com/PyCQA/flake8 - rev: 6.1.0 - hooks: - - id: flake8 - additional_dependencies: [flake8-2020] - + # Run the linter. + - id: ruff + args: [ --fix ] + # Run the formatter. + - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.13.0 + rev: v1.14.0 hooks: - id: mypy - exclude: 'tests/|bin/' + args: [ --check-untyped-defs ] + exclude: 'tests/|noxfile.py|setup.py|bin/' diff --git a/CHANGELOG.md b/CHANGELOG.md index 608567e4..83ceb412 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...master) (2024-10-??) +## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24) ### Changed - Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend. @@ -10,9 +10,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - pre-commit configuration. +- noxfile. ### Removed - `build-requirements.txt` as per using `pyproject.toml` native build configuration. +- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile). +- `setup.cfg` in favor of `pyproject.toml` metadata configuration. +- unused `utils.unicode_range` function. + +### Fixed +- converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572) ## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08) @@ -193,7 +200,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12) ### Fixed -- ASCII miss-detection on rare cases (PR #170) +- ASCII miss-detection on rare cases (PR #170) ## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30) @@ -225,7 +232,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - MD improvement on trailing data and long foreign (non-pure latin) data (PR #124) - Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122) - call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129) -- Code style as refactored by Sourcery-AI (PR #131) +- Code style as refactored by Sourcery-AI (PR #131) - Minor adjustment on the MD around european words (PR #133) - Remove and replace SRTs from assets / tests (PR #139) - Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135) @@ -298,7 +305,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15) ### Fixed -- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59) +- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59) ### Changed - Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index abee674b..40b19f0e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,12 +1,12 @@ # Contribution Guidelines -If you’re reading this, you’re probably interested in contributing to Charset Normalizer. -Thank you very much! Open source projects live-and-die based on the support they receive from others, +If you’re reading this, you’re probably interested in contributing to Charset Normalizer. +Thank you very much! Open source projects live-and-die based on the support they receive from others, and the fact that you’re even considering contributing to this project is very generous of you. ## Questions -The GitHub issue tracker is for *bug reports* and *feature requests*. +The GitHub issue tracker is for *bug reports* and *feature requests*. Questions are allowed only when no answer are provided in docs. ## Good Bug Reports @@ -67,6 +67,10 @@ the backward-compatibility. ## How to run tests locally? It is essential that you run, prior to any submissions the mandatory checks. -Run the script `./bin/run_checks.sh` to verify that your modification are not breaking anything. -Also, make sure to run the `./bin/run_autofix.sh` to comply with the style format and import sorting. +```shell +pip install nox +nox -s test +nox -s lint +nox -s coverage +``` diff --git a/LICENSE b/LICENSE index ad82355b..9725772c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 TAHRI Ahmed R. +Copyright (c) 2025 TAHRI Ahmed R. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index 3792f5bb..8da2cd04 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ -include LICENSE README.md CHANGELOG.md charset_normalizer/py.typed dev-requirements.txt +include LICENSE README.md CHANGELOG.md src/charset_normalizer/py.typed dev-requirements.txt SECURITY.md noxfile.py recursive-include data *.md recursive-include data *.txt recursive-include docs * diff --git a/README.md b/README.md index 13e6e14f..ee5b2e7e 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@

Featured Packages
- Static Badge + Static Badge Static Badge @@ -55,8 +55,7 @@ This project offers you an alternative to **Universal Charset Encoding Detector* Reading Normalized TextCat Reading Text

-*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
-Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html) +*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
## ⚡ Performance @@ -64,21 +63,23 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | Accuracy | Mean per file (ms) | File per sec (est) | |-----------------------------------------------|:--------:|:------------------:|:------------------:| -| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec | +| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec | | charset-normalizer | **98 %** | **10 ms** | 100 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | |-----------------------------------------------|:---------------:|:---------------:|:---------------:| -| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms | +| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms | | charset-normalizer | 100 ms | 50 ms | 5 ms | +_updated as of december 2024 using CPython 3.12_ + Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload. > Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows. > And yes, these results might change at any time. The dataset can be updated to include more files. > The actual delays heavily depends on your CPU capabilities. The factors should remain the same. > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability -> (eg. Supported Encoding) Challenge-them if you want. +> (e.g. Supported Encoding) Challenge-them if you want. ## ✨ Installation @@ -195,11 +196,11 @@ reliable alternative using a completely different method. Also! I never back dow I **don't care** about the **originating charset** encoding, because **two different tables** can produce **two identical rendered string.** -What I want is to get readable text, the best I can. +What I want is to get readable text, the best I can. In a way, **I'm brute forcing text decoding.** How cool is that ? 😎 -Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode. +Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode. ## 🍰 How @@ -211,7 +212,7 @@ Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is **Wait a minute**, what is noise/mess and coherence according to **YOU ?** *Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then -**I established** some ground rules about **what is obvious** when **it seems like** a mess. +**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text). I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to improve or rewrite it. @@ -255,3 +256,5 @@ from the experts who know it best, while seamlessly integrating with existing tools. [1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme + +[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297) diff --git a/UPGRADE.md b/UPGRADE.md deleted file mode 100644 index 4b8f7bb1..00000000 --- a/UPGRADE.md +++ /dev/null @@ -1,31 +0,0 @@ -Guide to upgrade your code from v1 to v2 ----------------------------------------- - - * If you are using the legacy `detect` function, that is it. You have nothing to do. - -## Detection - -### Before - -```python -from charset_normalizer import CharsetNormalizerMatches - -results = CharsetNormalizerMatches.from_bytes( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32') -) -``` - -### After - -```python -from charset_normalizer import from_bytes - -results = from_bytes( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_32') -) -``` - -Methods that once were staticmethods of the class `CharsetNormalizerMatches` are now basic functions. -`from_fp`, `from_bytes`, `from_fp` and `` are concerned. - -Staticmethods scheduled to be removed in version 3.0 diff --git a/bin/bc.py b/bin/bc.py index 4eacc1c4..cac23682 100644 --- a/bin/bc.py +++ b/bin/bc.py @@ -1,4 +1,3 @@ -#!/bin/python from __future__ import annotations import argparse diff --git a/bin/coverage.py b/bin/coverage.py index e5ba0110..a84bb73c 100644 --- a/bin/coverage.py +++ b/bin/coverage.py @@ -1,4 +1,3 @@ -#!/bin/python from __future__ import annotations import argparse diff --git a/bin/integration.py b/bin/integration.py deleted file mode 100644 index 7313ae5e..00000000 --- a/bin/integration.py +++ /dev/null @@ -1,56 +0,0 @@ -from __future__ import annotations - -from requests import __version__, get - -from charset_normalizer import __version__ as __version_cn__ -from charset_normalizer import detect - -if __name__ == "__main__": - print(f"requests {__version__}") - print(f"charset_normalizer {__version_cn__}") - - files: list[str] = get("http://127.0.0.1:8080/").json() - - print("## Testing with actual files") - - for file in files: - r = get("http://127.0.0.1:8080/" + file) - - if r.ok is False: - print(f"Unable to retrieve '{file}' | HTTP/{r.status_code}") - exit(1) - - expected_encoding = detect(r.content)["encoding"] - - if expected_encoding != r.apparent_encoding: - print( - f"Integration test failed | File '{file}' | Expected '{expected_encoding}' got '{r.apparent_encoding}'" - ) - exit(1) - - print(f"✅✅ '{file}' OK") - - print("## Testing with edge cases") - - # Should NOT crash - get("http://127.0.0.1:8080/edge/empty/json").json() - - print("✅✅ Empty JSON OK") - - if get("http://127.0.0.1:8080/edge/empty/plain").apparent_encoding != "utf-8": - print("Empty payload SHOULD not return apparent_encoding != UTF-8") - exit(1) - - print("✅✅ Empty Plain Text OK") - - r = get("http://127.0.0.1:8080/edge/gb18030/json") - - if r.apparent_encoding != "GB18030": - print("JSON Basic Detection FAILURE (/edge/gb18030/json)") - exit(1) - - r.json() - - print("✅✅ GB18030 JSON Encoded OK") - - print("Integration tests passed!") diff --git a/bin/performance.py b/bin/performance.py index 41195b8f..3a55c188 100644 --- a/bin/performance.py +++ b/bin/performance.py @@ -1,4 +1,3 @@ -#!/bin/python from __future__ import annotations import argparse diff --git a/bin/serve.py b/bin/serve.py deleted file mode 100644 index 99d6b6ea..00000000 --- a/bin/serve.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from glob import glob - -from flask import Flask, jsonify, send_from_directory - -app = Flask(__name__) - - -@app.route("/raw/") -def read_file(path): - return ( - send_from_directory("../char-dataset", path, as_attachment=True), - 200, - {"Content-Type": "text/plain"}, - ) - - -@app.route("/") -def read_targets(): - return jsonify( - [ - el.replace("./char-dataset", "/raw").replace("\\", "/") - for el in sorted(glob("./char-dataset/**/*")) - ] - ) - - -@app.route("/edge/empty/plain") -def read_empty_response_plain(): - return b"", 200, {"Content-Type": "text/plain"} - - -@app.route("/edge/empty/json") -def read_empty_response_json(): - return b"{}", 200, {"Content-Type": "application/json"} - - -@app.route("/edge/gb18030/json") -def read_gb18030_response_json(): - return ( - '{"abc": "我没有埋怨,磋砣的只是一些时间。。今觀俗士之論也,以族舉德,以位命賢,茲可謂得論之一體矣,而未獲至論之淑真也。"}'.encode( - "gb18030" - ), - 200, - {"Content-Type": "application/json"}, - ) - - -if __name__ == "__main__": - app.run(host="127.0.0.1", port=8080) diff --git a/dev-requirements.txt b/dev-requirements.txt index 0d0b27fb..19f1c2bb 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,2 @@ -chardet==5.1.0 -pytest-cov==4.1.0 -Flask==2.2.3 -pytest>=7.4.4,<=8.3.4 -requests==2.31.0 -pre-commit -build +coverage>=7.2.7,<7.7 +pytest>=7.4.4,<9 diff --git a/docs/index.rst b/docs/index.rst index 19ca08a9..da8a9b6f 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,7 +20,7 @@ It aims to be as generic as possible. It is released under MIT license, see LICENSE for more details. Be aware that no warranty of any kind is provided with this package. -Copyright (C) 2023 Ahmed TAHRI +Copyright (C) 2025 Ahmed TAHRI Introduction ============ diff --git a/noxfile.py b/noxfile.py new file mode 100644 index 00000000..b8f0c8c6 --- /dev/null +++ b/noxfile.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import os +import shutil + +import nox + + +def test_impl( + session: nox.Session, + use_mypyc: bool = False, +): + # Install deps and the package itself. + session.install("-U", "pip", "setuptools", silent=False) + session.install("-r", "dev-requirements.txt", silent=False) + + session.install( + ".", + silent=False, + env={"CHARSET_NORMALIZER_USE_MYPYC": "1" if use_mypyc else "0"}, + ) + + # Show the pip version. + session.run("pip", "--version") + # Print the Python version and bytesize. + session.run("python", "--version") + # Show charset-normalizer cli info + session.run("normalizer", "--version") + + # Inspired from https://hynek.me/articles/ditch-codecov-python/ + # We use parallel mode and then combine in a later CI step + session.run( + "python", + "-m", + "coverage", + "run", + "--parallel-mode", + "-m", + "pytest", + "-v", + "-ra", + f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}", + "--tb=native", + "--durations=10", + "--strict-config", + "--strict-markers", + *(session.posargs or ("tests/",)), + env={ + "PYTHONWARNINGS": "always::DeprecationWarning", + "COVERAGE_CORE": "sysmon", + }, + ) + + +@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy"]) +def test(session: nox.Session) -> None: + test_impl(session) + + +@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]) +def test_mypyc(session: nox.Session) -> None: + test_impl(session, True) + + +def git_clone(session: nox.Session, git_url: str) -> None: + """We either clone the target repository or if already exist + simply reset the state and pull. + """ + expected_directory = git_url.split("/")[-1] + + if expected_directory.endswith(".git"): + expected_directory = expected_directory[:-4] + + if not os.path.isdir(expected_directory): + session.run("git", "clone", "--depth", "1", git_url, external=True) + else: + session.run( + "git", "-C", expected_directory, "reset", "--hard", "HEAD", external=True + ) + session.run("git", "-C", expected_directory, "pull", external=True) + + +@nox.session() +def backward_compatibility(session: nox.Session) -> None: + git_clone(session, "https://github.com/ousret/char-dataset") + + # Install deps and the package itself. + session.install("-U", "pip", "setuptools", silent=False) + session.install("-r", "dev-requirements.txt", silent=False) + + session.install(".", silent=False) + session.install("chardet") + + session.run( + "python", + "bin/bc.py", + *(session.posargs or ("--coverage=85",)), + ) + + +@nox.session() +def coverage(session: nox.Session) -> None: + git_clone(session, "https://github.com/ousret/char-dataset") + + # Install deps and the package itself. + session.install("-U", "pip", "setuptools", silent=False) + session.install("-r", "dev-requirements.txt", silent=False) + + session.install(".", silent=False) + + # Show the pip version. + session.run("pip", "--version") + # Print the Python version and bytesize. + session.run("python", "--version") + # Show charset-normalizer cli info + session.run("normalizer", "--version") + + session.run( + "python", + "-m", + "coverage", + "run", + "--parallel-mode", + "bin/coverage.py", + *(session.posargs or ("--coverage=90", "--with-preemptive")), + ) + + +@nox.session() +def performance(session: nox.Session) -> None: + git_clone(session, "https://github.com/ousret/char-dataset") + + # Install deps and the package itself. + session.install("-U", "pip", "setuptools", silent=False) + session.install("-r", "dev-requirements.txt", silent=False) + + session.install("chardet") + session.install(".", silent=False, env={"CHARSET_NORMALIZER_USE_MYPYC": "1"}) + + session.run( + "python", + "bin/performance.py", + *(session.posargs or ()), + ) + + +@nox.session() +def downstream_niquests(session: nox.Session) -> None: + root = os.getcwd() + tmp_dir = session.create_tmp() + + session.cd(tmp_dir) + git_clone(session, "https://github.com/jawah/niquests") + session.chdir("niquests") + + session.run("git", "rev-parse", "HEAD", external=True) + session.install(".[socks]", silent=False) + session.install("-r", "requirements-dev.txt", silent=False) + + session.cd(root) + session.install(".", silent=False) + session.cd(f"{tmp_dir}/niquests") + + session.run( + "python", + "-c", + "import charset_normalizer; print(charset_normalizer.__version__)", + ) + session.run( + "python", + "-m", + "pytest", + "-v", + f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}", + *(session.posargs or ("tests/",)), + env={"NIQUESTS_STRICT_OCSP": "1"}, + ) + + +@nox.session() +def downstream_requests(session: nox.Session) -> None: + root = os.getcwd() + tmp_dir = session.create_tmp() + + session.cd(tmp_dir) + git_clone(session, "https://github.com/psf/requests") + session.chdir("requests") + + session.run("git", "rev-parse", "HEAD", external=True) + session.install(".[socks]", silent=False) + session.install("-r", "requirements-dev.txt", silent=False) + + session.cd(root) + session.install(".", silent=False) + session.cd(f"{tmp_dir}/requests") + + session.run( + "python", + "-c", + "import charset_normalizer; print(charset_normalizer.__version__)", + ) + session.run( + "python", + "-m", + "pytest", + "-v", + f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}", + *(session.posargs or ("tests/",)), + ) + + +@nox.session() +def format(session: nox.Session) -> None: + """Run code formatters.""" + lint(session) + + +@nox.session +def lint(session: nox.Session) -> None: + session.install("pre-commit") + session.run("pre-commit", "run", "--all-files") + + +@nox.session +def docs(session: nox.Session) -> None: + session.install("-r", "docs/requirements.txt") + session.install(".") + + session.chdir("docs") + if os.path.exists("_build"): + shutil.rmtree("_build") + session.run("sphinx-build", "-b", "html", "-W", ".", "_build/html") diff --git a/pyproject.toml b/pyproject.toml index 05f32e5a..a9ad91c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools", "setuptools-scm", "mypy>=1.4.1,<=1.13.0"] +requires = ["setuptools", "setuptools-scm", "mypy>=1.4.1,<=1.14.0"] build-backend = "setuptools.build_meta" [project] @@ -40,6 +40,10 @@ dynamic = ["version", "readme"] [project.optional-dependencies] unicode_backport = [] +[tool.setuptools] +package-dir = {"" = "src"} +packages = ["charset_normalizer", "charset_normalizer.cli", ] + [tool.setuptools.dynamic] version = {attr = "charset_normalizer.__version__"} readme = {file = ["README.md", "CHANGELOG.md", "LICENSE"]} @@ -53,11 +57,11 @@ normalizer = "charset_normalizer:cli.cli_detect" "Code" = "https://github.com/jawah/charset_normalizer" "Issue tracker" = "https://github.com/jawah/charset_normalizer/issues" -[tool.setuptools.packages.find] -exclude = ["tests*"] - [tool.pytest.ini_options] -addopts = "--cov=charset_normalizer --cov-report=term-missing -rxXs" +log_level = "DEBUG" +filterwarnings = [ + "error", +] [tool.isort] profile = "black" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 5d23e253..00000000 --- a/setup.cfg +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -ignore = W503, E203, B305 -max-line-length = 120 diff --git a/setup.py b/setup.py index c113acda..da2a69ff 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ #!/usr/bin/env python - from __future__ import annotations import os @@ -12,7 +11,7 @@ if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc": sys.argv.pop(1) USE_MYPYC = True -if os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1": +elif os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1": USE_MYPYC = True if USE_MYPYC: @@ -20,9 +19,10 @@ MYPYC_MODULES = mypycify( [ - "charset_normalizer/md.py", + "src/charset_normalizer/md.py", ], debug_level="0", + opt_level="3", ) else: MYPYC_MODULES = None diff --git a/charset_normalizer/__init__.py b/src/charset_normalizer/__init__.py similarity index 99% rename from charset_normalizer/__init__.py rename to src/charset_normalizer/__init__.py index 348341fb..0d3a3799 100644 --- a/charset_normalizer/__init__.py +++ b/src/charset_normalizer/__init__.py @@ -18,6 +18,7 @@ :copyright: (c) 2021 by Ahmed TAHRI :license: MIT, see LICENSE for more details. """ + from __future__ import annotations import logging diff --git a/charset_normalizer/__main__.py b/src/charset_normalizer/__main__.py similarity index 100% rename from charset_normalizer/__main__.py rename to src/charset_normalizer/__main__.py diff --git a/charset_normalizer/api.py b/src/charset_normalizer/api.py similarity index 98% rename from charset_normalizer/api.py rename to src/charset_normalizer/api.py index 9ffc049d..2c8c0618 100644 --- a/charset_normalizer/api.py +++ b/src/charset_normalizer/api.py @@ -23,8 +23,6 @@ should_strip_sig_or_bom, ) -# Will most likely be controversial -# logging.addLevelName(TRACE, "TRACE") logger = logging.getLogger("charset_normalizer") explain_handler = logging.StreamHandler() explain_handler.setFormatter( @@ -78,7 +76,7 @@ def from_bytes( if length == 0: logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.") - if explain: + if explain: # Defensive: ensure exit path clean handler logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level or logging.WARNING) return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")]) @@ -464,7 +462,7 @@ def from_bytes( "Encoding detection: %s is most likely the one.", current_match.encoding, ) - if explain: + if explain: # Defensive: ensure exit path clean handler logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level) return CharsetMatches([current_match]) @@ -482,7 +480,7 @@ def from_bytes( "Encoding detection: %s is most likely the one.", probable_result.encoding, ) - if explain: + if explain: # Defensive: ensure exit path clean handler logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level) @@ -494,7 +492,7 @@ def from_bytes( "the beginning of the sequence.", encoding_iana, ) - if explain: + if explain: # Defensive: ensure exit path clean handler logger.removeHandler(explain_handler) logger.setLevel(previous_logger_level) return CharsetMatches([results[encoding_iana]]) diff --git a/charset_normalizer/cd.py b/src/charset_normalizer/cd.py similarity index 100% rename from charset_normalizer/cd.py rename to src/charset_normalizer/cd.py diff --git a/charset_normalizer/cli/__init__.py b/src/charset_normalizer/cli/__init__.py similarity index 100% rename from charset_normalizer/cli/__init__.py rename to src/charset_normalizer/cli/__init__.py diff --git a/charset_normalizer/cli/__main__.py b/src/charset_normalizer/cli/__main__.py similarity index 99% rename from charset_normalizer/cli/__main__.py rename to src/charset_normalizer/cli/__main__.py index b4f364be..64a290f2 100644 --- a/charset_normalizer/cli/__main__.py +++ b/src/charset_normalizer/cli/__main__.py @@ -125,7 +125,7 @@ def cli_detect(argv: list[str] | None = None) -> int: default=0.2, type=float, dest="threshold", - help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.", + help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.", ) parser.add_argument( "--version", diff --git a/charset_normalizer/constant.py b/src/charset_normalizer/constant.py similarity index 100% rename from charset_normalizer/constant.py rename to src/charset_normalizer/constant.py diff --git a/charset_normalizer/legacy.py b/src/charset_normalizer/legacy.py similarity index 94% rename from charset_normalizer/legacy.py rename to src/charset_normalizer/legacy.py index cfb876a1..a2f53451 100644 --- a/charset_normalizer/legacy.py +++ b/src/charset_normalizer/legacy.py @@ -37,8 +37,9 @@ def detect( if not isinstance(byte_str, (bytearray, bytes)): raise TypeError( # pragma: nocover - "Expected object of type bytes or bytearray, got: " - "{}".format(type(byte_str)) + "Expected object of type bytes or bytearray, got: " "{}".format( + type(byte_str) + ) ) if isinstance(byte_str, bytearray): diff --git a/charset_normalizer/md.py b/src/charset_normalizer/md.py similarity index 97% rename from charset_normalizer/md.py rename to src/charset_normalizer/md.py index d177db01..9ed59a86 100644 --- a/charset_normalizer/md.py +++ b/src/charset_normalizer/md.py @@ -93,7 +93,7 @@ def feed(self, character: str) -> None: self._last_printable_char = character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._punctuation_count = 0 self._character_count = 0 self._symbol_count = 0 @@ -124,7 +124,7 @@ def feed(self, character: str) -> None: if is_accentuated(character): self._accentuated_count += 1 - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._character_count = 0 self._accentuated_count = 0 @@ -150,7 +150,7 @@ def feed(self, character: str) -> None: self._unprintable_count += 1 self._character_count += 1 - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._unprintable_count = 0 @property @@ -185,7 +185,7 @@ def feed(self, character: str) -> None: self._successive_count += 1 self._last_latin_character = character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._successive_count = 0 self._character_count = 0 self._last_latin_character = None @@ -230,7 +230,7 @@ def feed(self, character: str) -> None: self._last_printable_seen = character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._character_count = 0 self._suspicious_successive_range_count = 0 self._last_printable_seen = None @@ -347,7 +347,7 @@ def feed(self, character: str) -> None: self._is_current_word_bad = True self._buffer += character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._buffer = "" self._is_current_word_bad = False self._foreign_long_watch = False @@ -385,7 +385,7 @@ def feed(self, character: str) -> None: if is_cjk(character): self._cjk_character_count += 1 - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._wrong_stop_count = 0 self._cjk_character_count = 0 @@ -455,7 +455,7 @@ def feed(self, character: str) -> None: self._character_count_since_last_sep += 1 self._last_alpha_seen = character - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._character_count = 0 self._character_count_since_last_sep = 0 self._successive_upper_lower_count = 0 @@ -477,7 +477,7 @@ def __init__(self) -> None: self._character_count: int = 0 self._isolated_form_count: int = 0 - def reset(self) -> None: # pragma: no cover + def reset(self) -> None: # Abstract self._character_count = 0 self._isolated_form_count = 0 @@ -526,9 +526,10 @@ def is_suspiciously_successive_range( ): return False - keywords_range_a, keywords_range_b = unicode_range_a.split( - " " - ), unicode_range_b.split(" ") + keywords_range_a, keywords_range_b = ( + unicode_range_a.split(" "), + unicode_range_b.split(" "), + ) for el in keywords_range_a: if el in UNICODE_SECONDARY_RANGE_KEYWORD: @@ -623,7 +624,7 @@ def mess_ratio( logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") - for dt in detectors: # pragma: nocover + for dt in detectors: logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") return round(mean_mess_ratio, 3) diff --git a/charset_normalizer/models.py b/src/charset_normalizer/models.py similarity index 98% rename from charset_normalizer/models.py rename to src/charset_normalizer/models.py index 09492a9c..1042758f 100644 --- a/charset_normalizer/models.py +++ b/src/charset_normalizer/models.py @@ -221,10 +221,11 @@ def output(self, encoding: str = "utf_8") -> bytes: patched_header = sub( RE_POSSIBLE_ENCODING_INDICATION, lambda m: m.string[m.span()[0] : m.span()[1]].replace( - m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type] + m.groups()[0], + iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type] ), decoded_string[:8192], - 1, + count=1, ) decoded_string = patched_header + decoded_string[8192:] diff --git a/charset_normalizer/py.typed b/src/charset_normalizer/py.typed similarity index 100% rename from charset_normalizer/py.typed rename to src/charset_normalizer/py.typed diff --git a/charset_normalizer/utils.py b/src/charset_normalizer/utils.py similarity index 93% rename from charset_normalizer/utils.py rename to src/charset_normalizer/utils.py index 4498d3c6..0175e0a9 100644 --- a/charset_normalizer/utils.py +++ b/src/charset_normalizer/utils.py @@ -27,7 +27,7 @@ def is_accentuated(character: str) -> bool: try: description: str = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return ( "WITH GRAVE" in description @@ -70,7 +70,7 @@ def unicode_range(character: str) -> str | None: def is_latin(character: str) -> bool: try: description: str = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "LATIN" in description @@ -134,7 +134,7 @@ def is_case_variable(character: str) -> bool: def is_cjk(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "CJK" in character_name @@ -144,7 +144,7 @@ def is_cjk(character: str) -> bool: def is_hiragana(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "HIRAGANA" in character_name @@ -154,7 +154,7 @@ def is_hiragana(character: str) -> bool: def is_katakana(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "KATAKANA" in character_name @@ -164,7 +164,7 @@ def is_katakana(character: str) -> bool: def is_hangul(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "HANGUL" in character_name @@ -174,7 +174,7 @@ def is_hangul(character: str) -> bool: def is_thai(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "THAI" in character_name @@ -184,7 +184,7 @@ def is_thai(character: str) -> bool: def is_arabic(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "ARABIC" in character_name @@ -194,7 +194,7 @@ def is_arabic(character: str) -> bool: def is_arabic_isolated_form(character: str) -> bool: try: character_name = unicodedata.name(character) - except ValueError: + except ValueError: # Defensive: unicode database outdated? return False return "ARABIC" in character_name and "ISOLATED FORM" in character_name @@ -210,7 +210,7 @@ def is_unprintable(character: str) -> bool: return ( character.isspace() is False # includes \n \t \r \v and character.isprintable() is False - and character != "\x1A" # Why? Its the ASCII substitute character. + and character != "\x1a" # Why? Its the ASCII substitute character. and character != "\ufeff" # bug discovered in Python, # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. ) @@ -292,6 +292,7 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool: def iana_name(cp_name: str, strict: bool = True) -> str: + """Returns the Python normalized encoding name (Not the IANA official name).""" cp_name = cp_name.lower().replace("-", "_") encoding_alias: str @@ -307,20 +308,6 @@ def iana_name(cp_name: str, strict: bool = True) -> str: return cp_name -def range_scan(decoded_sequence: str) -> list[str]: - ranges: set[str] = set() - - for character in decoded_sequence: - character_range: str | None = unicode_range(character) - - if character_range is None: - continue - - ranges.add(character_range) - - return list(ranges) - - def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): return 0.0 diff --git a/charset_normalizer/version.py b/src/charset_normalizer/version.py similarity index 100% rename from charset_normalizer/version.py rename to src/charset_normalizer/version.py diff --git a/tests/test_preemptive_detection.py b/tests/test_preemptive_detection.py index 64b52023..e56c4a16 100644 --- a/tests/test_preemptive_detection.py +++ b/tests/test_preemptive_detection.py @@ -34,7 +34,7 @@ def test_detect_most_common_body_encoding(payload, expected_encoding): [ ( b'', - b'', + b'', ), ( b'', @@ -51,19 +51,19 @@ def test_detect_most_common_body_encoding(payload, expected_encoding): ), ( b'', - b'', + b'', ), ( b'', - b'', + b'', ), ( b"", - b"", + b"", ), ( b'', - b'', + b'', ), ], )