From 619be5f8a13a3b65dd36c7701c9b9f979d11f179 Mon Sep 17 00:00:00 2001 From: Andrew Tribick Date: Tue, 2 Jan 2024 13:04:27 +0100 Subject: [PATCH] Add check for Unicode normalization --- .../workflows/{licenses.yml => checks.yml} | 19 ++- checks/checkunicode.py | 129 ++++++++++++++++++ 2 files changed, 146 insertions(+), 2 deletions(-) rename .github/workflows/{licenses.yml => checks.yml} (56%) create mode 100755 checks/checkunicode.py diff --git a/.github/workflows/licenses.yml b/.github/workflows/checks.yml similarity index 56% rename from .github/workflows/licenses.yml rename to .github/workflows/checks.yml index 66e07e84..c356f423 100644 --- a/.github/workflows/licenses.yml +++ b/.github/workflows/checks.yml @@ -1,4 +1,4 @@ -name: Licenses +name: Checks on: push: @@ -11,7 +11,8 @@ concurrency: cancel-in-progress: true jobs: - reuse: + licenses: + name: "Licenses" strategy: fail-fast: true runs-on: ubuntu-latest @@ -23,3 +24,17 @@ jobs: working-directory: ${{github.workspace}} run: | python3 -m pipx run reuse lint + + unicode: + name: "Unicode" + strategy: + fail-fast: true + runs-on: ubuntu-latest + steps: + - name: Checkout source code + uses: actions/checkout@v4 + + - name: Validate Unicode + working-directory: ${{github.workspace}} + run: | + python3 checks/checkunicode.py diff --git a/checks/checkunicode.py b/checks/checkunicode.py new file mode 100755 index 00000000..75af0dc1 --- /dev/null +++ b/checks/checkunicode.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 + +# SPDX-FileCopyrightText: 2024 Celestia Development Team +# SPDX-License-Identifier: GPL-2.0-or-later + +"""Checks for Unicode normalization forms in CelestiaContent""" + +import codecs +from enum import auto, Enum +import os +import sys +from typing import TextIO +import unicodedata + + +def eprint(*args, **kwargs) -> None: + """Print to stderr""" + print(*args, file=sys.stderr, **kwargs) + + +class _CheckState(Enum): + NORMAL = auto() + QUOTED = auto() + ESCAPE = auto() + + +def check_starnames(file: TextIO) -> bool: + """Check starnames.dat for non-NFC strings""" + result = True + for line_number, line in enumerate(file, 1): + linesplit = line[:-1].split(":") + for entry in linesplit[1:]: + if not unicodedata.is_normalized("NFC", entry): + result = False + eprint(f"Non-normalized NFC in {file.name} ({line_number})") + return result + + +def _check_string( + unescaped: str, file_name: str, line_number: int, split_names: bool +) -> bool: + result = True + expanded = codecs.unicode_escape_decode(unescaped)[0] + if split_names: + for alt_name in expanded.split(":"): + if not unicodedata.is_normalized("NFC", alt_name): + result = False + eprint( + f'Non-normalized string constant "{unescaped}" in {file_name} ({line_number})' + ) + elif not unicodedata.is_normalized("NFC", expanded): + eprint( + f'Non-normalized string constant "{unescaped}" in {file_name} ({line_number})' + ) + return result + + +def check_strings(file: TextIO, catalog_checks: bool) -> bool: + """Check strings in .ssc/.stc file""" + result = True + bracket_level = 0 + start_quote = 0 + for line_number, line in enumerate(file, 1): + state = _CheckState.NORMAL + for pos, ch in enumerate(line): + if state == _CheckState.NORMAL: + if ch == '"': + start_quote = pos + 1 + state = _CheckState.QUOTED + elif ch == "#": + break + elif ch == "[": + bracket_level += 1 + elif ch == "]": + bracket_level -= 1 + elif state == _CheckState.QUOTED: + if ch == "\\": + state = _CheckState.ESCAPE + elif ch == '"': + split_names = catalog_checks and bracket_level == 0 + result &= _check_string( + line[start_quote:pos], file.name, line_number, split_names + ) + state = _CheckState.NORMAL + elif state == _CheckState.ESCAPE: + state = _CheckState.QUOTED + return result + + +def check_directory(dirname: str, extensions: list[str], catalog_checks: bool) -> bool: + """Checks all files in a directory""" + result = True + casefolded_extensions = [e.casefold() for e in extensions] + for root, _dirs, files in os.walk(dirname, topdown=True): + for file in files: + extension = os.path.splitext(file)[1] + if extension.casefold() in casefolded_extensions: + with open(os.path.join(root, file), "rt", encoding="utf-8") as f: + result &= check_strings(f, catalog_checks) + return result + + +def check_files() -> bool: + """Checks the Unicode normalization status of CelestiaContent""" + result = True + with open("data/starnames.dat", "rt", encoding="utf-8") as f: + result &= check_starnames(f) + + with open("data/asterisms.dat", "rt", encoding="utf-8") as f: + result &= check_strings(f, catalog_checks=False) + + data_file_directories = ["data", "extras", "extras-standard"] + data_file_extensions = [".ssc", ".stc", ".dsc"] + for dirname in data_file_directories: + result &= check_directory(dirname, data_file_extensions, catalog_checks=True) + + result &= check_directory("po", [".po", ".pot"], catalog_checks=False) + + return result + + +if __name__ == "__main__": + os.chdir(os.path.join(os.path.dirname(__file__), "..")) + if not check_files(): + print("Unicode errors detected") + sys.exit(1) + else: + print("Unicode ok") + sys.exit(0)