From 188e712b269ad41c60d9f7c258beca76527e6dae Mon Sep 17 00:00:00 2001 From: p-goulart Date: Tue, 30 Apr 2024 18:50:46 +0200 Subject: [PATCH] Add basic file checks --- .github/workflows/build.yml | 12 ++++++++++++ scripts/check_encoding.sh | 24 ++++++++++++++++++++++++ scripts/check_newlines.sh | 18 ++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 scripts/check_encoding.sh create mode 100644 scripts/check_newlines.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 453e01a..6d8944e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -13,8 +13,20 @@ on: workflow_dispatch: {} jobs: + check_files: + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Check Hunspell encoding + run: bash ./scripts/check_encoding.sh + + - name: Check tagger file newlines + run: bash ./scripts/check_newlines.sh build: runs-on: ubuntu-latest + needs: check_files strategy: matrix: python-version: ["3.11"] diff --git a/scripts/check_encoding.sh b/scripts/check_encoding.sh new file mode 100644 index 0000000..159bbac --- /dev/null +++ b/scripts/check_encoding.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Shell script to check if all .dic and .aff files are encoded in ISO-8859-1 + +get_encoding() { + for ext in dic aff; do + find ./data/spelling-dict/hunspell -type f -name "pt*.${ext}" -exec file {} \; + done +} + +FILE_ENCODINGS=$(get_encoding) + +check_encoding() { + echo "${FILE_ENCODINGS}" | grep -v "ISO-8859 text" +} + + +if [[ -z $(check_encoding) ]]; then + echo "All .dic and .aff files are encoded in ISO-8859-1, we're good!" + exit 0 +else + echo "Some .dic and .aff files are not encoded in ISO-8859-1, please fix this." + echo "${FILE_ENCODINGS}" + exit 1 +fi \ No newline at end of file diff --git a/scripts/check_newlines.sh b/scripts/check_newlines.sh new file mode 100644 index 0000000..209289d --- /dev/null +++ b/scripts/check_newlines.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Check for files that do not end with a newline + +check_newlines() { + find ./data/src-dict -name "*.txt" -type f -print0 | xargs -0 -n1 bash -c 'tail -c1 "$1" | read -r _ || echo "$1"' bash +} + +NO_NEWLINE_FILES=$(check_newlines) + +if [[ -z "${NO_NEWLINE_FILES}" ]]; then + echo "All files end with a blank line, which is good." + exit 0 +else + echo "Some files do not end with a newline:" + echo "${NO_NEWLINE_FILES}" + exit 1 +fi