Merge pull request #396 from DeleMike/fix/adjust-check-query-workflow

complete workflow to check sparql queries
scribe-org · Oct 19, 2024 · 9d5c37c · 9d5c37c
2 parents 8321dc3 + a975a6b
commit 9d5c37c
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 57 deletions.
diff --git a/.github/workflows/check_query_identifiers.yaml b/.github/workflows/check_query_identifiers.yaml
@@ -22,24 +22,26 @@ jobs:
     name: Run Check Query Identifiers
 
     steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      # - name: Set up Python ${{ matrix.python-version }}
-      #   uses: actions/setup-python@v4
-      #   with:
-      #     python-version: ${{ matrix.python-version }}
-
-      # - name: Install dependencies
-      #   run: |
-      #     python -m pip install --upgrade uv
-      #     uv venv
-      #     uv pip install -r requirements.txt
-
-      # - name: Activate virtualenv
-      #   run: |
-      #     . .venv/bin/activate
-      #     echo PATH=$PATH >> $GITHUB_ENV
-
-      # - name: Run Python script
-      #   run: python src/scribe_data/check/check_query_identifiers.py
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Add project root to PYTHONPATH
+        run: echo "PYTHONPATH=$(pwd)/src" >> $GITHUB_ENV
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run check_query_identifiers.py
+        working-directory: ./src/scribe_data/check
+        run: python check_query_identifiers.py
+
+      - name: Post-run status
+        if: failure()
+        run: echo "Project SPARQL queries check failed. Please fix the reported errors."
diff --git a/src/scribe_data/check/check_query_identifiers.py b/src/scribe_data/check/check_query_identifiers.py
@@ -25,6 +25,7 @@
 """
 
 import re
+import sys
 from pathlib import Path
 
 from scribe_data.cli.cli_utils import (
@@ -50,6 +51,11 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str:
     -------
         str
             The extracted QID if found, otherwise None.
+
+    Raises
+    ------
+        FileNotFoundError
+            If the specified file does not exist.
     """
     try:
         with open(file_path, "r", encoding="utf-8") as file:
@@ -63,7 +69,7 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str:
     return None
 
 
-def check_queries():
+def check_queries() -> None:
     """
     Validates SPARQL queries in the specified directory to check for correct language
     and data type QIDs.
@@ -92,14 +98,14 @@ def check_queries():
         for file in incorrect_languages:
             print(f"- {file}")
 
-    print("\n----------------------------------------------------------------\n")
-
     if incorrect_data_types:
         print("Incorrect Data Type QIDs found in the following files:")
         for file in incorrect_data_types:
             print(f"- {file}")
 
-    print("\n----------------------------------------------------------------\n")
+    # Exit with an error code if any incorrect QIDs are found.
+    if incorrect_languages or incorrect_data_types:
+        sys.exit(1)
 
 
 def is_valid_language(query_file: Path, lang_qid: str) -> bool:
@@ -117,24 +123,30 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool:
     -------
     bool
         True if the language QID is valid, otherwise False.
+
+    Example
+    -------
+        > is_valid_language(Path("path/to/query.sparql"), "Q123456")
+        True
     """
     lang_directory_name = query_file.parent.parent.name.lower()
-    languages = language_metadata.get(
-        "languages"
-    )  # might not work since language_metadata file is not fully updated
-    language_entry = next(
-        (lang for lang in languages if lang["language"] == lang_directory_name), None
-    )
+    language_entry = language_metadata.get(lang_directory_name)
+
+    if not language_entry:
+        # Look for sub-languages
+        for lang, details in language_metadata.items():
+            if "sub_languages" in details:
+                sub_language_entry = details["sub_languages"].get(lang_directory_name)
+                if sub_language_entry:
+                    language_entry = sub_language_entry
+                    break
 
     if not language_entry:
         return False
 
     expected_language_qid = language_entry["qid"]
 
-    if lang_qid != expected_language_qid:
-        return False
-
-    return True
+    return lang_qid == expected_language_qid
 
 
 def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool:
@@ -152,13 +164,17 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool:
     -------
     bool
         True if the data type QID is valid, otherwise False.
+
+    Example
+    -------
+        > is_valid_data_type(Path("path/to/query.sparql"), "Q654321")
+        True
     """
     directory_name = query_file.parent.name  # e.g., "nouns" or "verbs"
     expected_data_type_qid = data_type_metadata.get(directory_name)
 
     return data_type_qid == expected_data_type_qid
 
 
-# Run the check_queries function
-# MARK: TODO: Remove Call
-# check_queries()
+if __name__ == "__main__":
+    check_queries()
diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py
@@ -54,7 +54,6 @@
 except (IOError, json.JSONDecodeError) as e:
     print(f"Error reading data type metadata: {e}")
 
-
 language_map = {}
 language_to_qid = {}
 

diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py
@@ -31,6 +31,7 @@
     get_language_iso,
     get_language_qid,
     list_all_languages,
+    list_languages_with_metadata_for_data_type,
 )
 
 
@@ -132,28 +133,27 @@ def list_languages_for_data_type(data_type: str) -> None:
             The data type to check for.
     """
     data_type = correct_data_type(data_type=data_type)
-    all_languages = list_all_languages(language_metadata)
-    available_languages = []
-    for lang in all_languages:
-        lang = format_sublanguage_name(lang, language_metadata)
-        language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang
-        if language_dir.is_dir():
-            dt_path = language_dir / data_type
-            if dt_path.exists():
-                available_languages.append(lang)
-
-    available_languages.sort()
-    table_header = f"Available languages: {data_type}"
-    table_line_length = max(
-        len(table_header), max(len(lang) for lang in available_languages)
-    )
+    all_languages = list_languages_with_metadata_for_data_type(language_metadata)
+
+    # Set column widths for consistent formatting.
+    language_col_width = max(len(lang["name"]) for lang in all_languages) + 2
+    iso_col_width = max(len(lang["iso"]) for lang in all_languages) + 2
+    qid_col_width = max(len(lang["qid"]) for lang in all_languages) + 2
 
+    table_line_length = language_col_width + iso_col_width + qid_col_width
+
+    # Print table header.
     print()
-    print(table_header)
+    print(
+        f"{'Language':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}"
+    )
     print("-" * table_line_length)
 
-    for lang in available_languages:
-        print(f"{lang}")
+    # Iterate through the list of languages and format each row.
+    for lang in all_languages:
+        print(
+            f"{lang['name'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}"
+        )
 
     print("-" * table_line_length)
     print()

diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json
@@ -95,6 +95,10 @@
     "iso": "ja",
     "qid": "Q5287"
   },
+  "korean": {
+    "iso": "ko",
+    "qid": "Q9176"
+  },
   "kurmanji": {
     "iso": "kmr",
     "qid": "Q36163"
@@ -103,6 +107,10 @@
     "iso": "la",
     "qid": "Q397"
   },
+  "latvian": {
+    "iso": "lv",
+    "qid": "Q9078"
+  },
   "malay": {
     "iso": "ms",
     "qid": "Q9237"

diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
@@ -546,3 +546,36 @@ def list_all_languages(language_metadata=_languages):
             current_languages.append(lang_key)
 
     return sorted(current_languages)
+
+
+def list_languages_with_metadata_for_data_type(language_metadata=_languages):
+    """
+    Returns a sorted list of languages and their metadata (name, iso, qid) for a specific data type.
+    The list includes sub-languages where applicable.
+    """
+    current_languages = []
+
+    # Iterate through the language metadata.
+    for lang_key, lang_data in language_metadata.items():
+        # Check if there are sub-languages.
+        if "sub_languages" in lang_data:
+            # Add the sub-languages to current_languages with metadata.
+            for sub_key, sub_data in lang_data["sub_languages"].items():
+                current_languages.append(
+                    {
+                        "name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}",
+                        "iso": sub_data.get("iso", ""),
+                        "qid": sub_data.get("qid", ""),
+                    }
+                )
+        else:
+            # If no sub-languages, add the main language with metadata.
+            current_languages.append(
+                {
+                    "name": lang_data.get("name", lang_key),
+                    "iso": lang_data.get("iso", ""),
+                    "qid": lang_data.get("qid", ""),
+                }
+            )
+
+    return sorted(current_languages, key=lambda x: x["name"])
diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py
@@ -154,8 +154,10 @@ def test_list_all_languages():
         "indonesian",
         "italian",
         "japanese",
+        "korean",
         "kurmanji",
         "latin",
+        "latvian",
         "malay",
         "malayalam",
         "mandarin",