Skip to content

Commit

Permalink
Merge pull request #396 from DeleMike/fix/adjust-check-query-workflow
Browse files Browse the repository at this point in the history
complete workflow to check sparql queries
  • Loading branch information
andrewtavis authored Oct 19, 2024
2 parents 8321dc3 + a975a6b commit 9d5c37c
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 57 deletions.
44 changes: 23 additions & 21 deletions .github/workflows/check_query_identifiers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,26 @@ jobs:
name: Run Check Query Identifiers

steps:
- name: Checkout
uses: actions/checkout@v3

# - name: Set up Python ${{ matrix.python-version }}
# uses: actions/setup-python@v4
# with:
# python-version: ${{ matrix.python-version }}

# - name: Install dependencies
# run: |
# python -m pip install --upgrade uv
# uv venv
# uv pip install -r requirements.txt

# - name: Activate virtualenv
# run: |
# . .venv/bin/activate
# echo PATH=$PATH >> $GITHUB_ENV

# - name: Run Python script
# run: python src/scribe_data/check/check_query_identifiers.py
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Add project root to PYTHONPATH
run: echo "PYTHONPATH=$(pwd)/src" >> $GITHUB_ENV

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run check_query_identifiers.py
working-directory: ./src/scribe_data/check
run: python check_query_identifiers.py

- name: Post-run status
if: failure()
run: echo "Project SPARQL queries check failed. Please fix the reported errors."
50 changes: 33 additions & 17 deletions src/scribe_data/check/check_query_identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"""

import re
import sys
from pathlib import Path

from scribe_data.cli.cli_utils import (
Expand All @@ -50,6 +51,11 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str:
-------
str
The extracted QID if found, otherwise None.
Raises
------
FileNotFoundError
If the specified file does not exist.
"""
try:
with open(file_path, "r", encoding="utf-8") as file:
Expand All @@ -63,7 +69,7 @@ def extract_qid_from_sparql(file_path: Path, pattern: str) -> str:
return None


def check_queries():
def check_queries() -> None:
"""
Validates SPARQL queries in the specified directory to check for correct language
and data type QIDs.
Expand Down Expand Up @@ -92,14 +98,14 @@ def check_queries():
for file in incorrect_languages:
print(f"- {file}")

print("\n----------------------------------------------------------------\n")

if incorrect_data_types:
print("Incorrect Data Type QIDs found in the following files:")
for file in incorrect_data_types:
print(f"- {file}")

print("\n----------------------------------------------------------------\n")
# Exit with an error code if any incorrect QIDs are found.
if incorrect_languages or incorrect_data_types:
sys.exit(1)


def is_valid_language(query_file: Path, lang_qid: str) -> bool:
Expand All @@ -117,24 +123,30 @@ def is_valid_language(query_file: Path, lang_qid: str) -> bool:
-------
bool
True if the language QID is valid, otherwise False.
Example
-------
> is_valid_language(Path("path/to/query.sparql"), "Q123456")
True
"""
lang_directory_name = query_file.parent.parent.name.lower()
languages = language_metadata.get(
"languages"
) # might not work since language_metadata file is not fully updated
language_entry = next(
(lang for lang in languages if lang["language"] == lang_directory_name), None
)
language_entry = language_metadata.get(lang_directory_name)

if not language_entry:
# Look for sub-languages
for lang, details in language_metadata.items():
if "sub_languages" in details:
sub_language_entry = details["sub_languages"].get(lang_directory_name)
if sub_language_entry:
language_entry = sub_language_entry
break

if not language_entry:
return False

expected_language_qid = language_entry["qid"]

if lang_qid != expected_language_qid:
return False

return True
return lang_qid == expected_language_qid


def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool:
Expand All @@ -152,13 +164,17 @@ def is_valid_data_type(query_file: Path, data_type_qid: str) -> bool:
-------
bool
True if the data type QID is valid, otherwise False.
Example
-------
> is_valid_data_type(Path("path/to/query.sparql"), "Q654321")
True
"""
directory_name = query_file.parent.name # e.g., "nouns" or "verbs"
expected_data_type_qid = data_type_metadata.get(directory_name)

return data_type_qid == expected_data_type_qid


# Run the check_queries function
# MARK: TODO: Remove Call
# check_queries()
if __name__ == "__main__":
check_queries()
1 change: 0 additions & 1 deletion src/scribe_data/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
except (IOError, json.JSONDecodeError) as e:
print(f"Error reading data type metadata: {e}")


language_map = {}
language_to_qid = {}

Expand Down
36 changes: 18 additions & 18 deletions src/scribe_data/cli/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
get_language_iso,
get_language_qid,
list_all_languages,
list_languages_with_metadata_for_data_type,
)


Expand Down Expand Up @@ -132,28 +133,27 @@ def list_languages_for_data_type(data_type: str) -> None:
The data type to check for.
"""
data_type = correct_data_type(data_type=data_type)
all_languages = list_all_languages(language_metadata)
available_languages = []
for lang in all_languages:
lang = format_sublanguage_name(lang, language_metadata)
language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang
if language_dir.is_dir():
dt_path = language_dir / data_type
if dt_path.exists():
available_languages.append(lang)

available_languages.sort()
table_header = f"Available languages: {data_type}"
table_line_length = max(
len(table_header), max(len(lang) for lang in available_languages)
)
all_languages = list_languages_with_metadata_for_data_type(language_metadata)

# Set column widths for consistent formatting.
language_col_width = max(len(lang["name"]) for lang in all_languages) + 2
iso_col_width = max(len(lang["iso"]) for lang in all_languages) + 2
qid_col_width = max(len(lang["qid"]) for lang in all_languages) + 2

table_line_length = language_col_width + iso_col_width + qid_col_width

# Print table header.
print()
print(table_header)
print(
f"{'Language':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}"
)
print("-" * table_line_length)

for lang in available_languages:
print(f"{lang}")
# Iterate through the list of languages and format each row.
for lang in all_languages:
print(
f"{lang['name'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}"
)

print("-" * table_line_length)
print()
Expand Down
8 changes: 8 additions & 0 deletions src/scribe_data/resources/language_metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@
"iso": "ja",
"qid": "Q5287"
},
"korean": {
"iso": "ko",
"qid": "Q9176"
},
"kurmanji": {
"iso": "kmr",
"qid": "Q36163"
Expand All @@ -103,6 +107,10 @@
"iso": "la",
"qid": "Q397"
},
"latvian": {
"iso": "lv",
"qid": "Q9078"
},
"malay": {
"iso": "ms",
"qid": "Q9237"
Expand Down
33 changes: 33 additions & 0 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,3 +546,36 @@ def list_all_languages(language_metadata=_languages):
current_languages.append(lang_key)

return sorted(current_languages)


def list_languages_with_metadata_for_data_type(language_metadata=_languages):
"""
Returns a sorted list of languages and their metadata (name, iso, qid) for a specific data type.
The list includes sub-languages where applicable.
"""
current_languages = []

# Iterate through the language metadata.
for lang_key, lang_data in language_metadata.items():
# Check if there are sub-languages.
if "sub_languages" in lang_data:
# Add the sub-languages to current_languages with metadata.
for sub_key, sub_data in lang_data["sub_languages"].items():
current_languages.append(
{
"name": f"{lang_data.get('name', lang_key)}/{sub_data.get('name', sub_key)}",
"iso": sub_data.get("iso", ""),
"qid": sub_data.get("qid", ""),
}
)
else:
# If no sub-languages, add the main language with metadata.
current_languages.append(
{
"name": lang_data.get("name", lang_key),
"iso": lang_data.get("iso", ""),
"qid": lang_data.get("qid", ""),
}
)

return sorted(current_languages, key=lambda x: x["name"])
2 changes: 2 additions & 0 deletions tests/load/test_update_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,10 @@ def test_list_all_languages():
"indonesian",
"italian",
"japanese",
"korean",
"kurmanji",
"latin",
"latvian",
"malay",
"malayalam",
"mandarin",
Expand Down

0 comments on commit 9d5c37c

Please sign in to comment.