Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into simplify-cli-tests-…
Browse files Browse the repository at this point in the history
…output-check
  • Loading branch information
OmarAI2003 committed Oct 20, 2024
2 parents c44f4ae + ee2ec76 commit 1bf9be4
Show file tree
Hide file tree
Showing 28 changed files with 802 additions and 130 deletions.
23 changes: 22 additions & 1 deletion .github/workflows/check_project_structure.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,33 @@ on:

jobs:
structure-check:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
python-version:
- "3.9"

runs-on: ${{ matrix.os }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Add project root to PYTHONPATH
run: echo "PYTHONPATH=$(pwd)/src" >> $GITHUB_ENV

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run check_project_structure.py
working-directory: ./src/scribe_data/check
run: python check_project_structure.py
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/check_query_identifiers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ on:
push:
branches: [main]
pull_request:
branches:
- main
branches: [main]
types: [opened, reopened, synchronize]

jobs:
Expand Down
100 changes: 21 additions & 79 deletions src/scribe_data/check/check_project_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,83 +25,23 @@
"""

import os
from pathlib import Path

# Expected languages and data types.
LANGUAGES = {
"Arabic",
"English",
"Greek",
"Italian",
"Malayalam",
"Russian",
"Tamil",
"Basque",
"Esperanto",
"Hausa",
"Japanese",
"Norwegian",
"Slovak",
"Dagbani",
"Ukrainian",
"Bengali",
"Estonian",
"Hebrew",
"Korean",
"Pidgin",
"Spanish",
"Yoruba",
"Chinese",
"Finnish",
"Hindustani",
"Kurmanji",
"Polish",
"Swahili",
"Czech",
"French",
"Indonesian",
"Latin",
"Latvian",
"Portuguese",
"Swedish",
"Danish",
"German",
"Malay",
"Punjabi",
"Tajik",
"Igbo",
}
from scribe_data.cli.cli_utils import (
LANGUAGE_DATA_EXTRACTION_DIR,
data_type_metadata,
language_metadata,
)

DATA_TYPES = {
"adjectives",
"adverbs",
"articles",
"autosuggestions",
"conjunctions",
"emoji_keywords",
"nouns",
"personal_pronouns",
"postpositions",
"prepositions",
"pronouns",
"proper_nouns",
"verbs",
}

# Sub-subdirectories expected for specific languages.
# Expected languages and data types.
LANGUAGES = [lang.capitalize() for lang in language_metadata.keys()]
DATA_TYPES = data_type_metadata.keys()
SUB_DIRECTORIES = {
"Chinese": ["Mandarin"],
"Hindustani": ["Urdu", "Hindi"],
"Norwegian": ["Nynorsk", "Bokmål"],
"Pidgin": ["Nigerian"],
"Punjabi": ["Shahmukhi", "Gurmukhi"],
k.capitalize(): [lang.capitalize() for lang in v["sub_languages"].keys()]
for k, v in language_metadata.items()
if len(v.keys()) == 1 and "sub_languages" in v.keys()
}


# Base directory path.
BASE_DIR = Path(__file__).parent.parent / "language_data_extraction"


def check_for_sparql_files(folder_path, data_type, language, subdir, missing_queries):
"""
Check if a data-type folder contains at least one .sparql file.
Expand Down Expand Up @@ -215,19 +155,21 @@ def validate_project_structure():
missing_folders = []
missing_queries = []

if not os.path.exists(BASE_DIR):
print(f"Error: Base directory '{BASE_DIR}' does not exist.")
if not os.path.exists(LANGUAGE_DATA_EXTRACTION_DIR):
print(f"Error: Base directory '{LANGUAGE_DATA_EXTRACTION_DIR}' does not exist.")
exit(1)

# Check for unexpected files in BASE_DIR.
for item in os.listdir(BASE_DIR):
item_path = os.path.join(BASE_DIR, item)
# Check for unexpected files in LANGUAGE_DATA_EXTRACTION_DIR.
for item in os.listdir(LANGUAGE_DATA_EXTRACTION_DIR):
item_path = os.path.join(LANGUAGE_DATA_EXTRACTION_DIR, item)
if os.path.isfile(item_path) and item != "__init__.py":
errors.append(f"Unexpected file found in BASE_DIR: {item}")
errors.append(
f"Unexpected file found in the 'language_data_extraction' files: {item}"
)

# Iterate through the language directories.
for language in os.listdir(BASE_DIR):
language_path = os.path.join(BASE_DIR, language)
for language in os.listdir(LANGUAGE_DATA_EXTRACTION_DIR):
language_path = os.path.join(LANGUAGE_DATA_EXTRACTION_DIR, language)

if not os.path.isdir(language_path) or language == "__init__.py":
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,5 @@ SELECT
WHERE {
?lexeme dct:language wd:Q8752 ;
wikibase:lexicalCategory wd:Q34698 ;
wikibase:lemma ?lemma .

SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
?lemma rdfs:label ?adjective .
}
wikibase:lemma ?adjective .
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,5 @@ SELECT
WHERE {
?lexeme dct:language wd:Q9610 ;
wikibase:lexicalCategory wd:Q34698 ;
wikibase:lemma ?lemma .

SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
?lemma rdfs:label ?adjective .
}
wikibase:lemma ?adjective .
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,5 @@ SELECT
WHERE {
?lexeme dct:language wd:Q1860 ;
wikibase:lexicalCategory wd:Q34698 ;
wikibase:lemma ?lemma .

SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
?lemma rdfs:label ?adjective .
}
wikibase:lemma ?adjective .
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# tool: scribe-data
# All Estonian (Q380057) adverbs (Q380057) and the given forms.
# Enter this query at https://query.wikidata.org/
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# tool: scribe-data
# All Estonian (Q380057) adverbs (Q380057) and the given forms.
# Enter this query at https://query.wikidata.org/
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,9 @@
SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition
?case

WHERE {
?lexeme dct:language wd:Q9072 ;
wikibase:lexicalCategory wd:Q4833830 ;
wikibase:lemma ?preposition .

# MARK: Corresponding Case

OPTIONAL {
?lexeme wdt:P5713 ?caseForm .
}

SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
?caseForm rdfs:label ?case .
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# tool: scribe-data
# All Estonian (Q9072) verbs (Q24905) and the given forms.
# Enter this query at https://query.wikidata.org/
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# tool: scribe-data
# All Igbo (Q33578) adjective (Q34698) and the given forms.
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?adjective
?singular
?plural

WHERE {
?lexeme dct:language wd:Q33578;
wikibase:lexicalCategory wd:Q34698;
wikibase:lemma ?adjective .

# MARK: Singular

OPTIONAL {
?lexeme ontolex:lexicalForm ?singularForm .
?singularForm ontolex:representation ?singular ;
wikibase:grammaticalFeature wd:Q110786 .
}

# MARK: Plural

OPTIONAL {
?lexeme ontolex:lexicalForm ?pluralForm .
?pluralForm ontolex:representation ?plural ;
wikibase:grammaticalFeature wd:Q146786 .
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# tool: scribe-data
# All Igbo (Q33578) adverbs and the given forms.
# All Igbo (Q33578) adverbs (Q380057) and the given forms.
# Enter this query at https://query.wikidata.org/.

SELECT
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# tool: scribe-data
# All Igbo (Q33578) nouns (Q1084) and the given forms.
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?noun

WHERE {
?lexeme dct:language wd:Q33578 ;
wikibase:lexicalCategory wd:Q1084 ;
wikibase:lemma ?noun .
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# tool: scribe-data
# All Igbo (Q33578) prepositions (Q4833830) and the given forms.
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?preposition
?contraction

WHERE {
?lexeme dct:language wd:Q33578 ;
wikibase:lexicalCategory wd:Q4833830 ;
wikibase:lemma ?preposition .

# MARK: Contraction

OPTIONAL {
?lexeme ontolex:lexicalForm ?contractionForm .
?contractionForm ontolex:representation ?contraction ;
wikibase:grammaticalFeature wd:Q126473 .
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# tool: scribe-data
# All Korean (Q9176) adjectives (Q34698) and the given forms.
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?adjective

WHERE {
?lexeme dct:language wd:Q9176 ;
wikibase:lexicalCategory wd:Q34698 ;
wikibase:lemma ?adjective .
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# tool: scribe-data
# All Latin language (Q397) adverbs (Q380057) and the given forms.
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?adverb
?comparative
?superlative

WHERE {
?lexeme dct:language wd:Q397 ;
wikibase:lexicalCategory wd:Q380057 ;
wikibase:lemma ?adverb .

# MARK: Comparative

OPTIONAL {
?lexeme ontolex:lexicalForm ?comparativeForm .
?comparativeForm ontolex:representation ?comparative ;
wikibase:grammaticalFeature wd:Q14169499 .
}

# MARK: Superlative

OPTIONAL {
?lexeme ontolex:lexicalForm ?superlativeForm .
?superlativeForm ontolex:representation ?superlative ;
wikibase:grammaticalFeature wd:Q1817208 .
}
}
Loading

0 comments on commit 1bf9be4

Please sign in to comment.