Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(queries): Extend SPARQL query to extract additional Latin verb forms (issue #444) #479

Closed
wants to merge 9 commits into from
112 changes: 105 additions & 7 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
-->
"""

import json
import subprocess
from pathlib import Path

Expand All @@ -30,7 +31,67 @@
DEFAULT_TSV_EXPORT_DIR,
)
from scribe_data.wikidata.query_data import query_data
from scribe_data.wikipedia.process_wiki import gen_autosuggestions
from scribe_data.wikidata.wikidata_utils import sparql

def load_lexeme_metadata():
"""
Load the lexeme form metadata from the JSON file.
"""
metadata_path = Path(__file__).parent / "lexeme_form.metadata.json"
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Warning: Could not find lexeme metadata file at {metadata_path}")
return {}

def load_text_corpus(language):
"""
Load the text corpus for a given language with consideration for lexeme forms.

Parameters
----------
language : str
The language to load the corpus for

Returns
-------
list
The processed text corpus
"""
# Load lexeme metadata
lexeme_metadata = load_lexeme_metadata()

# Create SPARQL query to get relevant lexemes for the language
query = """
SELECT DISTINCT ?lexeme ?form ?representation WHERE {
?lexeme dct:language ?language .
?lexeme ontolex:lexicalForm ?form .
?form ontolex:representation ?representation .

# Filter for specific language
FILTER(LANG(?representation) = "%s")
}
LIMIT 10000
""" % language.lower()

sparql.setQuery(query)

try:
results = sparql.query().convert()
corpus = []

# Process results
for result in results["results"]["bindings"]:
representation = result["representation"]["value"]
corpus.append(representation)

return corpus

except Exception as e:
print(f"Error loading corpus for {language}: {str(e)}")
return []

def get_data(
language: str = None,
Expand Down Expand Up @@ -76,7 +137,6 @@ def get_data(
The requested data saved locally given file type and location arguments.
"""
# MARK: Defaults

output_type = output_type or "json"
if output_dir is None:
if output_type == "csv":
Expand All @@ -89,18 +149,18 @@ def get_data(
output_dir = DEFAULT_TSV_EXPORT_DIR

languages = [language] if language else None

subprocess_result = False

# MARK: Get All
# Load lexeme metadata
lexeme_metadata = load_lexeme_metadata()

# MARK: Get All
if all:
print("Updating all languages and data types ...")
query_data(None, None, None, overwrite)
subprocess_result = True

# MARK: Emojis

elif data_type in {"emoji-keywords", "emoji_keywords"}:
for lang in languages:
emoji_keyword_extraction_script = (
Expand All @@ -115,11 +175,49 @@ def get_data(
["python", emoji_keyword_extraction_script]
)

# MARK: Query Data
# MARK: Autosuggestions
elif data_type in {"autosuggestions", "auto_suggestions"}:
for lang in languages:
print(f"Generating autosuggestions for {lang}...")

# Load text corpus with lexeme forms consideration
text_corpus = load_text_corpus(lang)

if text_corpus:
try:
# Generate autosuggestions using the loaded corpus
autosuggestions = gen_autosuggestions(
text_corpus,
language=lang,
num_words=500, # Default number of words
update_local_data=True,
verbose=interactive
)

# Save autosuggestions with lexeme metadata
output_path = Path(output_dir) / lang / "autosuggestions.json"
output_path.parent.mkdir(parents=True, exist_ok=True)

# Combine autosuggestions with lexeme metadata
output_data = {
"autosuggestions": autosuggestions,
"lexeme_metadata": lexeme_metadata
}

with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output_data, f, ensure_ascii=False, indent=2)

subprocess_result = True
print(f"Autosuggestions for {lang} generated and saved to {output_path}")

except Exception as e:
print(f"Error generating autosuggestions for {lang}: {str(e)}")
else:
print(f"No corpus data found for {lang}")

# MARK: Query Data
elif language or data_type:
data_type = data_type[0] if isinstance(data_type, list) else data_type

data_type = [data_type] if data_type else None
print(
f"Updating data for language(s): {language}; data type(s): {', '.join(data_type)}"
Expand Down Expand Up @@ -155,4 +253,4 @@ def get_data(
)
print(
"Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n"
)
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,24 @@ SELECT
?singular
?plural
?gender
?definite
?indefinite

WHERE {
?lexeme dct:language wd:Q56475 ;
wikibase:lexicalCategory wd:Q1084 ;
wikibase:lemma ?singular .
FILTER(lang(?singular) = "ha")
# FILTER(lang(?singular) = "ha-arabic")
FILTER(lang(?singular) = "ha")
# FILTER(lang(?singular) = "ha-arabic")

# MARK: Plural

OPTIONAL {
?lexeme ontolex:lexicalForm ?pluralForm .
?pluralForm ontolex:representation ?plural ;
wikibase:grammaticalFeature wd:Q146786 .
FILTER(lang(?plural) = "ha")
# FILTER(lang(?plural) = "ha-arabic")
FILTER(lang(?plural) = "ha")
# FILTER(lang(?plural) = "ha-arabic")
}

# MARK: Gender(s)
Expand All @@ -31,6 +33,26 @@ WHERE {
?lexeme wdt:P5185 ?nounGender .
}

# MARK: Definite form

OPTIONAL {
?lexeme ontolex:lexicalForm ?definiteForm .
?definiteForm ontolex:representation ?definite ;
wikibase:grammaticalFeature wd:Q53997851 .
FILTER(lang(?definite) = "ha")
# FILTER(lang(?definite) = "ha-arabic")
}

# MARK: Indefinite form

OPTIONAL {
?lexeme ontolex:lexicalForm ?indefiniteForm .
?indefiniteForm ontolex:representation ?indefinite ;
wikibase:grammaticalFeature wd:Q53997857 .
FILTER(lang(?indefinite) = "ha")
# FILTER(lang(?indefinite) = "ha-arabic")
}

SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
?nounGender rdfs:label ?gender .
Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,49 @@
# tool: scribe-data
# All Hausa (Q56475) nouns (Q1084) and the given forms.
# All Hausa (Q56475) proper nouns (Q147276) and the given forms.
# Enter this query at https://query.wikidata.org/.

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?singular
?gender
?definite
?vocative

WHERE {
?lexeme dct:language wd:Q56475 ;
wikibase:lexicalCategory wd:Q147276 ;
wikibase:lemma ?singular .
FILTER(lang(?singular) = "ha")
# FILTER(lang(?singular) = "ha-arabic")
FILTER(lang(?singular) = "ha")
# FILTER(lang(?singular) = "ha-arabic")

# MARK: Gender(s)

OPTIONAL {
?lexeme wdt:P5185 ?nounGender .
}

# MARK: Definite form

OPTIONAL {
?lexeme ontolex:lexicalForm ?definiteForm .
?definiteForm ontolex:representation ?definite ;
wikibase:grammaticalFeature wd:Q53997851 .
FILTER(lang(?definite) = "ha")
# FILTER(lang(?definite) = "ha-arabic")
}

# MARK: Vocative form

OPTIONAL {
?lexeme ontolex:lexicalForm ?vocativeForm .
?vocativeForm ontolex:representation ?vocative ;
wikibase:grammaticalFeature wd:Q185077 .
FILTER(lang(?vocative) = "ha")
# FILTER(lang(?vocative) = "ha-arabic")
}

SERVICE wikibase:label {
bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
?nounGender rdfs:label ?gender .
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,67 @@

SELECT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?verb
?infinitive
?pastSimple
?presentContinuous
?futureTense
?imperativeSingular
?imperativePlural

WHERE {
?lexeme dct:language wd:Q56475 ;
wikibase:lexicalCategory wd:Q24905 ;
wikibase:lemma ?verb .
FILTER(lang(?verb) = "ha")
# FILTER(lang(?verb) = "ha-arabic")
}
wikibase:lemma ?infinitive .
FILTER(lang(?infinitive) = "ha")
# FILTER(lang(?infinitive) = "ha-arabic")

# MARK: Past Simple

OPTIONAL {
?lexeme ontolex:lexicalForm ?pastSimpleForm .
?pastSimpleForm ontolex:representation ?pastSimple ;
wikibase:grammaticalFeature wd:Q1392475, wd:Q1240211 .
FILTER(lang(?pastSimple) = "ha")
# FILTER(lang(?pastSimple) = "ha-arabic")
}

# MARK: Present Continuous

OPTIONAL {
?lexeme ontolex:lexicalForm ?presentContinuousForm .
?presentContinuousForm ontolex:representation ?presentContinuous ;
wikibase:grammaticalFeature wd:Q192613, wd:Q1423695 .
FILTER(lang(?presentContinuous) = "ha")
# FILTER(lang(?presentContinuous) = "ha-arabic")
}

# MARK: Future Tense

OPTIONAL {
?lexeme ontolex:lexicalForm ?futureTenseForm .
?futureTenseForm ontolex:representation ?futureTense ;
wikibase:grammaticalFeature wd:Q618612 .
FILTER(lang(?futureTense) = "ha")
# FILTER(lang(?futureTense) = "ha-arabic")
}

# MARK: Imperative Singular

OPTIONAL {
?lexeme ontolex:lexicalForm ?imperativeSingularForm .
?imperativeSingularForm ontolex:representation ?imperativeSingular ;
wikibase:grammaticalFeature wd:Q22716, wd:Q110786 .
FILTER(lang(?imperativeSingular) = "ha")
# FILTER(lang(?imperativeSingular) = "ha-arabic")
}

# MARK: Imperative Plural

OPTIONAL {
?lexeme ontolex:lexicalForm ?imperativePluralForm .
?imperativePluralForm ontolex:representation ?imperativePlural ;
wikibase:grammaticalFeature wd:Q22716, wd:Q146786 .
FILTER(lang(?imperativePlural) = "ha")
# FILTER(lang(?imperativePlural) = "ha-arabic")
}
}
Loading
Loading