scribe-org · Collins-Webdev · Oct 21, 2024 · Oct 21, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -20,6 +20,7 @@
     -->
 """
 
+import json
 import subprocess
 from pathlib import Path
 
@@ -30,7 +31,67 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
+from scribe_data.wikipedia.process_wiki import gen_autosuggestions
+from scribe_data.wikidata.wikidata_utils import sparql
 
+def load_lexeme_metadata():
+    """
+    Load the lexeme form metadata from the JSON file.
+    """
+    metadata_path = Path(__file__).parent / "lexeme_form.metadata.json"
+    try:
+        with open(metadata_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Warning: Could not find lexeme metadata file at {metadata_path}")
+        return {}
+
+def load_text_corpus(language):
+    """
+    Load the text corpus for a given language with consideration for lexeme forms.
+
+    Parameters
+    ----------
+    language : str
+        The language to load the corpus for
+
+    Returns
+    -------
+    list
+        The processed text corpus
+    """
+    # Load lexeme metadata
+    lexeme_metadata = load_lexeme_metadata()
+
+    # Create SPARQL query to get relevant lexemes for the language
+    query = """
+    SELECT DISTINCT ?lexeme ?form ?representation WHERE {
+      ?lexeme dct:language ?language .
+      ?lexeme ontolex:lexicalForm ?form .
+      ?form ontolex:representation ?representation .
+
+      # Filter for specific language
+      FILTER(LANG(?representation) = "%s")
+    }
+    LIMIT 10000
+    """ % language.lower()
+
+    sparql.setQuery(query)
+
+    try:
+        results = sparql.query().convert()
+        corpus = []
+
+        # Process results
+        for result in results["results"]["bindings"]:
+            representation = result["representation"]["value"]
+            corpus.append(representation)
+
+        return corpus
+
+    except Exception as e:
+        print(f"Error loading corpus for {language}: {str(e)}")
+        return []
 
 def get_data(
     language: str = None,
@@ -76,7 +137,6 @@ def get_data(
         The requested data saved locally given file type and location arguments.
     """
     # MARK: Defaults
-
     output_type = output_type or "json"
     if output_dir is None:
         if output_type == "csv":
@@ -89,18 +149,18 @@ def get_data(
             output_dir = DEFAULT_TSV_EXPORT_DIR
 
     languages = [language] if language else None
-
     subprocess_result = False
 
-    # MARK: Get All
+    # Load lexeme metadata
+    lexeme_metadata = load_lexeme_metadata()
 
+    # MARK: Get All
     if all:
         print("Updating all languages and data types ...")
         query_data(None, None, None, overwrite)
         subprocess_result = True
 
     # MARK: Emojis
-
     elif data_type in {"emoji-keywords", "emoji_keywords"}:
         for lang in languages:
             emoji_keyword_extraction_script = (
@@ -115,11 +175,49 @@ def get_data(
                 ["python", emoji_keyword_extraction_script]
             )
 
-    # MARK: Query Data
+    # MARK: Autosuggestions
+    elif data_type in {"autosuggestions", "auto_suggestions"}:
+        for lang in languages:
+            print(f"Generating autosuggestions for {lang}...")
+
+            # Load text corpus with lexeme forms consideration
+            text_corpus = load_text_corpus(lang)
+
+            if text_corpus:
+                try:
+                    # Generate autosuggestions using the loaded corpus
+                    autosuggestions = gen_autosuggestions(
+                        text_corpus,
+                        language=lang,
+                        num_words=500,  # Default number of words
+                        update_local_data=True,
+                        verbose=interactive
+                    )
+
+                    # Save autosuggestions with lexeme metadata
+                    output_path = Path(output_dir) / lang / "autosuggestions.json"
+                    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+                    # Combine autosuggestions with lexeme metadata
+                    output_data = {
+                        "autosuggestions": autosuggestions,
+                        "lexeme_metadata": lexeme_metadata
+                    }
+
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        json.dump(output_data, f, ensure_ascii=False, indent=2)
+
+                    subprocess_result = True
+                    print(f"Autosuggestions for {lang} generated and saved to {output_path}")
+
+                except Exception as e:
+                    print(f"Error generating autosuggestions for {lang}: {str(e)}")
+            else:
+                print(f"No corpus data found for {lang}")
 
+    # MARK: Query Data
     elif language or data_type:
         data_type = data_type[0] if isinstance(data_type, list) else data_type
-
         data_type = [data_type] if data_type else None
         print(
             f"Updating data for language(s): {language}; data type(s): {', '.join(data_type)}"
@@ -155,4 +253,4 @@ def get_data(
         )
         print(
             "Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n"
-        )
+        )
diff --git a/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql
@@ -7,22 +7,24 @@ SELECT
   ?singular
   ?plural
   ?gender
+  ?definite
+  ?indefinite
 
 WHERE {
   ?lexeme dct:language wd:Q56475 ;
     wikibase:lexicalCategory wd:Q1084 ;
     wikibase:lemma ?singular .
-    FILTER(lang(?singular) = "ha")
-    # FILTER(lang(?singular) = "ha-arabic")
+  FILTER(lang(?singular) = "ha")
+  # FILTER(lang(?singular) = "ha-arabic")
 
   # MARK: Plural
 
   OPTIONAL {
     ?lexeme ontolex:lexicalForm ?pluralForm .
     ?pluralForm ontolex:representation ?plural ;
       wikibase:grammaticalFeature wd:Q146786 .
-      FILTER(lang(?plural) = "ha")
-      # FILTER(lang(?plural) = "ha-arabic")
+    FILTER(lang(?plural) = "ha")
+    # FILTER(lang(?plural) = "ha-arabic")
   }
 
   # MARK: Gender(s)
@@ -31,6 +33,26 @@ WHERE {
     ?lexeme wdt:P5185 ?nounGender .
   }
 
+  # MARK: Definite form
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?definiteForm .
+    ?definiteForm ontolex:representation ?definite ;
+      wikibase:grammaticalFeature wd:Q53997851 .
+    FILTER(lang(?definite) = "ha")
+    # FILTER(lang(?definite) = "ha-arabic")
+  }
+
+  # MARK: Indefinite form
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?indefiniteForm .
+    ?indefiniteForm ontolex:representation ?indefinite ;
+      wikibase:grammaticalFeature wd:Q53997857 .
+    FILTER(lang(?indefinite) = "ha")
+    # FILTER(lang(?indefinite) = "ha-arabic")
+  }
+
   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
     ?nounGender rdfs:label ?gender .

diff --git a/src/scribe_data/language_data_extraction/Hausa/proper_nouns/query_proper_nouns.sparql b/src/scribe_data/language_data_extraction/Hausa/proper_nouns/query_proper_nouns.sparql
@@ -1,27 +1,49 @@
 # tool: scribe-data
-# All Hausa (Q56475) nouns (Q1084) and the given forms.
+# All Hausa (Q56475) proper nouns (Q147276) and the given forms.
 # Enter this query at https://query.wikidata.org/.
 
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?singular
   ?gender
+  ?definite
+  ?vocative
 
 WHERE {
   ?lexeme dct:language wd:Q56475 ;
     wikibase:lexicalCategory wd:Q147276 ;
     wikibase:lemma ?singular .
-    FILTER(lang(?singular) = "ha")
-    # FILTER(lang(?singular) = "ha-arabic")
+  FILTER(lang(?singular) = "ha")
+  # FILTER(lang(?singular) = "ha-arabic")
 
   # MARK: Gender(s)
 
   OPTIONAL {
     ?lexeme wdt:P5185 ?nounGender .
   }
 
+  # MARK: Definite form
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?definiteForm .
+    ?definiteForm ontolex:representation ?definite ;
+      wikibase:grammaticalFeature wd:Q53997851 .
+    FILTER(lang(?definite) = "ha")
+    # FILTER(lang(?definite) = "ha-arabic")
+  }
+
+  # MARK: Vocative form
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?vocativeForm .
+    ?vocativeForm ontolex:representation ?vocative ;
+      wikibase:grammaticalFeature wd:Q185077 .
+    FILTER(lang(?vocative) = "ha")
+    # FILTER(lang(?vocative) = "ha-arabic")
+  }
+
   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
     ?nounGender rdfs:label ?gender .
   }
-}
+}
diff --git a/src/scribe_data/language_data_extraction/Hausa/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Hausa/verbs/query_verbs.sparql
@@ -4,12 +4,67 @@
 
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
-  ?verb
+  ?infinitive
+  ?pastSimple
+  ?presentContinuous
+  ?futureTense
+  ?imperativeSingular
+  ?imperativePlural
 
 WHERE {
   ?lexeme dct:language wd:Q56475 ;
     wikibase:lexicalCategory wd:Q24905 ;
-    wikibase:lemma ?verb .
-    FILTER(lang(?verb) = "ha")
-    # FILTER(lang(?verb) = "ha-arabic")
-}
+    wikibase:lemma ?infinitive .
+  FILTER(lang(?infinitive) = "ha")
+  # FILTER(lang(?infinitive) = "ha-arabic")
+
+  # MARK: Past Simple
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?pastSimpleForm .
+    ?pastSimpleForm ontolex:representation ?pastSimple ;
+      wikibase:grammaticalFeature wd:Q1392475, wd:Q1240211 .
+    FILTER(lang(?pastSimple) = "ha")
+    # FILTER(lang(?pastSimple) = "ha-arabic")
+  }
+
+  # MARK: Present Continuous
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?presentContinuousForm .
+    ?presentContinuousForm ontolex:representation ?presentContinuous ;
+      wikibase:grammaticalFeature wd:Q192613, wd:Q1423695 .
+    FILTER(lang(?presentContinuous) = "ha")
+    # FILTER(lang(?presentContinuous) = "ha-arabic")
+  }
+
+  # MARK: Future Tense
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?futureTenseForm .
+    ?futureTenseForm ontolex:representation ?futureTense ;
+      wikibase:grammaticalFeature wd:Q618612 .
+    FILTER(lang(?futureTense) = "ha")
+    # FILTER(lang(?futureTense) = "ha-arabic")
+  }
+
+  # MARK: Imperative Singular
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?imperativeSingularForm .
+    ?imperativeSingularForm ontolex:representation ?imperativeSingular ;
+      wikibase:grammaticalFeature wd:Q22716, wd:Q110786 .
+    FILTER(lang(?imperativeSingular) = "ha")
+    # FILTER(lang(?imperativeSingular) = "ha-arabic")
+  }
+
+  # MARK: Imperative Plural
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?imperativePluralForm .
+    ?imperativePluralForm ontolex:representation ?imperativePlural ;
+      wikibase:grammaticalFeature wd:Q22716, wd:Q146786 .
+    FILTER(lang(?imperativePlural) = "ha")
+    # FILTER(lang(?imperativePlural) = "ha-arabic")
+  }
+}