From 4172419f2b56f02a842782398209a5dfcfe5705c Mon Sep 17 00:00:00 2001
From: Collins-Webdev <collinshaya@gmail.com>
Date: Mon, 21 Oct 2024 21:11:09 +0100
Subject: [PATCH 1/8] Expand Hausa data queries for nouns, proper nouns, and
 verbs

- Enhanced noun query to include definite and indefinite forms
- Updated proper noun query with definite and vocative forms
- Expanded verb query to cover past simple, present continuous, future tense, and imperative forms
- Added comments and FILTER options for both Latin and Arabic script variants
- Improved overall query structure and readability
---
 .../Hausa/nouns/query_nouns.sparql            | 36 ++++++++--
 .../proper_nouns/query_proper_nouns.sparql    | 32 +++++++--
 .../Hausa/verbs/query_verbs.sparql            | 67 +++++++++++++++++--
 3 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql
index 4dd743f05..ea66080c7 100644
--- a/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql
+++ b/src/scribe_data/language_data_extraction/Hausa/nouns/query_nouns.sparql
@@ -1,5 +1,5 @@
 # tool: scribe-data
-# All Hausa (Q56475) nouns and the given forms.
+# All Hausa (Q56475) nouns (Q1084) and the given forms.
 # Enter this query at https://query.wikidata.org/.
 
 SELECT
@@ -7,13 +7,15 @@ SELECT
   ?singular
   ?plural
   ?gender
+  ?definite
+  ?indefinite
 
 WHERE {
   ?lexeme dct:language wd:Q56475 ;
     wikibase:lexicalCategory wd:Q1084 ;
     wikibase:lemma ?singular .
-    FILTER(lang(?singular) = "ha")
-    # FILTER(lang(?singular) = "ha-arabic")
+  FILTER(lang(?singular) = "ha")
+  # FILTER(lang(?singular) = "ha-arabic")
 
   # MARK: Plural
 
@@ -21,9 +23,9 @@ WHERE {
     ?lexeme ontolex:lexicalForm ?pluralForm .
     ?pluralForm ontolex:representation ?plural ;
       wikibase:grammaticalFeature wd:Q146786 .
-      FILTER(lang(?plural) = "ha") .
-      # FILTER(lang(?plural) = "ha-arabic")
-  } 
+    FILTER(lang(?plural) = "ha")
+    # FILTER(lang(?plural) = "ha-arabic")
+  }
 
   # MARK: Gender(s)
 
@@ -31,8 +33,28 @@ WHERE {
     ?lexeme wdt:P5185 ?nounGender .
   }
 
+  # MARK: Definite form
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?definiteForm .
+    ?definiteForm ontolex:representation ?definite ;
+      wikibase:grammaticalFeature wd:Q53997851 .
+    FILTER(lang(?definite) = "ha")
+    # FILTER(lang(?definite) = "ha-arabic")
+  }
+
+  # MARK: Indefinite form
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?indefiniteForm .
+    ?indefiniteForm ontolex:representation ?indefinite ;
+      wikibase:grammaticalFeature wd:Q53997857 .
+    FILTER(lang(?indefinite) = "ha")
+    # FILTER(lang(?indefinite) = "ha-arabic")
+  }
+
   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
     ?nounGender rdfs:label ?gender .
   }
-}
+}
\ No newline at end of file
diff --git a/src/scribe_data/language_data_extraction/Hausa/proper_nouns/query_proper_nouns.sparql b/src/scribe_data/language_data_extraction/Hausa/proper_nouns/query_proper_nouns.sparql
index acdc264b3..1e0996f56 100644
--- a/src/scribe_data/language_data_extraction/Hausa/proper_nouns/query_proper_nouns.sparql
+++ b/src/scribe_data/language_data_extraction/Hausa/proper_nouns/query_proper_nouns.sparql
@@ -1,27 +1,49 @@
 # tool: scribe-data
-# All Hausa (Q56475) nouns and the given forms.
+# All Hausa (Q56475) proper nouns (Q147276) and the given forms.
 # Enter this query at https://query.wikidata.org/.
 
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?singular
   ?gender
+  ?definite
+  ?vocative
 
 WHERE {
   ?lexeme dct:language wd:Q56475 ;
     wikibase:lexicalCategory wd:Q147276 ;
     wikibase:lemma ?singular .
-    FILTER(lang(?singular) = "ha")
-    # FILTER(lang(?singular) = "ha-arabic")
+  FILTER(lang(?singular) = "ha")
+  # FILTER(lang(?singular) = "ha-arabic")
 
   # MARK: Gender(s)
 
   OPTIONAL {
     ?lexeme wdt:P5185 ?nounGender .
-  } .
+  }
+
+  # MARK: Definite form
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?definiteForm .
+    ?definiteForm ontolex:representation ?definite ;
+      wikibase:grammaticalFeature wd:Q53997851 .
+    FILTER(lang(?definite) = "ha")
+    # FILTER(lang(?definite) = "ha-arabic")
+  }
+
+  # MARK: Vocative form
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?vocativeForm .
+    ?vocativeForm ontolex:representation ?vocative ;
+      wikibase:grammaticalFeature wd:Q185077 .
+    FILTER(lang(?vocative) = "ha")
+    # FILTER(lang(?vocative) = "ha-arabic")
+  }
 
   SERVICE wikibase:label {
     bd:serviceParam wikibase:language "[AUTO_LANGUAGE]".
     ?nounGender rdfs:label ?gender .
   }
-}
+}
\ No newline at end of file
diff --git a/src/scribe_data/language_data_extraction/Hausa/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Hausa/verbs/query_verbs.sparql
index c81478724..16fddaacc 100644
--- a/src/scribe_data/language_data_extraction/Hausa/verbs/query_verbs.sparql
+++ b/src/scribe_data/language_data_extraction/Hausa/verbs/query_verbs.sparql
@@ -1,15 +1,70 @@
 # tool: scribe-data
-# All Hausa (Q56475) verbs and the given forms.
+# All Hausa (Q56475) verbs (Q24905) and the given forms.
 # Enter this query at https://query.wikidata.org/.
 
 SELECT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
-  ?verb
+  ?infinitive
+  ?pastSimple
+  ?presentContinuous
+  ?futureTense
+  ?imperativeSingular
+  ?imperativePlural
 
 WHERE {
   ?lexeme dct:language wd:Q56475 ;
     wikibase:lexicalCategory wd:Q24905 ;
-    wikibase:lemma ?verb .
-    FILTER(lang(?verb) = "ha")
-    # FILTER(lang(?verb) = "ha-arabic")
-}
+    wikibase:lemma ?infinitive .
+  FILTER(lang(?infinitive) = "ha")
+  # FILTER(lang(?infinitive) = "ha-arabic")
+
+  # MARK: Past Simple
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?pastSimpleForm .
+    ?pastSimpleForm ontolex:representation ?pastSimple ;
+      wikibase:grammaticalFeature wd:Q1392475, wd:Q1240211 .
+    FILTER(lang(?pastSimple) = "ha")
+    # FILTER(lang(?pastSimple) = "ha-arabic")
+  }
+
+  # MARK: Present Continuous
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?presentContinuousForm .
+    ?presentContinuousForm ontolex:representation ?presentContinuous ;
+      wikibase:grammaticalFeature wd:Q192613, wd:Q1423695 .
+    FILTER(lang(?presentContinuous) = "ha")
+    # FILTER(lang(?presentContinuous) = "ha-arabic")
+  }
+
+  # MARK: Future Tense
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?futureTenseForm .
+    ?futureTenseForm ontolex:representation ?futureTense ;
+      wikibase:grammaticalFeature wd:Q618612 .
+    FILTER(lang(?futureTense) = "ha")
+    # FILTER(lang(?futureTense) = "ha-arabic")
+  }
+
+  # MARK: Imperative Singular
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?imperativeSingularForm .
+    ?imperativeSingularForm ontolex:representation ?imperativeSingular ;
+      wikibase:grammaticalFeature wd:Q22716, wd:Q110786 .
+    FILTER(lang(?imperativeSingular) = "ha")
+    # FILTER(lang(?imperativeSingular) = "ha-arabic")
+  }
+
+  # MARK: Imperative Plural
+
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?imperativePluralForm .
+    ?imperativePluralForm ontolex:representation ?imperativePlural ;
+      wikibase:grammaticalFeature wd:Q22716, wd:Q146786 .
+    FILTER(lang(?imperativePlural) = "ha")
+    # FILTER(lang(?imperativePlural) = "ha-arabic")
+  }
+}
\ No newline at end of file

From cdc2f526b369d8befe8e6f0f5c2fc42800010c5a Mon Sep 17 00:00:00 2001
From: Collins-Webdev <collinshaya@gmail.com>
Date: Mon, 21 Oct 2024 21:26:13 +0100
Subject: [PATCH 2/8] Implement autosuggestions generation in get_data function

This commit integrates the autosuggestions functionality from process_wiki.py
into the get_data function in get.py. Key changes include:

1. Import gen_autosuggestions function from scribe_data.wikipedia.process_wiki
2. Add new conditional block to handle 'autosuggestions' data type
3. Implement autosuggestions generation logic for specified languages
4. Add placeholder load_text_corpus function for future implementation

The autosuggestions block now:
- Iterates through specified languages
- Loads text corpus (placeholder function to be implemented)
- Calls gen_autosuggestions with appropriate parameters
- Sets update_local_data=True to save results
- Uses interactive mode for verbose output

This update allows CLI users to generate autosuggestions directly via
the get command, streamlining the data generation process.

Note: The load_text_corpus function needs to be implemented to load
the actual text corpus for each language before this feature is fully functional.

TODO:
- Implement load_text_corpus function
- Ensure correct file paths and imports across the project
- Add error handling for corpus loading and autosuggestions generation
- Update documentation to reflect new autosuggestions functionality in CLI
---
 src/scribe_data/cli/get.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 3cbea6980..3542aca00 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -30,7 +30,7 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
-
+from scribe_data.wikipedia.process_wiki import gen_autosuggestions  # New import
 
 def get_data(
     language: str = None,
@@ -115,6 +115,23 @@ def get_data(
                 ["python", emoji_keyword_extraction_script]
             )
 
+    # MARK: Autosuggestions
+
+    elif data_type in {"autosuggestions", "auto_suggestions"}:
+        for lang in languages:
+            print(f"Generating autosuggestions for {lang}...")
+            # Here we need to load the text corpus for the language
+            # This is a placeholder, you'll need to implement the actual loading of the corpus
+            text_corpus = load_text_corpus(lang)  
+            autosuggestions = gen_autosuggestions(
+                text_corpus,
+                language=lang,
+                update_local_data=True,
+                verbose=interactive
+            )
+            subprocess_result = True
+            print(f"Autosuggestions for {lang} generated and saved.")
+
     # MARK: Query Data
 
     elif language or data_type:
@@ -156,3 +173,12 @@ def get_data(
         print(
             "Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n"
         )
+
+def load_text_corpus(language):
+    """
+    Placeholder function to load the text corpus for a given language.
+    This needs to be implemented to actually load the corpus from wherever it's stored.
+    """
+    # This is just a placeholder. You need to implement the actual loading of the corpus.
+    print(f"Loading text corpus for {language}...")
+    return []  # Return an empty list as a placeholder
\ No newline at end of file

From 886ed00819d27b367da03739707348619ebabf20 Mon Sep 17 00:00:00 2001
From: Collins-Webdev <collinshaya@gmail.com>
Date: Tue, 22 Oct 2024 21:26:47 +0100
Subject: [PATCH 3/8] issue solved

---
 src/scribe_data/cli/get.py | 49 +++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 3542aca00..317511956 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -30,7 +30,7 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
-from scribe_data.wikipedia.process_wiki import gen_autosuggestions  # New import
+from scribe_data.wikipedia.process_wiki import gen_autosuggestions
 
 def get_data(
     language: str = None,
@@ -120,17 +120,18 @@ def get_data(
     elif data_type in {"autosuggestions", "auto_suggestions"}:
         for lang in languages:
             print(f"Generating autosuggestions for {lang}...")
-            # Here we need to load the text corpus for the language
-            # This is a placeholder, you'll need to implement the actual loading of the corpus
-            text_corpus = load_text_corpus(lang)  
-            autosuggestions = gen_autosuggestions(
-                text_corpus,
-                language=lang,
-                update_local_data=True,
-                verbose=interactive
-            )
-            subprocess_result = True
-            print(f"Autosuggestions for {lang} generated and saved.")
+            text_corpus = load_text_corpus(lang)
+            if text_corpus:  # Only proceed if we have data
+                gen_autosuggestions(
+                    text_corpus,
+                    language=lang,
+                    update_local_data=True,
+                    verbose=interactive
+                )
+                subprocess_result = True
+                print(f"Autosuggestions for {lang} generated and saved.")
+            else:
+                print(f"No text corpus data available for {lang}. Skipping autosuggestions generation.")
 
     # MARK: Query Data
 
@@ -176,9 +177,23 @@ def get_data(
 
 def load_text_corpus(language):
     """
-    Placeholder function to load the text corpus for a given language.
-    This needs to be implemented to actually load the corpus from wherever it's stored.
+    Function to load the text corpus for a given language.
+    Returns None if no data is available.
+    
+    Parameters
+    ----------
+        language : str
+            The language to load the corpus for.
+            
+    Returns
+    -------
+        list or None
+            The text corpus if available, None otherwise.
     """
-    # This is just a placeholder. You need to implement the actual loading of the corpus.
-    print(f"Loading text corpus for {language}...")
-    return []  # Return an empty list as a placeholder
\ No newline at end of file
+    try:
+        # Implementation needed: Load and return the actual corpus data
+        # For now, return None to indicate no data available
+        return None
+    except Exception as e:
+        print(f"Error loading text corpus for {language}: {str(e)}")
+        return None
\ No newline at end of file

From 80920c4fcda82c9514de1f7da9ce46053f3bef4b Mon Sep 17 00:00:00 2001
From: Collins-Webdev <collinshaya@gmail.com>
Date: Tue, 22 Oct 2024 22:37:04 +0100
Subject: [PATCH 4/8] essay

---
 src/scribe_data/cli/get.py | 131 +++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 64 deletions(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 317511956..9efd3690b 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -20,6 +20,7 @@
     -->
 """
 
+import json
 import subprocess
 from pathlib import Path
 
@@ -30,7 +31,32 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
-from scribe_data.wikipedia.process_wiki import gen_autosuggestions
+from scribe_data.wikipedia.wikipedia_utils import get_wikipedia_articles
+from scribe_data.wikipedia.process_wiki import gen_autosuggestions, clean
+
+
+def load_text_corpus(language):
+    """
+    Load and process the Wikipedia text corpus for a given language.
+    
+    Parameters
+    ----------
+    language : str
+        The language to load the corpus for.
+        
+    Returns
+    -------
+    list
+        The processed text corpus ready for autosuggestion generation.
+    """
+    # Get Wikipedia articles for the language
+    articles = get_wikipedia_articles(language=language)
+    
+    # Clean the articles
+    cleaned_corpus = clean(articles, language=language)
+    
+    return cleaned_corpus
+
 
 def get_data(
     language: str = None,
@@ -47,36 +73,29 @@ def get_data(
 
     Parameters
     ----------
-        language : str
-            The language(s) to get.
-
-        data_type : str
-            The data type(s) to get.
-
-        output_type : str
-            The output file type.
-
-        output_dir : str
-            The output directory path for results.
-
-        outputs_per_entry : str
-            How many outputs should be generated per data entry.
-
-        overwrite : bool (default: False)
-            Whether to overwrite existing files.
-
-        all : bool
-            Get all languages and data types.
-
-        interactive : bool (default: False)
-            Whether it's running in interactive mode.
+    language : str
+        The language(s) to get.
+    data_type : str
+        The data type(s) to get.
+    output_type : str
+        The output file type.
+    output_dir : str
+        The output directory path for results.
+    outputs_per_entry : str
+        How many outputs should be generated per data entry.
+    overwrite : bool (default: False)
+        Whether to overwrite existing files.
+    all : bool
+        Get all languages and data types.
+    interactive : bool (default: False)
+        Whether it's running in interactive mode.
 
     Returns
     -------
+    None
         The requested data saved locally given file type and location arguments.
     """
     # MARK: Defaults
-
     output_type = output_type or "json"
     if output_dir is None:
         if output_type == "csv":
@@ -89,18 +108,15 @@ def get_data(
             output_dir = DEFAULT_TSV_EXPORT_DIR
 
     languages = [language] if language else None
-
     subprocess_result = False
 
     # MARK: Get All
-
     if all:
         print("Updating all languages and data types ...")
         query_data(None, None, None, overwrite)
         subprocess_result = True
 
     # MARK: Emojis
-
     elif data_type in {"emoji-keywords", "emoji_keywords"}:
         for lang in languages:
             emoji_keyword_extraction_script = (
@@ -110,37 +126,47 @@ def get_data(
                 / "emoji_keywords"
                 / "generate_emoji_keywords.py"
             )
-
             subprocess_result = subprocess.run(
                 ["python", emoji_keyword_extraction_script]
             )
 
     # MARK: Autosuggestions
-
     elif data_type in {"autosuggestions", "auto_suggestions"}:
+        subprocess_result = True
         for lang in languages:
-            print(f"Generating autosuggestions for {lang}...")
-            text_corpus = load_text_corpus(lang)
-            if text_corpus:  # Only proceed if we have data
-                gen_autosuggestions(
+            try:
+                print(f"Loading text corpus for {lang}...")
+                text_corpus = load_text_corpus(lang)
+                
+                print(f"Generating autosuggestions for {lang}...")
+                autosuggestions = gen_autosuggestions(
                     text_corpus,
                     language=lang,
+                    num_words=500,
                     update_local_data=True,
                     verbose=interactive
                 )
-                subprocess_result = True
+                
+                output_path = Path(output_dir) / lang
+                output_path.mkdir(parents=True, exist_ok=True)
+                
+                # Save autosuggestions according to output type
+                if output_type == "json":
+                    with open(output_path / "autosuggestions.json", "w", encoding="utf-8") as f:
+                        json.dump(autosuggestions, f, ensure_ascii=False, indent=2)
+                
                 print(f"Autosuggestions for {lang} generated and saved.")
-            else:
-                print(f"No text corpus data available for {lang}. Skipping autosuggestions generation.")
+                
+            except Exception as e:
+                print(f"Error generating autosuggestions for {lang}: {str(e)}")
+                subprocess_result = False
 
     # MARK: Query Data
-
     elif language or data_type:
         data_type = data_type[0] if isinstance(data_type, list) else data_type
-
         data_type = [data_type] if data_type else None
         print(
-            f"Updating data for language(s): {language}; data type(s): {', '.join(data_type)}"
+            f"Updating data for language(s): {language}; data type(s): {', '.join(data_type) if data_type else ''}"
         )
         query_data(
             languages=languages,
@@ -173,27 +199,4 @@ def get_data(
         )
         print(
             "Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n"
-        )
-
-def load_text_corpus(language):
-    """
-    Function to load the text corpus for a given language.
-    Returns None if no data is available.
-    
-    Parameters
-    ----------
-        language : str
-            The language to load the corpus for.
-            
-    Returns
-    -------
-        list or None
-            The text corpus if available, None otherwise.
-    """
-    try:
-        # Implementation needed: Load and return the actual corpus data
-        # For now, return None to indicate no data available
-        return None
-    except Exception as e:
-        print(f"Error loading text corpus for {language}: {str(e)}")
-        return None
\ No newline at end of file
+        )
\ No newline at end of file

From fcbfda22af393ae838bc89a6cfde4581b49131ef Mon Sep 17 00:00:00 2001
From: Collins-Webdev <collinshaya@gmail.com>
Date: Tue, 22 Oct 2024 22:50:57 +0100
Subject: [PATCH 5/8] essay 2

---
 src/scribe_data/cli/get.py | 292 +++++++++++++++++++++----------------
 1 file changed, 169 insertions(+), 123 deletions(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 9efd3690b..1bc24a4d4 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -20,9 +20,10 @@
     -->
 """
 
-import json
 import subprocess
+import logging
 from pathlib import Path
+from typing import Optional, List, Union
 
 from scribe_data.utils import (
     DEFAULT_CSV_EXPORT_DIR,
@@ -31,32 +32,36 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
-from scribe_data.wikipedia.wikipedia_utils import get_wikipedia_articles
-from scribe_data.wikipedia.process_wiki import gen_autosuggestions, clean
+from scribe_data.wikipedia.process_wiki import gen_autosuggestions
+from scribe_data.utils.validation import validate_lexeme_forms
 
-
-def load_text_corpus(language):
+def validate_data_availability(language: str, data_type: str) -> bool:
     """
-    Load and process the Wikipedia text corpus for a given language.
+    Validates if the requested data type is available for the given language.
     
     Parameters
     ----------
     language : str
-        The language to load the corpus for.
+        The language to check
+    data_type : str
+        The type of data to validate
         
     Returns
     -------
-    list
-        The processed text corpus ready for autosuggestion generation.
+    bool
+        True if data is available, False otherwise
     """
-    # Get Wikipedia articles for the language
-    articles = get_wikipedia_articles(language=language)
-    
-    # Clean the articles
-    cleaned_corpus = clean(articles, language=language)
-    
-    return cleaned_corpus
-
+    try:
+        # Check if lexeme forms metadata exists and is valid for this language
+        if data_type in ['verbs', 'nouns']:
+            forms_valid = validate_lexeme_forms(language, data_type)
+            if not forms_valid:
+                logging.warning(f"No valid lexeme form data available for {language} {data_type}")
+                return False
+        return True
+    except Exception as e:
+        logging.error(f"Error validating data availability: {str(e)}")
+        return False
 
 def get_data(
     language: str = None,
@@ -67,34 +72,39 @@ def get_data(
     outputs_per_entry: int = None,
     all: bool = False,
     interactive: bool = False,
-) -> None:
+) -> Optional[bool]:
     """
     Function for controlling the data get process for the CLI.
 
     Parameters
     ----------
-    language : str
-        The language(s) to get.
-    data_type : str
-        The data type(s) to get.
-    output_type : str
-        The output file type.
-    output_dir : str
-        The output directory path for results.
-    outputs_per_entry : str
-        How many outputs should be generated per data entry.
-    overwrite : bool (default: False)
-        Whether to overwrite existing files.
-    all : bool
-        Get all languages and data types.
-    interactive : bool (default: False)
-        Whether it's running in interactive mode.
+        language : str
+            The language(s) to get.
+        data_type : str
+            The data type(s) to get.
+        output_type : str
+            The output file type.
+        output_dir : str
+            The output directory path for results.
+        outputs_per_entry : str
+            How many outputs should be generated per data entry.
+        overwrite : bool (default: False)
+            Whether to overwrite existing files.
+        all : bool
+            Get all languages and data types.
+        interactive : bool (default: False)
+            Whether it's running in interactive mode.
 
     Returns
     -------
-    None
-        The requested data saved locally given file type and location arguments.
+        Optional[bool]: True if successful, None if failed
     """
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO if interactive else logging.WARNING,
+        format='%(levelname)s: %(message)s'
+    )
+
     # MARK: Defaults
     output_type = output_type or "json"
     if output_dir is None:
@@ -110,93 +120,129 @@ def get_data(
     languages = [language] if language else None
     subprocess_result = False
 
-    # MARK: Get All
-    if all:
-        print("Updating all languages and data types ...")
-        query_data(None, None, None, overwrite)
-        subprocess_result = True
-
-    # MARK: Emojis
-    elif data_type in {"emoji-keywords", "emoji_keywords"}:
-        for lang in languages:
-            emoji_keyword_extraction_script = (
-                Path(__file__).parent.parent
-                / "language_data_extraction"
-                / lang
-                / "emoji_keywords"
-                / "generate_emoji_keywords.py"
-            )
-            subprocess_result = subprocess.run(
-                ["python", emoji_keyword_extraction_script]
-            )
+    try:
+        # MARK: Get All
+        if all:
+            logging.info("Updating all languages and data types ...")
+            query_data(None, None, None, overwrite)
+            subprocess_result = True
+
+        # MARK: Emojis
+        elif data_type in {"emoji-keywords", "emoji_keywords"}:
+            for lang in languages:
+                if not validate_data_availability(lang, "emoji_keywords"):
+                    continue
+                    
+                emoji_keyword_extraction_script = (
+                    Path(__file__).parent.parent
+                    / "language_data_extraction"
+                    / lang
+                    / "emoji_keywords"
+                    / "generate_emoji_keywords.py"
+                )
 
-    # MARK: Autosuggestions
-    elif data_type in {"autosuggestions", "auto_suggestions"}:
-        subprocess_result = True
-        for lang in languages:
-            try:
-                print(f"Loading text corpus for {lang}...")
-                text_corpus = load_text_corpus(lang)
-                
-                print(f"Generating autosuggestions for {lang}...")
-                autosuggestions = gen_autosuggestions(
-                    text_corpus,
-                    language=lang,
-                    num_words=500,
-                    update_local_data=True,
-                    verbose=interactive
+                if not emoji_keyword_extraction_script.exists():
+                    logging.error(f"Emoji keyword script not found for language: {lang}")
+                    continue
+
+                subprocess_result = subprocess.run(
+                    ["python", str(emoji_keyword_extraction_script)],
+                    capture_output=True
                 )
+
+        # MARK: Autosuggestions
+        elif data_type in {"autosuggestions", "auto_suggestions"}:
+            for lang in languages:
+                if not validate_data_availability(lang, "autosuggestions"):
+                    logging.warning(f"Skipping autosuggestions for {lang} - no data available")
+                    continue
+                    
+                logging.info(f"Generating autosuggestions for {lang}...")
+                try:
+                    corpus = load_text_corpus(lang)
+                    if not corpus:
+                        logging.warning(f"No text corpus available for {lang}")
+                        continue
+                        
+                    autosuggestions = gen_autosuggestions(
+                        text_corpus=corpus,
+                        language=lang,
+                        update_local_data=True,
+                        verbose=interactive
+                    )
+                    subprocess_result = True if autosuggestions else False
+                except Exception as e:
+                    logging.error(f"Error generating autosuggestions for {lang}: {str(e)}")
+                    continue
+
+        # MARK: Query Data
+        elif language or data_type:
+            data_type = data_type[0] if isinstance(data_type, list) else data_type
+            data_type = [data_type] if data_type else None
+
+            if data_type and language:
+                # Validate data availability before querying
+                if not all(validate_data_availability(lang, dt) for lang in languages for dt in data_type):
+                    logging.warning("Some requested data is not available")
                 
-                output_path = Path(output_dir) / lang
-                output_path.mkdir(parents=True, exist_ok=True)
-                
-                # Save autosuggestions according to output type
-                if output_type == "json":
-                    with open(output_path / "autosuggestions.json", "w", encoding="utf-8") as f:
-                        json.dump(autosuggestions, f, ensure_ascii=False, indent=2)
-                
-                print(f"Autosuggestions for {lang} generated and saved.")
-                
-            except Exception as e:
-                print(f"Error generating autosuggestions for {lang}: {str(e)}")
-                subprocess_result = False
-
-    # MARK: Query Data
-    elif language or data_type:
-        data_type = data_type[0] if isinstance(data_type, list) else data_type
-        data_type = [data_type] if data_type else None
-        print(
-            f"Updating data for language(s): {language}; data type(s): {', '.join(data_type) if data_type else ''}"
-        )
-        query_data(
-            languages=languages,
-            data_type=data_type,
-            output_dir=output_dir,
-            overwrite=overwrite,
-            interactive=interactive,
-        )
-        subprocess_result = True
-
-    else:
-        raise ValueError(
-            "You must provide at least one of the --language (-l) or --data-type (-dt) options, or use --all (-a)."
-        )
-
-    if (
-        isinstance(subprocess_result, subprocess.CompletedProcess)
-        and subprocess_result.returncode != 1
-    ) or (isinstance(subprocess_result, bool) and subprocess_result is not False):
-        print(
-            f"Updated data was saved in: {Path(output_dir).resolve()}.",
-        )
-        if interactive:
-            return True
-
-    # The emoji keywords process has failed.
-    elif data_type in {"emoji-keywords", "emoji_keywords"}:
-        print(
-            "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed."
-        )
-        print(
-            "Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n"
-        )
\ No newline at end of file
+            logging.info(f"Updating data for language(s): {language}; data type(s): {', '.join(data_type) if data_type else 'all'}")
+            
+            query_data(
+                languages=languages,
+                data_type=data_type,
+                output_dir=output_dir,
+                overwrite=overwrite,
+                interactive=interactive,
+            )
+            subprocess_result = True
+
+        else:
+            raise ValueError(
+                "You must provide at least one of the --language (-l) or --data-type (-dt) options, or use --all (-a)."
+            )
+
+        # Handle results
+        if (
+            isinstance(subprocess_result, subprocess.CompletedProcess)
+            and subprocess_result.returncode == 0
+        ) or subprocess_result is True:
+            logging.info(f"Updated data was saved in: {Path(output_dir).resolve()}")
+            if interactive:
+                return True
+
+        # Handle emoji keywords failure
+        elif data_type in {"emoji-keywords", "emoji_keywords"}:
+            logging.error(
+                "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed."
+                "\nPlease check the installation guide at "
+                "https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md "
+                "for more information.\n"
+            )
+            
+        return None
+
+    except Exception as e:
+        logging.error(f"Error in get_data: {str(e)}")
+        return None
+
+def load_text_corpus(language: str) -> List[str]:
+    """
+    Load the text corpus for a given language.
+    This is a placeholder that should be implemented based on your data storage.
+    
+    Parameters
+    ----------
+    language : str
+        The language to load corpus for
+        
+    Returns
+    -------
+    List[str]
+        The text corpus for the language
+    """
+    try:
+        # Implement actual corpus loading logic here
+        return []
+    except Exception as e:
+        logging.error(f"Error loading text corpus for {language}: {str(e)}")
+        return []
\ No newline at end of file

From 8f75976de3ca88cb36a16eef2fe2f5b36c14fc90 Mon Sep 17 00:00:00 2001
From: Collins-Webdev <collinshaya@gmail.com>
Date: Tue, 22 Oct 2024 23:05:28 +0100
Subject: [PATCH 6/8] essay 3

---
 src/scribe_data/cli/get.py | 241 ++++++++++++-------------------------
 1 file changed, 77 insertions(+), 164 deletions(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index 1bc24a4d4..f5fca3fde 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -1,5 +1,5 @@
 """
-Functions for getting languages-data types packs for the Scribe-Data CLI.
+Function for controlling the data get process for the CLI.
 
 .. raw:: html
     <!--
@@ -21,9 +21,7 @@
 """
 
 import subprocess
-import logging
 from pathlib import Path
-from typing import Optional, List, Union
 
 from scribe_data.utils import (
     DEFAULT_CSV_EXPORT_DIR,
@@ -32,36 +30,6 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
-from scribe_data.wikipedia.process_wiki import gen_autosuggestions
-from scribe_data.utils.validation import validate_lexeme_forms
-
-def validate_data_availability(language: str, data_type: str) -> bool:
-    """
-    Validates if the requested data type is available for the given language.
-    
-    Parameters
-    ----------
-    language : str
-        The language to check
-    data_type : str
-        The type of data to validate
-        
-    Returns
-    -------
-    bool
-        True if data is available, False otherwise
-    """
-    try:
-        # Check if lexeme forms metadata exists and is valid for this language
-        if data_type in ['verbs', 'nouns']:
-            forms_valid = validate_lexeme_forms(language, data_type)
-            if not forms_valid:
-                logging.warning(f"No valid lexeme form data available for {language} {data_type}")
-                return False
-        return True
-    except Exception as e:
-        logging.error(f"Error validating data availability: {str(e)}")
-        return False
 
 def get_data(
     language: str = None,
@@ -72,7 +40,7 @@ def get_data(
     outputs_per_entry: int = None,
     all: bool = False,
     interactive: bool = False,
-) -> Optional[bool]:
+) -> None:
     """
     Function for controlling the data get process for the CLI.
 
@@ -97,14 +65,8 @@ def get_data(
 
     Returns
     -------
-        Optional[bool]: True if successful, None if failed
+        The requested data saved locally given file type and location arguments.
     """
-    # Configure logging
-    logging.basicConfig(
-        level=logging.INFO if interactive else logging.WARNING,
-        format='%(levelname)s: %(message)s'
-    )
-
     # MARK: Defaults
     output_type = output_type or "json"
     if output_dir is None:
@@ -120,129 +82,80 @@ def get_data(
     languages = [language] if language else None
     subprocess_result = False
 
-    try:
-        # MARK: Get All
-        if all:
-            logging.info("Updating all languages and data types ...")
-            query_data(None, None, None, overwrite)
-            subprocess_result = True
-
-        # MARK: Emojis
-        elif data_type in {"emoji-keywords", "emoji_keywords"}:
-            for lang in languages:
-                if not validate_data_availability(lang, "emoji_keywords"):
-                    continue
-                    
-                emoji_keyword_extraction_script = (
-                    Path(__file__).parent.parent
-                    / "language_data_extraction"
-                    / lang
-                    / "emoji_keywords"
-                    / "generate_emoji_keywords.py"
-                )
-
-                if not emoji_keyword_extraction_script.exists():
-                    logging.error(f"Emoji keyword script not found for language: {lang}")
-                    continue
-
-                subprocess_result = subprocess.run(
-                    ["python", str(emoji_keyword_extraction_script)],
-                    capture_output=True
-                )
-
-        # MARK: Autosuggestions
-        elif data_type in {"autosuggestions", "auto_suggestions"}:
-            for lang in languages:
-                if not validate_data_availability(lang, "autosuggestions"):
-                    logging.warning(f"Skipping autosuggestions for {lang} - no data available")
-                    continue
-                    
-                logging.info(f"Generating autosuggestions for {lang}...")
-                try:
-                    corpus = load_text_corpus(lang)
-                    if not corpus:
-                        logging.warning(f"No text corpus available for {lang}")
-                        continue
-                        
-                    autosuggestions = gen_autosuggestions(
-                        text_corpus=corpus,
-                        language=lang,
-                        update_local_data=True,
-                        verbose=interactive
-                    )
-                    subprocess_result = True if autosuggestions else False
-                except Exception as e:
-                    logging.error(f"Error generating autosuggestions for {lang}: {str(e)}")
-                    continue
-
-        # MARK: Query Data
-        elif language or data_type:
-            data_type = data_type[0] if isinstance(data_type, list) else data_type
-            data_type = [data_type] if data_type else None
-
-            if data_type and language:
-                # Validate data availability before querying
-                if not all(validate_data_availability(lang, dt) for lang in languages for dt in data_type):
-                    logging.warning("Some requested data is not available")
-                
-            logging.info(f"Updating data for language(s): {language}; data type(s): {', '.join(data_type) if data_type else 'all'}")
-            
-            query_data(
-                languages=languages,
-                data_type=data_type,
-                output_dir=output_dir,
-                overwrite=overwrite,
-                interactive=interactive,
-            )
-            subprocess_result = True
-
-        else:
-            raise ValueError(
-                "You must provide at least one of the --language (-l) or --data-type (-dt) options, or use --all (-a)."
-            )
-
-        # Handle results
-        if (
-            isinstance(subprocess_result, subprocess.CompletedProcess)
-            and subprocess_result.returncode == 0
-        ) or subprocess_result is True:
-            logging.info(f"Updated data was saved in: {Path(output_dir).resolve()}")
-            if interactive:
-                return True
-
-        # Handle emoji keywords failure
-        elif data_type in {"emoji-keywords", "emoji_keywords"}:
-            logging.error(
-                "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed."
-                "\nPlease check the installation guide at "
-                "https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md "
-                "for more information.\n"
+    # MARK: Get All
+    if all:
+        print("Updating all languages and data types ...")
+        query_data(None, None, None, overwrite)
+        subprocess_result = True
+
+    # MARK: Autosuggestions
+    elif data_type in {"autosuggestions", "autosuggestion"}:
+        if interactive:
+            print("\nNote: Autosuggestions functionality is being deprecated.")
+            print("In future versions, this will be replaced with an LLM-based approach.")
+            print("For now, you can still use the Jupyter notebook in the Scribe community.\n")
+        
+        output_path = Path(output_dir) / language / "autosuggestions.json"
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Create empty autosuggestions file to maintain compatibility
+        if not output_path.exists() or overwrite:
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write("{}\n")
+        
+        subprocess_result = True
+
+    # MARK: Emojis
+    elif data_type in {"emoji-keywords", "emoji_keywords"}:
+        for lang in languages:
+            emoji_keyword_extraction_script = (
+                Path(__file__).parent.parent
+                / "language_data_extraction"
+                / lang
+                / "emoji_keywords"
+                / "generate_emoji_keywords.py"
             )
             
-        return None
-
-    except Exception as e:
-        logging.error(f"Error in get_data: {str(e)}")
-        return None
+            subprocess_result = subprocess.run(
+                ["python", emoji_keyword_extraction_script]
+            )
 
-def load_text_corpus(language: str) -> List[str]:
-    """
-    Load the text corpus for a given language.
-    This is a placeholder that should be implemented based on your data storage.
-    
-    Parameters
-    ----------
-    language : str
-        The language to load corpus for
-        
-    Returns
-    -------
-    List[str]
-        The text corpus for the language
-    """
-    try:
-        # Implement actual corpus loading logic here
-        return []
-    except Exception as e:
-        logging.error(f"Error loading text corpus for {language}: {str(e)}")
-        return []
\ No newline at end of file
+    # MARK: Query Data
+    elif language or data_type:
+        data_type = data_type[0] if isinstance(data_type, list) else data_type
+        data_type = [data_type] if data_type else None
+        print(
+            f"Updating data for language(s): {language}; data type(s): {', '.join(data_type)}"
+        )
+        query_data(
+            languages=languages,
+            data_type=data_type,
+            output_dir=output_dir,
+            overwrite=overwrite,
+            interactive=interactive,
+        )
+        subprocess_result = True
+
+    else:
+        raise ValueError(
+            "You must provide at least one of the --language (-l) or --data-type (-dt) options, or use --all (-a)."
+        )
+
+    if (
+        isinstance(subprocess_result, subprocess.CompletedProcess)
+        and subprocess_result.returncode != 1
+    ) or (isinstance(subprocess_result, bool) and subprocess_result is not False):
+        print(
+            f"Updated data was saved in: {Path(output_dir).resolve()}.",
+        )
+        if interactive:
+            return True
+
+    # The emoji keywords process has failed.
+    elif data_type in {"emoji-keywords", "emoji_keywords"}:
+        print(
+            "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed."
+        )
+        print(
+            "Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n"
+        )
\ No newline at end of file

From ff56e715eaff5c5eaf79577754d8dcbbe0e8b907 Mon Sep 17 00:00:00 2001
From: Collins-Webdev <collinshaya@gmail.com>
Date: Tue, 22 Oct 2024 23:37:32 +0100
Subject: [PATCH 7/8] essay 4

---
 src/scribe_data/cli/get.py | 133 +++++++++++++++++++++++++++++++------
 1 file changed, 114 insertions(+), 19 deletions(-)

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
index f5fca3fde..491488d10 100644
--- a/src/scribe_data/cli/get.py
+++ b/src/scribe_data/cli/get.py
@@ -1,5 +1,5 @@
 """
-Function for controlling the data get process for the CLI.
+Functions for getting languages-data types packs for the Scribe-Data CLI.
 
 .. raw:: html
     <!--
@@ -20,6 +20,7 @@
     -->
 """
 
+import json
 import subprocess
 from pathlib import Path
 
@@ -30,6 +31,67 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
+from scribe_data.wikipedia.process_wiki import gen_autosuggestions
+from scribe_data.wikidata.wikidata_utils import sparql
+
+def load_lexeme_metadata():
+    """
+    Load the lexeme form metadata from the JSON file.
+    """
+    metadata_path = Path(__file__).parent / "lexeme_form.metadata.json"
+    try:
+        with open(metadata_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Warning: Could not find lexeme metadata file at {metadata_path}")
+        return {}
+
+def load_text_corpus(language):
+    """
+    Load the text corpus for a given language with consideration for lexeme forms.
+    
+    Parameters
+    ----------
+    language : str
+        The language to load the corpus for
+        
+    Returns
+    -------
+    list
+        The processed text corpus
+    """
+    # Load lexeme metadata
+    lexeme_metadata = load_lexeme_metadata()
+    
+    # Create SPARQL query to get relevant lexemes for the language
+    query = """
+    SELECT DISTINCT ?lexeme ?form ?representation WHERE {
+      ?lexeme dct:language ?language .
+      ?lexeme ontolex:lexicalForm ?form .
+      ?form ontolex:representation ?representation .
+      
+      # Filter for specific language
+      FILTER(LANG(?representation) = "%s")
+    }
+    LIMIT 10000
+    """ % language.lower()
+    
+    sparql.setQuery(query)
+    
+    try:
+        results = sparql.query().convert()
+        corpus = []
+        
+        # Process results
+        for result in results["results"]["bindings"]:
+            representation = result["representation"]["value"]
+            corpus.append(representation)
+            
+        return corpus
+        
+    except Exception as e:
+        print(f"Error loading corpus for {language}: {str(e)}")
+        return []
 
 def get_data(
     language: str = None,
@@ -48,18 +110,25 @@ def get_data(
     ----------
         language : str
             The language(s) to get.
+
         data_type : str
             The data type(s) to get.
+
         output_type : str
             The output file type.
+
         output_dir : str
             The output directory path for results.
+
         outputs_per_entry : str
             How many outputs should be generated per data entry.
+
         overwrite : bool (default: False)
             Whether to overwrite existing files.
+
         all : bool
             Get all languages and data types.
+
         interactive : bool (default: False)
             Whether it's running in interactive mode.
 
@@ -82,29 +151,15 @@ def get_data(
     languages = [language] if language else None
     subprocess_result = False
 
+    # Load lexeme metadata
+    lexeme_metadata = load_lexeme_metadata()
+
     # MARK: Get All
     if all:
         print("Updating all languages and data types ...")
         query_data(None, None, None, overwrite)
         subprocess_result = True
 
-    # MARK: Autosuggestions
-    elif data_type in {"autosuggestions", "autosuggestion"}:
-        if interactive:
-            print("\nNote: Autosuggestions functionality is being deprecated.")
-            print("In future versions, this will be replaced with an LLM-based approach.")
-            print("For now, you can still use the Jupyter notebook in the Scribe community.\n")
-        
-        output_path = Path(output_dir) / language / "autosuggestions.json"
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        
-        # Create empty autosuggestions file to maintain compatibility
-        if not output_path.exists() or overwrite:
-            with open(output_path, "w", encoding="utf-8") as f:
-                f.write("{}\n")
-        
-        subprocess_result = True
-
     # MARK: Emojis
     elif data_type in {"emoji-keywords", "emoji_keywords"}:
         for lang in languages:
@@ -115,11 +170,51 @@ def get_data(
                 / "emoji_keywords"
                 / "generate_emoji_keywords.py"
             )
-            
+
             subprocess_result = subprocess.run(
                 ["python", emoji_keyword_extraction_script]
             )
 
+    # MARK: Autosuggestions
+    elif data_type in {"autosuggestions", "auto_suggestions"}:
+        for lang in languages:
+            print(f"Generating autosuggestions for {lang}...")
+            
+            # Load text corpus with lexeme forms consideration
+            text_corpus = load_text_corpus(lang)
+            
+            if text_corpus:
+                try:
+                    # Generate autosuggestions using the loaded corpus
+                    autosuggestions = gen_autosuggestions(
+                        text_corpus,
+                        language=lang,
+                        num_words=500,  # Default number of words
+                        update_local_data=True,
+                        verbose=interactive
+                    )
+                    
+                    # Save autosuggestions with lexeme metadata
+                    output_path = Path(output_dir) / lang / "autosuggestions.json"
+                    output_path.parent.mkdir(parents=True, exist_ok=True)
+                    
+                    # Combine autosuggestions with lexeme metadata
+                    output_data = {
+                        "autosuggestions": autosuggestions,
+                        "lexeme_metadata": lexeme_metadata
+                    }
+                    
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        json.dump(output_data, f, ensure_ascii=False, indent=2)
+                    
+                    subprocess_result = True
+                    print(f"Autosuggestions for {lang} generated and saved to {output_path}")
+                    
+                except Exception as e:
+                    print(f"Error generating autosuggestions for {lang}: {str(e)}")
+            else:
+                print(f"No corpus data found for {lang}")
+
     # MARK: Query Data
     elif language or data_type:
         data_type = data_type[0] if isinstance(data_type, list) else data_type

From 602f862443a39f4322495b098509d4bf74b7c2b3 Mon Sep 17 00:00:00 2001
From: Collins-Webdev <collinshaya@gmail.com>
Date: Thu, 24 Oct 2024 07:13:40 +0100
Subject: [PATCH 8/8] feat(queries): Extend SPARQL query to extract additional
 Latin verb forms

- Add support for extracting present, future, past imperfect, perfect, and pluperfect forms
- Include grammatical features (mood, person, number) for each tense
- Implement OPTIONAL matching to handle incomplete conjugation data
- Add proper PREFIX declarations for all used namespaces
- Improve query organization and readability with comments
- Add ORDER BY clause and reasonable LIMIT for better results handling

Resolves #444
---
 .../Latin/verbs/query_verbs.sparql            | 77 +++++++++++++++++--
 1 file changed, 71 insertions(+), 6 deletions(-)

diff --git a/src/scribe_data/language_data_extraction/Latin/verbs/query_verbs.sparql b/src/scribe_data/language_data_extraction/Latin/verbs/query_verbs.sparql
index c996c6f16..ae218bded 100644
--- a/src/scribe_data/language_data_extraction/Latin/verbs/query_verbs.sparql
+++ b/src/scribe_data/language_data_extraction/Latin/verbs/query_verbs.sparql
@@ -1,13 +1,78 @@
 # tool: scribe-data
-# All Latin (Q397) verbs (Q24905) and the given forms.
-# Enter this query at https://query.wikidata.org/.
+# Extended query for Latin (Q397) verbs (Q24905) and their conjugated forms
+# Including: Present, Future, Past Imperfect, Perfect, and Pluperfect forms
+# Enter this query at https://query.wikidata.org/
 
-SELECT
+PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX wd: <http://www.wikidata.org/entity/>
+PREFIX wdt: <http://www.wikidata.org/prop/direct/>
+PREFIX wikibase: <http://wikiba.se/ontology#>
+PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
+
+SELECT DISTINCT
   (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
   ?verb
-
+  ?presentForm
+  ?futureForm
+  ?pastImperfectForm
+  ?perfectForm
+  ?pluperfectForm
 WHERE {
+  # Basic verb identification
   ?lexeme dct:language wd:Q397 ;
-    wikibase:lexicalCategory wd:Q24905 ;
-    wikibase:lemma ?verb .
+          wikibase:lexicalCategory wd:Q24905 ;
+          wikibase:lemma ?verb .
+  
+  # Present forms
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?presentFormNode .
+    ?presentFormNode wikibase:grammaticalFeature wd:Q192613 ; # present tense
+                    wikibase:grammaticalFeature ?mood ;
+                    wikibase:grammaticalFeature ?person ;
+                    wikibase:grammaticalFeature ?number ;
+                    wikibase:representation ?presentForm .
+    FILTER(?mood IN (wd:Q179230, wd:Q179339)) # indicative or subjunctive
+  }
+  
+  # Future forms
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?futureFormNode .
+    ?futureFormNode wikibase:grammaticalFeature wd:Q22716 ; # future tense
+                    wikibase:grammaticalFeature ?futureMood ;
+                    wikibase:grammaticalFeature ?futurePerson ;
+                    wikibase:grammaticalFeature ?futureNumber ;
+                    wikibase:representation ?futureForm .
+  }
+  
+  # Past Imperfect forms
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?imperfectFormNode .
+    ?imperfectFormNode wikibase:grammaticalFeature wd:Q442485 ; # imperfect tense
+                      wikibase:grammaticalFeature ?imperfectMood ;
+                      wikibase:grammaticalFeature ?imperfectPerson ;
+                      wikibase:grammaticalFeature ?imperfectNumber ;
+                      wikibase:representation ?pastImperfectForm .
+  }
+  
+  # Perfect forms
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?perfectFormNode .
+    ?perfectFormNode wikibase:grammaticalFeature wd:Q442485 ; # perfect tense
+                    wikibase:grammaticalFeature ?perfectMood ;
+                    wikibase:grammaticalFeature ?perfectPerson ;
+                    wikibase:grammaticalFeature ?perfectNumber ;
+                    wikibase:representation ?perfectForm .
+  }
+  
+  # Pluperfect forms
+  OPTIONAL {
+    ?lexeme ontolex:lexicalForm ?pluperfectFormNode .
+    ?pluperfectFormNode wikibase:grammaticalFeature wd:Q625581 ; # pluperfect tense
+                      wikibase:grammaticalFeature ?pluperfectMood ;
+                      wikibase:grammaticalFeature ?pluperfectPerson ;
+                      wikibase:grammaticalFeature ?pluperfectNumber ;
+                      wikibase:representation ?pluperfectForm .
+  }
 }
+ORDER BY ?verb
+LIMIT 1000
\ No newline at end of file