essay 4

Collins-Webdev · Oct 22, 2024 · ff56e71 · ff56e71
1 parent 8f75976
commit ff56e71
Showing 1 changed file with 114 additions and 19 deletions.
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -1,5 +1,5 @@
 """
-Function for controlling the data get process for the CLI.
+Functions for getting languages-data types packs for the Scribe-Data CLI.
 
 .. raw:: html
     <!--
@@ -20,6 +20,7 @@
     -->
 """
 
+import json
 import subprocess
 from pathlib import Path
 
@@ -30,6 +31,67 @@
     DEFAULT_TSV_EXPORT_DIR,
 )
 from scribe_data.wikidata.query_data import query_data
+from scribe_data.wikipedia.process_wiki import gen_autosuggestions
+from scribe_data.wikidata.wikidata_utils import sparql
+
+def load_lexeme_metadata():
+    """
+    Load the lexeme form metadata from the JSON file.
+    """
+    metadata_path = Path(__file__).parent / "lexeme_form.metadata.json"
+    try:
+        with open(metadata_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Warning: Could not find lexeme metadata file at {metadata_path}")
+        return {}
+
+def load_text_corpus(language):
+    """
+    Load the text corpus for a given language with consideration for lexeme forms.
+    
+    Parameters
+    ----------
+    language : str
+        The language to load the corpus for
+        
+    Returns
+    -------
+    list
+        The processed text corpus
+    """
+    # Load lexeme metadata
+    lexeme_metadata = load_lexeme_metadata()
+
+    # Create SPARQL query to get relevant lexemes for the language
+    query = """
+    SELECT DISTINCT ?lexeme ?form ?representation WHERE {
+      ?lexeme dct:language ?language .
+      ?lexeme ontolex:lexicalForm ?form .
+      ?form ontolex:representation ?representation .
+      
+      # Filter for specific language
+      FILTER(LANG(?representation) = "%s")
+    }
+    LIMIT 10000
+    """ % language.lower()
+
+    sparql.setQuery(query)
+
+    try:
+        results = sparql.query().convert()
+        corpus = []
+
+        # Process results
+        for result in results["results"]["bindings"]:
+            representation = result["representation"]["value"]
+            corpus.append(representation)
+
+        return corpus
+
+    except Exception as e:
+        print(f"Error loading corpus for {language}: {str(e)}")
+        return []
 
 def get_data(
     language: str = None,
@@ -48,18 +110,25 @@ def get_data(
     ----------
         language : str
             The language(s) to get.
+
         data_type : str
             The data type(s) to get.
+
         output_type : str
             The output file type.
+
         output_dir : str
             The output directory path for results.
+
         outputs_per_entry : str
             How many outputs should be generated per data entry.
+
         overwrite : bool (default: False)
             Whether to overwrite existing files.
+
         all : bool
             Get all languages and data types.
+
         interactive : bool (default: False)
             Whether it's running in interactive mode.
 
@@ -82,29 +151,15 @@ def get_data(
     languages = [language] if language else None
     subprocess_result = False
 
+    # Load lexeme metadata
+    lexeme_metadata = load_lexeme_metadata()
+
     # MARK: Get All
     if all:
         print("Updating all languages and data types ...")
         query_data(None, None, None, overwrite)
         subprocess_result = True
 
-    # MARK: Autosuggestions
-    elif data_type in {"autosuggestions", "autosuggestion"}:
-        if interactive:
-            print("\nNote: Autosuggestions functionality is being deprecated.")
-            print("In future versions, this will be replaced with an LLM-based approach.")
-            print("For now, you can still use the Jupyter notebook in the Scribe community.\n")
-
-        output_path = Path(output_dir) / language / "autosuggestions.json"
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-
-        # Create empty autosuggestions file to maintain compatibility
-        if not output_path.exists() or overwrite:
-            with open(output_path, "w", encoding="utf-8") as f:
-                f.write("{}\n")
-
-        subprocess_result = True
-
     # MARK: Emojis
     elif data_type in {"emoji-keywords", "emoji_keywords"}:
         for lang in languages:
@@ -115,11 +170,51 @@ def get_data(
                 / "emoji_keywords"
                 / "generate_emoji_keywords.py"
             )
-            
+
             subprocess_result = subprocess.run(
                 ["python", emoji_keyword_extraction_script]
             )
 
+    # MARK: Autosuggestions
+    elif data_type in {"autosuggestions", "auto_suggestions"}:
+        for lang in languages:
+            print(f"Generating autosuggestions for {lang}...")
+
+            # Load text corpus with lexeme forms consideration
+            text_corpus = load_text_corpus(lang)
+
+            if text_corpus:
+                try:
+                    # Generate autosuggestions using the loaded corpus
+                    autosuggestions = gen_autosuggestions(
+                        text_corpus,
+                        language=lang,
+                        num_words=500,  # Default number of words
+                        update_local_data=True,
+                        verbose=interactive
+                    )
+
+                    # Save autosuggestions with lexeme metadata
+                    output_path = Path(output_dir) / lang / "autosuggestions.json"
+                    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+                    # Combine autosuggestions with lexeme metadata
+                    output_data = {
+                        "autosuggestions": autosuggestions,
+                        "lexeme_metadata": lexeme_metadata
+                    }
+
+                    with open(output_path, 'w', encoding='utf-8') as f:
+                        json.dump(output_data, f, ensure_ascii=False, indent=2)
+
+                    subprocess_result = True
+                    print(f"Autosuggestions for {lang} generated and saved to {output_path}")
+
+                except Exception as e:
+                    print(f"Error generating autosuggestions for {lang}: {str(e)}")
+            else:
+                print(f"No corpus data found for {lang}")
+
     # MARK: Query Data
     elif language or data_type:
         data_type = data_type[0] if isinstance(data_type, list) else data_type