diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index f5fca3fde..491488d10 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -1,5 +1,5 @@ """ -Function for controlling the data get process for the CLI. +Functions for getting languages-data types packs for the Scribe-Data CLI. .. raw:: html """ +import json import subprocess from pathlib import Path @@ -30,6 +31,67 @@ DEFAULT_TSV_EXPORT_DIR, ) from scribe_data.wikidata.query_data import query_data +from scribe_data.wikipedia.process_wiki import gen_autosuggestions +from scribe_data.wikidata.wikidata_utils import sparql + +def load_lexeme_metadata(): + """ + Load the lexeme form metadata from the JSON file. + """ + metadata_path = Path(__file__).parent / "lexeme_form.metadata.json" + try: + with open(metadata_path, 'r', encoding='utf-8') as f: + return json.load(f) + except FileNotFoundError: + print(f"Warning: Could not find lexeme metadata file at {metadata_path}") + return {} + +def load_text_corpus(language): + """ + Load the text corpus for a given language with consideration for lexeme forms. + + Parameters + ---------- + language : str + The language to load the corpus for + + Returns + ------- + list + The processed text corpus + """ + # Load lexeme metadata + lexeme_metadata = load_lexeme_metadata() + + # Create SPARQL query to get relevant lexemes for the language + query = """ + SELECT DISTINCT ?lexeme ?form ?representation WHERE { + ?lexeme dct:language ?language . + ?lexeme ontolex:lexicalForm ?form . + ?form ontolex:representation ?representation . + + # Filter for specific language + FILTER(LANG(?representation) = "%s") + } + LIMIT 10000 + """ % language.lower() + + sparql.setQuery(query) + + try: + results = sparql.query().convert() + corpus = [] + + # Process results + for result in results["results"]["bindings"]: + representation = result["representation"]["value"] + corpus.append(representation) + + return corpus + + except Exception as e: + print(f"Error loading corpus for {language}: {str(e)}") + return [] def get_data( language: str = None, @@ -48,18 +110,25 @@ def get_data( ---------- language : str The language(s) to get. + data_type : str The data type(s) to get. + output_type : str The output file type. + output_dir : str The output directory path for results. + outputs_per_entry : str How many outputs should be generated per data entry. + overwrite : bool (default: False) Whether to overwrite existing files. + all : bool Get all languages and data types. + interactive : bool (default: False) Whether it's running in interactive mode. @@ -82,29 +151,15 @@ def get_data( languages = [language] if language else None subprocess_result = False + # Load lexeme metadata + lexeme_metadata = load_lexeme_metadata() + # MARK: Get All if all: print("Updating all languages and data types ...") query_data(None, None, None, overwrite) subprocess_result = True - # MARK: Autosuggestions - elif data_type in {"autosuggestions", "autosuggestion"}: - if interactive: - print("\nNote: Autosuggestions functionality is being deprecated.") - print("In future versions, this will be replaced with an LLM-based approach.") - print("For now, you can still use the Jupyter notebook in the Scribe community.\n") - - output_path = Path(output_dir) / language / "autosuggestions.json" - output_path.parent.mkdir(parents=True, exist_ok=True) - - # Create empty autosuggestions file to maintain compatibility - if not output_path.exists() or overwrite: - with open(output_path, "w", encoding="utf-8") as f: - f.write("{}\n") - - subprocess_result = True - # MARK: Emojis elif data_type in {"emoji-keywords", "emoji_keywords"}: for lang in languages: @@ -115,11 +170,51 @@ def get_data( / "emoji_keywords" / "generate_emoji_keywords.py" ) - + subprocess_result = subprocess.run( ["python", emoji_keyword_extraction_script] ) + # MARK: Autosuggestions + elif data_type in {"autosuggestions", "auto_suggestions"}: + for lang in languages: + print(f"Generating autosuggestions for {lang}...") + + # Load text corpus with lexeme forms consideration + text_corpus = load_text_corpus(lang) + + if text_corpus: + try: + # Generate autosuggestions using the loaded corpus + autosuggestions = gen_autosuggestions( + text_corpus, + language=lang, + num_words=500, # Default number of words + update_local_data=True, + verbose=interactive + ) + + # Save autosuggestions with lexeme metadata + output_path = Path(output_dir) / lang / "autosuggestions.json" + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Combine autosuggestions with lexeme metadata + output_data = { + "autosuggestions": autosuggestions, + "lexeme_metadata": lexeme_metadata + } + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(output_data, f, ensure_ascii=False, indent=2) + + subprocess_result = True + print(f"Autosuggestions for {lang} generated and saved to {output_path}") + + except Exception as e: + print(f"Error generating autosuggestions for {lang}: {str(e)}") + else: + print(f"No corpus data found for {lang}") + # MARK: Query Data elif language or data_type: data_type = data_type[0] if isinstance(data_type, list) else data_type