Skip to content

Commit

Permalink
essay 4
Browse files Browse the repository at this point in the history
  • Loading branch information
Collins-Webdev committed Oct 22, 2024
1 parent 8f75976 commit ff56e71
Showing 1 changed file with 114 additions and 19 deletions.
133 changes: 114 additions & 19 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Function for controlling the data get process for the CLI.
Functions for getting languages-data types packs for the Scribe-Data CLI.
.. raw:: html
<!--
Expand All @@ -20,6 +20,7 @@
-->
"""

import json
import subprocess
from pathlib import Path

Expand All @@ -30,6 +31,67 @@
DEFAULT_TSV_EXPORT_DIR,
)
from scribe_data.wikidata.query_data import query_data
from scribe_data.wikipedia.process_wiki import gen_autosuggestions
from scribe_data.wikidata.wikidata_utils import sparql

def load_lexeme_metadata():
"""
Load the lexeme form metadata from the JSON file.
"""
metadata_path = Path(__file__).parent / "lexeme_form.metadata.json"
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Warning: Could not find lexeme metadata file at {metadata_path}")
return {}

def load_text_corpus(language):
"""
Load the text corpus for a given language with consideration for lexeme forms.
Parameters
----------
language : str
The language to load the corpus for
Returns
-------
list
The processed text corpus
"""
# Load lexeme metadata
lexeme_metadata = load_lexeme_metadata()

# Create SPARQL query to get relevant lexemes for the language
query = """
SELECT DISTINCT ?lexeme ?form ?representation WHERE {
?lexeme dct:language ?language .
?lexeme ontolex:lexicalForm ?form .
?form ontolex:representation ?representation .
# Filter for specific language
FILTER(LANG(?representation) = "%s")
}
LIMIT 10000
""" % language.lower()

sparql.setQuery(query)

try:
results = sparql.query().convert()
corpus = []

# Process results
for result in results["results"]["bindings"]:
representation = result["representation"]["value"]
corpus.append(representation)

return corpus

except Exception as e:
print(f"Error loading corpus for {language}: {str(e)}")
return []

def get_data(
language: str = None,
Expand All @@ -48,18 +110,25 @@ def get_data(
----------
language : str
The language(s) to get.
data_type : str
The data type(s) to get.
output_type : str
The output file type.
output_dir : str
The output directory path for results.
outputs_per_entry : str
How many outputs should be generated per data entry.
overwrite : bool (default: False)
Whether to overwrite existing files.
all : bool
Get all languages and data types.
interactive : bool (default: False)
Whether it's running in interactive mode.
Expand All @@ -82,29 +151,15 @@ def get_data(
languages = [language] if language else None
subprocess_result = False

# Load lexeme metadata
lexeme_metadata = load_lexeme_metadata()

# MARK: Get All
if all:
print("Updating all languages and data types ...")
query_data(None, None, None, overwrite)
subprocess_result = True

# MARK: Autosuggestions
elif data_type in {"autosuggestions", "autosuggestion"}:
if interactive:
print("\nNote: Autosuggestions functionality is being deprecated.")
print("In future versions, this will be replaced with an LLM-based approach.")
print("For now, you can still use the Jupyter notebook in the Scribe community.\n")

output_path = Path(output_dir) / language / "autosuggestions.json"
output_path.parent.mkdir(parents=True, exist_ok=True)

# Create empty autosuggestions file to maintain compatibility
if not output_path.exists() or overwrite:
with open(output_path, "w", encoding="utf-8") as f:
f.write("{}\n")

subprocess_result = True

# MARK: Emojis
elif data_type in {"emoji-keywords", "emoji_keywords"}:
for lang in languages:
Expand All @@ -115,11 +170,51 @@ def get_data(
/ "emoji_keywords"
/ "generate_emoji_keywords.py"
)

subprocess_result = subprocess.run(
["python", emoji_keyword_extraction_script]
)

# MARK: Autosuggestions
elif data_type in {"autosuggestions", "auto_suggestions"}:
for lang in languages:
print(f"Generating autosuggestions for {lang}...")

# Load text corpus with lexeme forms consideration
text_corpus = load_text_corpus(lang)

if text_corpus:
try:
# Generate autosuggestions using the loaded corpus
autosuggestions = gen_autosuggestions(
text_corpus,
language=lang,
num_words=500, # Default number of words
update_local_data=True,
verbose=interactive
)

# Save autosuggestions with lexeme metadata
output_path = Path(output_dir) / lang / "autosuggestions.json"
output_path.parent.mkdir(parents=True, exist_ok=True)

# Combine autosuggestions with lexeme metadata
output_data = {
"autosuggestions": autosuggestions,
"lexeme_metadata": lexeme_metadata
}

with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output_data, f, ensure_ascii=False, indent=2)

subprocess_result = True
print(f"Autosuggestions for {lang} generated and saved to {output_path}")

except Exception as e:
print(f"Error generating autosuggestions for {lang}: {str(e)}")
else:
print(f"No corpus data found for {lang}")

# MARK: Query Data
elif language or data_type:
data_type = data_type[0] if isinstance(data_type, list) else data_type
Expand Down

0 comments on commit ff56e71

Please sign in to comment.