diff --git a/.github/workflows/pr_maintainer_checklist.yaml b/.github/workflows/pr_maintainer_checklist.yaml index ed33f7373..cd300ca5a 100644 --- a/.github/workflows/pr_maintainer_checklist.yaml +++ b/.github/workflows/pr_maintainer_checklist.yaml @@ -32,4 +32,6 @@ jobs: - The contributor's name and icon in remote commits should be the same as what appears in the PR - If there's a mismatch, the contributor needs to make sure that the [email they use for GitHub](https://github.com/settings/emails) matches what they have for `git config user.email` in their local Scribe-Data repo + - [ ] The linting and formatting workflow within the [PR checks](https://github.com/scribe-org/Scribe-Data/pull/${{ github.event.pull_request.number }}/checks) do not indicate new errors in the files changed + - [ ] The [CHANGELOG](https://github.com/scribe-org/Scribe-Data/blob/main/CHANGELOG.md) has been updated with a description of the changes for the upcoming release and the corresponding issue (if necessary) diff --git a/setup.py b/setup.py index be6e2ae4a..d6006a57b 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,11 @@ long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/scribe-org/Scribe-Data", + entry_points={ + "console_scripts": [ + "scribe-data=scribe_data.cli.main:main", + ], + } ) if __name__ == "__main__": diff --git a/src/scribe_data/cli/__init__.py b/src/scribe_data/cli/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py new file mode 100644 index 000000000..7d3578aeb --- /dev/null +++ b/src/scribe_data/cli/cli_utils.py @@ -0,0 +1,125 @@ +""" +Utility functions for the Scribe-Data CLI. + +.. raw:: html + +""" + +import json +from pathlib import Path +from typing import Dict, List, Union + +LANGUAGE_METADATA_FILE = ( + Path(__file__).parent.parent / "resources" / "language_metadata.json" +) +WORD_TYPE_METADATA_FILE = ( + Path(__file__).parent.parent / "resources" / "word_type_metadata.json" +) +DATA_DIR = Path("scribe_data_json_export") + +with LANGUAGE_METADATA_FILE.open("r", encoding="utf-8") as file: + language_metadata = json.load(file) + +with WORD_TYPE_METADATA_FILE.open("r", encoding="utf-8") as file: + word_type_metadata = json.load(file) + +language_map = { + lang["language"].lower(): lang for lang in language_metadata["languages"] +} + + +def correct_word_type(word_type: str) -> str: + """ + Corrects common versions of word type arguments so users can choose between them. + + Parameters + ---------- + word_type : str + The word type to potentially correct. + + Returns + ------- + The word_type value or a corrected version of it. + """ + all_word_types = word_type_metadata["word-types"] + + if word_type in all_word_types: + return word_type + + for wt in all_word_types: + if f"{word_type}s" == wt: + return wt + + +def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: + """ + Prints a formatted output from the Scribe-Data CLI. + """ + if not data: + print(f"No data available for word type '{word_type}'.") + return + + max_key_length = max((len(key) for key in data.keys()), default=0) + + if word_type == "autosuggestions": + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {', '.join(value)}") + + elif word_type == "emoji_keywords": + for key, value in data.items(): + emojis = [item["emoji"] for item in value] + print(f"{key:<{max_key_length}} : {' '.join(emojis)}") + + elif word_type in {"prepositions", "translations"}: + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {value}") + + elif isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max( + (len(sub_key) for sub_key in value.keys()), default=0 + ) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + + else: + print(f" {item}") + + else: + print(f"{key:<{max_key_length}} : {value}") + + elif isinstance(data, list): + for item in data: + if isinstance(item, dict): + for key, value in item.items(): + print(f"{key} : {value}") + + else: + print(item) + + else: + print(data) diff --git a/src/scribe_data/cli.py b/src/scribe_data/cli/convert.py similarity index 90% rename from src/scribe_data/cli.py rename to src/scribe_data/cli/convert.py index 3a9e570c6..cb7a3fc8a 100644 --- a/src/scribe_data/cli.py +++ b/src/scribe_data/cli/convert.py @@ -1,5 +1,5 @@ """ -Setup and commands for the Scribe-Data command line interface. +Functions to convert data returned from the Scribe-Data CLI to other file types. .. raw:: html +""" + +import json +from pathlib import Path + +from .cli_utils import correct_word_type + +# Load language metadata from JSON file. +METADATA_FILE = Path(__file__).parent.parent / "resources" / "language_metadata.json" +LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction" + +with METADATA_FILE.open("r", encoding="utf-8") as file: + language_metadata = json.load(file) + +language_map = { + lang["language"].lower(): lang for lang in language_metadata["languages"] +} + + +def list_languages() -> None: + """ + Generates a table of languages, their ISO-2 codes and their Wikidata QIDs. + """ + languages = list(language_metadata["languages"]) + languages.sort(key=lambda x: x["language"]) + + language_col_width = max(len(lang["language"]) for lang in languages) + 2 + iso_col_width = max(len(lang["iso"]) for lang in languages) + 2 + qid_col_width = max(len(lang["qid"]) for lang in languages) + 2 + + table_line_length = language_col_width + iso_col_width + qid_col_width + + print() + print( + f"{'Language':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" + ) + print("-" * table_line_length) + + for lang in languages: + print( + f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}" + ) + + print("-" * table_line_length) + print() + + +def list_word_types(language: str = None) -> None: + """ + Lists all word types or those available for a given language. + + Parameters + ---------- + language : str + The language to potentially list word types for. + """ + if language: + language_data = language_map.get(language.lower()) + language_capitalized = language.capitalize() + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized + + if not language_data: + raise ValueError(f"Language '{language}' is not recognized.") + + word_types = [f.name for f in language_dir.iterdir() if f.is_dir()] + if not word_types: + raise ValueError( + f"No word types available for language '{language_capitalized}'." + ) + + table_header = f"Available word types: {language_capitalized}" + + else: + word_types = set() + for lang in language_metadata["languages"]: + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() + if language_dir.is_dir(): + word_types.update(f.name for f in language_dir.iterdir() if f.is_dir()) + + table_header = "Available word types: All languages" + + table_line_length = max(len(table_header), max(len(wt) for wt in word_types)) + + print() + print(table_header) + print("-" * table_line_length) + + word_types = sorted(word_types) + for wt in word_types: + print(wt) + + print("-" * table_line_length) + print() + + +def list_all() -> None: + """ + Lists all available languages and word types. + """ + list_languages() + list_word_types() + + +def list_languages_for_word_type(word_type: str) -> None: + """ + Lists the available languages for a given word type. + + Parameters + ---------- + word_type : str + The word type to check for. + """ + word_type = correct_word_type(word_type) + available_languages = [] + for lang in language_metadata["languages"]: + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() + if language_dir.is_dir(): + wt_path = language_dir / word_type + if wt_path.exists(): + available_languages.append(lang["language"]) + + available_languages.sort() + table_header = f"Available languages: {word_type}" + table_line_length = max( + len(table_header), max(len(lang) for lang in available_languages) + ) + + print() + print(table_header) + print("-" * table_line_length) + + for lang in available_languages: + print(f"{lang.capitalize()}") + + print("-" * table_line_length) + print() + + +def list_wrapper(language: str = None, word_type: str = None) -> None: + """ + Conditionally provides the full functionality of the list command. + + Parameters + ---------- + language : str + The language to potentially list word types for. + + word_type : str + The word type to check for. + """ + if not language and not word_type: + list_all() + + elif language is True and not word_type: + list_languages() + + elif not language and word_type is True: + list_word_types() + + elif language is True and word_type is True: + print("Please specify either a language or a word type.") + + elif word_type is not None: + list_languages_for_word_type(word_type) + + elif language is not None: + list_word_types(language) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py new file mode 100644 index 000000000..c057c53de --- /dev/null +++ b/src/scribe_data/cli/main.py @@ -0,0 +1,163 @@ +""" +Setup and commands for the Scribe-Data command line interface. + +.. raw:: html + +""" + +#!/usr/bin/env python3 +import argparse + +from .list import list_wrapper +from .query import query_data + +LIST_DESCRIPTION = "List languages and word types that Scribe-Data can be used for." +QUERY_DESCRIPTION = "Query data from Wikidata for given languages and word types." +CONVERT_DESCRIPTION = "Convert data returned by Scribe-Data to different file types." +CLI_EPILOG = "Visit the codebase at https://github.com/scribe-org/Scribe-Data and documentation at https://scribe-data.readthedocs.io/en/latest/ to learn more!" + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="Scribe-Data", + description="The Scribe-Data CLI is a tool to query language data from Wikidata and other sources.", + epilog=CLI_EPILOG, + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + parser._actions[0].help = "Show this help message and exit." + parser.add_argument( + "-v", "--verbose", help="Increase output verbosity.", action="store_true" + ) + parser.add_argument("-u", "--update", help="Update the Scribe-Data CLI.") + + # MARK: List + + list_parser = subparsers.add_parser( + "list", + aliases=["l"], + help=LIST_DESCRIPTION, + description=LIST_DESCRIPTION, + epilog=CLI_EPILOG, + ) + list_parser._actions[0].help = "Show this help message and exit." + list_parser.add_argument( + "--language", + "-lang", + nargs="?", + const=True, + help="Run list command on all or given languages.", + ) + list_parser.add_argument( + "--word-type", + "-wt", + nargs="?", + const=True, + help="Run list command on all or given word types.", + ) + + # MARK: Query + + query_parser = subparsers.add_parser( + "query", + aliases=["q"], + help=QUERY_DESCRIPTION, + description=QUERY_DESCRIPTION, + epilog=CLI_EPILOG, + ) + query_parser._actions[0].help = "Show this help message and exit." + query_parser.add_argument( + "-lang", "--language", type=str, help="The language(s) to query." + ) + query_parser.add_argument( + "-wt", "--word-type", type=str, help="The word type(s) to query." + ) + query_parser.add_argument( + "-od", "--output-dir", type=str, help="The output directory path for results." + ) + query_parser.add_argument( + "-ot", + "--output-type", + type=str, + choices=["json", "csv", "tsv"], + help="The output file type.", + ) + query_parser.add_argument( + "-o", + "--overwrite", + action="store_true", + help="Whether to overwrite existing files (default: False).", + ) + + # MARK: Convert + + convert_parser = subparsers.add_parser( + "convert", + aliases=["c"], + help=CONVERT_DESCRIPTION, + description=CONVERT_DESCRIPTION, + epilog=CLI_EPILOG, + ) + convert_parser._actions[0].help = "Show this help message and exit." + convert_parser.add_argument( + "-f", "--file", type=str, help="The file to convert to a new type." + ) + convert_parser.add_argument( + "-ko", + "--keep-original", + action="store_false", + help="Whether to keep the file to be converted (default: True).", + ) + convert_parser.add_argument( + "-json", "--to-json", type=str, help="Convert the file to JSON format." + ) + convert_parser.add_argument( + "-csv", "--to-csv", type=str, help="Convert the file to CSV format." + ) + convert_parser.add_argument( + "-tsv", "--to-tsv", type=str, help="Convert the file to TSV format." + ) + convert_parser.add_argument( + "-sqlite", "--to-sqlite", type=str, help="Convert the file to SQLite format." + ) + + # MARK: Setup CLI + + args = parser.parse_args() + + if args.command in ["list", "l"]: + list_wrapper(args.language, args.word_type) + + elif args.command in ["query", "q"]: + query_data( + args.language, + args.word_type, + args.output_dir, + args.overwrite, + args.output_type, + ) + + elif args.command in ["convert", "c"]: + return + + else: + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/src/scribe_data/cli/query.py b/src/scribe_data/cli/query.py new file mode 100644 index 000000000..f2b629905 --- /dev/null +++ b/src/scribe_data/cli/query.py @@ -0,0 +1,193 @@ +""" +Functions for querying languages-word types packs for the Scribe-Data CLI. + +.. raw:: html + +""" + +import csv +import json +from pathlib import Path +from typing import Optional + +from .cli_utils import language_map + +DATA_DIR = Path("scribe_data_json_export") + + +def query_data( + language: str = None, + word_type: str = None, + output_dir: Optional[str] = None, + overwrite: bool = False, + output_type: Optional[str] = None, +) -> None: + if not (language or word_type): + raise ValueError( + "You must provide either a --language (-l) or --word-type (-wt) option." + ) + + if output_dir: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + if output_type == "json" or output_type is None: + export_json(language, word_type, output_dir, overwrite) + + elif output_type in ["csv", "tsv"]: + export_csv_or_tsv(language, word_type, output_dir, overwrite, output_type) + + else: + raise ValueError( + "Unsupported output type. Please use 'json', 'csv', or 'tsv'." + ) + + else: + raise ValueError("Please specify an output directory using --output-dir (-od).") + + +def export_json( + language: str, word_type: str, output_dir: Path, overwrite: bool +) -> None: + normalized_language = language_map.get(language.lower()) + language_capitalized = language.capitalize() + if not normalized_language: + raise ValueError(f"Language '{language_capitalized}' is not recognized.") + + data_file = ( + DATA_DIR / normalized_language["language"].capitalize() / f"{word_type}.json" + ) + + if not data_file.exists(): + print( + f"No data found for language '{normalized_language['language']}' and word type '{word_type}'." + ) + return + + try: + with data_file.open("r") as file: + data = json.load(file) + + except (IOError, json.JSONDecodeError) as e: + print(f"Error reading '{data_file}': {e}") + return + + # Adjust the output directory for JSON exports. + json_output_dir = ( + output_dir + / "scribe_data_json_export" + / normalized_language["language"].capitalize() + ) + json_output_dir.mkdir(parents=True, exist_ok=True) + + output_file = json_output_dir / f"{word_type}.json" + if output_file.exists() and not overwrite: + user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") + if user_input.lower() != "y": + print(f"Skipping {normalized_language['language']} - {word_type}") + return + + try: + with output_file.open("w") as file: + json.dump(data, file, indent=2) + except IOError as e: + raise IOError(f"Error writing to '{output_file}': {e}") from e + + print( + f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'" + ) + + +def export_csv_or_tsv( + language: str, word_type: str, output_dir: Path, overwrite: bool, output_type: str +) -> None: + normalized_language = language_map.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + data_file = ( + DATA_DIR / normalized_language["language"].capitalize() / f"{word_type}.json" + ) + if not data_file.exists(): + print( + f"No data found for language '{normalized_language['language']}' and word type '{word_type}'." + ) + return + + try: + with data_file.open("r") as file: + data = json.load(file) + + except (IOError, json.JSONDecodeError) as e: + print(f"Error reading '{data_file}': {e}") + return + + if output_type == "csv": + delimiter = "," + file_extension = "csv" + + elif output_type == "tsv": + delimiter = "\t" + file_extension = "tsv" + + else: + print(f"Unsupported output type '{output_type}'.") + return + + # Adjust the output directory for CSV exports. + csv_output_dir = ( + output_dir + / "scribe_data_csv_export" + / normalized_language["language"].capitalize() + ) + csv_output_dir.mkdir(parents=True, exist_ok=True) + + output_file = csv_output_dir / f"{word_type}.{file_extension}" + if output_file.exists() and not overwrite: + user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") + if user_input.lower() != "y": + print(f"Skipping {normalized_language['language']} - {word_type}") + return + + try: + with output_file.open("w", newline="", encoding="utf-8") as file: + writer = csv.writer(file, delimiter=delimiter) + if isinstance(data, dict): + for key, value in data.items(): + writer.writerow([key, value]) + + elif isinstance(data, list): + for item in data: + if isinstance(item, dict): + writer.writerow(item.values()) + + else: + writer.writerow([item]) + + else: + print(f"Unsupported data format for {output_type} export.") + + except IOError as e: + print(f"Error writing to '{output_file}': {e}") + return + + print( + f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'" + ) diff --git a/src/scribe_data/resources/language_meta_data.json b/src/scribe_data/resources/language_metadata.json similarity index 92% rename from src/scribe_data/resources/language_meta_data.json rename to src/scribe_data/resources/language_metadata.json index 27a8110ea..e6d7de8a6 100755 --- a/src/scribe_data/resources/language_meta_data.json +++ b/src/scribe_data/resources/language_metadata.json @@ -3,10 +3,10 @@ "description": { "entry": { "language": "the supported language. All lowercase", - "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes . All lowercase", + "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. All lowercase", "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390", "remove-words": "words that should not be included as autosuggestions for the given language.", - "ignore-words": "TODO. Case sensitive." + "ignore-words": "words that should be removed from the autosuggestion generation process." } }, "languages": [ diff --git a/src/scribe_data/resources/word_type_metadata.json b/src/scribe_data/resources/word_type_metadata.json new file mode 100644 index 000000000..3479d94c6 --- /dev/null +++ b/src/scribe_data/resources/word_type_metadata.json @@ -0,0 +1,10 @@ +{ + "word-types": [ + "autosuggestions", + "emoji_keywords", + "nouns", + "prepositions", + "translations", + "verbs" + ] +} diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 13a0da4c0..ca056f8cc 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -62,7 +62,7 @@ def _load_json(package_path: str, file_name: str, root: str): _languages = _load_json( package_path="scribe_data.resources", - file_name="language_meta_data.json", + file_name="language_metadata.json", root="languages", ) diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py index ed97210c9..7c7581650 100644 --- a/src/scribe_data/wikipedia/extract_wiki.py +++ b/src/scribe_data/wikipedia/extract_wiki.py @@ -182,7 +182,7 @@ def iterate_and_parse_file(args): partitions_dir : str The path to where output file should be stored. - article_limit : int optional (default=None) + article_limit : int (default=None) An optional article_limit of the number of articles to find. verbose : bool (default=True)