From 179184b933f7c2ea8f295146c29b7304febcaff4 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Tue, 4 Jun 2024 08:19:25 +0600 Subject: [PATCH 01/18] add script for all language list --- src/scribe_data/cli.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/scribe_data/cli.py b/src/scribe_data/cli.py index 0b2872598..60385354d 100644 --- a/src/scribe_data/cli.py +++ b/src/scribe_data/cli.py @@ -1,3 +1,30 @@ """ Setup and commands for the Scribe-Data command line interface. """ + +import os +import argparse + +def list_languages(): + languages = [lang for lang in os.listdir('language_data_export') if os.path.isdir(f"language_data_export/{lang}")] + print("Available languages:") + for lang in languages: + print(f"- {lang}") + word_types = [wt.replace('.json', '') for wt in os.listdir(f"language_data_export/{lang}") if wt.endswith('.json')] + max_word_type_length = max(len(wt) for wt in word_types) + for wt in word_types: + print(f" - {wt:<{max_word_type_length}}") + print("") + +def main(): + parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') + parser.add_argument('--list-languages', '-ll', action='store_true', help='List available language codes and word types') + args = parser.parse_args() + + if args.list_languages: + list_languages() + else: + parser.print_help() + +if __name__ == '__main__': + main() From 7c4dde3b1efbdef2784d2ee4d086e3cb0de84d2f Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Wed, 5 Jun 2024 10:16:43 +0600 Subject: [PATCH 02/18] add query word --- src/scribe_data/cli.py | 79 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 9 deletions(-) diff --git a/src/scribe_data/cli.py b/src/scribe_data/cli.py index 60385354d..0da163178 100644 --- a/src/scribe_data/cli.py +++ b/src/scribe_data/cli.py @@ -2,27 +2,88 @@ Setup and commands for the Scribe-Data command line interface. """ -import os import argparse +import json +from pathlib import Path +from typing import Dict, List, Union -def list_languages(): - languages = [lang for lang in os.listdir('language_data_export') if os.path.isdir(f"language_data_export/{lang}")] +DATA_DIR = Path('language_data_export') + +def list_languages() -> None: + if not DATA_DIR.exists() or not DATA_DIR.is_dir(): + print(f"Directory '{DATA_DIR}' does not exist.") + return + + languages = [lang for lang in DATA_DIR.iterdir() if lang.is_dir()] print("Available languages:") for lang in languages: - print(f"- {lang}") - word_types = [wt.replace('.json', '') for wt in os.listdir(f"language_data_export/{lang}") if wt.endswith('.json')] + print(f"- {lang.name}") + word_types = [wt.stem for wt in lang.glob('*.json')] max_word_type_length = max(len(wt) for wt in word_types) for wt in word_types: print(f" - {wt:<{max_word_type_length}}") - print("") + print("") + +def print_formatted_data(data: Union[Dict, List]) -> None: + if isinstance(data, dict): + max_key_length = max(len(key) for key in data.keys()) + for key, value in data.items(): + if isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max(len(sub_key) for sub_key in value.keys()) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + else: + print(f"{key:<{max_key_length}} : {value}") + elif isinstance(data, list): + max_key_length = max(len(key) for item in data for key in item.keys()) + for item in data: + if isinstance(item, dict): + for key, value in item.items(): + print(f"{key:<{max_key_length}} : ") + if isinstance(value, dict): + max_sub_key_length = max(len(sub_key) for sub_key in value.keys()) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + else: + print(f" {value}") + else: + print(json.dumps(item, indent=2)) + else: + print(data) + +def query_data(language: str, word_type: str) -> None: + data_file = DATA_DIR / language / f"{word_type}.json" + if not data_file.exists(): + print(f"No data found for language '{language}' and word type '{word_type}'.") + return -def main(): + try: + with data_file.open('r') as file: + data = json.load(file) + except (IOError, json.JSONDecodeError) as e: + print(f"Error reading '{data_file}': {e}") + return + + print(f"Data for language '{language}' and word type '{word_type}':") + print_formatted_data(data) + +def main() -> None: parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') - parser.add_argument('--list-languages', '-ll', action='store_true', help='List available language codes and word types') + subparsers = parser.add_subparsers(dest='command') + + subparsers.add_parser('list-languages', help='List available language codes and word types') + + query_parser = subparsers.add_parser('query', help='Query data for a specific language and word type') + query_parser.add_argument('-l', '--language', required=True, help='Language code') + query_parser.add_argument('-wt', '--word-type', required=True, help='Word type') + args = parser.parse_args() - if args.list_languages: + if args.command == 'list-languages': list_languages() + elif args.command == 'query': + query_data(args.language, args.word_type) else: parser.print_help() From 4731c6da3527a02a8990901641287bef8f33cf46 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Thu, 6 Jun 2024 08:02:01 +0600 Subject: [PATCH 03/18] update the commands --- setup.py | 5 ++ src/scribe_data/cli.py | 122 ++++++++++++++++++++++++++++------------- 2 files changed, 90 insertions(+), 37 deletions(-) diff --git a/setup.py b/setup.py index ba2fbdd76..948e87112 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,11 @@ long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/scribe-org/Scribe-Data", + entry_points={ + "console_scripts": [ + "scribe-data=scribe_data.cli:main", + ], + } ) if __name__ == "__main__": diff --git a/src/scribe_data/cli.py b/src/scribe_data/cli.py index 05e85f756..277cca12e 100644 --- a/src/scribe_data/cli.py +++ b/src/scribe_data/cli.py @@ -20,11 +20,20 @@ --> """ +#!/usr/bin/env python3 + +import sys +import os import argparse import json from pathlib import Path from typing import Dict, List, Union +# Add the parent directory of 'src' to sys.path +current_dir = os.path.dirname(os.path.abspath(__file__)) +parent_dir = os.path.dirname(current_dir) +sys.path.append(parent_dir) + DATA_DIR = Path('language_data_export') def list_languages() -> None: @@ -32,43 +41,69 @@ def list_languages() -> None: print(f"Directory '{DATA_DIR}' does not exist.") return - languages = [lang for lang in DATA_DIR.iterdir() if lang.is_dir()] + languages = [lang.name for lang in DATA_DIR.iterdir() if lang.is_dir()] print("Available languages:") for lang in languages: - print(f"- {lang.name}") - word_types = [wt.stem for wt in lang.glob('*.json')] - max_word_type_length = max(len(wt) for wt in word_types) - for wt in word_types: - print(f" - {wt:<{max_word_type_length}}") - print("") - -def print_formatted_data(data: Union[Dict, List]) -> None: - if isinstance(data, dict): + print(f"- {lang}") + +def list_word_types(language: str) -> None: + language_dir = DATA_DIR / language + if not language_dir.exists() or not language_dir.is_dir(): + print(f"No data found for language '{language}'.") + return + + word_types = [wt.stem for wt in language_dir.glob('*.json')] + if not word_types: + print(f"No word types available for language '{language}'.") + return + + max_word_type_length = max(len(wt) for wt in word_types) + print(f"Word types for language '{language}':") + for wt in word_types: + print(f" - {wt:<{max_word_type_length}}") + +def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: + if word_type == 'autosuggestions': max_key_length = max(len(key) for key in data.keys()) for key, value in data.items(): - if isinstance(value, dict): - print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max(len(sub_key) for sub_key in value.keys()) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - else: - print(f"{key:<{max_key_length}} : {value}") - elif isinstance(data, list): - max_key_length = max(len(key) for item in data for key in item.keys()) - for item in data: - if isinstance(item, dict): - for key, value in item.items(): - print(f"{key:<{max_key_length}} : ") - if isinstance(value, dict): - max_sub_key_length = max(len(sub_key) for sub_key in value.keys()) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - else: - print(f" {value}") - else: - print(json.dumps(item, indent=2)) + print(f"{key:<{max_key_length}} : {', '.join(value)}") + elif word_type == 'emoji_keywords': + max_key_length = max(len(key) for key in data.keys()) + for key, value in data.items(): + emojis = [item['emoji'] for item in value] + print(f"{key:<{max_key_length}} : {' '.join(emojis)}") + elif word_type == 'prepositions' or word_type == 'translations': + max_key_length = max(len(key) for key in data.keys()) + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {value}") else: - print(data) + if isinstance(data, dict): + max_key_length = max(len(key) for key in data.keys()) + for key, value in data.items(): + if isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max(len(sub_key) for sub_key in value.keys()) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + else: + print(f" {item}") + else: + print(f"{key:<{max_key_length}} : {value}") + elif isinstance(data, list): + for item in data: + if isinstance(item, dict): + for key, value in item.items(): + print(f"{key} : {value}") + else: + print(item) + else: + print(data) def query_data(language: str, word_type: str) -> None: data_file = DATA_DIR / language / f"{word_type}.json" @@ -84,22 +119,35 @@ def query_data(language: str, word_type: str) -> None: return print(f"Data for language '{language}' and word type '{word_type}':") - print_formatted_data(data) + print_formatted_data(data, word_type) + + if word_type.lower() == 'nouns': + print("\nLegend:") + print("PL : Plural") + print("empty : Singular\n") def main() -> None: parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') - subparsers = parser.add_subparsers(dest='command') - - subparsers.add_parser('list-languages', help='List available language codes and word types') + subparsers = parser.add_subparsers(dest='command', required=True) + # Define the 'list-languages' subcommand + list_languages_parser = subparsers.add_parser('languages-list', aliases=['ll'], help='List available languages') + + # Define the 'list-word-types' subcommand + list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types for a specific language') + list_word_types_parser.add_argument('-l', '--language', required=True, help='Language code') + + # Define the 'query' subcommand query_parser = subparsers.add_parser('query', help='Query data for a specific language and word type') query_parser.add_argument('-l', '--language', required=True, help='Language code') query_parser.add_argument('-wt', '--word-type', required=True, help='Word type') args = parser.parse_args() - if args.command == 'list-languages': + if args.command in ['languages-list', 'll']: list_languages() + elif args.command in ['list-word-types', 'lwt']: + list_word_types(args.language) elif args.command == 'query': query_data(args.language, args.word_type) else: From fcec4e0c3168cf886166b7358e40f7191d4eed5e Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Thu, 6 Jun 2024 16:47:05 +0600 Subject: [PATCH 04/18] add language code --- src/scribe_data/cli.py | 44 ++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/src/scribe_data/cli.py b/src/scribe_data/cli.py index 277cca12e..213bd548c 100644 --- a/src/scribe_data/cli.py +++ b/src/scribe_data/cli.py @@ -29,13 +29,24 @@ from pathlib import Path from typing import Dict, List, Union -# Add the parent directory of 'src' to sys.path current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) sys.path.append(parent_dir) DATA_DIR = Path('language_data_export') +# Mapping of possible inputs to standardized language names +LANGUAGE_MAP = { + 'en': 'English', 'english': 'English', + 'fr': 'French', 'french': 'French', + 'de': 'German', 'german': 'German', + 'it': 'Italian', 'italian': 'Italian', + 'pt': 'Portuguese', 'portuguese': 'Portuguese', + 'ru': 'Russian', 'russian': 'Russian', + 'es': 'Spanish', 'spanish': 'Spanish', + 'sv': 'Swedish', 'swedish': 'Swedish' +} + def list_languages() -> None: if not DATA_DIR.exists() or not DATA_DIR.is_dir(): print(f"Directory '{DATA_DIR}' does not exist.") @@ -47,18 +58,24 @@ def list_languages() -> None: print(f"- {lang}") def list_word_types(language: str) -> None: - language_dir = DATA_DIR / language + # Normalize the input language + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + language_dir = DATA_DIR / normalized_language if not language_dir.exists() or not language_dir.is_dir(): - print(f"No data found for language '{language}'.") + print(f"No data found for language '{normalized_language}'.") return word_types = [wt.stem for wt in language_dir.glob('*.json')] if not word_types: - print(f"No word types available for language '{language}'.") + print(f"No word types available for language '{normalized_language}'.") return max_word_type_length = max(len(wt) for wt in word_types) - print(f"Word types for language '{language}':") + print(f"Word types for language '{normalized_language}':") for wt in word_types: print(f" - {wt:<{max_word_type_length}}") @@ -106,9 +123,15 @@ def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: print(data) def query_data(language: str, word_type: str) -> None: - data_file = DATA_DIR / language / f"{word_type}.json" + # Normalize the input language + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + data_file = DATA_DIR / normalized_language / f"{word_type}.json" if not data_file.exists(): - print(f"No data found for language '{language}' and word type '{word_type}'.") + print(f"No data found for language '{normalized_language}' and word type '{word_type}'.") return try: @@ -118,7 +141,7 @@ def query_data(language: str, word_type: str) -> None: print(f"Error reading '{data_file}': {e}") return - print(f"Data for language '{language}' and word type '{word_type}':") + print(f"Data for language '{normalized_language}' and word type '{word_type}':") print_formatted_data(data, word_type) if word_type.lower() == 'nouns': @@ -130,14 +153,11 @@ def main() -> None: parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') subparsers = parser.add_subparsers(dest='command', required=True) - # Define the 'list-languages' subcommand - list_languages_parser = subparsers.add_parser('languages-list', aliases=['ll'], help='List available languages') + subparsers.add_parser('languages-list', aliases=['ll'], help='List available languages') - # Define the 'list-word-types' subcommand list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types for a specific language') list_word_types_parser.add_argument('-l', '--language', required=True, help='Language code') - # Define the 'query' subcommand query_parser = subparsers.add_parser('query', help='Query data for a specific language and word type') query_parser.add_argument('-l', '--language', required=True, help='Language code') query_parser.add_argument('-wt', '--word-type', required=True, help='Word type') From e01670372713b571afc8393219c0be3134e2a44f Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Sat, 8 Jun 2024 07:10:53 +0600 Subject: [PATCH 05/18] update as per requirement in #148 --- src/scribe_data/cli.py | 71 ++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/src/scribe_data/cli.py b/src/scribe_data/cli.py index 213bd548c..e8cba7783 100644 --- a/src/scribe_data/cli.py +++ b/src/scribe_data/cli.py @@ -53,31 +53,47 @@ def list_languages() -> None: return languages = [lang.name for lang in DATA_DIR.iterdir() if lang.is_dir()] + languages.sort() print("Available languages:") for lang in languages: print(f"- {lang}") -def list_word_types(language: str) -> None: - # Normalize the input language - normalized_language = LANGUAGE_MAP.get(language.lower()) - if not normalized_language: - print(f"Language '{language}' is not recognized.") - return - - language_dir = DATA_DIR / normalized_language - if not language_dir.exists() or not language_dir.is_dir(): - print(f"No data found for language '{normalized_language}'.") - return +def list_word_types(language: str = None) -> None: + if language: + # Normalize the input language + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + language_dir = DATA_DIR / normalized_language + if not language_dir.exists() or not language_dir.is_dir(): + print(f"No data found for language '{normalized_language}'.") + return + + word_types = [wt.stem for wt in language_dir.glob('*.json')] + if not word_types: + print(f"No word types available for language '{normalized_language}'.") + return + + max_word_type_length = max(len(wt) for wt in word_types) + print(f"Word types for language '{normalized_language}':") + for wt in word_types: + print(f" - {wt:<{max_word_type_length}}") + else: + word_types = set() + for lang_dir in DATA_DIR.iterdir(): + if lang_dir.is_dir(): + word_types.update(wt.stem for wt in lang_dir.glob('*.json')) - word_types = [wt.stem for wt in language_dir.glob('*.json')] - if not word_types: - print(f"No word types available for language '{normalized_language}'.") - return + if not word_types: + print("No word types available.") + return - max_word_type_length = max(len(wt) for wt in word_types) - print(f"Word types for language '{normalized_language}':") - for wt in word_types: - print(f" - {wt:<{max_word_type_length}}") + word_types = sorted(word_types) + print("Available word types:") + for wt in word_types: + print(f" - {wt}") def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: if word_type == 'autosuggestions': @@ -153,21 +169,24 @@ def main() -> None: parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') subparsers = parser.add_subparsers(dest='command', required=True) - subparsers.add_parser('languages-list', aliases=['ll'], help='List available languages') - - list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types for a specific language') - list_word_types_parser.add_argument('-l', '--language', required=True, help='Language code') - + subparsers.add_parser('list-languages', aliases=['ll'], help='List available languages') + + list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types') + list_word_types_parser.add_argument('-l', '--language', help='Language code') + query_parser = subparsers.add_parser('query', help='Query data for a specific language and word type') query_parser.add_argument('-l', '--language', required=True, help='Language code') query_parser.add_argument('-wt', '--word-type', required=True, help='Word type') args = parser.parse_args() - if args.command in ['languages-list', 'll']: + if args.command in ['list-languages', 'll']: list_languages() elif args.command in ['list-word-types', 'lwt']: - list_word_types(args.language) + if args.language: + list_word_types(args.language) + else: + list_word_types() elif args.command == 'query': query_data(args.language, args.word_type) else: From 6e6da98480e808587468c48163588b8773a5322e Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Wed, 12 Jun 2024 08:49:14 +0600 Subject: [PATCH 06/18] update cli file structure --- setup.py | 6 +- src/scribe_data/cli.py | 196 ------------------------------- src/scribe_data/cli/__init__.py | 0 src/scribe_data/cli/cli_list.py | 52 ++++++++ src/scribe_data/cli/cli_main.py | 58 +++++++++ src/scribe_data/cli/cli_query.py | 63 ++++++++++ src/scribe_data/cli/cli_utils.py | 53 +++++++++ 7 files changed, 229 insertions(+), 199 deletions(-) delete mode 100644 src/scribe_data/cli.py create mode 100644 src/scribe_data/cli/__init__.py create mode 100644 src/scribe_data/cli/cli_list.py create mode 100644 src/scribe_data/cli/cli_main.py create mode 100644 src/scribe_data/cli/cli_query.py create mode 100644 src/scribe_data/cli/cli_utils.py diff --git a/setup.py b/setup.py index 9c5b1741c..854612ba4 100644 --- a/setup.py +++ b/setup.py @@ -49,9 +49,9 @@ url="https://github.com/scribe-org/Scribe-Data", entry_points={ "console_scripts": [ - "scribe-data=scribe_data.cli:main", - ], - } + "scribe-data=scribe_data.cli.cli_main:main", + ], + } ) if __name__ == "__main__": diff --git a/src/scribe_data/cli.py b/src/scribe_data/cli.py deleted file mode 100644 index e8cba7783..000000000 --- a/src/scribe_data/cli.py +++ /dev/null @@ -1,196 +0,0 @@ -""" -Setup and commands for the Scribe-Data command line interface. - -.. raw:: html - -""" - -#!/usr/bin/env python3 - -import sys -import os -import argparse -import json -from pathlib import Path -from typing import Dict, List, Union - -current_dir = os.path.dirname(os.path.abspath(__file__)) -parent_dir = os.path.dirname(current_dir) -sys.path.append(parent_dir) - -DATA_DIR = Path('language_data_export') - -# Mapping of possible inputs to standardized language names -LANGUAGE_MAP = { - 'en': 'English', 'english': 'English', - 'fr': 'French', 'french': 'French', - 'de': 'German', 'german': 'German', - 'it': 'Italian', 'italian': 'Italian', - 'pt': 'Portuguese', 'portuguese': 'Portuguese', - 'ru': 'Russian', 'russian': 'Russian', - 'es': 'Spanish', 'spanish': 'Spanish', - 'sv': 'Swedish', 'swedish': 'Swedish' -} - -def list_languages() -> None: - if not DATA_DIR.exists() or not DATA_DIR.is_dir(): - print(f"Directory '{DATA_DIR}' does not exist.") - return - - languages = [lang.name for lang in DATA_DIR.iterdir() if lang.is_dir()] - languages.sort() - print("Available languages:") - for lang in languages: - print(f"- {lang}") - -def list_word_types(language: str = None) -> None: - if language: - # Normalize the input language - normalized_language = LANGUAGE_MAP.get(language.lower()) - if not normalized_language: - print(f"Language '{language}' is not recognized.") - return - - language_dir = DATA_DIR / normalized_language - if not language_dir.exists() or not language_dir.is_dir(): - print(f"No data found for language '{normalized_language}'.") - return - - word_types = [wt.stem for wt in language_dir.glob('*.json')] - if not word_types: - print(f"No word types available for language '{normalized_language}'.") - return - - max_word_type_length = max(len(wt) for wt in word_types) - print(f"Word types for language '{normalized_language}':") - for wt in word_types: - print(f" - {wt:<{max_word_type_length}}") - else: - word_types = set() - for lang_dir in DATA_DIR.iterdir(): - if lang_dir.is_dir(): - word_types.update(wt.stem for wt in lang_dir.glob('*.json')) - - if not word_types: - print("No word types available.") - return - - word_types = sorted(word_types) - print("Available word types:") - for wt in word_types: - print(f" - {wt}") - -def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: - if word_type == 'autosuggestions': - max_key_length = max(len(key) for key in data.keys()) - for key, value in data.items(): - print(f"{key:<{max_key_length}} : {', '.join(value)}") - elif word_type == 'emoji_keywords': - max_key_length = max(len(key) for key in data.keys()) - for key, value in data.items(): - emojis = [item['emoji'] for item in value] - print(f"{key:<{max_key_length}} : {' '.join(emojis)}") - elif word_type == 'prepositions' or word_type == 'translations': - max_key_length = max(len(key) for key in data.keys()) - for key, value in data.items(): - print(f"{key:<{max_key_length}} : {value}") - else: - if isinstance(data, dict): - max_key_length = max(len(key) for key in data.keys()) - for key, value in data.items(): - if isinstance(value, dict): - print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max(len(sub_key) for sub_key in value.keys()) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - elif isinstance(value, list): - print(f"{key:<{max_key_length}} : ") - for item in value: - if isinstance(item, dict): - for sub_key, sub_value in item.items(): - print(f" {sub_key:<{max_key_length}} : {sub_value}") - else: - print(f" {item}") - else: - print(f"{key:<{max_key_length}} : {value}") - elif isinstance(data, list): - for item in data: - if isinstance(item, dict): - for key, value in item.items(): - print(f"{key} : {value}") - else: - print(item) - else: - print(data) - -def query_data(language: str, word_type: str) -> None: - # Normalize the input language - normalized_language = LANGUAGE_MAP.get(language.lower()) - if not normalized_language: - print(f"Language '{language}' is not recognized.") - return - - data_file = DATA_DIR / normalized_language / f"{word_type}.json" - if not data_file.exists(): - print(f"No data found for language '{normalized_language}' and word type '{word_type}'.") - return - - try: - with data_file.open('r') as file: - data = json.load(file) - except (IOError, json.JSONDecodeError) as e: - print(f"Error reading '{data_file}': {e}") - return - - print(f"Data for language '{normalized_language}' and word type '{word_type}':") - print_formatted_data(data, word_type) - - if word_type.lower() == 'nouns': - print("\nLegend:") - print("PL : Plural") - print("empty : Singular\n") - -def main() -> None: - parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') - subparsers = parser.add_subparsers(dest='command', required=True) - - subparsers.add_parser('list-languages', aliases=['ll'], help='List available languages') - - list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types') - list_word_types_parser.add_argument('-l', '--language', help='Language code') - - query_parser = subparsers.add_parser('query', help='Query data for a specific language and word type') - query_parser.add_argument('-l', '--language', required=True, help='Language code') - query_parser.add_argument('-wt', '--word-type', required=True, help='Word type') - - args = parser.parse_args() - - if args.command in ['list-languages', 'll']: - list_languages() - elif args.command in ['list-word-types', 'lwt']: - if args.language: - list_word_types(args.language) - else: - list_word_types() - elif args.command == 'query': - query_data(args.language, args.word_type) - else: - parser.print_help() - -if __name__ == '__main__': - main() diff --git a/src/scribe_data/cli/__init__.py b/src/scribe_data/cli/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/scribe_data/cli/cli_list.py b/src/scribe_data/cli/cli_list.py new file mode 100644 index 000000000..affadf648 --- /dev/null +++ b/src/scribe_data/cli/cli_list.py @@ -0,0 +1,52 @@ +# import os +from pathlib import Path +from .cli_utils import LANGUAGE_MAP + +DATA_DIR = Path('scribe_data_json_export') + +def list_languages() -> None: + if not DATA_DIR.exists() or not DATA_DIR.is_dir(): + print(f"Directory '{DATA_DIR}' does not exist.") + return + + languages = [lang.name for lang in DATA_DIR.iterdir() if lang.is_dir()] + languages.sort() + print("Available languages:") + for lang in languages: + print(f"- {lang}") + +def list_word_types(language: str = None) -> None: + if language: + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + language_dir = DATA_DIR / normalized_language + if not language_dir.exists() or not language_dir.is_dir(): + print(f"No data found for language '{normalized_language}'.") + return + + word_types = [wt.stem for wt in language_dir.glob('*.json')] + if not word_types: + print(f"No word types available for language '{normalized_language}'.") + return + + max_word_type_length = max(len(wt) for wt in word_types) + print(f"Word types for language '{normalized_language}':") + for wt in word_types: + print(f" - {wt:<{max_word_type_length}}") + else: + word_types = set() + for lang_dir in DATA_DIR.iterdir(): + if lang_dir.is_dir(): + word_types.update(wt.stem for wt in lang_dir.glob('*.json')) + + if not word_types: + print("No word types available.") + return + + word_types = sorted(word_types) + print("Available word types:") + for wt in word_types: + print(f" - {wt}") diff --git a/src/scribe_data/cli/cli_main.py b/src/scribe_data/cli/cli_main.py new file mode 100644 index 000000000..dd857ca70 --- /dev/null +++ b/src/scribe_data/cli/cli_main.py @@ -0,0 +1,58 @@ +""" +Setup and commands for the Scribe-Data command line interface. + +.. raw:: html + +""" + +#!/usr/bin/env python3 + +import argparse +from .cli_list import list_languages, list_word_types +from .cli_query import query_data + +def main() -> None: + parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') + subparsers = parser.add_subparsers(dest='command', required=True) + + subparsers.add_parser('list-languages', aliases=['ll'], help='List available languages') + + list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types') + list_word_types_parser.add_argument('-l', '--language', help='Language code') + + query_parser = subparsers.add_parser('query', help='Query data for a specific language and word type') + query_parser.add_argument('--all', action='store_true', help='Query all data') + query_parser.add_argument('-l', '--language', help='Language code') + query_parser.add_argument('-wt', '--word-type', help='Word type') + + args = parser.parse_args() + + if args.command in ['list-languages', 'll']: + list_languages() + elif args.command in ['list-word-types', 'lwt']: + if args.language: + list_word_types(args.language) + else: + list_word_types() + elif args.command == 'query': + query_data(args.all, args.language, args.word_type) + else: + parser.print_help() + +if __name__ == '__main__': + main() diff --git a/src/scribe_data/cli/cli_query.py b/src/scribe_data/cli/cli_query.py new file mode 100644 index 000000000..b35ebff3d --- /dev/null +++ b/src/scribe_data/cli/cli_query.py @@ -0,0 +1,63 @@ +import json +from pathlib import Path +from .cli_utils import print_formatted_data, LANGUAGE_MAP + +DATA_DIR = Path('scribe_data_json_export') + +def query_data(all_data: bool, language: str = None, word_type: str = None) -> None: + if not (all_data or language or word_type): + print("Error: You must provide at least one of --all, --language, or --word-type.") + return + + if all_data: + for lang_dir in DATA_DIR.iterdir(): + if lang_dir.is_dir(): + for wt in lang_dir.glob('*.json'): + query_and_print_data(lang_dir.name, wt.stem) + elif language and word_type: + query_and_print_data(language, word_type) + elif language: + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + language_dir = DATA_DIR / normalized_language + if not language_dir.exists() or not language_dir.is_dir(): + print(f"No data found for language '{normalized_language}'.") + return + + for wt in language_dir.glob('*.json'): + query_and_print_data(language, wt.stem) + elif word_type: + for lang_dir in DATA_DIR.iterdir(): + if lang_dir.is_dir(): + wt_path = lang_dir / f"{word_type}.json" + if wt_path.exists(): + query_and_print_data(lang_dir.name, word_type) + +def query_and_print_data(language: str, word_type: str) -> None: + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + data_file = DATA_DIR / normalized_language / f"{word_type}.json" + if not data_file.exists(): + print(f"No data found for language '{normalized_language}' and word type '{word_type}'.") + return + + try: + with data_file.open('r') as file: + data = json.load(file) + except (IOError, json.JSONDecodeError) as e: + print(f"Error reading '{data_file}': {e}") + return + + print(f"Data for language '{normalized_language}' and word type '{word_type}':") + print_formatted_data(data, word_type) + + if word_type.lower() == 'nouns': + print("\nLegend:") + print("PL : Plural") + print("empty : Singular\n") diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py new file mode 100644 index 000000000..2531de9a2 --- /dev/null +++ b/src/scribe_data/cli/cli_utils.py @@ -0,0 +1,53 @@ +from typing import Dict, List, Union + +LANGUAGE_MAP = { + 'en': 'English', 'english': 'English', + 'fr': 'French', 'french': 'French', + 'de': 'German', 'german': 'German', + 'it': 'Italian', 'italian': 'Italian', + 'pt': 'Portuguese', 'portuguese': 'Portuguese', + 'ru': 'Russian', 'russian': 'Russian', + 'es': 'Spanish', 'spanish': 'Spanish', + 'sv': 'Swedish', 'swedish': 'Swedish' +} + +def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: + if not data: + print("No data available.") + return + + if word_type == 'autosuggestions': + max_key_length = max(len(key) for key in data.keys()) + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {', '.join(value)}") + elif word_type == 'emoji_keywords': + max_key_length = max(len(key) for key in data.keys()) + for key, value in data.items(): + emojis = [item['emoji'] for item in value] + print(f"{key:<{max_key_length}} : {' '.join(emojis)}") + elif word_type == 'prepositions' or word_type == 'translations': + max_key_length = max(len(key) for key in data.keys()) + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {value}") + else: + if isinstance(data, dict): + max_key_length = max(len(key) for key in data.keys()) + for key, value in data.items(): + if isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max(len(sub_key) for sub_key in value.keys()) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + else: + print(f" {item}") + else: + print(f"{key:<{max_key_length}} : {value}") + elif isinstance(data, list): + for item in data: + print(item) From 2487d6d42a924744057beb1cc534b1ef6d1069a0 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Wed, 12 Jun 2024 16:11:27 +0600 Subject: [PATCH 07/18] rename files, fix commands for list --- setup.py | 6 +- src/scribe_data/cli/cli_list.py | 52 ---------- src/scribe_data/cli/list.py | 97 +++++++++++++++++++ src/scribe_data/cli/{cli_main.py => main.py} | 23 ++--- .../cli/{cli_query.py => query.py} | 9 +- .../cli/{cli_utils.py => utils.py} | 16 +-- 6 files changed, 121 insertions(+), 82 deletions(-) delete mode 100644 src/scribe_data/cli/cli_list.py create mode 100644 src/scribe_data/cli/list.py rename src/scribe_data/cli/{cli_main.py => main.py} (70%) rename src/scribe_data/cli/{cli_query.py => query.py} (91%) rename src/scribe_data/cli/{cli_utils.py => utils.py} (85%) diff --git a/setup.py b/setup.py index 854612ba4..d6006a57b 100644 --- a/setup.py +++ b/setup.py @@ -49,9 +49,9 @@ url="https://github.com/scribe-org/Scribe-Data", entry_points={ "console_scripts": [ - "scribe-data=scribe_data.cli.cli_main:main", - ], - } + "scribe-data=scribe_data.cli.main:main", + ], + } ) if __name__ == "__main__": diff --git a/src/scribe_data/cli/cli_list.py b/src/scribe_data/cli/cli_list.py deleted file mode 100644 index affadf648..000000000 --- a/src/scribe_data/cli/cli_list.py +++ /dev/null @@ -1,52 +0,0 @@ -# import os -from pathlib import Path -from .cli_utils import LANGUAGE_MAP - -DATA_DIR = Path('scribe_data_json_export') - -def list_languages() -> None: - if not DATA_DIR.exists() or not DATA_DIR.is_dir(): - print(f"Directory '{DATA_DIR}' does not exist.") - return - - languages = [lang.name for lang in DATA_DIR.iterdir() if lang.is_dir()] - languages.sort() - print("Available languages:") - for lang in languages: - print(f"- {lang}") - -def list_word_types(language: str = None) -> None: - if language: - normalized_language = LANGUAGE_MAP.get(language.lower()) - if not normalized_language: - print(f"Language '{language}' is not recognized.") - return - - language_dir = DATA_DIR / normalized_language - if not language_dir.exists() or not language_dir.is_dir(): - print(f"No data found for language '{normalized_language}'.") - return - - word_types = [wt.stem for wt in language_dir.glob('*.json')] - if not word_types: - print(f"No word types available for language '{normalized_language}'.") - return - - max_word_type_length = max(len(wt) for wt in word_types) - print(f"Word types for language '{normalized_language}':") - for wt in word_types: - print(f" - {wt:<{max_word_type_length}}") - else: - word_types = set() - for lang_dir in DATA_DIR.iterdir(): - if lang_dir.is_dir(): - word_types.update(wt.stem for wt in lang_dir.glob('*.json')) - - if not word_types: - print("No word types available.") - return - - word_types = sorted(word_types) - print("Available word types:") - for wt in word_types: - print(f" - {wt}") diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py new file mode 100644 index 000000000..5f5abc784 --- /dev/null +++ b/src/scribe_data/cli/list.py @@ -0,0 +1,97 @@ +from pathlib import Path +from.utils import LANGUAGE_MAP + +DATA_DIR = Path('scribe_data_json_export') + +def list_languages() -> None: + if not DATA_DIR.exists() or not DATA_DIR.is_dir(): + print(f"Directory '{DATA_DIR}' does not exist.") + return + + languages = [lang.name for lang in DATA_DIR.iterdir() if lang.is_dir()] + languages.sort() + print("Available languages:") + for lang in languages: + print(f"- {lang}") + +def list_word_types(language: str = None) -> None: + if language: + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + language_dir = DATA_DIR / normalized_language + if not language_dir.exists() or not language_dir.is_dir(): + print(f"No data found for language '{normalized_language}'.") + return + + word_types = [wt.stem for wt in language_dir.glob('*.json')] + if not word_types: + print(f"No word types available for language '{normalized_language}'.") + return + + max_word_type_length = max(len(wt) for wt in word_types) + print(f"Word types for language '{normalized_language}':") + for wt in word_types: + print(f" - {wt:<{max_word_type_length}}") + else: + word_types = set() + for lang_dir in DATA_DIR.iterdir(): + if lang_dir.is_dir(): + word_types.update(wt.stem for wt in lang_dir.glob('*.json')) + + if not word_types: + print("No word types available.") + return + + word_types = sorted(word_types) + print("Available word types:") + for wt in word_types: + print(f" - {wt}") + +def list_all() -> None: + list_languages() + print() + list_word_types() + +def list_languages_for_word_type(word_type: str) -> None: + available_languages = [] + for lang_dir in DATA_DIR.iterdir(): + if lang_dir.is_dir(): + wt_path = lang_dir / f"{word_type}.json" + if wt_path.exists(): + available_languages.append(lang_dir.name) + + if not available_languages: + print(f"No languages found with word type '{word_type}'.") + return + + available_languages.sort() + print(f"Languages with word type '{word_type}':") + for lang in available_languages: + print(f"- {lang}") + +def list_wrapper(language: str = None, word_type: str = None) -> None: + if language is None and word_type is None: + list_all() + elif language is True and word_type is None: + list_languages() + elif language is None and word_type is True: + list_word_types() + elif language is True and word_type is True: + print("Please specify both a language and a word type.") + elif language is True and word_type is not None: + list_languages_for_word_type(word_type) + elif language is not None and word_type is True: + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + list_word_types(normalized_language) + elif language is not None and word_type is not None: + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + list_word_types(normalized_language) diff --git a/src/scribe_data/cli/cli_main.py b/src/scribe_data/cli/main.py similarity index 70% rename from src/scribe_data/cli/cli_main.py rename to src/scribe_data/cli/main.py index dd857ca70..68bc86534 100644 --- a/src/scribe_data/cli/cli_main.py +++ b/src/scribe_data/cli/main.py @@ -21,19 +21,17 @@ """ #!/usr/bin/env python3 - import argparse -from .cli_list import list_languages, list_word_types -from .cli_query import query_data +from .list import list_wrapper +from .query import query_data def main() -> None: parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') subparsers = parser.add_subparsers(dest='command', required=True) - subparsers.add_parser('list-languages', aliases=['ll'], help='List available languages') - - list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types') - list_word_types_parser.add_argument('-l', '--language', help='Language code') + list_parser = subparsers.add_parser('list', help='List languages and word types') + list_parser.add_argument('--language', '-l', nargs='?', const=True, help='List all languages or filter by language code') + list_parser.add_argument('--word-type', '-wt', nargs='?', const=True, help='List all word types or filter by word type') query_parser = subparsers.add_parser('query', help='Query data for a specific language and word type') query_parser.add_argument('--all', action='store_true', help='Query all data') @@ -42,17 +40,12 @@ def main() -> None: args = parser.parse_args() - if args.command in ['list-languages', 'll']: - list_languages() - elif args.command in ['list-word-types', 'lwt']: - if args.language: - list_word_types(args.language) - else: - list_word_types() + if args.command == 'list': + list_wrapper(args.language, args.word_type) elif args.command == 'query': query_data(args.all, args.language, args.word_type) else: parser.print_help() if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/src/scribe_data/cli/cli_query.py b/src/scribe_data/cli/query.py similarity index 91% rename from src/scribe_data/cli/cli_query.py rename to src/scribe_data/cli/query.py index b35ebff3d..bd4f629c1 100644 --- a/src/scribe_data/cli/cli_query.py +++ b/src/scribe_data/cli/query.py @@ -1,6 +1,7 @@ +# src/scribe_data/cli/query.py import json from pathlib import Path -from .cli_utils import print_formatted_data, LANGUAGE_MAP +from .utils import LANGUAGE_MAP, print_formatted_data DATA_DIR = Path('scribe_data_json_export') @@ -56,8 +57,4 @@ def query_and_print_data(language: str, word_type: str) -> None: print(f"Data for language '{normalized_language}' and word type '{word_type}':") print_formatted_data(data, word_type) - - if word_type.lower() == 'nouns': - print("\nLegend:") - print("PL : Plural") - print("empty : Singular\n") + \ No newline at end of file diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/utils.py similarity index 85% rename from src/scribe_data/cli/cli_utils.py rename to src/scribe_data/cli/utils.py index 2531de9a2..4ceac0f80 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/utils.py @@ -1,5 +1,7 @@ -from typing import Dict, List, Union +from typing import Dict, List, Union, Optional +from difflib import SequenceMatcher +# Mapping of possible inputs to standardized language names LANGUAGE_MAP = { 'en': 'English', 'english': 'English', 'fr': 'French', 'french': 'French', @@ -12,10 +14,6 @@ } def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: - if not data: - print("No data available.") - return - if word_type == 'autosuggestions': max_key_length = max(len(key) for key in data.keys()) for key, value in data.items(): @@ -50,4 +48,10 @@ def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: print(f"{key:<{max_key_length}} : {value}") elif isinstance(data, list): for item in data: - print(item) + if isinstance(item, dict): + for key, value in item.items(): + print(f"{key} : {value}") + else: + print(item) + else: + print(data) From bc6c7da9c70a19d6197cfa29f9cf61b6f0e16eb8 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Sun, 16 Jun 2024 07:43:26 +0600 Subject: [PATCH 08/18] changed alias for query into q --- src/scribe_data/cli/main.py | 11 ++++++++--- src/scribe_data/cli/utils.py | 17 ++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 68bc86534..d6a15ee06 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -33,7 +33,10 @@ def main() -> None: list_parser.add_argument('--language', '-l', nargs='?', const=True, help='List all languages or filter by language code') list_parser.add_argument('--word-type', '-wt', nargs='?', const=True, help='List all word types or filter by word type') - query_parser = subparsers.add_parser('query', help='Query data for a specific language and word type') + list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types') + list_word_types_parser.add_argument('-l', '--language', help='Language code') + + query_parser = subparsers.add_parser('query', aliases=['q'], help='Query data for a specific language and word type') query_parser.add_argument('--all', action='store_true', help='Query all data') query_parser.add_argument('-l', '--language', help='Language code') query_parser.add_argument('-wt', '--word-type', help='Word type') @@ -42,10 +45,12 @@ def main() -> None: if args.command == 'list': list_wrapper(args.language, args.word_type) - elif args.command == 'query': + elif args.command in ['list-word-types', 'lwt']: + list_wrapper(None, args.language) + elif args.command in ['query', 'q']: query_data(args.all, args.language, args.word_type) else: parser.print_help() if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/scribe_data/cli/utils.py b/src/scribe_data/cli/utils.py index 4ceac0f80..fe5a71c5d 100644 --- a/src/scribe_data/cli/utils.py +++ b/src/scribe_data/cli/utils.py @@ -1,5 +1,4 @@ -from typing import Dict, List, Union, Optional -from difflib import SequenceMatcher +from typing import Dict, List, Union # Mapping of possible inputs to standardized language names LANGUAGE_MAP = { @@ -14,26 +13,30 @@ } def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: + if not data: + print(f"No data available for word type '{word_type}'.") + return + if word_type == 'autosuggestions': - max_key_length = max(len(key) for key in data.keys()) + max_key_length = max((len(key) for key in data.keys()), default=0) for key, value in data.items(): print(f"{key:<{max_key_length}} : {', '.join(value)}") elif word_type == 'emoji_keywords': - max_key_length = max(len(key) for key in data.keys()) + max_key_length = max((len(key) for key in data.keys()), default=0) for key, value in data.items(): emojis = [item['emoji'] for item in value] print(f"{key:<{max_key_length}} : {' '.join(emojis)}") elif word_type == 'prepositions' or word_type == 'translations': - max_key_length = max(len(key) for key in data.keys()) + max_key_length = max((len(key) for key in data.keys()), default=0) for key, value in data.items(): print(f"{key:<{max_key_length}} : {value}") else: if isinstance(data, dict): - max_key_length = max(len(key) for key in data.keys()) + max_key_length = max((len(key) for key in data.keys()), default=0) for key, value in data.items(): if isinstance(value, dict): print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max(len(sub_key) for sub_key in value.keys()) + max_sub_key_length = max((len(sub_key) for sub_key in value.keys()), default=0) for sub_key, sub_value in value.items(): print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") elif isinstance(value, list): From c4348569c47240834a297c7db17e1804511f8339 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Sun, 16 Jun 2024 08:12:18 +0600 Subject: [PATCH 09/18] getting lang info from language_meta_data.json --- src/scribe_data/cli/list.py | 52 ++++++++----------- src/scribe_data/cli/query.py | 26 +++++----- src/scribe_data/cli/utils.py | 24 ++++----- .../resources/language_meta_data.json | 29 +++++++---- 4 files changed, 65 insertions(+), 66 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 5f5abc784..6519d50d1 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -1,18 +1,14 @@ from pathlib import Path -from.utils import LANGUAGE_MAP +from .utils import LANGUAGE_METADATA, LANGUAGE_MAP DATA_DIR = Path('scribe_data_json_export') def list_languages() -> None: - if not DATA_DIR.exists() or not DATA_DIR.is_dir(): - print(f"Directory '{DATA_DIR}' does not exist.") - return - - languages = [lang.name for lang in DATA_DIR.iterdir() if lang.is_dir()] + languages = [lang['language'] for lang in LANGUAGE_METADATA['languages']] languages.sort() print("Available languages:") for lang in languages: - print(f"- {lang}") + print(f"- {lang.capitalize()}") def list_word_types(language: str = None) -> None: if language: @@ -21,25 +17,26 @@ def list_word_types(language: str = None) -> None: print(f"Language '{language}' is not recognized.") return - language_dir = DATA_DIR / normalized_language + language_dir = DATA_DIR / normalized_language['language'].capitalize() if not language_dir.exists() or not language_dir.is_dir(): - print(f"No data found for language '{normalized_language}'.") + print(f"No data found for language '{normalized_language['language']}'.") return word_types = [wt.stem for wt in language_dir.glob('*.json')] if not word_types: - print(f"No word types available for language '{normalized_language}'.") + print(f"No word types available for language '{normalized_language['language']}'.") return - max_word_type_length = max(len(wt) for wt in word_types) - print(f"Word types for language '{normalized_language}':") + word_types = sorted(word_types) + print(f"Word types for language '{normalized_language['language']}':") for wt in word_types: - print(f" - {wt:<{max_word_type_length}}") + print(f" - {wt}") else: word_types = set() - for lang_dir in DATA_DIR.iterdir(): - if lang_dir.is_dir(): - word_types.update(wt.stem for wt in lang_dir.glob('*.json')) + for lang in LANGUAGE_METADATA['languages']: + language_dir = DATA_DIR / lang['language'].capitalize() + if language_dir.is_dir(): + word_types.update(wt.stem for wt in language_dir.glob('*.json')) if not word_types: print("No word types available.") @@ -57,11 +54,12 @@ def list_all() -> None: def list_languages_for_word_type(word_type: str) -> None: available_languages = [] - for lang_dir in DATA_DIR.iterdir(): - if lang_dir.is_dir(): - wt_path = lang_dir / f"{word_type}.json" + for lang in LANGUAGE_METADATA['languages']: + language_dir = DATA_DIR / lang['language'].capitalize() + if language_dir.is_dir(): + wt_path = language_dir / f"{word_type}.json" if wt_path.exists(): - available_languages.append(lang_dir.name) + available_languages.append(lang['language']) if not available_languages: print(f"No languages found with word type '{word_type}'.") @@ -70,7 +68,7 @@ def list_languages_for_word_type(word_type: str) -> None: available_languages.sort() print(f"Languages with word type '{word_type}':") for lang in available_languages: - print(f"- {lang}") + print(f"- {lang.capitalize()}") def list_wrapper(language: str = None, word_type: str = None) -> None: if language is None and word_type is None: @@ -84,14 +82,6 @@ def list_wrapper(language: str = None, word_type: str = None) -> None: elif language is True and word_type is not None: list_languages_for_word_type(word_type) elif language is not None and word_type is True: - normalized_language = LANGUAGE_MAP.get(language.lower()) - if not normalized_language: - print(f"Language '{language}' is not recognized.") - return - list_word_types(normalized_language) + list_word_types(language) elif language is not None and word_type is not None: - normalized_language = LANGUAGE_MAP.get(language.lower()) - if not normalized_language: - print(f"Language '{language}' is not recognized.") - return - list_word_types(normalized_language) + list_word_types(language) diff --git a/src/scribe_data/cli/query.py b/src/scribe_data/cli/query.py index bd4f629c1..ed415a0cb 100644 --- a/src/scribe_data/cli/query.py +++ b/src/scribe_data/cli/query.py @@ -1,7 +1,6 @@ -# src/scribe_data/cli/query.py import json from pathlib import Path -from .utils import LANGUAGE_MAP, print_formatted_data +from .utils import LANGUAGE_METADATA, LANGUAGE_MAP, print_formatted_data DATA_DIR = Path('scribe_data_json_export') @@ -11,10 +10,11 @@ def query_data(all_data: bool, language: str = None, word_type: str = None) -> N return if all_data: - for lang_dir in DATA_DIR.iterdir(): + for lang in LANGUAGE_METADATA['languages']: + lang_dir = DATA_DIR / lang['language'].capitalize() if lang_dir.is_dir(): for wt in lang_dir.glob('*.json'): - query_and_print_data(lang_dir.name, wt.stem) + query_and_print_data(lang['language'], wt.stem) elif language and word_type: query_and_print_data(language, word_type) elif language: @@ -23,19 +23,20 @@ def query_data(all_data: bool, language: str = None, word_type: str = None) -> N print(f"Language '{language}' is not recognized.") return - language_dir = DATA_DIR / normalized_language + language_dir = DATA_DIR / normalized_language['language'].capitalize() if not language_dir.exists() or not language_dir.is_dir(): - print(f"No data found for language '{normalized_language}'.") + print(f"No data found for language '{normalized_language['language']}'.") return for wt in language_dir.glob('*.json'): - query_and_print_data(language, wt.stem) + query_and_print_data(normalized_language['language'], wt.stem) elif word_type: - for lang_dir in DATA_DIR.iterdir(): + for lang in LANGUAGE_METADATA['languages']: + lang_dir = DATA_DIR / lang['language'].capitalize() if lang_dir.is_dir(): wt_path = lang_dir / f"{word_type}.json" if wt_path.exists(): - query_and_print_data(lang_dir.name, word_type) + query_and_print_data(lang['language'], word_type) def query_and_print_data(language: str, word_type: str) -> None: normalized_language = LANGUAGE_MAP.get(language.lower()) @@ -43,9 +44,9 @@ def query_and_print_data(language: str, word_type: str) -> None: print(f"Language '{language}' is not recognized.") return - data_file = DATA_DIR / normalized_language / f"{word_type}.json" + data_file = DATA_DIR / normalized_language['language'].capitalize() / f"{word_type}.json" if not data_file.exists(): - print(f"No data found for language '{normalized_language}' and word type '{word_type}'.") + print(f"No data found for language '{normalized_language['language']}' and word type '{word_type}'.") return try: @@ -55,6 +56,5 @@ def query_and_print_data(language: str, word_type: str) -> None: print(f"Error reading '{data_file}': {e}") return - print(f"Data for language '{normalized_language}' and word type '{word_type}':") + print(f"Data for language '{normalized_language['language']}' and word type '{word_type}':") print_formatted_data(data, word_type) - \ No newline at end of file diff --git a/src/scribe_data/cli/utils.py b/src/scribe_data/cli/utils.py index fe5a71c5d..f38086043 100644 --- a/src/scribe_data/cli/utils.py +++ b/src/scribe_data/cli/utils.py @@ -1,16 +1,16 @@ +import json +from pathlib import Path from typing import Dict, List, Union -# Mapping of possible inputs to standardized language names -LANGUAGE_MAP = { - 'en': 'English', 'english': 'English', - 'fr': 'French', 'french': 'French', - 'de': 'German', 'german': 'German', - 'it': 'Italian', 'italian': 'Italian', - 'pt': 'Portuguese', 'portuguese': 'Portuguese', - 'ru': 'Russian', 'russian': 'Russian', - 'es': 'Spanish', 'spanish': 'Spanish', - 'sv': 'Swedish', 'swedish': 'Swedish' -} +# Load language metadata from JSON file +METADATA_FILE = Path(__file__).parent.parent / 'resources' / 'language_meta_data.json' + +def load_language_metadata() -> Dict: + with METADATA_FILE.open('r', encoding='utf-8') as file: + return json.load(file) + +LANGUAGE_METADATA = load_language_metadata() +LANGUAGE_MAP = {lang['language'].lower(): lang for lang in LANGUAGE_METADATA['languages']} def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: if not data: @@ -26,7 +26,7 @@ def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: for key, value in data.items(): emojis = [item['emoji'] for item in value] print(f"{key:<{max_key_length}} : {' '.join(emojis)}") - elif word_type == 'prepositions' or word_type == 'translations': + elif word_type in ['prepositions', 'translations']: max_key_length = max((len(key) for key in data.keys()), default=0) for key, value in data.items(): print(f"{key:<{max_key_length}} : {value}") diff --git a/src/scribe_data/resources/language_meta_data.json b/src/scribe_data/resources/language_meta_data.json index 27a8110ea..88ba732e1 100755 --- a/src/scribe_data/resources/language_meta_data.json +++ b/src/scribe_data/resources/language_meta_data.json @@ -3,10 +3,11 @@ "description": { "entry": { "language": "the supported language. All lowercase", - "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes . All lowercase", + "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. All lowercase", "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390", "remove-words": "words that should not be included as autosuggestions for the given language.", - "ignore-words": "TODO. Case sensitive." + "ignore-words": "TODO. Case sensitive.", + "word-types": "A list of word types available for the given language." } }, "languages": [ @@ -15,56 +16,64 @@ "iso": "en", "qid": "Q1860", "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] + "ignore-words": [], + "word-types": ["nouns", "verbs", "translated_words"] }, { "language": "french", "iso": "fr", "qid": "Q150", "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"] + "ignore-words": ["XXe"], + "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] }, { "language": "german", "iso": "de", "qid": "Q188", "remove-words": ["of", "the", "The", "and", "NeinJa", "et", "redirect"], - "ignore-words": ["Gemeinde", "Familienname"] + "ignore-words": ["Gemeinde", "Familienname"], + "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] }, { "language": "italian", "iso": "it", "qid": "Q652", "remove-words": ["of", "the", "The", "and", "text", "from"], - "ignore-words": ["The", "ATP"] + "ignore-words": ["The", "ATP"], + "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] }, { "language": "portuguese", "iso": "pt", "qid": "Q5146", "remove-words": ["of", "the", "The", "and", "jbutadptflora"], - "ignore-words": [] + "ignore-words": [], + "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] }, { "language": "russian", "iso": "ru", "qid": "Q7737", "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] + "ignore-words": [], + "word-types": ["nouns", "verbs", "translated_words", "translations", "emoji_keywords", "prepositions", "autosuggestions"] }, { "language": "spanish", "iso": "es", "qid": "Q1321", "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] + "ignore-words": [], + "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] }, { "language": "swedish", "iso": "sv", "qid": "Q9027", "remove-words": ["of", "the", "The", "and", "Checklist", "Catalogue"], - "ignore-words": ["databasdump"] + "ignore-words": ["databasdump"], + "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] } ] } From e1e8e68112ff19a2a24f5b0437e8c341295e780d Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Sun, 16 Jun 2024 09:15:01 +0600 Subject: [PATCH 10/18] show formatted data from meta file --- src/scribe_data/cli/list.py | 76 ++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 6519d50d1..0656a067a 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -1,14 +1,80 @@ +import json from pathlib import Path -from .utils import LANGUAGE_METADATA, LANGUAGE_MAP +from typing import Dict, List, Union + +# Load language metadata from JSON file +METADATA_FILE = Path(__file__).parent.parent / 'resources' / 'language_meta_data.json' + +def load_language_metadata() -> Dict: + with METADATA_FILE.open('r', encoding='utf-8') as file: + return json.load(file) + +LANGUAGE_METADATA = load_language_metadata() +LANGUAGE_MAP = {lang['language'].lower(): lang for lang in LANGUAGE_METADATA['languages']} DATA_DIR = Path('scribe_data_json_export') +def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: + if not data: + print(f"No data available for word type '{word_type}'.") + return + + if word_type == 'autosuggestions': + max_key_length = max((len(key) for key in data.keys()), default=0) + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {', '.join(value)}") + elif word_type == 'emoji_keywords': + max_key_length = max((len(key) for key in data.keys()), default=0) + for key, value in data.items(): + emojis = [item['emoji'] for item in value] + print(f"{key:<{max_key_length}} : {' '.join(emojis)}") + elif word_type in ['prepositions', 'translations']: + max_key_length = max((len(key) for key in data.keys()), default=0) + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {value}") + else: + if isinstance(data, dict): + max_key_length = max((len(key) for key in data.keys()), default=0) + for key, value in data.items(): + if isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max((len(sub_key) for sub_key in value.keys()), default=0) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + else: + print(f" {item}") + else: + print(f"{key:<{max_key_length}} : {value}") + elif isinstance(data, list): + for item in data: + if isinstance(item, dict): + for key, value in item.items(): + print(f"{key} : {value}") + else: + print(item) + else: + print(data) + def list_languages() -> None: - languages = [lang['language'] for lang in LANGUAGE_METADATA['languages']] - languages.sort() - print("Available languages:") + languages = [lang for lang in LANGUAGE_METADATA['languages']] + languages.sort(key=lambda x: x['language']) + + # Define column widths + language_col_width = max(len(lang['language']) for lang in languages) + 2 + iso_col_width = 5 # Length of "ISO" column header + padding + qid_col_width = 5 # Length of "QID" column header + padding + + print(f"{'Language':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}") + print('-' * (language_col_width + iso_col_width + qid_col_width)) + for lang in languages: - print(f"- {lang.capitalize()}") + print(f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}") def list_word_types(language: str = None) -> None: if language: From d39dd296b531d239500e30e89ce95cf09022dfbd Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Tue, 18 Jun 2024 16:16:59 +0600 Subject: [PATCH 11/18] add not implemented function --- src/scribe_data/cli/main.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index d6a15ee06..923c6f97c 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -25,21 +25,39 @@ from .list import list_wrapper from .query import query_data +def not_implemented(): + print("This command is not implemented yet.") + def main() -> None: parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') subparsers = parser.add_subparsers(dest='command', required=True) + # List command list_parser = subparsers.add_parser('list', help='List languages and word types') list_parser.add_argument('--language', '-l', nargs='?', const=True, help='List all languages or filter by language code') list_parser.add_argument('--word-type', '-wt', nargs='?', const=True, help='List all word types or filter by word type') + # List word types command list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types') list_word_types_parser.add_argument('-l', '--language', help='Language code') + # Query command query_parser = subparsers.add_parser('query', aliases=['q'], help='Query data for a specific language and word type') query_parser.add_argument('--all', action='store_true', help='Query all data') query_parser.add_argument('-l', '--language', help='Language code') query_parser.add_argument('-wt', '--word-type', help='Word type') + query_parser.add_argument('-of', '--output-file', help='Output file') + query_parser.add_argument('-ot', '--output-type', help='Output type') + query_parser.add_argument('-ll', '--list-languages', action='store_true', help='List available language codes') + + # Poll command + poll_parser = subparsers.add_parser('poll', help='Check whether there is new data available') + + # Version command + version_parser = subparsers.add_parser('version', aliases=['v'], help='Show the version of the CLI tool') + + # Update command + update_parser = subparsers.add_parser('update', aliases=['u'], help='Update the CLI tool') args = parser.parse_args() @@ -49,8 +67,15 @@ def main() -> None: list_wrapper(None, args.language) elif args.command in ['query', 'q']: query_data(args.all, args.language, args.word_type) + elif args.command == 'poll': + not_implemented() + elif args.command in ['version', 'v']: + not_implemented() + elif args.command in ['update', 'u']: + not_implemented() else: parser.print_help() if __name__ == '__main__': main() + From 4f63cf0b5df32f232ef23f1d2c59ed4c54242c55 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Wed, 19 Jun 2024 07:50:51 +0600 Subject: [PATCH 12/18] added --output-dir and --overwrite - #144 --- src/scribe_data/cli/main.py | 26 +++----------------- src/scribe_data/cli/query.py | 46 +++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 923c6f97c..ef05beb9b 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -25,9 +25,6 @@ from .list import list_wrapper from .query import query_data -def not_implemented(): - print("This command is not implemented yet.") - def main() -> None: parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') subparsers = parser.add_subparsers(dest='command', required=True) @@ -46,18 +43,8 @@ def main() -> None: query_parser.add_argument('--all', action='store_true', help='Query all data') query_parser.add_argument('-l', '--language', help='Language code') query_parser.add_argument('-wt', '--word-type', help='Word type') - query_parser.add_argument('-of', '--output-file', help='Output file') - query_parser.add_argument('-ot', '--output-type', help='Output type') - query_parser.add_argument('-ll', '--list-languages', action='store_true', help='List available language codes') - - # Poll command - poll_parser = subparsers.add_parser('poll', help='Check whether there is new data available') - - # Version command - version_parser = subparsers.add_parser('version', aliases=['v'], help='Show the version of the CLI tool') - - # Update command - update_parser = subparsers.add_parser('update', aliases=['u'], help='Update the CLI tool') + query_parser.add_argument('-od', '--output-dir', help='Output directory') + query_parser.add_argument('-o', '--overwrite', action='store_true', help='Overwrite existing files in output directory') args = parser.parse_args() @@ -66,16 +53,9 @@ def main() -> None: elif args.command in ['list-word-types', 'lwt']: list_wrapper(None, args.language) elif args.command in ['query', 'q']: - query_data(args.all, args.language, args.word_type) - elif args.command == 'poll': - not_implemented() - elif args.command in ['version', 'v']: - not_implemented() - elif args.command in ['update', 'u']: - not_implemented() + query_data(args.all, args.language, args.word_type, args.output_dir, args.overwrite) else: parser.print_help() if __name__ == '__main__': main() - diff --git a/src/scribe_data/cli/query.py b/src/scribe_data/cli/query.py index ed415a0cb..ddaec9ccd 100644 --- a/src/scribe_data/cli/query.py +++ b/src/scribe_data/cli/query.py @@ -4,19 +4,33 @@ DATA_DIR = Path('scribe_data_json_export') -def query_data(all_data: bool, language: str = None, word_type: str = None) -> None: +def query_data(all_data: bool, language: str = None, word_type: str = None, output_dir: str = None, overwrite: bool = False) -> None: if not (all_data or language or word_type): print("Error: You must provide at least one of --all, --language, or --word-type.") return + if output_dir: + output_dir = Path(output_dir).expanduser() # Ensure it's a Path object and expand user (~) + if output_dir.suffix: + print("Error: The output path should be a directory, not a file.") + return + + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) # Create directories if they do not exist + elif not output_dir.is_dir(): + print(f"Error: {output_dir} is not a directory.") + return + else: + output_dir = None + if all_data: for lang in LANGUAGE_METADATA['languages']: lang_dir = DATA_DIR / lang['language'].capitalize() if lang_dir.is_dir(): for wt in lang_dir.glob('*.json'): - query_and_print_data(lang['language'], wt.stem) + query_and_print_data(lang['language'], wt.stem, output_dir, overwrite) elif language and word_type: - query_and_print_data(language, word_type) + query_and_print_data(language, word_type, output_dir, overwrite) elif language: normalized_language = LANGUAGE_MAP.get(language.lower()) if not normalized_language: @@ -29,16 +43,16 @@ def query_data(all_data: bool, language: str = None, word_type: str = None) -> N return for wt in language_dir.glob('*.json'): - query_and_print_data(normalized_language['language'], wt.stem) + query_and_print_data(normalized_language['language'], wt.stem, output_dir, overwrite) elif word_type: for lang in LANGUAGE_METADATA['languages']: lang_dir = DATA_DIR / lang['language'].capitalize() if lang_dir.is_dir(): wt_path = lang_dir / f"{word_type}.json" if wt_path.exists(): - query_and_print_data(lang['language'], word_type) + query_and_print_data(lang['language'], word_type, output_dir, overwrite) -def query_and_print_data(language: str, word_type: str) -> None: +def query_and_print_data(language: str, word_type: str, output_dir: Path, overwrite: bool) -> None: normalized_language = LANGUAGE_MAP.get(language.lower()) if not normalized_language: print(f"Language '{language}' is not recognized.") @@ -56,5 +70,21 @@ def query_and_print_data(language: str, word_type: str) -> None: print(f"Error reading '{data_file}': {e}") return - print(f"Data for language '{normalized_language['language']}' and word type '{word_type}':") - print_formatted_data(data, word_type) + if output_dir: + output_file = output_dir / f"{normalized_language['language']}_{word_type}.json" + if output_file.exists() and not overwrite: + user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") + if user_input.lower() != 'y': + print(f"Skipping {normalized_language['language']} - {word_type}") + return + + try: + with output_file.open('w') as file: + json.dump(data, file, indent=2) + except IOError as e: + print(f"Error writing to '{output_file}': {e}") + return + print(f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'") + else: + print(f"Data for language '{normalized_language['language']}' and word type '{word_type}':") + print_formatted_data(data, word_type) From afa4eefcaae2a6b0a4bbeb005107e6083550a5c5 Mon Sep 17 00:00:00 2001 From: Mahfuza Humayra Mohona Date: Thu, 20 Jun 2024 08:13:51 +0600 Subject: [PATCH 13/18] implementation of #146 --- src/scribe_data/cli/main.py | 14 ++-- src/scribe_data/cli/query.py | 150 +++++++++++++++++++++-------------- 2 files changed, 98 insertions(+), 66 deletions(-) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index ef05beb9b..d012d860c 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -39,12 +39,12 @@ def main() -> None: list_word_types_parser.add_argument('-l', '--language', help='Language code') # Query command - query_parser = subparsers.add_parser('query', aliases=['q'], help='Query data for a specific language and word type') - query_parser.add_argument('--all', action='store_true', help='Query all data') - query_parser.add_argument('-l', '--language', help='Language code') - query_parser.add_argument('-wt', '--word-type', help='Word type') - query_parser.add_argument('-od', '--output-dir', help='Output directory') - query_parser.add_argument('-o', '--overwrite', action='store_true', help='Overwrite existing files in output directory') + query_parser = subparsers.add_parser('query', help='Query data') + query_parser.add_argument('-l', '--language', type=str, help='Language for query') + query_parser.add_argument('-wt', '--word-type', type=str, help='Word type for query') + query_parser.add_argument('-od', '--output-dir', type=str, help='Output directory') + query_parser.add_argument('-o', '--overwrite', action='store_true', help='Overwrite existing files') + query_parser.add_argument('--output-type', type=str, choices=['json', 'csv', 'tsv'], help='Output file type') args = parser.parse_args() @@ -53,7 +53,7 @@ def main() -> None: elif args.command in ['list-word-types', 'lwt']: list_wrapper(None, args.language) elif args.command in ['query', 'q']: - query_data(args.all, args.language, args.word_type, args.output_dir, args.overwrite) + query_data(args.language, args.word_type, args.output_dir, args.overwrite, args.output_type) else: parser.print_help() diff --git a/src/scribe_data/cli/query.py b/src/scribe_data/cli/query.py index ddaec9ccd..d993eff15 100644 --- a/src/scribe_data/cli/query.py +++ b/src/scribe_data/cli/query.py @@ -1,58 +1,68 @@ import json +import csv from pathlib import Path -from .utils import LANGUAGE_METADATA, LANGUAGE_MAP, print_formatted_data +from typing import Optional +from .utils import LANGUAGE_METADATA, LANGUAGE_MAP DATA_DIR = Path('scribe_data_json_export') -def query_data(all_data: bool, language: str = None, word_type: str = None, output_dir: str = None, overwrite: bool = False) -> None: - if not (all_data or language or word_type): - print("Error: You must provide at least one of --all, --language, or --word-type.") +def query_data(language: str = None, word_type: str = None, output_dir: Optional[str] = None, overwrite: bool = False, output_type: Optional[str] = None) -> None: + if not (language and word_type): + print("Error: You must provide both --language (-l) and --word-type (-wt) options.") return if output_dir: - output_dir = Path(output_dir).expanduser() # Ensure it's a Path object and expand user (~) - if output_dir.suffix: - print("Error: The output path should be a directory, not a file.") - return - + output_dir = Path(output_dir) if not output_dir.exists(): - output_dir.mkdir(parents=True, exist_ok=True) # Create directories if they do not exist - elif not output_dir.is_dir(): - print(f"Error: {output_dir} is not a directory.") - return + output_dir.mkdir(parents=True, exist_ok=True) + + if output_type == 'json' or output_type is None: + export_json(language, word_type, output_dir, overwrite) + elif output_type in ['csv', 'tsv']: + export_csv_or_tsv(language, word_type, output_dir, overwrite, output_type) + else: + print("Error: Unsupported output type. Please use 'json', 'csv', or 'tsv'.") else: - output_dir = None - - if all_data: - for lang in LANGUAGE_METADATA['languages']: - lang_dir = DATA_DIR / lang['language'].capitalize() - if lang_dir.is_dir(): - for wt in lang_dir.glob('*.json'): - query_and_print_data(lang['language'], wt.stem, output_dir, overwrite) - elif language and word_type: - query_and_print_data(language, word_type, output_dir, overwrite) - elif language: - normalized_language = LANGUAGE_MAP.get(language.lower()) - if not normalized_language: - print(f"Language '{language}' is not recognized.") - return + print("Error: Please specify an output directory using --output-dir (-od).") + +def export_json(language: str, word_type: str, output_dir: Path, overwrite: bool) -> None: + normalized_language = LANGUAGE_MAP.get(language.lower()) + if not normalized_language: + print(f"Language '{language}' is not recognized.") + return + + data_file = DATA_DIR / normalized_language['language'].capitalize() / f"{word_type}.json" + if not data_file.exists(): + print(f"No data found for language '{normalized_language['language']}' and word type '{word_type}'.") + return + + try: + with data_file.open('r') as file: + data = json.load(file) + except (IOError, json.JSONDecodeError) as e: + print(f"Error reading '{data_file}': {e}") + return + + # Adjust the output directory for JSON exports + json_output_dir = output_dir / "scribe_data_json_export" / normalized_language['language'].capitalize() + json_output_dir.mkdir(parents=True, exist_ok=True) - language_dir = DATA_DIR / normalized_language['language'].capitalize() - if not language_dir.exists() or not language_dir.is_dir(): - print(f"No data found for language '{normalized_language['language']}'.") + output_file = json_output_dir / f"{word_type}.json" + if output_file.exists() and not overwrite: + user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") + if user_input.lower()!= 'y': + print(f"Skipping {normalized_language['language']} - {word_type}") return - for wt in language_dir.glob('*.json'): - query_and_print_data(normalized_language['language'], wt.stem, output_dir, overwrite) - elif word_type: - for lang in LANGUAGE_METADATA['languages']: - lang_dir = DATA_DIR / lang['language'].capitalize() - if lang_dir.is_dir(): - wt_path = lang_dir / f"{word_type}.json" - if wt_path.exists(): - query_and_print_data(lang['language'], word_type, output_dir, overwrite) - -def query_and_print_data(language: str, word_type: str, output_dir: Path, overwrite: bool) -> None: + try: + with output_file.open('w') as file: + json.dump(data, file, indent=2) + except IOError as e: + print(f"Error writing to '{output_file}': {e}") + return + print(f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'") + +def export_csv_or_tsv(language: str, word_type: str, output_dir: Path, overwrite: bool, output_type: str) -> None: normalized_language = LANGUAGE_MAP.get(language.lower()) if not normalized_language: print(f"Language '{language}' is not recognized.") @@ -70,21 +80,43 @@ def query_and_print_data(language: str, word_type: str, output_dir: Path, overwr print(f"Error reading '{data_file}': {e}") return - if output_dir: - output_file = output_dir / f"{normalized_language['language']}_{word_type}.json" - if output_file.exists() and not overwrite: - user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") - if user_input.lower() != 'y': - print(f"Skipping {normalized_language['language']} - {word_type}") - return - - try: - with output_file.open('w') as file: - json.dump(data, file, indent=2) - except IOError as e: - print(f"Error writing to '{output_file}': {e}") - return - print(f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'") + if output_type == 'csv': + delimiter = ',' + file_extension = 'csv' + elif output_type == 'tsv': + delimiter = '\t' + file_extension = 'tsv' else: - print(f"Data for language '{normalized_language['language']}' and word type '{word_type}':") - print_formatted_data(data, word_type) + print(f"Unsupported output type '{output_type}'.") + return + + # Adjust the output directory for CSV exports + csv_output_dir = output_dir / "scribe_data_csv_export" / normalized_language['language'].capitalize() + csv_output_dir.mkdir(parents=True, exist_ok=True) + + output_file = csv_output_dir / f"{word_type}.{file_extension}" + if output_file.exists() and not overwrite: + user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") + if user_input.lower()!= 'y': + print(f"Skipping {normalized_language['language']} - {word_type}") + return + + try: + with output_file.open('w', newline='', encoding='utf-8') as file: + writer = csv.writer(file, delimiter=delimiter) + if isinstance(data, dict): + for key, value in data.items(): + writer.writerow([key, value]) + elif isinstance(data, list): + for item in data: + if isinstance(item, dict): + writer.writerow(item.values()) + else: + writer.writerow([item]) + else: + print(f"Unsupported data format for {output_type} export.") + except IOError as e: + print(f"Error writing to '{output_file}': {e}") + return + + print(f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'") From 6958366327e07aac4bd60b6f4f4ce788e2d854b9 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 22 Jun 2024 19:52:26 +0200 Subject: [PATCH 14/18] Update CLI structure + refactoring --- .../workflows/pr_maintainer_checklist.yaml | 2 + src/scribe_data/cli/cli_utils.py | 121 ++++++++ src/scribe_data/cli/convert.py | 21 ++ src/scribe_data/cli/list.py | 260 ++++++++++-------- src/scribe_data/cli/main.py | 149 ++++++++-- src/scribe_data/cli/query.py | 149 +++++++--- src/scribe_data/cli/utils.py | 60 ---- .../resources/language_meta_data.json | 60 +++- src/scribe_data/wikipedia/extract_wiki.py | 2 +- 9 files changed, 578 insertions(+), 246 deletions(-) create mode 100644 src/scribe_data/cli/cli_utils.py create mode 100644 src/scribe_data/cli/convert.py delete mode 100644 src/scribe_data/cli/utils.py diff --git a/.github/workflows/pr_maintainer_checklist.yaml b/.github/workflows/pr_maintainer_checklist.yaml index ed33f7373..cd300ca5a 100644 --- a/.github/workflows/pr_maintainer_checklist.yaml +++ b/.github/workflows/pr_maintainer_checklist.yaml @@ -32,4 +32,6 @@ jobs: - The contributor's name and icon in remote commits should be the same as what appears in the PR - If there's a mismatch, the contributor needs to make sure that the [email they use for GitHub](https://github.com/settings/emails) matches what they have for `git config user.email` in their local Scribe-Data repo + - [ ] The linting and formatting workflow within the [PR checks](https://github.com/scribe-org/Scribe-Data/pull/${{ github.event.pull_request.number }}/checks) do not indicate new errors in the files changed + - [ ] The [CHANGELOG](https://github.com/scribe-org/Scribe-Data/blob/main/CHANGELOG.md) has been updated with a description of the changes for the upcoming release and the corresponding issue (if necessary) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py new file mode 100644 index 000000000..18812015b --- /dev/null +++ b/src/scribe_data/cli/cli_utils.py @@ -0,0 +1,121 @@ +""" +Utility functions for the Scribe-Data CLI. + +.. raw:: html + +""" + +import json +from pathlib import Path +from typing import Dict, List, Union + +METADATA_FILE = Path(__file__).parent.parent / "resources" / "language_meta_data.json" +DATA_DIR = Path("scribe_data_json_export") + +with METADATA_FILE.open("r", encoding="utf-8") as file: + language_metadata = json.load(file) + +language_map = { + lang["language"].lower(): lang for lang in language_metadata["languages"] +} + + +def correct_word_type(word_type: str) -> str: + """ + Corrects common versions of word type arguments so users can choose between them. + + Parameters + ---------- + word_type : str + The word type to potentially correct. + + Returns + ------- + The word_type value or a corrected version of it. + """ + all_word_types = set() + for language in language_metadata["languages"]: + all_word_types.update(language["word-types"]) + + all_word_types = list(all_word_types) + + if word_type in all_word_types: + return word_type + + for wt in all_word_types: + if f"{word_type}s" == wt: + return wt + + +def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: + """ + Prints a formatted output from the Scribe-Data CLI. + """ + if not data: + print(f"No data available for word type '{word_type}'.") + return + + max_key_length = max((len(key) for key in data.keys()), default=0) + + if word_type == "autosuggestions": + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {', '.join(value)}") + + elif word_type == "emoji_keywords": + for key, value in data.items(): + emojis = [item["emoji"] for item in value] + print(f"{key:<{max_key_length}} : {' '.join(emojis)}") + + elif word_type in {"prepositions", "translations"}: + for key, value in data.items(): + print(f"{key:<{max_key_length}} : {value}") + + elif isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max( + (len(sub_key) for sub_key in value.keys()), default=0 + ) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + + else: + print(f" {item}") + + else: + print(f"{key:<{max_key_length}} : {value}") + + elif isinstance(data, list): + for item in data: + if isinstance(item, dict): + for key, value in item.items(): + print(f"{key} : {value}") + + else: + print(item) + + else: + print(data) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py new file mode 100644 index 000000000..cb7a3fc8a --- /dev/null +++ b/src/scribe_data/cli/convert.py @@ -0,0 +1,21 @@ +""" +Functions to convert data returned from the Scribe-Data CLI to other file types. + +.. raw:: html + +""" diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 0656a067a..8f78a84db 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -1,153 +1,187 @@ +""" +Functions for listing languages and word types for the Scribe-Data CLI. + +.. raw:: html + +""" + import json from pathlib import Path -from typing import Dict, List, Union - -# Load language metadata from JSON file -METADATA_FILE = Path(__file__).parent.parent / 'resources' / 'language_meta_data.json' - -def load_language_metadata() -> Dict: - with METADATA_FILE.open('r', encoding='utf-8') as file: - return json.load(file) - -LANGUAGE_METADATA = load_language_metadata() -LANGUAGE_MAP = {lang['language'].lower(): lang for lang in LANGUAGE_METADATA['languages']} - -DATA_DIR = Path('scribe_data_json_export') - -def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: - if not data: - print(f"No data available for word type '{word_type}'.") - return - - if word_type == 'autosuggestions': - max_key_length = max((len(key) for key in data.keys()), default=0) - for key, value in data.items(): - print(f"{key:<{max_key_length}} : {', '.join(value)}") - elif word_type == 'emoji_keywords': - max_key_length = max((len(key) for key in data.keys()), default=0) - for key, value in data.items(): - emojis = [item['emoji'] for item in value] - print(f"{key:<{max_key_length}} : {' '.join(emojis)}") - elif word_type in ['prepositions', 'translations']: - max_key_length = max((len(key) for key in data.keys()), default=0) - for key, value in data.items(): - print(f"{key:<{max_key_length}} : {value}") - else: - if isinstance(data, dict): - max_key_length = max((len(key) for key in data.keys()), default=0) - for key, value in data.items(): - if isinstance(value, dict): - print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max((len(sub_key) for sub_key in value.keys()), default=0) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - elif isinstance(value, list): - print(f"{key:<{max_key_length}} : ") - for item in value: - if isinstance(item, dict): - for sub_key, sub_value in item.items(): - print(f" {sub_key:<{max_key_length}} : {sub_value}") - else: - print(f" {item}") - else: - print(f"{key:<{max_key_length}} : {value}") - elif isinstance(data, list): - for item in data: - if isinstance(item, dict): - for key, value in item.items(): - print(f"{key} : {value}") - else: - print(item) - else: - print(data) + +from .cli_utils import correct_word_type + +# Load language metadata from JSON file. +METADATA_FILE = Path(__file__).parent.parent / "resources" / "language_meta_data.json" +LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction" + +with METADATA_FILE.open("r", encoding="utf-8") as file: + language_metadata = json.load(file) + +language_map = { + lang["language"].lower(): lang for lang in language_metadata["languages"] +} + def list_languages() -> None: - languages = [lang for lang in LANGUAGE_METADATA['languages']] - languages.sort(key=lambda x: x['language']) + """ + Generates a table of languages, their ISO-2 codes and their Wikidata QIDs. + """ + languages = list(language_metadata["languages"]) + languages.sort(key=lambda x: x["language"]) - # Define column widths - language_col_width = max(len(lang['language']) for lang in languages) + 2 - iso_col_width = 5 # Length of "ISO" column header + padding - qid_col_width = 5 # Length of "QID" column header + padding + language_col_width = max(len(lang["language"]) for lang in languages) + 2 + iso_col_width = max(len(lang["iso"]) for lang in languages) + 2 + qid_col_width = max(len(lang["qid"]) for lang in languages) + 2 - print(f"{'Language':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}") - print('-' * (language_col_width + iso_col_width + qid_col_width)) + table_line_length = language_col_width + iso_col_width + qid_col_width + + print() + print( + f"{'Language':<{language_col_width}} {'ISO':<{iso_col_width}} {'QID':<{qid_col_width}}" + ) + print("-" * table_line_length) for lang in languages: - print(f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}") + print( + f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}" + ) + + print("-" * table_line_length) + print() + def list_word_types(language: str = None) -> None: + """ + Lists all word types or those available for a given language. + + Parameters + ---------- + language : str + The language to potentially list word types for. + """ if language: - normalized_language = LANGUAGE_MAP.get(language.lower()) - if not normalized_language: - print(f"Language '{language}' is not recognized.") - return + language_data = language_map.get(language.lower()) + language_capitalized = language.capitalize() + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized - language_dir = DATA_DIR / normalized_language['language'].capitalize() - if not language_dir.exists() or not language_dir.is_dir(): - print(f"No data found for language '{normalized_language['language']}'.") - return + if not language_data: + raise ValueError(f"Language '{language}' is not recognized.") - word_types = [wt.stem for wt in language_dir.glob('*.json')] + word_types = [f.name for f in language_dir.iterdir() if f.is_dir()] if not word_types: - print(f"No word types available for language '{normalized_language['language']}'.") - return + raise ValueError( + f"No word types available for language '{language_capitalized}'." + ) + + table_header = f"Available word types: {language_capitalized}" - word_types = sorted(word_types) - print(f"Word types for language '{normalized_language['language']}':") - for wt in word_types: - print(f" - {wt}") else: word_types = set() - for lang in LANGUAGE_METADATA['languages']: - language_dir = DATA_DIR / lang['language'].capitalize() + for lang in language_metadata["languages"]: + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() if language_dir.is_dir(): - word_types.update(wt.stem for wt in language_dir.glob('*.json')) + word_types.update(f.name for f in language_dir.iterdir() if f.is_dir()) - if not word_types: - print("No word types available.") - return + table_header = "Available word types: All languages" + + table_line_length = max(len(table_header), max(len(wt) for wt in word_types)) + + print() + print(table_header) + print("-" * table_line_length) + + word_types = sorted(word_types) + for wt in word_types: + print(wt) + + print("-" * table_line_length) + print() - word_types = sorted(word_types) - print("Available word types:") - for wt in word_types: - print(f" - {wt}") def list_all() -> None: + """ + Lists all available languages and word types. + """ list_languages() - print() list_word_types() + def list_languages_for_word_type(word_type: str) -> None: + """ + Lists the available languages for a given word type. + + Parameters + ---------- + word_type : str + The word type to check for. + """ + word_type = correct_word_type(word_type) available_languages = [] - for lang in LANGUAGE_METADATA['languages']: - language_dir = DATA_DIR / lang['language'].capitalize() + for lang in language_metadata["languages"]: + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() if language_dir.is_dir(): - wt_path = language_dir / f"{word_type}.json" + wt_path = language_dir / word_type if wt_path.exists(): - available_languages.append(lang['language']) - - if not available_languages: - print(f"No languages found with word type '{word_type}'.") - return + available_languages.append(lang["language"]) available_languages.sort() - print(f"Languages with word type '{word_type}':") + table_header = f"Available languages: {word_type}" + table_line_length = max( + len(table_header), max(len(lang) for lang in available_languages) + ) + + print() + print(table_header) + print("-" * table_line_length) + for lang in available_languages: - print(f"- {lang.capitalize()}") + print(f"{lang.capitalize()}") + + print("-" * table_line_length) + print() + def list_wrapper(language: str = None, word_type: str = None) -> None: - if language is None and word_type is None: + """ + Conditionally provides the full functionality of the list command. + + Parameters + ---------- + language : str + The language to potentially list word types for. + + word_type : str + The word type to check for. + """ + if not language and not word_type: list_all() - elif language is True and word_type is None: + + elif language is True and not word_type: list_languages() - elif language is None and word_type is True: + + elif not language and word_type is True: list_word_types() + elif language is True and word_type is True: - print("Please specify both a language and a word type.") - elif language is True and word_type is not None: + print("Please specify either a language or a word type.") + + elif word_type is not None: list_languages_for_word_type(word_type) - elif language is not None and word_type is True: - list_word_types(language) - elif language is not None and word_type is not None: + + elif language is not None: list_word_types(language) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index d012d860c..32bd40a6d 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -22,40 +22,139 @@ #!/usr/bin/env python3 import argparse + from .list import list_wrapper from .query import query_data +LIST_DESCRIPTION = "List languages and word types that Scribe-Data can be used for." +QUERY_DESCRIPTION = "Query data from Wikidata for given languages and word types." +CONVERT_DESCRIPTION = "Convert data returned by Scribe-Data to different file types." +CLI_EPILOG = "Visit the codebase at https://github.com/scribe-org/Scribe-Data and documentation at https://scribe-data.readthedocs.io/en/latest/ to learn more!" + + def main() -> None: - parser = argparse.ArgumentParser(description='Scribe-Data CLI Tool') - subparsers = parser.add_subparsers(dest='command', required=True) - - # List command - list_parser = subparsers.add_parser('list', help='List languages and word types') - list_parser.add_argument('--language', '-l', nargs='?', const=True, help='List all languages or filter by language code') - list_parser.add_argument('--word-type', '-wt', nargs='?', const=True, help='List all word types or filter by word type') - - # List word types command - list_word_types_parser = subparsers.add_parser('list-word-types', aliases=['lwt'], help='List available word types') - list_word_types_parser.add_argument('-l', '--language', help='Language code') - - # Query command - query_parser = subparsers.add_parser('query', help='Query data') - query_parser.add_argument('-l', '--language', type=str, help='Language for query') - query_parser.add_argument('-wt', '--word-type', type=str, help='Word type for query') - query_parser.add_argument('-od', '--output-dir', type=str, help='Output directory') - query_parser.add_argument('-o', '--overwrite', action='store_true', help='Overwrite existing files') - query_parser.add_argument('--output-type', type=str, choices=['json', 'csv', 'tsv'], help='Output file type') + parser = argparse.ArgumentParser( + prog="Scribe-Data", + description="The Scribe-Data CLI is a tool to query language data from Wikidata and other sources.", + epilog=CLI_EPILOG, + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + parser._actions[0].help = "Show this help message and exit." + parser.add_argument( + "-v", "--verbose", help="Increase output verbosity.", action="store_true" + ) + parser.add_argument("-u", "--update", help="Update the Scribe-Data CLI.") + + # MARK: List + list_parser = subparsers.add_parser( + "list", + aliases=["l"], + help=LIST_DESCRIPTION, + description=LIST_DESCRIPTION, + epilog=CLI_EPILOG, + ) + list_parser._actions[0].help = "Show this help message and exit." + list_parser.add_argument( + "--language", + "-lang", + nargs="?", + const=True, + help="Run list command on all or given languages.", + ) + list_parser.add_argument( + "--word-type", + "-wt", + nargs="?", + const=True, + help="Run list command on all or given word types.", + ) + + # MARK: Query + query_parser = subparsers.add_parser( + "query", + aliases=["q"], + help=QUERY_DESCRIPTION, + description=QUERY_DESCRIPTION, + epilog=CLI_EPILOG, + ) + query_parser._actions[0].help = "Show this help message and exit." + query_parser.add_argument( + "-lang", "--language", type=str, help="The language(s) to query." + ) + query_parser.add_argument( + "-wt", "--word-type", type=str, help="The word type(s) to query." + ) + query_parser.add_argument( + "-od", "--output-dir", type=str, help="The output directory path for results." + ) + query_parser.add_argument( + "-ot", + "--output-type", + type=str, + choices=["json", "csv", "tsv"], + help="The output file type.", + ) + query_parser.add_argument( + "-o", + "--overwrite", + action="store_true", + help="Whether to overwrite existing files (default: False).", + ) + + # MARK: Convert + convert_parser = subparsers.add_parser( + "convert", + aliases=["c"], + help=CONVERT_DESCRIPTION, + description=CONVERT_DESCRIPTION, + epilog=CLI_EPILOG, + ) + convert_parser._actions[0].help = "Show this help message and exit." + convert_parser.add_argument( + "-f", "--file", type=str, help="The file to convert to a new type." + ) + convert_parser.add_argument( + "-ko", + "--keep-original", + action="store_false", + help="Whether to keep the file to be converted (default: True).", + ) + convert_parser.add_argument( + "-json", "--to-json", type=str, help="Convert the file to JSON format." + ) + convert_parser.add_argument( + "-csv", "--to-csv", type=str, help="Convert the file to CSV format." + ) + convert_parser.add_argument( + "-tsv", "--to-tsv", type=str, help="Convert the file to TSV format." + ) + convert_parser.add_argument( + "-sqlite", "--to-sqlite", type=str, help="Convert the file to SQLite format." + ) + + # MARK: Setup CLI args = parser.parse_args() - if args.command == 'list': + if args.command in ["list", "l"]: list_wrapper(args.language, args.word_type) - elif args.command in ['list-word-types', 'lwt']: - list_wrapper(None, args.language) - elif args.command in ['query', 'q']: - query_data(args.language, args.word_type, args.output_dir, args.overwrite, args.output_type) + + elif args.command in ["query", "q"]: + query_data( + args.language, + args.word_type, + args.output_dir, + args.overwrite, + args.output_type, + ) + + elif args.command in ["convert", "c"]: + return + else: parser.print_help() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/src/scribe_data/cli/query.py b/src/scribe_data/cli/query.py index d993eff15..b0c8334a0 100644 --- a/src/scribe_data/cli/query.py +++ b/src/scribe_data/cli/query.py @@ -1,122 +1,193 @@ -import json +""" +Functions for querying languages-word types packs for the Scribe-Data CLI. + +.. raw:: html + +""" + import csv +import json from pathlib import Path from typing import Optional -from .utils import LANGUAGE_METADATA, LANGUAGE_MAP -DATA_DIR = Path('scribe_data_json_export') +from .cli_utils import language_map -def query_data(language: str = None, word_type: str = None, output_dir: Optional[str] = None, overwrite: bool = False, output_type: Optional[str] = None) -> None: - if not (language and word_type): - print("Error: You must provide both --language (-l) and --word-type (-wt) options.") - return +DATA_DIR = Path("scribe_data_json_export") + + +def query_data( + language: str = None, + word_type: str = None, + output_dir: Optional[str] = None, + overwrite: bool = False, + output_type: Optional[str] = None, +) -> None: + if not (language or word_type): + raise ValueError( + "You must provide either a --language (-l) or --word-type (-wt) option." + ) if output_dir: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) - if output_type == 'json' or output_type is None: + if output_type == "json" or output_type is None: export_json(language, word_type, output_dir, overwrite) - elif output_type in ['csv', 'tsv']: + + elif output_type in ["csv", "tsv"]: export_csv_or_tsv(language, word_type, output_dir, overwrite, output_type) + else: - print("Error: Unsupported output type. Please use 'json', 'csv', or 'tsv'.") + raise ValueError( + "Unsupported output type. Please use 'json', 'csv', or 'tsv'." + ) + else: - print("Error: Please specify an output directory using --output-dir (-od).") + raise ValueError("Please specify an output directory using --output-dir (-od).") + -def export_json(language: str, word_type: str, output_dir: Path, overwrite: bool) -> None: - normalized_language = LANGUAGE_MAP.get(language.lower()) +def export_json( + language: str, word_type: str, output_dir: Path, overwrite: bool +) -> None: + normalized_language = language_map.get(language.lower()) + language_capitalized = language.capitalize() if not normalized_language: - print(f"Language '{language}' is not recognized.") - return + raise ValueError(f"Language '{language_capitalized}' is not recognized.") + + data_file = ( + DATA_DIR / normalized_language["language"].capitalize() / f"{word_type}.json" + ) - data_file = DATA_DIR / normalized_language['language'].capitalize() / f"{word_type}.json" if not data_file.exists(): - print(f"No data found for language '{normalized_language['language']}' and word type '{word_type}'.") + print( + f"No data found for language '{normalized_language['language']}' and word type '{word_type}'." + ) return try: - with data_file.open('r') as file: + with data_file.open("r") as file: data = json.load(file) + except (IOError, json.JSONDecodeError) as e: print(f"Error reading '{data_file}': {e}") return # Adjust the output directory for JSON exports - json_output_dir = output_dir / "scribe_data_json_export" / normalized_language['language'].capitalize() + json_output_dir = ( + output_dir + / "scribe_data_json_export" + / normalized_language["language"].capitalize() + ) json_output_dir.mkdir(parents=True, exist_ok=True) output_file = json_output_dir / f"{word_type}.json" if output_file.exists() and not overwrite: user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") - if user_input.lower()!= 'y': + if user_input.lower() != "y": print(f"Skipping {normalized_language['language']} - {word_type}") return try: - with output_file.open('w') as file: + with output_file.open("w") as file: json.dump(data, file, indent=2) except IOError as e: - print(f"Error writing to '{output_file}': {e}") - return - print(f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'") + raise IOError(f"Error writing to '{output_file}': {e}") from e -def export_csv_or_tsv(language: str, word_type: str, output_dir: Path, overwrite: bool, output_type: str) -> None: - normalized_language = LANGUAGE_MAP.get(language.lower()) + print( + f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'" + ) + + +def export_csv_or_tsv( + language: str, word_type: str, output_dir: Path, overwrite: bool, output_type: str +) -> None: + normalized_language = language_map.get(language.lower()) if not normalized_language: print(f"Language '{language}' is not recognized.") return - data_file = DATA_DIR / normalized_language['language'].capitalize() / f"{word_type}.json" + data_file = ( + DATA_DIR / normalized_language["language"].capitalize() / f"{word_type}.json" + ) if not data_file.exists(): - print(f"No data found for language '{normalized_language['language']}' and word type '{word_type}'.") + print( + f"No data found for language '{normalized_language['language']}' and word type '{word_type}'." + ) return try: - with data_file.open('r') as file: + with data_file.open("r") as file: data = json.load(file) + except (IOError, json.JSONDecodeError) as e: print(f"Error reading '{data_file}': {e}") return - if output_type == 'csv': - delimiter = ',' - file_extension = 'csv' - elif output_type == 'tsv': - delimiter = '\t' - file_extension = 'tsv' + if output_type == "csv": + delimiter = "," + file_extension = "csv" + + elif output_type == "tsv": + delimiter = "\t" + file_extension = "tsv" + else: print(f"Unsupported output type '{output_type}'.") return # Adjust the output directory for CSV exports - csv_output_dir = output_dir / "scribe_data_csv_export" / normalized_language['language'].capitalize() + csv_output_dir = ( + output_dir + / "scribe_data_csv_export" + / normalized_language["language"].capitalize() + ) csv_output_dir.mkdir(parents=True, exist_ok=True) output_file = csv_output_dir / f"{word_type}.{file_extension}" if output_file.exists() and not overwrite: user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") - if user_input.lower()!= 'y': + if user_input.lower() != "y": print(f"Skipping {normalized_language['language']} - {word_type}") return try: - with output_file.open('w', newline='', encoding='utf-8') as file: + with output_file.open("w", newline="", encoding="utf-8") as file: writer = csv.writer(file, delimiter=delimiter) if isinstance(data, dict): for key, value in data.items(): writer.writerow([key, value]) + elif isinstance(data, list): for item in data: if isinstance(item, dict): writer.writerow(item.values()) + else: writer.writerow([item]) + else: print(f"Unsupported data format for {output_type} export.") + except IOError as e: print(f"Error writing to '{output_file}': {e}") return - print(f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'") + print( + f"Data for language '{normalized_language['language']}' and word type '{word_type}' written to '{output_file}'" + ) diff --git a/src/scribe_data/cli/utils.py b/src/scribe_data/cli/utils.py deleted file mode 100644 index f38086043..000000000 --- a/src/scribe_data/cli/utils.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -from pathlib import Path -from typing import Dict, List, Union - -# Load language metadata from JSON file -METADATA_FILE = Path(__file__).parent.parent / 'resources' / 'language_meta_data.json' - -def load_language_metadata() -> Dict: - with METADATA_FILE.open('r', encoding='utf-8') as file: - return json.load(file) - -LANGUAGE_METADATA = load_language_metadata() -LANGUAGE_MAP = {lang['language'].lower(): lang for lang in LANGUAGE_METADATA['languages']} - -def print_formatted_data(data: Union[Dict, List], word_type: str) -> None: - if not data: - print(f"No data available for word type '{word_type}'.") - return - - if word_type == 'autosuggestions': - max_key_length = max((len(key) for key in data.keys()), default=0) - for key, value in data.items(): - print(f"{key:<{max_key_length}} : {', '.join(value)}") - elif word_type == 'emoji_keywords': - max_key_length = max((len(key) for key in data.keys()), default=0) - for key, value in data.items(): - emojis = [item['emoji'] for item in value] - print(f"{key:<{max_key_length}} : {' '.join(emojis)}") - elif word_type in ['prepositions', 'translations']: - max_key_length = max((len(key) for key in data.keys()), default=0) - for key, value in data.items(): - print(f"{key:<{max_key_length}} : {value}") - else: - if isinstance(data, dict): - max_key_length = max((len(key) for key in data.keys()), default=0) - for key, value in data.items(): - if isinstance(value, dict): - print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max((len(sub_key) for sub_key in value.keys()), default=0) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - elif isinstance(value, list): - print(f"{key:<{max_key_length}} : ") - for item in value: - if isinstance(item, dict): - for sub_key, sub_value in item.items(): - print(f" {sub_key:<{max_key_length}} : {sub_value}") - else: - print(f" {item}") - else: - print(f"{key:<{max_key_length}} : {value}") - elif isinstance(data, list): - for item in data: - if isinstance(item, dict): - for key, value in item.items(): - print(f"{key} : {value}") - else: - print(item) - else: - print(data) diff --git a/src/scribe_data/resources/language_meta_data.json b/src/scribe_data/resources/language_meta_data.json index 88ba732e1..7ef4faf60 100755 --- a/src/scribe_data/resources/language_meta_data.json +++ b/src/scribe_data/resources/language_meta_data.json @@ -17,7 +17,7 @@ "qid": "Q1860", "remove-words": ["of", "the", "The", "and"], "ignore-words": [], - "word-types": ["nouns", "verbs", "translated_words"] + "word-types": ["nouns", "verbs", "translations"] }, { "language": "french", @@ -25,7 +25,13 @@ "qid": "Q150", "remove-words": ["of", "the", "The", "and"], "ignore-words": ["XXe"], - "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] + "word-types": [ + "nouns", + "verbs", + "translations", + "emoji_keywords", + "autosuggestions" + ] }, { "language": "german", @@ -33,7 +39,14 @@ "qid": "Q188", "remove-words": ["of", "the", "The", "and", "NeinJa", "et", "redirect"], "ignore-words": ["Gemeinde", "Familienname"], - "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] + "word-types": [ + "nouns", + "verbs", + "translations", + "emoji_keywords", + "prepositions", + "autosuggestions" + ] }, { "language": "italian", @@ -41,7 +54,13 @@ "qid": "Q652", "remove-words": ["of", "the", "The", "and", "text", "from"], "ignore-words": ["The", "ATP"], - "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] + "word-types": [ + "nouns", + "verbs", + "translations", + "emoji_keywords", + "autosuggestions" + ] }, { "language": "portuguese", @@ -49,7 +68,13 @@ "qid": "Q5146", "remove-words": ["of", "the", "The", "and", "jbutadptflora"], "ignore-words": [], - "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] + "word-types": [ + "nouns", + "verbs", + "translations", + "emoji_keywords", + "autosuggestions" + ] }, { "language": "russian", @@ -57,7 +82,14 @@ "qid": "Q7737", "remove-words": ["of", "the", "The", "and"], "ignore-words": [], - "word-types": ["nouns", "verbs", "translated_words", "translations", "emoji_keywords", "prepositions", "autosuggestions"] + "word-types": [ + "nouns", + "verbs", + "translations", + "emoji_keywords", + "prepositions", + "autosuggestions" + ] }, { "language": "spanish", @@ -65,7 +97,13 @@ "qid": "Q1321", "remove-words": ["of", "the", "The", "and"], "ignore-words": [], - "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] + "word-types": [ + "nouns", + "verbs", + "translations", + "emoji_keywords", + "autosuggestions" + ] }, { "language": "swedish", @@ -73,7 +111,13 @@ "qid": "Q9027", "remove-words": ["of", "the", "The", "and", "Checklist", "Catalogue"], "ignore-words": ["databasdump"], - "word-types": ["nouns", "verbs", "translations", "emoji_keywords", "prepositions", "autosuggestions"] + "word-types": [ + "nouns", + "verbs", + "translations", + "emoji_keywords", + "autosuggestions" + ] } ] } diff --git a/src/scribe_data/wikipedia/extract_wiki.py b/src/scribe_data/wikipedia/extract_wiki.py index ed97210c9..7c7581650 100644 --- a/src/scribe_data/wikipedia/extract_wiki.py +++ b/src/scribe_data/wikipedia/extract_wiki.py @@ -182,7 +182,7 @@ def iterate_and_parse_file(args): partitions_dir : str The path to where output file should be stored. - article_limit : int optional (default=None) + article_limit : int (default=None) An optional article_limit of the number of articles to find. verbose : bool (default=True) From 10511aea070cecdb665bc196237b20916975e298 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 22 Jun 2024 20:16:56 +0200 Subject: [PATCH 15/18] Switch over word type correction + file rename --- src/scribe_data/cli/cli_utils.py | 18 +++++++++++------- src/scribe_data/cli/list.py | 2 +- ...e_meta_data.json => language_metadata.json} | 0 .../resources/word_type_metadata.json | 10 ++++++++++ src/scribe_data/utils.py | 2 +- 5 files changed, 23 insertions(+), 9 deletions(-) rename src/scribe_data/resources/{language_meta_data.json => language_metadata.json} (100%) create mode 100644 src/scribe_data/resources/word_type_metadata.json diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index 18812015b..7d3578aeb 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -24,12 +24,20 @@ from pathlib import Path from typing import Dict, List, Union -METADATA_FILE = Path(__file__).parent.parent / "resources" / "language_meta_data.json" +LANGUAGE_METADATA_FILE = ( + Path(__file__).parent.parent / "resources" / "language_metadata.json" +) +WORD_TYPE_METADATA_FILE = ( + Path(__file__).parent.parent / "resources" / "word_type_metadata.json" +) DATA_DIR = Path("scribe_data_json_export") -with METADATA_FILE.open("r", encoding="utf-8") as file: +with LANGUAGE_METADATA_FILE.open("r", encoding="utf-8") as file: language_metadata = json.load(file) +with WORD_TYPE_METADATA_FILE.open("r", encoding="utf-8") as file: + word_type_metadata = json.load(file) + language_map = { lang["language"].lower(): lang for lang in language_metadata["languages"] } @@ -48,11 +56,7 @@ def correct_word_type(word_type: str) -> str: ------- The word_type value or a corrected version of it. """ - all_word_types = set() - for language in language_metadata["languages"]: - all_word_types.update(language["word-types"]) - - all_word_types = list(all_word_types) + all_word_types = word_type_metadata["word-types"] if word_type in all_word_types: return word_type diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 8f78a84db..f79230ecd 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -26,7 +26,7 @@ from .cli_utils import correct_word_type # Load language metadata from JSON file. -METADATA_FILE = Path(__file__).parent.parent / "resources" / "language_meta_data.json" +METADATA_FILE = Path(__file__).parent.parent / "resources" / "language_metadata.json" LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction" with METADATA_FILE.open("r", encoding="utf-8") as file: diff --git a/src/scribe_data/resources/language_meta_data.json b/src/scribe_data/resources/language_metadata.json similarity index 100% rename from src/scribe_data/resources/language_meta_data.json rename to src/scribe_data/resources/language_metadata.json diff --git a/src/scribe_data/resources/word_type_metadata.json b/src/scribe_data/resources/word_type_metadata.json new file mode 100644 index 000000000..3479d94c6 --- /dev/null +++ b/src/scribe_data/resources/word_type_metadata.json @@ -0,0 +1,10 @@ +{ + "word-types": [ + "autosuggestions", + "emoji_keywords", + "nouns", + "prepositions", + "translations", + "verbs" + ] +} diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 13a0da4c0..ca056f8cc 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -62,7 +62,7 @@ def _load_json(package_path: str, file_name: str, root: str): _languages = _load_json( package_path="scribe_data.resources", - file_name="language_meta_data.json", + file_name="language_metadata.json", root="languages", ) From 78ae17ef73973ce70a0a2aa7c77675a189b34182 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 22 Jun 2024 20:19:48 +0200 Subject: [PATCH 16/18] Remove word-type keys from language metadata --- .../resources/language_metadata.json | 68 +++---------------- 1 file changed, 8 insertions(+), 60 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index 7ef4faf60..794ef4009 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -16,108 +16,56 @@ "iso": "en", "qid": "Q1860", "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - "word-types": ["nouns", "verbs", "translations"] + "ignore-words": [] }, { "language": "french", "iso": "fr", "qid": "Q150", "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - "word-types": [ - "nouns", - "verbs", - "translations", - "emoji_keywords", - "autosuggestions" - ] + "ignore-words": ["XXe"] }, { "language": "german", "iso": "de", "qid": "Q188", "remove-words": ["of", "the", "The", "and", "NeinJa", "et", "redirect"], - "ignore-words": ["Gemeinde", "Familienname"], - "word-types": [ - "nouns", - "verbs", - "translations", - "emoji_keywords", - "prepositions", - "autosuggestions" - ] + "ignore-words": ["Gemeinde", "Familienname"] }, { "language": "italian", "iso": "it", "qid": "Q652", "remove-words": ["of", "the", "The", "and", "text", "from"], - "ignore-words": ["The", "ATP"], - "word-types": [ - "nouns", - "verbs", - "translations", - "emoji_keywords", - "autosuggestions" - ] + "ignore-words": ["The", "ATP"] }, { "language": "portuguese", "iso": "pt", "qid": "Q5146", "remove-words": ["of", "the", "The", "and", "jbutadptflora"], - "ignore-words": [], - "word-types": [ - "nouns", - "verbs", - "translations", - "emoji_keywords", - "autosuggestions" - ] + "ignore-words": [] }, { "language": "russian", "iso": "ru", "qid": "Q7737", "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - "word-types": [ - "nouns", - "verbs", - "translations", - "emoji_keywords", - "prepositions", - "autosuggestions" - ] + "ignore-words": [] }, { "language": "spanish", "iso": "es", "qid": "Q1321", "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - "word-types": [ - "nouns", - "verbs", - "translations", - "emoji_keywords", - "autosuggestions" - ] + "ignore-words": [] }, { "language": "swedish", "iso": "sv", "qid": "Q9027", "remove-words": ["of", "the", "The", "and", "Checklist", "Catalogue"], - "ignore-words": ["databasdump"], - "word-types": [ - "nouns", - "verbs", - "translations", - "emoji_keywords", - "autosuggestions" - ] + "ignore-words": ["databasdump"] } ] } From eb74dff058d0300eb39c4476d1e14df696a9b9b9 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 22 Jun 2024 20:22:16 +0200 Subject: [PATCH 17/18] Remove word-type description from language metadata --- src/scribe_data/resources/language_metadata.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index 794ef4009..e6d7de8a6 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -6,8 +6,7 @@ "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. All lowercase", "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390", "remove-words": "words that should not be included as autosuggestions for the given language.", - "ignore-words": "TODO. Case sensitive.", - "word-types": "A list of word types available for the given language." + "ignore-words": "words that should be removed from the autosuggestion generation process." } }, "languages": [ From a212390cacf4d03ff8c4ff788c6d13a0a98e58be Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sat, 22 Jun 2024 20:24:10 +0200 Subject: [PATCH 18/18] File spacing and comment formatting --- src/scribe_data/cli/main.py | 3 +++ src/scribe_data/cli/query.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 32bd40a6d..c057c53de 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -47,6 +47,7 @@ def main() -> None: parser.add_argument("-u", "--update", help="Update the Scribe-Data CLI.") # MARK: List + list_parser = subparsers.add_parser( "list", aliases=["l"], @@ -71,6 +72,7 @@ def main() -> None: ) # MARK: Query + query_parser = subparsers.add_parser( "query", aliases=["q"], @@ -103,6 +105,7 @@ def main() -> None: ) # MARK: Convert + convert_parser = subparsers.add_parser( "convert", aliases=["c"], diff --git a/src/scribe_data/cli/query.py b/src/scribe_data/cli/query.py index b0c8334a0..f2b629905 100644 --- a/src/scribe_data/cli/query.py +++ b/src/scribe_data/cli/query.py @@ -88,7 +88,7 @@ def export_json( print(f"Error reading '{data_file}': {e}") return - # Adjust the output directory for JSON exports + # Adjust the output directory for JSON exports. json_output_dir = ( output_dir / "scribe_data_json_export" @@ -151,7 +151,7 @@ def export_csv_or_tsv( print(f"Unsupported output type '{output_type}'.") return - # Adjust the output directory for CSV exports + # Adjust the output directory for CSV exports. csv_output_dir = ( output_dir / "scribe_data_csv_export"