diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index bab97a1a8..17c07e1c1 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -7,6 +7,7 @@ Thank you for your pull request! 🚀 - [] This pull request is on a [separate branch](https://docs.github.com/en/get-started/quickstart/github-flow) and not the main branch +- [] I have tested my code with the `pytest` command as directed in the [testing section of the contributing guide](https://github.com/scribe-org/Scribe-Data/blob/main/CONTRIBUTING.md#testing) --- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 376a954a7..2e44c618e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -15,6 +15,7 @@ If you have questions or would like to communicate with the team, please [join u - [First steps as a contributor](#first-steps) - [Learning the tech stack](#learning-the-tech) - [Development environment](#dev-env) +- [Testing](#testing) - [Issues and projects](#issues-projects) - [Bug reports](#bug-reports) - [Feature requests](#feature-requests) @@ -171,6 +172,16 @@ pip install -e . > [!NOTE] > Feel free to contact the team in the [Data room on Matrix](https://matrix.to/#/#ScribeData:matrix.org) if you're having problems getting your environment setup! + + +## Testing [`⇧`](#contents) + +In addition to the [pre-commit](https://pre-commit.com/) hooks that are set up during the [development environment section](#dev-env), Scribe-Data also includes a testing suite that should be ran before all pull requests and subsequent commits. Please run the following in the project root: + +```bash +pytest +``` + ## Issues and projects [`⇧`](#contents) diff --git a/src/scribe_data/cli/cli_utils.py b/src/scribe_data/cli/cli_utils.py index 4f59a65ef..e39e1621d 100644 --- a/src/scribe_data/cli/cli_utils.py +++ b/src/scribe_data/cli/cli_utils.py @@ -27,6 +27,8 @@ from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR +# MARK: CLI Variables + LANGUAGE_DATA_EXTRACTION_DIR = Path(__file__).parent.parent / "language_data_extraction" LANGUAGE_METADATA_FILE = ( @@ -53,14 +55,24 @@ print(f"Error reading data type metadata: {e}") -language_map = { - lang["language"].lower(): lang for lang in language_metadata["languages"] -} +language_map = {} +language_to_qid = {} + +# Process each language and its potential sub-languages in one pass. +for lang, lang_data in language_metadata.items(): + lang_lower = lang.lower() -# Create language_to_qid dictionary. -language_to_qid = { - lang["language"].lower(): lang["qid"] for lang in language_metadata["languages"] -} + # Handle sub-languages if they exist. + if "sub_languages" in lang_data: + for sub_lang, sub_lang_data in lang_data["sub_languages"].items(): + sub_lang_lower = sub_lang.lower() + language_map[sub_lang_lower] = sub_lang_data + language_to_qid[sub_lang_lower] = sub_lang_data["qid"] + + else: + # Handle the main language directly. + language_map[lang_lower] = lang_data + language_to_qid[lang_lower] = lang_data["qid"] # MARK: Correct Inputs @@ -103,41 +115,37 @@ def print_formatted_data(data: Union[dict, list], data_type: str) -> None: if isinstance(data, dict): max_key_length = max((len(key) for key in data.keys()), default=0) - if data_type == "autosuggestions": - for key, value in data.items(): + for key, value in data.items(): + if data_type == "autosuggestions": print(f"{key:<{max_key_length}} : {', '.join(value)}") - elif data_type == "emoji_keywords": - for key, value in data.items(): + elif data_type == "emoji_keywords": emojis = [item["emoji"] for item in value] print(f"{key:<{max_key_length}} : {' '.join(emojis)}") - elif data_type in {"prepositions"}: - for key, value in data.items(): + elif data_type in {"prepositions"}: print(f"{key:<{max_key_length}} : {value}") - else: - for key, value in data.items(): - if isinstance(value, dict): - print(f"{key:<{max_key_length}} : ") - max_sub_key_length = max( - (len(sub_key) for sub_key in value.keys()), default=0 - ) - for sub_key, sub_value in value.items(): - print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") - - elif isinstance(value, list): - print(f"{key:<{max_key_length}} : ") - for item in value: - if isinstance(item, dict): - for sub_key, sub_value in item.items(): - print(f" {sub_key:<{max_key_length}} : {sub_value}") - - else: - print(f" {item}") - - else: - print(f"{key:<{max_key_length}} : {value}") + elif isinstance(value, dict): + print(f"{key:<{max_key_length}} : ") + max_sub_key_length = max( + (len(sub_key) for sub_key in value.keys()), default=0 + ) + for sub_key, sub_value in value.items(): + print(f" {sub_key:<{max_sub_key_length}} : {sub_value}") + + elif isinstance(value, list): + print(f"{key:<{max_key_length}} : ") + for item in value: + if isinstance(item, dict): + for sub_key, sub_value in item.items(): + print(f" {sub_key:<{max_key_length}} : {sub_value}") + + else: + print(f" {item}") + + else: + print(f"{key:<{max_key_length}} : {value}") elif isinstance(data, list): for item in data: @@ -202,12 +210,12 @@ def validate_single_item(item, valid_options, item_type): ): closest_match = difflib.get_close_matches(item, valid_options, n=1) closest_match_str = ( - f" The closest matching {item_type} is {closest_match[0]}." + f" The closest matching {item_type} is '{closest_match[0]}'." if closest_match else "" ) - return f"Invalid {item_type} {item}.{closest_match_str}" + return f"Invalid {item_type} '{item}'.{closest_match_str}" return None diff --git a/src/scribe_data/cli/interactive.py b/src/scribe_data/cli/interactive.py index 4e95f34b0..6ba7a1f55 100644 --- a/src/scribe_data/cli/interactive.py +++ b/src/scribe_data/cli/interactive.py @@ -35,7 +35,7 @@ from scribe_data.cli.cli_utils import data_type_metadata, language_metadata from scribe_data.cli.get import get_data from scribe_data.cli.version import get_version_message -from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR +from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR, list_all_languages # MARK: Config Setup @@ -51,9 +51,7 @@ class ScribeDataConfig: def __init__(self): - self.languages = [ - lang["language"].capitalize() for lang in language_metadata["languages"] - ] + self.languages = list_all_languages(language_metadata) self.data_types = list(data_type_metadata.keys()) self.selected_languages: List[str] = [] self.selected_data_types: List[str] = [] diff --git a/src/scribe_data/cli/list.py b/src/scribe_data/cli/list.py index 5d16b4413..762d3bfca 100644 --- a/src/scribe_data/cli/list.py +++ b/src/scribe_data/cli/list.py @@ -21,10 +21,16 @@ """ from scribe_data.cli.cli_utils import ( + LANGUAGE_DATA_EXTRACTION_DIR, correct_data_type, - language_metadata, language_map, - LANGUAGE_DATA_EXTRACTION_DIR, + language_metadata, +) +from scribe_data.utils import ( + format_sublanguage_name, + get_language_iso, + get_language_qid, + list_all_languages, ) @@ -32,12 +38,11 @@ def list_languages() -> None: """ Generates a table of languages, their ISO-2 codes and their Wikidata QIDs. """ - languages = list(language_metadata["languages"]) - languages.sort(key=lambda x: x["language"]) + languages = list_all_languages(language_metadata) - language_col_width = max(len(lang["language"]) for lang in languages) + 2 - iso_col_width = max(len(lang["iso"]) for lang in languages) + 2 - qid_col_width = max(len(lang["qid"]) for lang in languages) + 2 + language_col_width = max(len(lang) for lang in languages) + 2 + iso_col_width = max(len(get_language_iso(lang)) for lang in languages) + 2 + qid_col_width = max(len(get_language_qid(lang)) for lang in languages) + 2 table_line_length = language_col_width + iso_col_width + qid_col_width @@ -49,7 +54,7 @@ def list_languages() -> None: for lang in languages: print( - f"{lang['language'].capitalize():<{language_col_width}} {lang['iso']:<{iso_col_width}} {lang['qid']:<{qid_col_width}}" + f"{lang.capitalize():<{language_col_width}} {get_language_iso(lang):<{iso_col_width}} {get_language_qid(lang):<{qid_col_width}}" ) print("-" * table_line_length) @@ -65,7 +70,9 @@ def list_data_types(language: str = None) -> None: language : str The language to potentially list data types for. """ + languages = list_all_languages(language_metadata) if language: + language = format_sublanguage_name(language, language_metadata) language_data = language_map.get(language.lower()) language_capitalized = language.capitalize() language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized @@ -83,8 +90,11 @@ def list_data_types(language: str = None) -> None: else: data_types = set() - for lang in language_metadata["languages"]: - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() + for lang in languages: + language_dir = ( + LANGUAGE_DATA_EXTRACTION_DIR + / format_sublanguage_name(lang, language_metadata).capitalize() + ) if language_dir.is_dir(): data_types.update(f.name for f in language_dir.iterdir() if f.is_dir()) @@ -122,13 +132,15 @@ def list_languages_for_data_type(data_type: str) -> None: The data type to check for. """ data_type = correct_data_type(data_type=data_type) + all_languages = list_all_languages(language_metadata) available_languages = [] - for lang in language_metadata["languages"]: - language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang["language"].capitalize() + for lang in all_languages: + lang = format_sublanguage_name(lang, language_metadata) + language_dir = LANGUAGE_DATA_EXTRACTION_DIR / lang if language_dir.is_dir(): dt_path = language_dir / data_type if dt_path.exists(): - available_languages.append(lang["language"]) + available_languages.append(lang) available_languages.sort() table_header = f"Available languages: {data_type}" @@ -141,7 +153,7 @@ def list_languages_for_data_type(data_type: str) -> None: print("-" * table_line_length) for lang in available_languages: - print(f"{lang.capitalize()}") + print(f"{lang}") print("-" * table_line_length) print() diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py index fe1382707..885d9b3e9 100644 --- a/src/scribe_data/cli/total.py +++ b/src/scribe_data/cli/total.py @@ -29,6 +29,7 @@ language_metadata, language_to_qid, ) +from scribe_data.utils import format_sublanguage_name, list_all_languages from scribe_data.wikidata.wikidata_utils import sparql @@ -71,12 +72,13 @@ def get_datatype_list(language): data_types : list[str] or None A list of the corresponding data types. """ - languages = list(language_metadata["languages"]) - language_list = [lang["language"] for lang in languages] + languages = list_all_languages(language_metadata) - if language.lower() in language_list: + if language.lower() in languages: language_data = language_map.get(language.lower()) - language_capitalized = language.capitalize() + language_capitalized = format_sublanguage_name( + language, language_metadata + ).capitalize() language_dir = LANGUAGE_DATA_EXTRACTION_DIR / language_capitalized if not language_data: @@ -131,11 +133,9 @@ def print_total_lexemes(language: str = None): print("=" * 64) if language is None: # all languages - languages = list(language_metadata["languages"]) - languages.sort(key=lambda x: x["language"]) - language_list = [lang["language"] for lang in languages] + languages = list_all_languages(language_metadata) - for lang in language_list: + for lang in languages: data_types = get_datatype_list(lang) first_row = True diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 79d19e39b..aec1f9560 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -35,6 +35,7 @@ DEFAULT_SQLITE_EXPORT_DIR, get_language_iso, ) +from scribe_data.utils import list_all_languages def data_to_sqlite( @@ -52,8 +53,7 @@ def data_to_sqlite( current_language_data = json.load(f_languages) data_types = json.load(f_data_types).keys() - current_languages = [d["language"] for d in current_language_data["languages"]] - + current_languages = list_all_languages(current_language_data) if not languages: languages = current_languages diff --git a/src/scribe_data/resources/language_metadata.json b/src/scribe_data/resources/language_metadata.json index e6d7de8a6..7ab2145bf 100755 --- a/src/scribe_data/resources/language_metadata.json +++ b/src/scribe_data/resources/language_metadata.json @@ -1,70 +1,182 @@ { - "used by": "Scribe-Data/src/scribe_data/utils.py", - "description": { - "entry": { - "language": "the supported language. All lowercase", - "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. All lowercase", - "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390", - "remove-words": "words that should not be included as autosuggestions for the given language.", - "ignore-words": "words that should be removed from the autosuggestion generation process." + "arabic": { + "iso": "ar", + "qid": "Q13955" + }, + "basque": { + "iso": "eu", + "qid": "Q8752" + }, + "bengali": { + "iso": "bn", + "qid": "Q9610" + }, + "chinese": { + "sub_languages": { + "mandarin": { + "iso": "zh", + "qid": "Q727694" + } + } + }, + "czech": { + "iso": "cs", + "qid": "Q9056" + }, + "danish": { + "iso": "da", + "qid": "Q9035" + }, + "english": { + "iso": "en", + "qid": "Q1860" + }, + "esperanto": { + "iso": "eo", + "qid": "Q143" + }, + "estonian": { + "iso": "et", + "qid": "Q9072" + }, + "finnish": { + "iso": "fi", + "qid": "Q1412" + }, + "french": { + "iso": "fr", + "qid": "Q150" + }, + "german": { + "iso": "de", + "qid": "Q188" + }, + "greek": { + "iso": "el", + "qid": "Q36510" + }, + "hausa": { + "iso": "ha", + "qid": "Q56475" + }, + "hebrew": { + "iso": "he", + "qid": "Q9288" + }, + "hindustani": { + "sub_languages": { + "hindi": { + "iso": "hi", + "qid": "Q11051" + }, + "urdu": { + "iso": "ur", + "qid": "Q11051" + } + } + }, + "indonesian": { + "iso": "id", + "qid": "Q9240" + }, + "italian": { + "iso": "it", + "qid": "Q652" + }, + "japanese": { + "iso": "ja", + "qid": "Q5287" + }, + "kurmanji": { + "iso": "kmr", + "qid": "Q36163" + }, + "latin": { + "iso": "la", + "qid": "Q397" + }, + "malay": { + "iso": "ms", + "qid": "Q9237" + }, + "malayalam": { + "iso": "ml", + "qid": "Q36236" + }, + "norwegian": { + "sub_languages": { + "bokmål": { + "iso": "nb", + "qid": "Q25167" + }, + "nynorsk": { + "iso": "nn", + "qid": "Q25164" + } + } + }, + "pidgin": { + "sub_languages": { + "nigerian": { + "iso": "pi", + "qid": "Q33655" + } } }, - "languages": [ - { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"] - }, - { - "language": "german", - "iso": "de", - "qid": "Q188", - "remove-words": ["of", "the", "The", "and", "NeinJa", "et", "redirect"], - "ignore-words": ["Gemeinde", "Familienname"] - }, - { - "language": "italian", - "iso": "it", - "qid": "Q652", - "remove-words": ["of", "the", "The", "and", "text", "from"], - "ignore-words": ["The", "ATP"] - }, - { - "language": "portuguese", - "iso": "pt", - "qid": "Q5146", - "remove-words": ["of", "the", "The", "and", "jbutadptflora"], - "ignore-words": [] - }, - { - "language": "russian", - "iso": "ru", - "qid": "Q7737", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "spanish", - "iso": "es", - "qid": "Q1321", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [] - }, - { - "language": "swedish", - "iso": "sv", - "qid": "Q9027", - "remove-words": ["of", "the", "The", "and", "Checklist", "Catalogue"], - "ignore-words": ["databasdump"] + "polish": { + "iso": "pl", + "qid": "Q809" + }, + "portuguese": { + "iso": "pt", + "qid": "Q5146" + }, + "punjabi": { + "sub_languages": { + "gurmukhi": { + "iso": "pa", + "qid": "Q58635" + }, + "shahmukhi": { + "iso": "pnb", + "qid": "Q58635" + } } - ] + }, + "russian": { + "iso": "ru", + "qid": "Q7737" + }, + "slovak": { + "iso": "sk", + "qid": "Q9058" + }, + "spanish": { + "iso": "es", + "qid": "Q1321" + }, + "swahili": { + "iso": "sw", + "qid": "Q7838" + }, + "swedish": { + "iso": "sv", + "qid": "Q9027" + }, + "tajik": { + "iso": "tg", + "qid": "Q9260" + }, + "tamil": { + "iso": "ta", + "qid": "Q5885" + }, + "ukrainian": { + "iso": "ua", + "qid": "Q8798" + }, + "yoruba": { + "iso": "yo", + "qid": "Q34311" + } } diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 9d94485ab..3c2007640 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -26,9 +26,6 @@ from pathlib import Path from typing import Any, Optional -from iso639 import Lang -from iso639.exceptions import DeprecatedLanguageValue, InvalidLanguageValue - PROJECT_ROOT = "Scribe-Data" DEFAULT_JSON_EXPORT_DIR = "scribe_data_json_export" DEFAULT_CSV_EXPORT_DIR = "scribe_data_csv_export" @@ -36,7 +33,7 @@ DEFAULT_SQLITE_EXPORT_DIR = "scribe_data_sqlite_export" -def _load_json(package_path: str, file_name: str, root: str) -> Any: +def _load_json(package_path: str, file_name: str) -> Any: """ Loads a JSON resource from a package into a python entity. @@ -48,52 +45,37 @@ def _load_json(package_path: str, file_name: str, root: str) -> Any: file_name : str The name of the file (resource) that contains the JSON data. - root : str - The root node of the JSON document. - Returns ------- - A python entity starting at 'root'. + A python entity representing the JSON content. """ - with resources.files(package_path).joinpath(file_name).open( encoding="utf-8" ) as in_stream: - contents = json.load(in_stream) - return contents[root] + return json.load(in_stream) _languages = _load_json( - package_path="scribe_data.resources", - file_name="language_metadata.json", - root="languages", + package_path="scribe_data.resources", file_name="language_metadata.json" ) def _find(source_key: str, source_value: str, target_key: str, error_msg: str) -> Any: """ - Each 'language', (english, german,..., etc) is a dictionary of key/value pairs: + Finds a target value based on a source key/value pair from the language metadata. - entry = { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": [...], - "ignore-words": [...] - } - - Given a key/value pair, the 'source' and the 'target' key get the 'target' value. + This version handles both regular languages and those with sub-languages (e.g., Norwegian). Parameters ---------- source_value : str - The source value to find equivalents for (e.g. 'english'). + The source value to find equivalents for (e.g., 'english', 'nynorsk'). source_key : str - The source key to reference (e.g. 'language'). + The source key to reference (e.g., 'language'). target_key : str - The key to target (e.g. 'iso'). + The key to target (e.g., 'qid'). error_msg : str The message displayed when a value cannot be found. @@ -104,28 +86,33 @@ def _find(source_key: str, source_value: str, target_key: str, error_msg: str) - Raises ------ - ValueError : when a source_value is not supported. - """ - norm_source_value = source_value.lower() - - if target_value := [ - entry[target_key] - for entry in _languages - if entry[source_key] == norm_source_value - ]: - assert len(target_value) == 1, f"More than one entry for '{norm_source_value}'" - return target_value[0] - + ValueError : when a source_value is not supported or the language only has sub-languages. + """ + # Check if we're searching by language name. + if source_key == "language": + norm_source_value = source_value.lower() + + # First, check the main language entries (e.g., mandarin, french, etc.). + for language, entry in _languages.items(): + # If the language name matches the top-level key, return the target value. + if language.lower() == norm_source_value: + if "sub_languages" in entry: + sub_languages = ", ".join(entry["sub_languages"].keys()) + raise ValueError( + f"'{language}' has sub-languages, but is not queryable directly. Available sub-languages: {sub_languages}" + ) + return entry.get(target_key) + + # If there are sub-languages, check them too. + if "sub_languages" in entry: + for sub_language, sub_entry in entry["sub_languages"].items(): + if sub_language.lower() == norm_source_value: + return sub_entry.get(target_key) + + # If no match was found, raise an error. raise ValueError(error_msg) -def get_scribe_languages() -> list[str]: - """ - Returns the list of currently implemented Scribe languages. - """ - return sorted(entry["language"].capitalize() for entry in _languages) - - def get_language_qid(language: str) -> str: """ Returns the QID of the given language. @@ -162,13 +149,13 @@ def get_language_iso(language: str) -> str: str The ISO code for the language. """ - try: - iso_code = str(Lang(language.capitalize()).pt1) - except InvalidLanguageValue: - raise ValueError( - f"{language.capitalize()} is currently not a supported language for ISO conversion." - ) from None - return iso_code + + return _find( + "language", + language, + "iso", + f"{language.upper()} is currently not a supported language for ISO conversion.", + ) def get_language_from_iso(iso: str) -> str: @@ -185,57 +172,20 @@ def get_language_from_iso(iso: str) -> str: str The name for the language which has an ISO value of iso. """ - try: - language_name = str(Lang(iso.lower()).name) - except DeprecatedLanguageValue as e: - raise ValueError( - f"{iso.upper()} is currently not a supported ISO language." - ) from e - return language_name - - -def get_language_words_to_remove(language: str) -> list[str]: - """ - Returns the words that should be removed during the data cleaning process for the given language. - - Parameters - ---------- - language : str - The language the words should be returned for. - - Returns - ------- - list[str] - The words that that be removed during the data cleaning process for the given language. - """ - return _find( - "language", - language, - "remove-words", - f"{language.capitalize()} is currently not a supported language.", - ) - - -def get_language_words_to_ignore(language: str) -> list[str]: - """ - Returns the words that should not be included as autosuggestions for the given language. + # Iterate over the languages and their properties + for language, properties in _languages.items(): + # Check if the current language's ISO matches the provided ISO + if properties.get("iso") == iso: + return language.capitalize() - Parameters - ---------- - language : str - The language the words should be returned for. + # If there are sub-languages, check those as well + if "sub_languages" in properties: + for sub_lang, sub_properties in properties["sub_languages"].items(): + if sub_properties.get("iso") == iso: + return sub_lang.capitalize() - Returns - ------- - list[str] - The words that should not be included as autosuggestions for the given language. - """ - return _find( - "language", - language, - "ignore-words", - f"{language.capitalize()} is currently not a supported language.", - ) + # If no match is found, raise a ValueError + raise ValueError(f"{iso.upper()} is currently not a supported ISO language.") def load_queried_data( @@ -459,20 +409,25 @@ def map_genders(wikidata_gender: str) -> str: ---------- wikidata_gender : str The gender of the noun that was queried from WikiData. + + Returns + ------- + The gender value corrected in case the Wikidata ID was queried. """ gender_map = { - "masculine": "M", - "Q499327": "M", - "feminine": "F", - "Q1775415": "F", - "common gender": "C", - "Q1305037": "C", - "neuter": "N", - "Q1775461": "N", + "masculine": "masculine", + "Q499327": "masculine", + "feminine": "feminine", + "Q1775415": "feminine", + "common": "common", + "common gender": "common", + "Q1305037": "common", + "neuter": "neuter", + "Q1775461": "neuter", } return gender_map.get( - wikidata_gender, "" + wikidata_gender.lower(), "" ) # nouns could have a gender that is not a valid attribute @@ -484,20 +439,24 @@ def map_cases(wikidata_case: str) -> str: ---------- wikidata_case : str The case of the noun that was queried from WikiData. + + Returns + ------- + The case value corrected in case the Wikidata ID was queried. """ case_map = { - "accusative": "Acc", - "Q146078": "Acc", - "dative": "Dat", - "Q145599": "Dat", - "genitive": "Gen", - "Q146233": "Gen", - "instrumental": "Ins", - "Q192997": "Ins", - "prepositional": "Pre", - "Q2114906": "Pre", - "locative": "Loc", - "Q202142": "Loc", + "accusative": "accusative", + "Q146078": "accusative", + "dative": "dative", + "Q145599": "dative", + "genitive": "genitive", + "Q146233": "genitive", + "instrumental": "instrumental", + "Q192997": "instrumental", + "prepositional": "prepositional", + "Q2114906": "prepositional", + "locative": "locative", + "Q202142": "locative", } case = wikidata_case.split(" case")[0] return case_map.get(case, "") @@ -519,3 +478,71 @@ def order_annotations(annotation: str) -> str: annotation_split = sorted(list(set(filter(None, annotation.split("/"))))) return "/".join(annotation_split) + + +def format_sublanguage_name(lang, language_metadata=_languages): + """ + Formats the name of a sub-language by appending its main language + in the format 'MAIN_LANG/SUB_LANG'. If the language is not a sub-language, + the original language name is returned as-is. + + Parameters + ---------- + lang : str + The name of the language or sub-language to format. + + language_metadata : dict + The metadata containing information about main languages and their sub-languages. + + Returns + ------- + str + The formatted language name if it's a sub-language (e.g., 'Norwegian/Nynorsk'). + Otherwise the original name. + + Raises + ------ + ValueError: If the provided language or sub-language is not found. + + Example + ------- + > format_sublanguage_name("nynorsk", language_metadata) + 'Norwegian/Nynorsk' + + > format_sublanguage_name("english", language_metadata) + 'English' + """ + for main_lang, lang_data in language_metadata.items(): + # If it's not a sub-language, return the original name. + if main_lang == lang.lower(): + return lang.capitalize() + + # Check if the main language has sub-languages. + if "sub_languages" in lang_data: + # Check if the provided language is a sub-language. + for sub_lang in lang_data["sub_languages"]: + if lang.lower() == sub_lang.lower(): + # Return the formatted name MAIN_LANG/SUB_LANG. + return f"{main_lang.capitalize()}/{sub_lang.capitalize()}" + + # Raise ValueError if no match is found. + raise ValueError(f"{lang.upper()} is not a valid language or sub-language.") + + +def list_all_languages(language_metadata=_languages): + """ + Returns a sorted list of all languages from the provided metadata dictionary, including sub-languages. + """ + current_languages = [] + + # Iterate through the language metadata. + for lang_key, lang_data in language_metadata.items(): + # Check if there are sub-languages. + if "sub_languages" in lang_data: + # Add the sub-languages to current_languages. + current_languages.extend(lang_data["sub_languages"].keys()) + else: + # If no sub-languages, add the main language. + current_languages.append(lang_key) + + return sorted(current_languages) diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index 4da51b4f6..a9dba0b9f 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -33,6 +33,7 @@ from scribe_data.cli.cli_utils import ( language_metadata, ) +from scribe_data.utils import format_sublanguage_name, list_all_languages from scribe_data.wikidata.wikidata_utils import sparql @@ -103,7 +104,7 @@ def query_data( SCRIBE_DATA_SRC_PATH / "language_data_extraction" ) languages = [lang.capitalize() for lang in languages] - current_languages = list(language_metadata["languages"]) + current_languages = list_all_languages(language_metadata) current_data_type = ["nouns", "verbs", "prepositions"] # Assign current_languages and current_data_type if no arguments have been passed. @@ -147,7 +148,7 @@ def query_data( disable=interactive, colour="MAGENTA", ): - lang = q.parent.parent.name + lang = format_sublanguage_name(q.parent.parent.name, language_metadata) target_type = q.parent.name updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir diff --git a/tests/cli/test_list.py b/tests/cli/test_list.py index 03172e077..6fb4bf791 100644 --- a/tests/cli/test_list.py +++ b/tests/cli/test_list.py @@ -39,17 +39,49 @@ def test_list_languages(self, mock_print): list_languages() expected_calls = [ call(), - call("Language ISO QID "), - call("-----------------------"), - call("English en Q1860 "), - call("French fr Q150 "), - call("German de Q188 "), - call("Italian it Q652 "), - call("Portuguese pt Q5146 "), - call("Russian ru Q7737 "), - call("Spanish es Q1321 "), - call("Swedish sv Q9027 "), - call("-----------------------"), + call("Language ISO QID "), + call("--------------------------"), + call("Arabic ar Q13955 "), + call("Basque eu Q8752 "), + call("Bengali bn Q9610 "), + call("Bokmål nb Q25167 "), + call("Czech cs Q9056 "), + call("Danish da Q9035 "), + call("English en Q1860 "), + call("Esperanto eo Q143 "), + call("Estonian et Q9072 "), + call("Finnish fi Q1412 "), + call("French fr Q150 "), + call("German de Q188 "), + call("Greek el Q36510 "), + call("Gurmukhi pa Q58635 "), + call("Hausa ha Q56475 "), + call("Hebrew he Q9288 "), + call("Hindi hi Q11051 "), + call("Indonesian id Q9240 "), + call("Italian it Q652 "), + call("Japanese ja Q5287 "), + call("Kurmanji kmr Q36163 "), + call("Latin la Q397 "), + call("Malay ms Q9237 "), + call("Malayalam ml Q36236 "), + call("Mandarin zh Q727694 "), + call("Nigerian pi Q33655 "), + call("Nynorsk nn Q25164 "), + call("Polish pl Q809 "), + call("Portuguese pt Q5146 "), + call("Russian ru Q7737 "), + call("Shahmukhi pnb Q58635 "), + call("Slovak sk Q9058 "), + call("Spanish es Q1321 "), + call("Swahili sw Q7838 "), + call("Swedish sv Q9027 "), + call("Tajik tg Q9260 "), + call("Tamil ta Q5885 "), + call("Ukrainian ua Q8798 "), + call("Urdu ur Q11051 "), + call("Yoruba yo Q34311 "), + call("--------------------------"), call(), ] mock_print.assert_has_calls(expected_calls) @@ -66,6 +98,8 @@ def test_list_data_types_all_languages(self, mock_print): call("adverbs"), call("emoji-keywords"), call("nouns"), + call("personal-pronouns"), + call("postpositions"), call("prepositions"), call("proper-nouns"), call("verbs"), @@ -149,14 +183,46 @@ def test_list_languages_for_data_type_valid(self, mock_print): call(), call("Available languages: nouns"), call("--------------------------"), + call("Arabic"), + call("Basque"), + call("Bengali"), + call("Chinese/Mandarin"), + call("Czech"), + call("Danish"), call("English"), + call("Esperanto"), + call("Estonian"), + call("Finnish"), call("French"), call("German"), + call("Greek"), + call("Hausa"), + call("Hebrew"), + call("Hindustani/Hindi"), + call("Hindustani/Urdu"), + call("Indonesian"), call("Italian"), + call("Japanese"), + call("Kurmanji"), + call("Latin"), + call("Malay"), + call("Malayalam"), + call("Norwegian/Bokmål"), + call("Norwegian/Nynorsk"), + call("Pidgin/Nigerian"), + call("Polish"), call("Portuguese"), + call("Punjabi/Gurmukhi"), + call("Punjabi/Shahmukhi"), call("Russian"), + call("Slovak"), call("Spanish"), + call("Swahili"), call("Swedish"), + call("Tajik"), + call("Tamil"), + call("Ukrainian"), + call("Yoruba"), call("--------------------------"), call(), ] diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py index a827666a2..333c3b7d7 100644 --- a/tests/cli/test_utils.py +++ b/tests/cli/test_utils.py @@ -187,7 +187,7 @@ def test_validate_language_and_data_type_invalid_language(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual(str(context.exception), "Invalid language InvalidLanguage.") + self.assertEqual(str(context.exception), "Invalid language 'InvalidLanguage'.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): @@ -201,7 +201,7 @@ def test_validate_language_and_data_type_invalid_data_type(self, mock_get_qid): language=language_qid, data_type=data_type_qid ) - self.assertEqual(str(context.exception), "Invalid data-type InvalidDataType.") + self.assertEqual(str(context.exception), "Invalid data-type 'InvalidDataType'.") @patch("scribe_data.cli.total.get_qid_by_input") def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): @@ -217,7 +217,7 @@ def test_validate_language_and_data_type_both_invalid(self, mock_get_qid): self.assertEqual( str(context.exception), - "Invalid language InvalidLanguage.\nInvalid data-type InvalidDataType.", + "Invalid language 'InvalidLanguage'.\nInvalid data-type 'InvalidDataType'.", ) def test_validate_language_and_data_type_with_list(self): @@ -248,5 +248,5 @@ def test_validate_language_and_data_type_mixed_validity_in_lists(self): data_types = ["nouns", "InvalidDataType"] with self.assertRaises(ValueError) as context: validate_language_and_data_type(languages, data_types) - self.assertIn("Invalid language InvalidLanguage", str(context.exception)) - self.assertIn("Invalid data-type InvalidDataType", str(context.exception)) + self.assertIn("Invalid language 'InvalidLanguage'", str(context.exception)) + self.assertIn("Invalid data-type 'InvalidDataType'", str(context.exception)) diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py index 638ee09dd..43eaa2038 100644 --- a/tests/load/test_update_utils.py +++ b/tests/load/test_update_utils.py @@ -21,7 +21,6 @@ """ import sys -import unittest from pathlib import Path import pytest @@ -31,25 +30,6 @@ from scribe_data import utils -def test_get_scribe_languages(): - test_case = unittest.TestCase() - - # test for content, not order - test_case.assertCountEqual( - utils.get_scribe_languages(), - [ - "English", - "French", - "German", - "Italian", - "Portuguese", - "Russian", - "Spanish", - "Swedish", - ], - ) - - @pytest.mark.parametrize( "language, qid_code", [ @@ -61,6 +41,7 @@ def test_get_scribe_languages(): ("russian", "Q7737"), ("spanish", "Q1321"), ("swedish", "Q9027"), + ("bokmål", "Q25167"), ], ) def test_get_language_qid_positive(language, qid_code): @@ -88,6 +69,7 @@ def test_get_language_qid_negative(): ("russian", "ru"), ("spanish", "es"), ("SwedisH", "sv"), + ("bokmål", "nb"), ], ) def test_get_language_iso_positive(language, iso_code): @@ -100,7 +82,7 @@ def test_get_language_iso_negative(): assert ( str(excp.value) - == "Gibberish is currently not a supported language for ISO conversion." + == "GIBBERISH is currently not a supported language for ISO conversion." ) @@ -115,6 +97,7 @@ def test_get_language_iso_negative(): ("ru", "Russian"), ("es", "Spanish"), ("sv", "Swedish"), + ("nb", "Bokmål"), ], ) def test_get_language_from_iso_positive(iso_code, language): @@ -129,89 +112,69 @@ def test_get_language_from_iso_negative(): @pytest.mark.parametrize( - "language, remove_words", - [ - ( - "english", - [ - "of", - "the", - "The", - "and", - ], - ), - ( - "french", - [ - "of", - "the", - "The", - "and", - ], - ), - ("german", ["of", "the", "The", "and", "NeinJa", "et", "redirect"]), - ("italian", ["of", "the", "The", "and", "text", "from"]), - ("portuguese", ["of", "the", "The", "and", "jbutadptflora"]), - ( - "russian", - [ - "of", - "the", - "The", - "and", - ], - ), - ("spanish", ["of", "the", "The", "and"]), - ("swedish", ["of", "the", "The", "and", "Checklist", "Catalogue"]), - ], -) -def test_get_language_words_to_remove(language, remove_words): - test_case = unittest.TestCase() - - # ignore order, only content matters - test_case.assertCountEqual( - utils.get_language_words_to_remove(language), remove_words - ) - - -def test_get_language_words_to_remove_negative(): - with pytest.raises(ValueError) as excp: - _ = utils.get_language_words_to_remove("python") - - assert str(excp.value) == "Python is currently not a supported language." - - -@pytest.mark.parametrize( - "language, ignore_words", + "lang, expected_output", [ - ( - "french", - [ - "XXe", - ], - ), - ("german", ["Gemeinde", "Familienname"]), - ("italian", ["The", "ATP"]), - ("portuguese", []), - ("russian", []), - ("spanish", []), - ("swedish", ["databasdump"]), + ("nynorsk", "Norwegian/Nynorsk"), + ("bokmål", "Norwegian/Bokmål"), + ("english", "English"), ], ) -def test_get_language_words_to_ignore(language, ignore_words): - test_case = unittest.TestCase() - - # ignore order, only content matters - test_case.assertCountEqual( - utils.get_language_words_to_ignore(language), ignore_words - ) +def test_format_sublanguage_name_positive(lang, expected_output): + assert utils.format_sublanguage_name(lang) == expected_output -def test_get_language_words_to_ignore_negative(): +def test_format_sublanguage_name_negative(): with pytest.raises(ValueError) as excp: - _ = utils.get_language_words_to_ignore("JAVA") - - assert str(excp.value) == "Java is currently not a supported language." + _ = utils.format_sublanguage_name("soccer") + + assert str(excp.value) == "SOCCER is not a valid language or sub-language." + + +def test_list_all_languages(): + expected_languages = [ + "arabic", + "basque", + "bengali", + "bokmål", + "czech", + "danish", + "english", + "esperanto", + "estonian", + "finnish", + "french", + "german", + "greek", + "gurmukhi", + "hausa", + "hebrew", + "hindi", + "indonesian", + "italian", + "japanese", + "kurmanji", + "latin", + "malay", + "malayalam", + "mandarin", + "nigerian", + "nynorsk", + "polish", + "portuguese", + "russian", + "shahmukhi", + "slovak", + "spanish", + "swahili", + "swedish", + "tajik", + "tamil", + "ukrainian", + "urdu", + "yoruba", + ] + + assert utils.list_all_languages() == expected_languages def test_get_ios_data_path():