From f7a38b571495a08a870db8278f13608abe7a4572 Mon Sep 17 00:00:00 2001
From: Michael Charlton <m.charlton@mac.com>
Date: Wed, 18 Oct 2023 11:27:37 +0100
Subject: [PATCH] refactor(utils.py): move language data to JSON file (resolves
 #52)

---
 src/scribe_data/resources/__init__.py         |   0
 .../resources/language_meta_data.json         | 128 ++++++++
 src/scribe_data/utils.py                      | 293 +++++++++---------
 tests/load/test_update_utils.py               |   1 -
 4 files changed, 282 insertions(+), 140 deletions(-)
 create mode 100644 src/scribe_data/resources/__init__.py
 create mode 100755 src/scribe_data/resources/language_meta_data.json

diff --git a/src/scribe_data/resources/__init__.py b/src/scribe_data/resources/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/scribe_data/resources/language_meta_data.json b/src/scribe_data/resources/language_meta_data.json
new file mode 100755
index 000000000..ec0891e20
--- /dev/null
+++ b/src/scribe_data/resources/language_meta_data.json
@@ -0,0 +1,128 @@
+{
+    "used by": "Scribe-Data/src/scribe_data/utils.py",
+    "description": {
+        "entry": {
+            "language": "the supported language. All lowercase",
+            "iso": "the ISO 639 code for 'language'. See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes . All lowercase",
+            "qid": "the unique identifier of 'language' on Wikidata. 'Q' followed by one or more digits. See https://www.wikidata.org/wiki/Q43649390",
+            "remove-words": "words that should not be included as autosuggestions for the given language.",
+            "ignore-words": "TODO. Case sensitive."
+        }
+    },
+    "languages": [
+        {
+            "language": "english",
+            "iso": "en",
+            "qid": "Q1860",
+            "remove-words": [
+                "of",
+                "the",
+                "The",
+                "and"
+            ],
+            "ignore-words": []
+        },
+        {
+            "language": "french",
+            "iso": "fr",
+            "qid": "Q150",
+            "remove-words": [
+                "of",
+                "the",
+                "The",
+                "and"
+            ],
+            "ignore-words": [
+                "XXe"
+            ]
+        },
+        {
+            "language": "german",
+            "iso": "de",
+            "qid": "Q188",
+            "remove-words": [
+                "of",
+                "the",
+                "The",
+                "and",
+                "NeinJa",
+                "et",
+                "redirect"
+            ],
+            "ignore-words": [
+                "Gemeinde",
+                "Familienname"
+            ]
+        },
+        {
+            "language": "italian",
+            "iso": "it",
+            "qid": "Q652",
+            "remove-words": [
+                "of",
+                "the",
+                "The",
+                "and",
+                "text",
+                "from"
+            ],
+            "ignore-words": [
+                "The",
+                "ATP"
+            ]
+        },
+        {
+            "language": "portuguese",
+            "iso": "pt",
+            "qid": "Q5146",
+            "remove-words": [
+                "of",
+                "the",
+                "The",
+                "and",
+                "jbutadptflora"
+            ],
+            "ignore-words": []
+        },
+        {
+            "language": "russian",
+            "iso": "ru",
+            "qid": "Q7737",
+            "remove-words": [
+                "of",
+                "the",
+                "The",
+                "and"
+            ],
+            "ignore-words": []
+        },
+        {
+            "language": "spanish",
+            "iso": "es",
+            "qid": "Q1321",
+            "remove-words": [
+                "of",
+                "the",
+                "The",
+                "and"
+            ],
+            "ignore-words": []
+        },
+        {
+            "language": "swedish",
+            "iso": "sv",
+            "qid": "Q9027",
+            "remove-words": [
+                "of",
+                "the",
+                "The",
+                "and",
+                "Checklist",
+                "Catalogue"
+            ],
+            "ignore-words": [
+                "databasdump"
+            ]
+        }
+    ]
+}
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index da7d48f35..ffa769491 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -22,23 +22,102 @@
 """
 
 import ast
+import json
+import sys
+from importlib import resources
+from pathlib import Path
 from typing import Any
 
+PROJECT_ROOT = "Scribe-Data"
+
+
+def _load_json(package_path: str, file_name: str, root: str):
+    """Loads a JSON resource from a package into a python entity.
+
+    Parameters
+    ----------
+        package_path : str
+            The fully qualified package that contains the resource.
+
+        file_name : str
+            The name of the file (resource) that contains the JSON data.
+
+        root : str
+            The root node of the JSON document.
+
+    Returns
+    -------
+        A python entity starting at 'root'.
+    """
+    # add 'Scribe-Data/src' to PYTHONPATH so that resources.files()
+    # can find 'package_path'
+    parts = Path(__file__).resolve().parts
+    prj_root_idx = parts.index(PROJECT_ROOT)
+    package_root = str(Path(*parts[: prj_root_idx + 1], "src"))
+
+    if package_root not in sys.path:
+        sys.path.insert(0, package_root)
+
+    with resources.files(package_path).joinpath(file_name).open(
+        encoding="utf-8"
+    ) as in_stream:
+        contents = json.load(in_stream)
+        return contents[root]
+
+
+_languages = _load_json(
+    package_path="scribe_data.resources",
+    file_name="language_meta_data.json",
+    root="languages",
+)
+
+
+def _find(source_key: str, source_value: str, target_key: str, error_msg: str):
+    """
+    Each 'language', (english, german,..., etc) is a dictionary of key/value pairs:
+
+        entry = {
+            "language": "english",
+            "iso": "en",
+            "qid": "Q1860",
+            "remove-words": [...],
+            "ignore-words": [...]
+        }
+
+    Given a key/value pair, the 'source', and the 'target' key, get the 'target' value.
+
+    Args:
+        source_value (str): e.g. 'english'.
+        source_key (str): e.g. 'language'.
+        target_key (str): e.g. 'iso'.
+        error_msg (str): for when a value cannot be found.
+
+    Raises:
+        ValueError: when a source_value is not supported.
+
+    Returns:
+        the 'target' value
+    """
+    norm_source_value = source_value.lower()
+
+    target_value = [
+        entry[target_key]
+        for entry in _languages
+        if entry[source_key] == norm_source_value
+    ]
+
+    if target_value:
+        assert len(target_value) == 1, f"More than one entry for '{norm_source_value}'"
+        return target_value[0]
+
+    raise ValueError(error_msg)
+
 
 def get_scribe_languages() -> list[str]:
     """
     Returns the list of currently implemented Scribe languages.
     """
-    return [
-        "English",
-        "French",
-        "German",
-        "Italian",
-        "Portuguese",
-        "Russian",
-        "Spanish",
-        "Swedish",
-    ]
+    return sorted(entry["language"].capitalize() for entry in _languages)
 
 
 def get_language_qid(language: str) -> str:
@@ -52,27 +131,15 @@ def get_language_qid(language: str) -> str:
 
     Returns
     -------
-        The Wikidata QID for the language as a value of a dictionary.
+        str
+            The Wikidata QID for the language.
     """
-    language = language.lower()
-
-    language_qid_dict = {
-        "english": "Q1860",
-        "french": "Q150",
-        "german": "Q188",
-        "italian": "Q652",
-        "portuguese": "Q5146",
-        "russian": "Q7737",
-        "spanish": "Q1321",
-        "swedish": "Q9027",
-    }
-
-    if language not in language_qid_dict:
-        raise ValueError(
-            f"{language.upper()} is currently not a supported language for QID conversion."
-        )
-
-    return language_qid_dict[language]
+    return _find(
+        "language",
+        language,
+        "qid",
+        f"{language.upper()} is currently not a supported language for QID conversion.",
+    )
 
 
 def get_language_iso(language: str) -> str:
@@ -86,27 +153,15 @@ def get_language_iso(language: str) -> str:
 
     Returns
     -------
-        The ISO code for the language as a value of a dictionary.
+        str
+            The ISO code for the language.
     """
-    language = language.lower()
-
-    language_iso_dict = {
-        "english": "en",
-        "french": "fr",
-        "german": "de",
-        "italian": "it",
-        "portuguese": "pt",
-        "russian": "ru",
-        "spanish": "es",
-        "swedish": "sv",
-    }
-
-    if language not in language_iso_dict:
-        raise ValueError(
-            f"{language.capitalize()} is currently not a supported language for ISO conversion."
-        )
-
-    return language_iso_dict[language]
+    return _find(
+        "language",
+        language,
+        "iso",
+        f"{language.capitalize()} is currently not a supported language for ISO conversion.",
+    )
 
 
 def get_language_from_iso(iso: str) -> str:
@@ -120,30 +175,21 @@ def get_language_from_iso(iso: str) -> str:
 
     Returns
     -------
-        The name for the language as a value of a dictionary.
+        str
+            The name for the language which has an ISO value of iso.
     """
-    iso = iso.lower()
-
-    iso_language_dict = {
-        "en": "English",
-        "fr": "French",
-        "de": "German",
-        "it": "Italian",
-        "pt": "Portuguese",
-        "ru": "Russian",
-        "es": "Spanish",
-        "sv": "Swedish",
-    }
-
-    if iso not in iso_language_dict:
-        raise ValueError(f"{iso.upper()} is currently not a supported ISO language.")
-
-    return iso_language_dict[iso]
+    return _find(
+        "iso",
+        iso,
+        "language",
+        f"{iso.upper()} is currently not a supported ISO language.",
+    ).capitalize()
 
 
 def get_language_words_to_remove(language: str) -> list[str]:
     """
-    Returns the words that should not be included as autosuggestions for the given language.
+    Returns the words that should not be included as autosuggestions for the given
+    language.
 
     Parameters
     ----------
@@ -152,46 +198,22 @@ def get_language_words_to_remove(language: str) -> list[str]:
 
     Returns
     -------
-        The words that should not be included as autosuggestions for the given language as values of a dictionary.
+        list[str]
+            The words that should not be included as autosuggestions for the given
+            language
     """
-    language = language.lower()
-    words_to_remove: dict[str, list[str]] = {
-        "english": [
-            "of",
-            "the",
-            "The",
-            "and",
-        ],
-        "french": [
-            "of",
-            "the",
-            "The",
-            "and",
-        ],
-        "german": ["of", "the", "The", "and", "NeinJa", "et", "redirect"],
-        "italian": ["of", "the", "The", "and", "text", "from"],
-        "portuguese": ["of", "the", "The", "and", "jbutadptflora"],
-        "russian": [
-            "of",
-            "the",
-            "The",
-            "and",
-        ],  # and all non-Cyrillic characters
-        "spanish": ["of", "the", "The", "and"],
-        "swedish": ["of", "the", "The", "and", "Checklist", "Catalogue"],
-    }
-
-    if language not in words_to_remove:
-        raise ValueError(
-            f"{language.capitalize()} is currently not a supported language."
-        )
-
-    return words_to_remove[language]
+    return _find(
+        "language",
+        language,
+        "remove-words",
+        f"{language.capitalize()} is currently not a supported language.",
+    )
 
 
 def get_language_words_to_ignore(language: str) -> list[str]:
     """
-    Returns the words that should not be included as autosuggestions for the given language.
+    Returns the words that should not be included as autosuggestions for the given
+    language.
 
     Parameters
     ----------
@@ -200,27 +222,16 @@ def get_language_words_to_ignore(language: str) -> list[str]:
 
     Returns
     -------
-        The words that should not be included as autosuggestions for the given language as values of a dictionary.
+        list[str]
+            The words that should not be included as autosuggestions for the given
+            language
     """
-    language = language.lower()
-    words_to_ignore: dict[str, list[str]] = {
-        "french": [
-            "XXe",
-        ],
-        "german": ["Gemeinde", "Familienname"],
-        "italian": ["The", "ATP"],
-        "portuguese": [],
-        "russian": [],
-        "spanish": [],
-        "swedish": ["databasdump"],
-    }
-
-    if language not in words_to_ignore:
-        raise ValueError(
-            f"{language.capitalize()} is currently not a supported language."
-        )
-
-    return words_to_ignore[language]
+    return _find(
+        "language",
+        language,
+        "ignore-words",
+        f"{language.capitalize()} is currently not a supported language.",
+    )
 
 
 def get_path_from_format_file() -> str:
@@ -230,7 +241,7 @@ def get_path_from_format_file() -> str:
     return "../../../../../.."
 
 
-def get_path_from_load_dir() -> str:
+def get_path_from_load_dir():
     """
     Returns the directory path from the load directory to scribe-org.
     """
@@ -255,7 +266,8 @@ def get_ios_data_path(language: str) -> str:
 
     Returns
     -------
-        The path to the data json for the given language.
+        str
+            The path to the data json for the given language.
     """
     return f"/Scribe-iOS/Keyboards/LanguageKeyboards/{language}"
 
@@ -271,7 +283,8 @@ def get_android_data_path(language: str) -> str:
 
     Returns
     -------
-        The path to the data json for the given language.
+        str
+            The path to the data json for the given language.
     """
     return f"/Scribe-Android/app/src/main/LanguageKeyboards/{language}"
 
@@ -287,7 +300,8 @@ def get_desktop_data_path(language: str) -> str:
 
     Returns
     -------
-        The path to the data json for the given language.
+        str
+            The path to the data JSON for the given language.
     """
     return f"/Scribe-Desktop/scribe/language_guis/{language}"
 
@@ -349,7 +363,8 @@ def check_and_return_command_line_args(
     all_args, first_args_check=None, second_args_check=None
 ):
     """
-    Checks command line arguments passed to Scribe-Data files and returns them if correct.
+    Checks command line arguments passed to Scribe-Data files and returns them if
+    correct.
 
     Parameters
     ----------
@@ -365,7 +380,8 @@ def check_and_return_command_line_args(
     Returns
     -------
         first_args, second_args: list(str)
-            The subset of possible first and second arguments that have been verified as being valid.
+            The subset of possible first and second arguments that have been verified
+            as being valid.
     """
     if len(all_args) == 1:
         return None, None
@@ -378,7 +394,7 @@ def check_and_return_command_line_args(
 
         return first_args, None
 
-    elif len(all_args) == 3:
+    if len(all_args) == 3:
         arg_1 = all_args[1]
         arg_2 = all_args[2]
 
@@ -393,11 +409,10 @@ def check_and_return_command_line_args(
 
         return first_args, second_args
 
-    else:
-        raise ValueError(
-            f"""An invalid number of arguments were specified.
-            At this time only two sets of values can be passed.
-            Pass argument sets via the following:
-            python {all_args[0]} '["comma_separated_sets_in_quotes"]'
-            """
-        )
+    raise ValueError(
+        f"""An invalid number of arguments were specified.
+        At this time only two sets of values can be passed.
+        Pass argument sets via the following:
+        python {all_args[0]} '["comma_separated_sets_in_quotes"]'
+        """
+    )
diff --git a/tests/load/test_update_utils.py b/tests/load/test_update_utils.py
index b8722ac79..6aeaef415 100644
--- a/tests/load/test_update_utils.py
+++ b/tests/load/test_update_utils.py
@@ -196,7 +196,6 @@ def test_get_path_from_load_dir():
 
 
 def test_get_path_from_et_dir():
-    # TODO: file path is same as above. Is this correct?
     assert utils.get_path_from_et_dir() == "../../../.."