From 2b72e6408611bd4da2521c052049b1221a49a2db Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Mon, 25 Mar 2024 00:16:48 +0100
Subject: [PATCH] #75 Italian translation process and reorder directory
 structure

---
 CHANGELOG.md                                  |   7 +-
 README.md                                     |  10 +-
 .../English/translations/translate_words.py   |   4 +-
 .../French/translations/translate_words.py    |   4 +-
 .../German/translations/translate_words.py    |   4 +-
 .../Italian/translations/translate_words.py   |  43 +++++++
 .../translations/translate_words.py           |   4 +-
 .../Russian/translations/translate_words.py   |   4 +-
 .../Spanish/translations/translate_words.py   |   4 +-
 .../Swedish/translations/translate_words.py   |   4 +-
 .../translation/translation_utils.py          | 111 ++++++++++++++++++
 .../update_words_to_translate.py              |   2 +-
 .../{ => unicode}/emoji_utils.py              |   0
 .../unicode/gen_emoji_lexicon.ipynb           |   4 +-
 .../{ => unicode}/process_unicode.py          |   4 +-
 .../{ => wikidata}/query_profanity.sparql     |   0
 .../query_words_to_translate.sparql           |   0
 .../{ => wikidata}/update_data.py             |   2 +-
 .../{ => wikipedia}/extract_wiki.py           |   0
 .../wikipedia/gen_autosuggestions.ipynb       |  34 ++++--
 .../{ => wikipedia}/process_wiki.py           |   0
 src/scribe_data/utils.py                      |  89 --------------
 22 files changed, 213 insertions(+), 121 deletions(-)
 create mode 100644 src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py
 create mode 100644 src/scribe_data/extract_transform/translation/translation_utils.py
 rename src/scribe_data/extract_transform/{ => translation}/update_words_to_translate.py (96%)
 rename src/scribe_data/extract_transform/{ => unicode}/emoji_utils.py (100%)
 rename src/scribe_data/extract_transform/{ => unicode}/process_unicode.py (98%)
 rename src/scribe_data/extract_transform/{ => wikidata}/query_profanity.sparql (100%)
 rename src/scribe_data/extract_transform/{ => wikidata}/query_words_to_translate.sparql (100%)
 rename src/scribe_data/extract_transform/{ => wikidata}/update_data.py (99%)
 rename src/scribe_data/extract_transform/{ => wikipedia}/extract_wiki.py (100%)
 rename src/scribe_data/extract_transform/{ => wikipedia}/process_wiki.py (100%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f2bce29c7..932bbd67e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,10 +12,7 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
 
 ## [Upcoming] Scribe-Data 3.3.0
 
-<!-- - The translation process has been updated to allow for translations from non-English languages.
-  - Scribe-Data now outputs an SQLite table that has keys for target languages for each base language. -->
-<!-- - English has been added to the data ETL process. -->
-
+- The translation process has been updated to allow for translations from non-English languages ([#72](https://github.com/scribe-org/Scribe-Data/issues/72), [#73](https://github.com/scribe-org/Scribe-Data/issues/73), [#74](https://github.com/scribe-org/Scribe-Data/issues/74), [#75](https://github.com/scribe-org/Scribe-Data/issues/75), [#75](https://github.com/scribe-org/Scribe-Data/issues/75), [#76](https://github.com/scribe-org/Scribe-Data/issues/76), [#77](https://github.com/scribe-org/Scribe-Data/issues/77), [#78](https://github.com/scribe-org/Scribe-Data/issues/78), [#79](https://github.com/scribe-org/Scribe-Data/issues/79)).
 - The documentation has been given a new layout with the logo in the top left ([#90](https://github.com/scribe-org/Scribe-Data/issues/90)).
 - The documentation now has links to the code at the top of each page ([#91](https://github.com/scribe-org/Scribe-Data/issues/91)).
 
@@ -25,6 +22,8 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
 - A Ruff based GitHub workflow was added to check the code formatting and lint the codebase on each pull request ([#109](https://github.com/scribe-org/Scribe-Data/issues/109)).
 - The `_update_files` directory was renamed `update_files` as these files are used in non-internal manners now ([#57](https://github.com/scribe-org/Scribe-Data/issues/57)).
 - A common function has been created to map Wikidata ids to noun genders ([#69](https://github.com/scribe-org/Scribe-Data/issues/69)).
+- Files in the `extract_transform` directory were moved based on if they access Wikidata, Wikipedia or Unicode.
+  - Translation files are further moved to their own directory.
 
 ## Scribe-Data 3.2.2
 
diff --git a/README.md b/README.md
index 8de609ff1..feda2101f 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 
 ## Wikidata and Wikipedia language data extraction
 
-**Scribe-Data** contains the scripts for extracting and formatting data from [Wikidata](https://www.wikidata.org/) and [Wikipedia](https://www.wikipedia.org/) for Scribe applications. Updates to the language keyboard and interface data can be done using [scribe_data/load/update_data.py](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/load/update_data.py) and the notebooks within the [scribe_data/load](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/load) directory.
+**Scribe-Data** contains the scripts for extracting and formatting data from [Wikidata](https://www.wikidata.org/) and [Wikipedia](https://www.wikipedia.org/) for Scribe applications. Updates to the language keyboard and interface data can be done using [scribe_data/extract_transform/wikidata/update_data.py](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/wikidata/update_data.py) and the notebooks within the [scribe_data/load](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/load) directory.
 
 > [!NOTE]\
 > The [contributing](#contributing) section has information for those interested, with the articles and presentations in [featured by](#featured-by) also being good resources for learning more about Scribe.
@@ -38,14 +38,14 @@ Check out Scribe's [architecture diagrams](https://github.com/scribe-org/Organiz
 
 # Process [`⇧`](#contents)
 
-[scribe_data/extract_transform/update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) and the notebooks within the [scribe_data/extract_transform](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform) directory are used to update all data for [Scribe-iOS](https://github.com/scribe-org/Scribe-iOS), with this functionality later being expanded to update [Scribe-Android](https://github.com/scribe-org/Scribe-Android) and [Scribe-Desktop](https://github.com/scribe-org/Scribe-Desktop) when they're active.
+[scribe_data/extract_transform/wikidata/update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikidata/update_data.py) and the notebooks within the [scribe_data/extract_transform](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform) directory are used to update all data for [Scribe-iOS](https://github.com/scribe-org/Scribe-iOS), with this functionality later being expanded to update [Scribe-Android](https://github.com/scribe-org/Scribe-Android) and [Scribe-Desktop](https://github.com/scribe-org/Scribe-Desktop) when they're active.
 
-The main data update process in [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) triggers [SPARQL queries](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/languages) to query language data from [Wikidata](https://www.wikidata.org/) using [SPARQLWrapper](https://github.com/RDFLib/sparqlwrapper) as a URI. The autosuggestion process derives popular words from [Wikipedia](https://www.wikipedia.org/) as well as those words that normally follow them for an effective baseline feature until natural language processing methods are employed. Functions to generate autosuggestions are ran in [gen_autosuggestions.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb). Emojis are further sourced from [Unicode CLDR](https://github.com/unicode-org/cldr), with this process being ran in [gen_emoji_lexicon.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb).
+The main data update process in [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikidata/update_data.py) triggers [SPARQL queries](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/languages) to query language data from [Wikidata](https://www.wikidata.org/) using [SPARQLWrapper](https://github.com/RDFLib/sparqlwrapper) as a URI. The autosuggestion process derives popular words from [Wikipedia](https://www.wikipedia.org/) as well as those words that normally follow them for an effective baseline feature until natural language processing methods are employed. Functions to generate autosuggestions are ran in [gen_autosuggestions.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb). Emojis are further sourced from [Unicode CLDR](https://github.com/unicode-org/cldr), with this process being ran in [gen_emoji_lexicon.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb).
 
-Running [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) is done via the following CLI command:
+Running [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikidata/update_data.py) is done via the following CLI command:
 
 ```bash
-python3 src/scribe_data/extract_transform/update_data.py
+python3 src/scribe_data/extract_transform/wikidata/update_data.py
 ```
 
 The ultimate goal is that this repository will house language packs that are periodically updated with new [Wikidata](https://www.wikidata.org/) lexicographical data and data from other sources. These packs would then be available to download by users of Scribe applications.
diff --git a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
index fad6e9ec4..27b2ac1c4 100644
--- a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
+++ b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
@@ -14,7 +14,9 @@
 PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
 sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-from scribe_data.utils import translate_to_other_languages  # noqa: E402
+from scribe_data.extract_transform.translation.translation_utils import (  # noqa: E402
+    translate_to_other_languages,
+)
 
 SRC_LANG = "English"
 translate_script_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/scribe_data/extract_transform/languages/French/translations/translate_words.py b/src/scribe_data/extract_transform/languages/French/translations/translate_words.py
index a8177bdb7..48db29482 100644
--- a/src/scribe_data/extract_transform/languages/French/translations/translate_words.py
+++ b/src/scribe_data/extract_transform/languages/French/translations/translate_words.py
@@ -14,7 +14,9 @@
 PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
 sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-from scribe_data.utils import translate_to_other_languages  # noqa: E402
+from scribe_data.extract_transform.translation.translation_utils import (  # noqa: E402
+    translate_to_other_languages,
+)
 
 SRC_LANG = "French"
 translate_script_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/scribe_data/extract_transform/languages/German/translations/translate_words.py b/src/scribe_data/extract_transform/languages/German/translations/translate_words.py
index e838530e5..35fcebe36 100644
--- a/src/scribe_data/extract_transform/languages/German/translations/translate_words.py
+++ b/src/scribe_data/extract_transform/languages/German/translations/translate_words.py
@@ -14,7 +14,9 @@
 PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
 sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-from scribe_data.utils import translate_to_other_languages  # noqa: E402
+from scribe_data.extract_transform.translation.translation_utils import (  # noqa: E402
+    translate_to_other_languages,
+)
 
 SRC_LANG = "German"
 translate_script_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py
new file mode 100644
index 000000000..832db0263
--- /dev/null
+++ b/src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py
@@ -0,0 +1,43 @@
+"""
+Translates the Italian words queried from Wikidata to all other Scribe languages.
+
+Example
+-------
+    python3 src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py
+"""
+
+import json
+import os
+import sys
+
+PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
+PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
+sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
+
+from scribe_data.extract_transform.translation.translation_utils import (  # noqa: E402
+    translate_to_other_languages,
+)
+
+SRC_LANG = "Italian"
+translate_script_dir = os.path.dirname(os.path.abspath(__file__))
+words_to_translate_path = os.path.join(translate_script_dir, "words_to_translate.json")
+
+with open(words_to_translate_path, "r", encoding="utf-8") as file:
+    json_data = json.load(file)
+
+word_list = [item["word"] for item in json_data]
+
+translations = {}
+translated_words_path = os.path.join(
+    translate_script_dir, "../formatted_data/translated_words.json"
+)
+if os.path.exists(translated_words_path):
+    with open(translated_words_path, "r", encoding="utf-8") as file:
+        translations = json.load(file)
+
+translate_to_other_languages(
+    source_language=SRC_LANG,
+    word_list=word_list,
+    translations=translations,
+    batch_size=100,
+)
diff --git a/src/scribe_data/extract_transform/languages/Portuguese/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Portuguese/translations/translate_words.py
index 884083d8f..23256f718 100644
--- a/src/scribe_data/extract_transform/languages/Portuguese/translations/translate_words.py
+++ b/src/scribe_data/extract_transform/languages/Portuguese/translations/translate_words.py
@@ -14,7 +14,9 @@
 PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
 sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-from scribe_data.utils import translate_to_other_languages  # noqa: E402
+from scribe_data.extract_transform.translation.translation_utils import (  # noqa: E402
+    translate_to_other_languages,
+)
 
 SRC_LANG = "Portuguese"
 translate_script_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py
index a2b78e8f1..1de4f75c2 100644
--- a/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py
+++ b/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py
@@ -14,7 +14,9 @@
 PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
 sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-from scribe_data.utils import translate_to_other_languages  # noqa: E402
+from scribe_data.extract_transform.translation.translation_utils import (  # noqa: E402
+    translate_to_other_languages,
+)
 
 SRC_LANG = "Russian"
 translate_script_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/scribe_data/extract_transform/languages/Spanish/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Spanish/translations/translate_words.py
index b5151b151..2d6c172f2 100644
--- a/src/scribe_data/extract_transform/languages/Spanish/translations/translate_words.py
+++ b/src/scribe_data/extract_transform/languages/Spanish/translations/translate_words.py
@@ -14,7 +14,9 @@
 PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
 sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-from scribe_data.utils import translate_to_other_languages  # noqa: E402
+from scribe_data.extract_transform.translation.translation_utils import (  # noqa: E402
+    translate_to_other_languages,
+)
 
 SRC_LANG = "Spanish"
 translate_script_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/scribe_data/extract_transform/languages/Swedish/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Swedish/translations/translate_words.py
index f960b5650..f753bbedc 100644
--- a/src/scribe_data/extract_transform/languages/Swedish/translations/translate_words.py
+++ b/src/scribe_data/extract_transform/languages/Swedish/translations/translate_words.py
@@ -14,7 +14,9 @@
 PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
 sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-from scribe_data.utils import translate_to_other_languages  # noqa: E402
+from scribe_data.extract_transform.translation.translation_utils import (  # noqa: E402
+    translate_to_other_languages,
+)
 
 SRC_LANG = "Swedish"
 translate_script_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/scribe_data/extract_transform/translation/translation_utils.py b/src/scribe_data/extract_transform/translation/translation_utils.py
new file mode 100644
index 000000000..380474795
--- /dev/null
+++ b/src/scribe_data/extract_transform/translation/translation_utils.py
@@ -0,0 +1,111 @@
+"""
+Utility functions for the machine translation process.
+
+Contents:
+    translation_interrupt_handler,
+    translate_to_other_languages
+"""
+
+import json
+import os
+import signal
+import sys
+
+from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
+PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
+sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
+
+from scribe_data.utils import (  # noqa: E402
+    get_language_dir_path,
+    get_language_iso,
+    get_target_langcodes,
+)
+
+
+def translation_interrupt_handler(source_language, translations):
+    """
+    Handles interrupt signals and saves the current translation progress.
+
+    Parameters
+    ----------
+        source_language : str
+            The source language being translated from.
+
+        translations : list[dict]
+            The current list of translations.
+    """
+    print(
+        "\nThe interrupt signal has been caught and the current progress is being saved..."
+    )
+
+    with open(
+        f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json",
+        "w",
+        encoding="utf-8",
+    ) as file:
+        json.dump(translations, file, ensure_ascii=False, indent=4)
+
+    print("The current progress is saved to the translated_words.json file.")
+    exit()
+
+
+def translate_to_other_languages(source_language, word_list, translations, batch_size):
+    """
+    Translates a list of words from the source language to other target languages using batch processing.
+
+    Parameters
+    ----------
+        source_language : str
+            The source language being translated from.
+
+        word_list : list[str]
+            The list of words to translate.
+
+        translations : dict
+            The current dictionary of translations.
+
+        batch_size : int
+            The number of words to translate in each batch.
+    """
+    model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+
+    signal.signal(
+        signal.SIGINT,
+        lambda sig, frame: translation_interrupt_handler(source_language, translations),
+    )
+
+    for i in range(0, len(word_list), batch_size):
+        batch_words = word_list[i : i + batch_size]
+        print(f"Translating batch {i//batch_size + 1}: {batch_words}")
+
+        for lang_code in get_target_langcodes(source_language):
+            tokenizer.src_lang = get_language_iso(source_language)
+            encoded_words = tokenizer(batch_words, return_tensors="pt", padding=True)
+            generated_tokens = model.generate(
+                **encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code)
+            )
+            translated_words = tokenizer.batch_decode(
+                generated_tokens, skip_special_tokens=True
+            )
+
+            for word, translation in zip(batch_words, translated_words):
+                if word not in translations:
+                    translations[word] = {}
+
+                translations[word][lang_code] = translation
+
+        print(f"Batch {i//batch_size + 1} translation completed.")
+
+        with open(
+            f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json",
+            "w",
+            encoding="utf-8",
+        ) as file:
+            json.dump(translations, file, ensure_ascii=False, indent=4)
+
+    print(
+        "Translation results for all words are saved to the translated_words.json file."
+    )
diff --git a/src/scribe_data/extract_transform/update_words_to_translate.py b/src/scribe_data/extract_transform/translation/update_words_to_translate.py
similarity index 96%
rename from src/scribe_data/extract_transform/update_words_to_translate.py
rename to src/scribe_data/extract_transform/translation/update_words_to_translate.py
index ab12a44d5..7979b5d23 100644
--- a/src/scribe_data/extract_transform/update_words_to_translate.py
+++ b/src/scribe_data/extract_transform/translation/update_words_to_translate.py
@@ -8,7 +8,7 @@
 
 Example
 -------
-    python update_words_to_translate.py '["French", "German"]'
+    python3 src/scribe_data/extract_transform/translation/update_words_to_translate.py '["French", "German"]'
 """
 
 import json
diff --git a/src/scribe_data/extract_transform/emoji_utils.py b/src/scribe_data/extract_transform/unicode/emoji_utils.py
similarity index 100%
rename from src/scribe_data/extract_transform/emoji_utils.py
rename to src/scribe_data/extract_transform/unicode/emoji_utils.py
diff --git a/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb b/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb
index a1dcd577b..62d4b7a72 100644
--- a/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb
+++ b/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb
@@ -35,9 +35,7 @@
    "source": [
     "import os\n",
     "import sys\n",
-    "import json\n",
     "\n",
-    "from tqdm.auto import tqdm\n",
     "from IPython.display import display, HTML\n",
     "display(HTML(\"<style>.container { width:99% !important; }</style>\"))"
    ]
@@ -71,7 +69,7 @@
    },
    "outputs": [],
    "source": [
-    "from scribe_data.extract_transform.process_unicode import gen_emoji_lexicon"
+    "from scribe_data.extract_transform.unicode.process_unicode import gen_emoji_lexicon"
    ]
   },
   {
diff --git a/src/scribe_data/extract_transform/process_unicode.py b/src/scribe_data/extract_transform/unicode/process_unicode.py
similarity index 98%
rename from src/scribe_data/extract_transform/process_unicode.py
rename to src/scribe_data/extract_transform/unicode/process_unicode.py
index 4297f56dc..8dc88f1f4 100644
--- a/src/scribe_data/extract_transform/process_unicode.py
+++ b/src/scribe_data/extract_transform/unicode/process_unicode.py
@@ -14,13 +14,13 @@
 from icu import Char, UProperty
 from tqdm.auto import tqdm
 
-from scribe_data.extract_transform.emoji_utils import get_emoji_codes_to_ignore
+from scribe_data.extract_transform.unicode.emoji_utils import get_emoji_codes_to_ignore
 from scribe_data.utils import (
     get_language_iso,
     get_path_from_et_dir,
 )
 
-from . import _resources
+from .. import _resources
 
 emoji_codes_to_ignore = get_emoji_codes_to_ignore()
 
diff --git a/src/scribe_data/extract_transform/query_profanity.sparql b/src/scribe_data/extract_transform/wikidata/query_profanity.sparql
similarity index 100%
rename from src/scribe_data/extract_transform/query_profanity.sparql
rename to src/scribe_data/extract_transform/wikidata/query_profanity.sparql
diff --git a/src/scribe_data/extract_transform/query_words_to_translate.sparql b/src/scribe_data/extract_transform/wikidata/query_words_to_translate.sparql
similarity index 100%
rename from src/scribe_data/extract_transform/query_words_to_translate.sparql
rename to src/scribe_data/extract_transform/wikidata/query_words_to_translate.sparql
diff --git a/src/scribe_data/extract_transform/update_data.py b/src/scribe_data/extract_transform/wikidata/update_data.py
similarity index 99%
rename from src/scribe_data/extract_transform/update_data.py
rename to src/scribe_data/extract_transform/wikidata/update_data.py
index 23d274459..0a9fc7152 100644
--- a/src/scribe_data/extract_transform/update_data.py
+++ b/src/scribe_data/extract_transform/wikidata/update_data.py
@@ -11,7 +11,7 @@
 
 Example
 -------
-    python update_data.py '["French", "German"]' '["nouns", "verbs"]'
+    python3 src/scribe_data/extract_transform/wikidata/update_data.py '["French", "German"]' '["nouns", "verbs"]'
 """
 
 import itertools
diff --git a/src/scribe_data/extract_transform/extract_wiki.py b/src/scribe_data/extract_transform/wikipedia/extract_wiki.py
similarity index 100%
rename from src/scribe_data/extract_transform/extract_wiki.py
rename to src/scribe_data/extract_transform/wikipedia/extract_wiki.py
diff --git a/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb b/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb
index ed98a9d30..77cc413fb 100644
--- a/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb
+++ b/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb
@@ -21,6 +21,18 @@
     "This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bec5ff38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", message=r\"Passing\", category=FutureWarning)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -36,8 +48,6 @@
     "import os\n",
     "import sys\n",
     "import json\n",
-    "import warnings\n",
-    "warnings.filterwarnings(\"ignore\", message=r\"Passing\", category=FutureWarning)\n",
     "\n",
     "from tqdm.auto import tqdm\n",
     "from IPython.core.display import display, HTML\n",
@@ -73,8 +83,14 @@
    },
    "outputs": [],
    "source": [
-    "from scribe_data.extract_transform.extract_wiki import download_wiki, parse_to_ndjson\n",
-    "from scribe_data.extract_transform.process_wiki import clean, gen_autosuggestions\n",
+    "from scribe_data.extract_transform.wikipedia.extract_wiki import (\n",
+    "    download_wiki,\n",
+    "    parse_to_ndjson,\n",
+    ")\n",
+    "from scribe_data.extract_transform.wikipedia.process_wiki import (\n",
+    "    clean,\n",
+    "    gen_autosuggestions,\n",
+    ")\n",
     "from scribe_data.utils import get_language_iso"
    ]
   },
@@ -116,9 +132,9 @@
    "outputs": [],
    "source": [
     "files = download_wiki(\n",
-    "    language=language, \n",
-    "    target_dir=f\"./{language_abbr}wiki_dump\", \n",
-    "    file_limit=None, # None is all files \n",
+    "    language=language,\n",
+    "    target_dir=f\"./{language_abbr}wiki_dump\",\n",
+    "    file_limit=None, # None is all files\n",
     "    dump_id=\"20220920\"\n",
     ")\n",
     "print(f\"Number of files: {len(files)}\")"
@@ -207,7 +223,7 @@
     "    texts=article_texts,\n",
     "    language=language,\n",
     "    remove_words=None,\n",
-    "    sample_size=sample_size, \n",
+    "    sample_size=sample_size,\n",
     "    verbose=True,\n",
     ")"
    ]
@@ -232,7 +248,7 @@
    "outputs": [],
    "source": [
     "autosuggest_dict = gen_autosuggestions(\n",
-    "    text_corpus, \n",
+    "    text_corpus,\n",
     "    language=language,\n",
     "    num_words=1000,\n",
     "    ignore_words=None,\n",
diff --git a/src/scribe_data/extract_transform/process_wiki.py b/src/scribe_data/extract_transform/wikipedia/process_wiki.py
similarity index 100%
rename from src/scribe_data/extract_transform/process_wiki.py
rename to src/scribe_data/extract_transform/wikipedia/process_wiki.py
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
index 303cb1afb..881d662fa 100644
--- a/src/scribe_data/utils.py
+++ b/src/scribe_data/utils.py
@@ -30,7 +30,6 @@
 import ast
 import json
 import os
-import signal
 import sys
 from importlib import resources
 from pathlib import Path
@@ -38,7 +37,6 @@
 
 import langcodes
 from langcodes import Language
-from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 
 PROJECT_ROOT = "Scribe-Data"
 
@@ -532,93 +530,6 @@ def get_target_langcodes(source_lang) -> list[str]:
     ]
 
 
-def translation_interrupt_handler(source_language, translations):
-    """
-    Handles interrupt signals and saves the current translation progress.
-
-    Parameters
-    ----------
-        source_language : str
-            The source language being translated from.
-
-        translations : list[dict]
-            The current list of translations.
-    """
-    print(
-        "\nThe interrupt signal has been caught and the current progress is being saved..."
-    )
-
-    with open(
-        f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json",
-        "w",
-        encoding="utf-8",
-    ) as file:
-        json.dump(translations, file, ensure_ascii=False, indent=4)
-
-    print("The current progress is saved to the translated_words.json file.")
-    exit()
-
-
-def translate_to_other_languages(source_language, word_list, translations, batch_size):
-    """
-    Translates a list of words from the source language to other target languages using batch processing.
-
-    Parameters
-    ----------
-        source_language : str
-            The source language being translated from.
-
-        word_list : list[str]
-            The list of words to translate.
-
-        translations : dict
-            The current dictionary of translations.
-
-        batch_size : int
-            The number of words to translate in each batch.
-    """
-    model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-
-    signal.signal(
-        signal.SIGINT,
-        lambda sig, frame: translation_interrupt_handler(source_language, translations),
-    )
-
-    for i in range(0, len(word_list), batch_size):
-        batch_words = word_list[i : i + batch_size]
-        print(f"Translating batch {i//batch_size + 1}: {batch_words}")
-
-        for lang_code in get_target_langcodes(source_language):
-            tokenizer.src_lang = get_language_iso(source_language)
-            encoded_words = tokenizer(batch_words, return_tensors="pt", padding=True)
-            generated_tokens = model.generate(
-                **encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code)
-            )
-            translated_words = tokenizer.batch_decode(
-                generated_tokens, skip_special_tokens=True
-            )
-
-            for word, translation in zip(batch_words, translated_words):
-                if word not in translations:
-                    translations[word] = {}
-
-                translations[word][lang_code] = translation
-
-        print(f"Batch {i//batch_size + 1} translation completed.")
-
-        with open(
-            f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json",
-            "w",
-            encoding="utf-8",
-        ) as file:
-            json.dump(translations, file, ensure_ascii=False, indent=4)
-
-    print(
-        "Translation results for all words are saved to the translated_words.json file."
-    )
-
-
 def map_genders(wikidata_gender):
     """
     Maps those genders from Wikidata to succinct versions.