From 1fa59ea4330230a62f370b71f525d60e41fbe325 Mon Sep 17 00:00:00 2001
From: Linfye <3158203624@qq.com>
Date: Tue, 27 Feb 2024 14:53:36 +0800
Subject: [PATCH 1/6] English trans finished

---
 .../English/translations/__init__.py          | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/src/scribe_data/extract_transform/languages/English/translations/__init__.py b/src/scribe_data/extract_transform/languages/English/translations/__init__.py
index e69de29bb..b040d1fbe 100644
--- a/src/scribe_data/extract_transform/languages/English/translations/__init__.py
+++ b/src/scribe_data/extract_transform/languages/English/translations/__init__.py
@@ -0,0 +1,49 @@
+from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+import json
+import signal
+import os
+
+with open('words_to_translate.json', 'r', encoding='utf-8') as file:
+    json_data = json.load(file)
+
+word_list = []
+
+for item in json_data:
+    word_list.append(item["word"])
+
+#print(word_list[0])
+
+model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+
+target_languages = ["fr", "de", "it", "pt", "ru", "es", "sv"]
+
+translations = []
+
+if os.path.exists('translations.json'):
+    with open('translations.json', 'r', encoding='utf-8') as file:
+        translations = json.load(file)
+
+def signal_handler(sig, frame):
+    print("\nThe interrupt signal has been caught and the current progress is being saved...")
+    with open('translations.json', 'w', encoding='utf-8') as file:
+        json.dump(translations, file, ensure_ascii=False, indent=4)
+    print("The current progress is saved to the translations.json file.")
+    exit()
+
+signal.signal(signal.SIGINT, signal_handler)
+
+for word in word_list[len(translations):]:
+    word_translations = {word: {}}
+    for lang_code in target_languages:
+        tokenizer.src_lang = "en"
+        encoded_word = tokenizer(word, return_tensors="pt")
+        generated_tokens = model.generate(**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code))
+        translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+        word_translations[word][lang_code] = translated_word
+    translations.append(word_translations)
+    with open('translations.json', 'w', encoding='utf-8') as file:
+        json.dump(translations, file, ensure_ascii=False, indent=4)
+    print(f"Translation results for the word '{word}' have been saved.")
+
+print("Translation results for all words are saved to the translations.json file.")
\ No newline at end of file

From 6e9d5e87080b17a3cbffacce4896d6bca53c9127 Mon Sep 17 00:00:00 2001
From: Linfye <3158203624@qq.com>
Date: Tue, 27 Feb 2024 14:56:06 +0800
Subject: [PATCH 2/6] change dic

---
 .../languages/English/translations/__init__.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scribe_data/extract_transform/languages/English/translations/__init__.py b/src/scribe_data/extract_transform/languages/English/translations/__init__.py
index b040d1fbe..2c59a0285 100644
--- a/src/scribe_data/extract_transform/languages/English/translations/__init__.py
+++ b/src/scribe_data/extract_transform/languages/English/translations/__init__.py
@@ -42,7 +42,7 @@ def signal_handler(sig, frame):
         translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
         word_translations[word][lang_code] = translated_word
     translations.append(word_translations)
-    with open('translations.json', 'w', encoding='utf-8') as file:
+    with open('../formatted_data/translations.json', 'w', encoding='utf-8') as file:
         json.dump(translations, file, ensure_ascii=False, indent=4)
     print(f"Translation results for the word '{word}' have been saved.")
 

From 8311358e62ee3455dbc4712869fa2041a5616b8f Mon Sep 17 00:00:00 2001
From: Linfye <3158203624@qq.com>
Date: Tue, 27 Feb 2024 15:11:25 +0800
Subject: [PATCH 3/6] change file name

---
 .../formatted_data/translated_words.json      | 849 ++++++++++++++++++
 1 file changed, 849 insertions(+)
 create mode 100644 src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json

diff --git a/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json b/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json
new file mode 100644
index 000000000..f97905aa1
--- /dev/null
+++ b/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json
@@ -0,0 +1,849 @@
+[
+    {
+        "after": {
+            "fr": "Après",
+            "de": "Nach dem",
+            "it": "Dopo",
+            "pt": "Depois de",
+            "ru": "После",
+            "es": "Después",
+            "sv": "Efter"
+        }
+    },
+    {
+        "around": {
+            "fr": "autour",
+            "de": "Umgeben",
+            "it": "Intorno",
+            "pt": "ao redor",
+            "ru": "вокруг",
+            "es": "alrededor",
+            "sv": "Omkring"
+        }
+    },
+    {
+        "beside": {
+            "fr": "à côté",
+            "de": "Neben",
+            "it": "accanto",
+            "pt": "ao lado",
+            "ru": "рядом",
+            "es": "Al lado",
+            "sv": "Bredvid"
+        }
+    },
+    {
+        "through": {
+            "fr": "à travers",
+            "de": "durch",
+            "it": "attraverso",
+            "pt": "Por meio",
+            "ru": "через",
+            "es": "A través",
+            "sv": "genom"
+        }
+    },
+    {
+        "past": {
+            "fr": "Le passé",
+            "de": "Vergangenheit",
+            "it": "Il passato",
+            "pt": "passado",
+            "ru": "прошлое",
+            "es": "pasado",
+            "sv": "förflutna"
+        }
+    },
+    {
+        "despite": {
+            "fr": "Malgré",
+            "de": "Trotz",
+            "it": "Nonostante",
+            "pt": "Apesar de",
+            "ru": "Несмотря",
+            "es": "A pesar de",
+            "sv": "Trots att"
+        }
+    },
+    {
+        "always": {
+            "fr": "toujours",
+            "de": "immer",
+            "it": "sempre",
+            "pt": "sempre",
+            "ru": "Всегда",
+            "es": "Siempre",
+            "sv": "Alltid"
+        }
+    },
+    {
+        "orange": {
+            "fr": "Orange",
+            "de": "Orange",
+            "it": "L’arancia",
+            "pt": "Laranja",
+            "ru": "Оранжевый",
+            "es": "Orange",
+            "sv": "orange"
+        }
+    },
+    {
+        "beautiful": {
+            "fr": "Beaucoup",
+            "de": "Schönes",
+            "it": "bella",
+            "pt": "bonita",
+            "ru": "Красивый",
+            "es": "hermosa",
+            "sv": "vacker"
+        }
+    },
+    {
+        "large": {
+            "fr": "grand grand",
+            "de": "Große",
+            "it": "Grande",
+            "pt": "Grande",
+            "ru": "Большой",
+            "es": "Grandes",
+            "sv": "Stora"
+        }
+    },
+    {
+        "serious": {
+            "fr": "sérieux",
+            "de": "ernst",
+            "it": "serio",
+            "pt": "sério",
+            "ru": "Серьезный",
+            "es": "serio",
+            "sv": "allvarligt"
+        }
+    },
+    {
+        "bright": {
+            "fr": "Brille",
+            "de": "Brille",
+            "it": "Il brillo",
+            "pt": "brilhante",
+            "ru": "Яркий",
+            "es": "brillo",
+            "sv": "ljusa"
+        }
+    },
+    {
+        "strong": {
+            "fr": "Forte",
+            "de": "starke",
+            "it": "forte",
+            "pt": "forte",
+            "ru": "Сильный",
+            "es": "fuerte",
+            "sv": "starka"
+        }
+    },
+    {
+        "sweet": {
+            "fr": "doux",
+            "de": "Süßes",
+            "it": "dolce",
+            "pt": "doce",
+            "ru": "Сладкий",
+            "es": "dulce",
+            "sv": "söta"
+        }
+    },
+    {
+        "clear": {
+            "fr": "clair",
+            "de": "klar",
+            "it": "chiaro",
+            "pt": "Claro",
+            "ru": "Яркий",
+            "es": "claramente",
+            "sv": "tydligt"
+        }
+    },
+    {
+        "deep": {
+            "fr": "profondeur",
+            "de": "tief",
+            "it": "profondo",
+            "pt": "profundidade",
+            "ru": "глубокий",
+            "es": "profundidad",
+            "sv": "djupt"
+        }
+    },
+    {
+        "different": {
+            "fr": "Différents",
+            "de": "unterschiedlich",
+            "it": "Differenza",
+            "pt": "Diferentes",
+            "ru": "Разное",
+            "es": "diferentes",
+            "sv": "annorlunda"
+        }
+    },
+    {
+        "difficult": {
+            "fr": "Difficile",
+            "de": "Schwierig",
+            "it": "Difficile",
+            "pt": "Difícil",
+            "ru": "Трудное",
+            "es": "difícil",
+            "sv": "svåra"
+        }
+    },
+    {
+        "come": {
+            "fr": "Venez",
+            "de": "Kommen",
+            "it": "Vieni",
+            "pt": "Venha",
+            "ru": "Приходите",
+            "es": "Venga",
+            "sv": "Kom och"
+        }
+    },
+    {
+        "wait": {
+            "fr": "Attendre",
+            "de": "Warten",
+            "it": "Aspetta",
+            "pt": "Aguardando",
+            "ru": "ждать",
+            "es": "Espera",
+            "sv": "Väntar"
+        }
+    },
+    {
+        "except": {
+            "fr": "Sauf",
+            "de": "Ausgenommen",
+            "it": "tranne che",
+            "pt": "Excepção",
+            "ru": "за исключением",
+            "es": "excepto",
+            "sv": "Förutom"
+        }
+    },
+    {
+        "purpose": {
+            "fr": "Objectif",
+            "de": "Zweck",
+            "it": "Obiettivo",
+            "pt": "Objetivo",
+            "ru": "Цель",
+            "es": "Objetivo",
+            "sv": "syftet"
+        }
+    },
+    {
+        "begin": {
+            "fr": "Début",
+            "de": "beginnt",
+            "it": "Iniziamo",
+            "pt": "Começando",
+            "ru": "Начало",
+            "es": "Inicio",
+            "sv": "Börja"
+        }
+    },
+    {
+        "throw": {
+            "fr": "Jouer",
+            "de": "Schießen",
+            "it": "lancio",
+            "pt": "lança",
+            "ru": "Стрельба",
+            "es": "lanzar",
+            "sv": "kastar"
+        }
+    },
+    {
+        "teach": {
+            "fr": "enseignant",
+            "de": "Lehre",
+            "it": "insegnare",
+            "pt": "Ensino",
+            "ru": "Учитель",
+            "es": "enseñar",
+            "sv": "Lär dig"
+        }
+    },
+    {
+        "slope": {
+            "fr": "Le Slope",
+            "de": "Schlange",
+            "it": "di Slope",
+            "pt": "Limpeza",
+            "ru": "Слайд",
+            "es": "El Slope",
+            "sv": "Slippa"
+        }
+    },
+    {
+        "smash": {
+            "fr": "Smoothie",
+            "de": "Schmutz",
+            "it": "di smash",
+            "pt": "Mãe Smash",
+            "ru": "Смаш",
+            "es": "El Smash",
+            "sv": "Smash"
+        }
+    },
+    {
+        "smile": {
+            "fr": "sourire",
+            "de": "Lächeln",
+            "it": "Il sorriso",
+            "pt": "sorriso",
+            "ru": "улыбка",
+            "es": "sonrisas",
+            "sv": "Ett leende"
+        }
+    },
+    {
+        "liberate": {
+            "fr": "libérée",
+            "de": "Befreiung",
+            "it": "Liberato",
+            "pt": "Libertação",
+            "ru": "освобождать",
+            "es": "Liberación",
+            "sv": "befriade"
+        }
+    },
+    {
+        "rate": {
+            "fr": "taux",
+            "de": "Rate",
+            "it": "Tasso",
+            "pt": "Taxa",
+            "ru": "Уровень",
+            "es": "La tasa",
+            "sv": "Rättigheter"
+        }
+    },
+    {
+        "point": {
+            "fr": "point",
+            "de": "Punkt",
+            "it": "Il punto",
+            "pt": "ponto",
+            "ru": "Точка",
+            "es": "punto",
+            "sv": "Poäng"
+        }
+    },
+    {
+        "print": {
+            "fr": "Printé",
+            "de": "Druck",
+            "it": "stampa",
+            "pt": "Impressão",
+            "ru": "Принтер",
+            "es": "impresión",
+            "sv": "tryck"
+        }
+    },
+    {
+        "bar": {
+            "fr": "Bar",
+            "de": "Bar",
+            "it": "Bar",
+            "pt": "Bar",
+            "ru": "Бар",
+            "es": "bar",
+            "sv": "Bar"
+        }
+    },
+    {
+        "break": {
+            "fr": "La pause",
+            "de": "Pause",
+            "it": "pausa",
+            "pt": "Pausa",
+            "ru": "Перерыв",
+            "es": "La pausa",
+            "sv": "Avbrott"
+        }
+    },
+    {
+        "call": {
+            "fr": "Appel",
+            "de": "Anrufe",
+            "it": "Chiamate",
+            "pt": "Chamado",
+            "ru": "Звонок",
+            "es": "llamadas",
+            "sv": "ringer"
+        }
+    },
+    {
+        "initiate": {
+            "fr": "Initiation",
+            "de": "Initiieren",
+            "it": "iniziare",
+            "pt": "Iniciação",
+            "ru": "Инициативы",
+            "es": "Inicio",
+            "sv": "initiera"
+        }
+    },
+    {
+        "contribute": {
+            "fr": "Contribuer",
+            "de": "Beiträge",
+            "it": "contributi",
+            "pt": "Contribuição",
+            "ru": "Вклад",
+            "es": "Contribución",
+            "sv": "Bidrag"
+        }
+    },
+    {
+        "test": {
+            "fr": "Tests",
+            "de": "Test",
+            "it": "Il test",
+            "pt": "Testes",
+            "ru": "Тест",
+            "es": "Testes",
+            "sv": "Testning"
+        }
+    },
+    {
+        "deal": {
+            "fr": "Accord",
+            "de": "Vereinbarung",
+            "it": "Accordo",
+            "pt": "acordo",
+            "ru": "Договор",
+            "es": "Acuerdo",
+            "sv": "Avtal"
+        }
+    },
+    {
+        "dine": {
+            "fr": "Dîne",
+            "de": "Ihre",
+            "it": "Il tuo",
+            "pt": "Dinheiro",
+            "ru": "Тёни",
+            "es": "Tiene",
+            "sv": "Dina"
+        }
+    },
+    {
+        "meet": {
+            "fr": "Rencontre",
+            "de": "Treffen",
+            "it": "Incontrare",
+            "pt": "Encontro",
+            "ru": "Встреча",
+            "es": "Encuentro",
+            "sv": "möter"
+        }
+    },
+    {
+        "area": {
+            "fr": "La zone",
+            "de": "Region",
+            "it": "Regione",
+            "pt": "Área",
+            "ru": "Область",
+            "es": "Área",
+            "sv": "Område"
+        }
+    },
+    {
+        "today": {
+            "fr": "Aujourd’hui",
+            "de": "Heute",
+            "it": "Oggi",
+            "pt": "Hoje em dia",
+            "ru": "Сегодня",
+            "es": "Hoy hoy",
+            "sv": "i dag"
+        }
+    },
+    {
+        "wear": {
+            "fr": "Porter",
+            "de": "Kleidung",
+            "it": "indossare",
+            "pt": "Usando",
+            "ru": "носить",
+            "es": "Usar",
+            "sv": "bära"
+        }
+    },
+    {
+        "wave": {
+            "fr": "La vague",
+            "de": "Wellen",
+            "it": "Il Wave",
+            "pt": "A onda",
+            "ru": "Волна",
+            "es": "Las ondas",
+            "sv": "våg"
+        }
+    },
+    {
+        "wind": {
+            "fr": "Le vent",
+            "de": "Wind",
+            "it": "Il vento",
+            "pt": "Vento",
+            "ru": "Ветер",
+            "es": "El viento",
+            "sv": "Vinden"
+        }
+    },
+    {
+        "floor": {
+            "fr": "Le sol",
+            "de": "Boden",
+            "it": "Il pavimento",
+            "pt": "piso",
+            "ru": "Поверхность",
+            "es": "El piso",
+            "sv": "golv"
+        }
+    },
+    {
+        "man": {
+            "fr": "L’homme",
+            "de": "Mann",
+            "it": "uomo",
+            "pt": "Homem",
+            "ru": "Человек",
+            "es": "El hombre",
+            "sv": "Människa"
+        }
+    },
+    {
+        "word": {
+            "fr": "Paroles",
+            "de": "Wort",
+            "it": "Parola",
+            "pt": "Palavra",
+            "ru": "Слово",
+            "es": "Palabras",
+            "sv": "ord"
+        }
+    },
+    {
+        "bath": {
+            "fr": "baignade",
+            "de": "Bad",
+            "it": "Il bagno",
+            "pt": "banho",
+            "ru": "ванны",
+            "es": "baño",
+            "sv": "Badrum"
+        }
+    },
+    {
+        "bear": {
+            "fr": "Les ours",
+            "de": "Bären",
+            "it": "Il miele",
+            "pt": "A Beira",
+            "ru": "Медведь",
+            "es": "El Bear",
+            "sv": "Björn"
+        }
+    },
+    {
+        "bell": {
+            "fr": "Bélon",
+            "de": "Bellen",
+            "it": "di Bell",
+            "pt": "Bela",
+            "ru": "Белл",
+            "es": "Bellas",
+            "sv": "Klocka"
+        }
+    },
+    {
+        "tooth": {
+            "fr": "Les dents",
+            "de": "Zähne",
+            "it": "Il dente",
+            "pt": "Dentes",
+            "ru": "Зуб",
+            "es": "Dientes",
+            "sv": "tänder"
+        }
+    },
+    {
+        "thumb": {
+            "fr": "Tumeur",
+            "de": "Dumm",
+            "it": "Peccato",
+            "pt": "Duma",
+            "ru": "Дюм",
+            "es": "Tumba",
+            "sv": "Dumma"
+        }
+    },
+    {
+        "lightning": {
+            "fr": "éclairage",
+            "de": "Leuchten",
+            "it": "illuminazione",
+            "pt": "Iluminação",
+            "ru": "светильник",
+            "es": "Iluminación",
+            "sv": "Ljuset"
+        }
+    },
+    {
+        "thunder": {
+            "fr": "Thunder",
+            "de": "Thunder",
+            "it": "di Thunder",
+            "pt": "Tandem",
+            "ru": "Тондер",
+            "es": "El Thunder",
+            "sv": "Thunder"
+        }
+    },
+    {
+        "ticket": {
+            "fr": "Les billets",
+            "de": "Tickets",
+            "it": "biglietto",
+            "pt": "Bilhete",
+            "ru": "Билеты",
+            "es": "Título",
+            "sv": "Biljett"
+        }
+    },
+    {
+        "tray": {
+            "fr": "Trois",
+            "de": "Dreie",
+            "it": "Trai",
+            "pt": "Três",
+            "ru": "Трей",
+            "es": "Trio",
+            "sv": "Tray"
+        }
+    },
+    {
+        "tree": {
+            "fr": "Arbre",
+            "de": "Bäume",
+            "it": "albero",
+            "pt": "Árvore",
+            "ru": "Деревья",
+            "es": "árboles",
+            "sv": "Träd"
+        }
+    },
+    {
+        "salt": {
+            "fr": "Le sel",
+            "de": "Salz",
+            "it": "Il sale",
+            "pt": "Sal",
+            "ru": "Соль",
+            "es": "Sal",
+            "sv": "salt"
+        }
+    },
+    {
+        "secretary": {
+            "fr": "Secrétaire",
+            "de": "Sekretär",
+            "it": "Segretario",
+            "pt": "Secretário",
+            "ru": "Секретарь",
+            "es": "Secretario",
+            "sv": "Sekreterare"
+        }
+    },
+    {
+        "shelf": {
+            "fr": "Shelleau",
+            "de": "Schiff",
+            "it": "Il Shell",
+            "pt": "Shelby",
+            "ru": "Шелф",
+            "es": "El shelf",
+            "sv": "Shelby"
+        }
+    },
+    {
+        "hat": {
+            "fr": "Chapeau",
+            "de": "Hatten",
+            "it": "Il cappello",
+            "pt": "Cabeça",
+            "ru": "Шапка",
+            "es": "Caballero",
+            "sv": "Hatten"
+        }
+    },
+    {
+        "dress": {
+            "fr": "Vêtements",
+            "de": "Kleidung",
+            "it": "vestito",
+            "pt": "vestido",
+            "ru": "Одежда",
+            "es": "El vestido",
+            "sv": "Klänning"
+        }
+    },
+    {
+        "daughter": {
+            "fr": "Fille",
+            "de": "Tochter",
+            "it": "figlia",
+            "pt": "Filha",
+            "ru": "дочь",
+            "es": "hija",
+            "sv": "dotter"
+        }
+    },
+    {
+        "son": {
+            "fr": "Fils",
+            "de": "Sohn",
+            "it": "Figlio",
+            "pt": "Filho",
+            "ru": "Сын",
+            "es": "hijo",
+            "sv": "Sonen"
+        }
+    },
+    {
+        "soup": {
+            "fr": "soupe",
+            "de": "Suppe",
+            "it": "La zuppa",
+            "pt": "Sopa",
+            "ru": "Суп",
+            "es": "Sopa",
+            "sv": "Sopp"
+        }
+    },
+    {
+        "space": {
+            "fr": "Espace",
+            "de": "Raum",
+            "it": "Spazio",
+            "pt": "Espaço",
+            "ru": "пространство",
+            "es": "Espacio",
+            "sv": "utrymme"
+        }
+    },
+    {
+        "car": {
+            "fr": "voiture",
+            "de": "Autos",
+            "it": "auto",
+            "pt": "carro",
+            "ru": "Автомобили",
+            "es": "El coche",
+            "sv": "Bilen"
+        }
+    },
+    {
+        "circle": {
+            "fr": "Cirque",
+            "de": "Kreis",
+            "it": "Circolo",
+            "pt": "Círculo",
+            "ru": "Круг",
+            "es": "Círculo",
+            "sv": "cirkel"
+        }
+    },
+    {
+        "sphere": {
+            "fr": "sphère",
+            "de": "Sphäre",
+            "it": "La sfera",
+            "pt": "Espécie",
+            "ru": "сфера",
+            "es": "Esfera",
+            "sv": "Sfera"
+        }
+    },
+    {
+        "steel": {
+            "fr": "Acier",
+            "de": "Stahl",
+            "it": "Acciaio",
+            "pt": "Aço",
+            "ru": "сталь",
+            "es": "El acero",
+            "sv": "stål"
+        }
+    },
+    {
+        "stomach": {
+            "fr": "Le ventre",
+            "de": "Magen",
+            "it": "dello stomaco",
+            "pt": "O estômago",
+            "ru": "желудок",
+            "es": "El estómago",
+            "sv": "magen"
+        }
+    },
+    {
+        "store": {
+            "fr": "Boutique",
+            "de": "Geschäfte",
+            "it": "negozio",
+            "pt": "Loja",
+            "ru": "магазин",
+            "es": "La tienda",
+            "sv": "Butiken"
+        }
+    },
+    {
+        "range": {
+            "fr": "Range",
+            "de": "Range",
+            "it": "Rango",
+            "pt": "Rango",
+            "ru": "Ранги",
+            "es": "Rango",
+            "sv": "Range"
+        }
+    },
+    {
+        "pig": {
+            "fr": "Le porc",
+            "de": "Schweine",
+            "it": "Il maiale",
+            "pt": "Porco",
+            "ru": "Свинья",
+            "es": "El cerdo",
+            "sv": "Svin"
+        }
+    },
+    {
+        "rice": {
+            "fr": "Le riz",
+            "de": "Riesen",
+            "it": "Il riso",
+            "pt": "O arroz",
+            "ru": "Рис",
+            "es": "El arroz",
+            "sv": "Rice"
+        }
+    }
+]
\ No newline at end of file

From 960c4f67b259f6300680a7c59db50025392f60e8 Mon Sep 17 00:00:00 2001
From: Linfye <3158203624@qq.com>
Date: Tue, 27 Feb 2024 15:30:33 +0800
Subject: [PATCH 4/6] file name changed

---
 .../languages/English/translations/__init__.py       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/scribe_data/extract_transform/languages/English/translations/__init__.py b/src/scribe_data/extract_transform/languages/English/translations/__init__.py
index 2c59a0285..edb5b8fde 100644
--- a/src/scribe_data/extract_transform/languages/English/translations/__init__.py
+++ b/src/scribe_data/extract_transform/languages/English/translations/__init__.py
@@ -20,15 +20,15 @@
 
 translations = []
 
-if os.path.exists('translations.json'):
-    with open('translations.json', 'r', encoding='utf-8') as file:
+if os.path.exists('../formatted_data/translated_words.json'):
+    with open('../formatted_data/translated_words.json', 'r', encoding='utf-8') as file:
         translations = json.load(file)
 
 def signal_handler(sig, frame):
     print("\nThe interrupt signal has been caught and the current progress is being saved...")
-    with open('translations.json', 'w', encoding='utf-8') as file:
+    with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file:
         json.dump(translations, file, ensure_ascii=False, indent=4)
-    print("The current progress is saved to the translations.json file.")
+    print("The current progress is saved to the translated_words.json file.")
     exit()
 
 signal.signal(signal.SIGINT, signal_handler)
@@ -42,8 +42,8 @@ def signal_handler(sig, frame):
         translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
         word_translations[word][lang_code] = translated_word
     translations.append(word_translations)
-    with open('../formatted_data/translations.json', 'w', encoding='utf-8') as file:
+    with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file:
         json.dump(translations, file, ensure_ascii=False, indent=4)
     print(f"Translation results for the word '{word}' have been saved.")
 
-print("Translation results for all words are saved to the translations.json file.")
\ No newline at end of file
+print("Translation results for all words are saved to the translated_words.json file.")
\ No newline at end of file

From ef8be672d5b2be802b8960dfb7cca100bda9cf51 Mon Sep 17 00:00:00 2001
From: Linfye <3158203624@qq.com>
Date: Mon, 4 Mar 2024 23:04:03 +0800
Subject: [PATCH 5/6] minus fixed

---
 .../formatted_data/translated_words.json      |  2 +-
 .../English/translations/__init__.py          | 49 ----------------
 .../English/translations/translate_words.py   | 57 +++++++++++++++++++
 3 files changed, 58 insertions(+), 50 deletions(-)
 create mode 100644 src/scribe_data/extract_transform/languages/English/translations/translate_words.py

diff --git a/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json b/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json
index f97905aa1..deb33e9e3 100644
--- a/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json
+++ b/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json
@@ -846,4 +846,4 @@
             "sv": "Rice"
         }
     }
-]
\ No newline at end of file
+]
diff --git a/src/scribe_data/extract_transform/languages/English/translations/__init__.py b/src/scribe_data/extract_transform/languages/English/translations/__init__.py
index edb5b8fde..e69de29bb 100644
--- a/src/scribe_data/extract_transform/languages/English/translations/__init__.py
+++ b/src/scribe_data/extract_transform/languages/English/translations/__init__.py
@@ -1,49 +0,0 @@
-from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-import json
-import signal
-import os
-
-with open('words_to_translate.json', 'r', encoding='utf-8') as file:
-    json_data = json.load(file)
-
-word_list = []
-
-for item in json_data:
-    word_list.append(item["word"])
-
-#print(word_list[0])
-
-model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-
-target_languages = ["fr", "de", "it", "pt", "ru", "es", "sv"]
-
-translations = []
-
-if os.path.exists('../formatted_data/translated_words.json'):
-    with open('../formatted_data/translated_words.json', 'r', encoding='utf-8') as file:
-        translations = json.load(file)
-
-def signal_handler(sig, frame):
-    print("\nThe interrupt signal has been caught and the current progress is being saved...")
-    with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file:
-        json.dump(translations, file, ensure_ascii=False, indent=4)
-    print("The current progress is saved to the translated_words.json file.")
-    exit()
-
-signal.signal(signal.SIGINT, signal_handler)
-
-for word in word_list[len(translations):]:
-    word_translations = {word: {}}
-    for lang_code in target_languages:
-        tokenizer.src_lang = "en"
-        encoded_word = tokenizer(word, return_tensors="pt")
-        generated_tokens = model.generate(**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code))
-        translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
-        word_translations[word][lang_code] = translated_word
-    translations.append(word_translations)
-    with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file:
-        json.dump(translations, file, ensure_ascii=False, indent=4)
-    print(f"Translation results for the word '{word}' have been saved.")
-
-print("Translation results for all words are saved to the translated_words.json file.")
\ No newline at end of file
diff --git a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
new file mode 100644
index 000000000..cfb66b34a
--- /dev/null
+++ b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
@@ -0,0 +1,57 @@
+from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+import json
+import signal
+import os
+
+def translate_words(words_path: str):
+    with open(words_path, 'r', encoding='utf-8') as file:
+        words_json_data = json.load(file)
+
+    word_list = []
+
+    for item in words_json_data:
+        word_list.append(item["word"])
+
+    model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+
+    with open('../../../../../scribe_data/resources/language_meta_data.json', 'r', encoding='utf-8') as file:
+        lang_json_data = json.load(file)
+    iso_list = [lang['iso'] for lang in lang_json_data['languages']]
+    
+    target_languages = iso_list
+
+    translations = []
+
+    if os.path.exists('../formatted_data/translated_words.json'):
+        with open('../formatted_data/translated_words.json', 'r', encoding='utf-8') as file:
+            translations = json.load(file)
+
+    def signal_handler(sig, frame):
+        print("\nThe interrupt signal has been caught and the current progress is being saved...")
+        with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file:
+            json.dump(translations, file, ensure_ascii=False, indent=4)
+            file.write('\n')
+        print("The current progress is saved to the translated_words.json file.")
+        exit()
+
+    signal.signal(signal.SIGINT, signal_handler)
+
+    for word in word_list[len(translations):]:
+        word_translations = {word: {}}
+        for lang_code in target_languages:
+            tokenizer.src_lang = "en"
+            encoded_word = tokenizer(word, return_tensors="pt")
+            generated_tokens = model.generate(**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code))
+            translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+            word_translations[word][lang_code] = translated_word
+        translations.append(word_translations)
+        with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file:
+            json.dump(translations, file, ensure_ascii=False, indent=4)
+            file.write('\n')
+        print(f"Translation results for the word '{word}' have been saved.")
+
+    print("Translation results for all words are saved to the translated_words.json file.")
+    
+if __name__ == "__main__":
+    translate_words('words_to_translate.json')
\ No newline at end of file

From 24605846752a4977f155f4f193eeb96b4a8a5c91 Mon Sep 17 00:00:00 2001
From: Andrew Tavis McAllister <andrew.t.mcallister@gmail.com>
Date: Sun, 17 Mar 2024 14:00:40 +0100
Subject: [PATCH 6/6] #72 formatting for translation file and adding docstring

---
 .../English/translations/translate_words.py   | 69 +++++++++++++------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
index cfb66b34a..1efff8aac 100644
--- a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
+++ b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
@@ -1,10 +1,16 @@
-from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+"""
+Translates the English words queried from Wikidata to all other Scribe languages.
+"""
+
 import json
-import signal
 import os
+import signal
+
+from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
 
 def translate_words(words_path: str):
-    with open(words_path, 'r', encoding='utf-8') as file:
+    with open(words_path, "r", encoding="utf-8") as file:
         words_json_data = json.load(file)
 
     word_list = []
@@ -15,43 +21,66 @@ def translate_words(words_path: str):
     model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
     tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 
-    with open('../../../../../scribe_data/resources/language_meta_data.json', 'r', encoding='utf-8') as file:
+    with open(
+        "../../../../../scribe_data/resources/language_meta_data.json",
+        "r",
+        encoding="utf-8",
+    ) as file:
         lang_json_data = json.load(file)
-    iso_list = [lang['iso'] for lang in lang_json_data['languages']]
-    
+    iso_list = [lang["iso"] for lang in lang_json_data["languages"]]
+
     target_languages = iso_list
 
     translations = []
 
-    if os.path.exists('../formatted_data/translated_words.json'):
-        with open('../formatted_data/translated_words.json', 'r', encoding='utf-8') as file:
+    if os.path.exists("../formatted_data/translated_words.json"):
+        with open(
+            "../formatted_data/translated_words.json", "r", encoding="utf-8"
+        ) as file:
             translations = json.load(file)
 
     def signal_handler(sig, frame):
-        print("\nThe interrupt signal has been caught and the current progress is being saved...")
-        with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file:
+        print(
+            "\nThe interrupt signal has been caught and the current progress is being saved..."
+        )
+        with open(
+            "../formatted_data/translated_words.json", "w", encoding="utf-8"
+        ) as file:
             json.dump(translations, file, ensure_ascii=False, indent=4)
-            file.write('\n')
-        print("The current progress is saved to the translated_words.json file.")
+            file.write("\n")
+
+        print("The current progress has been saved to the translated_words.json file.")
         exit()
 
     signal.signal(signal.SIGINT, signal_handler)
 
-    for word in word_list[len(translations):]:
+    for word in word_list[len(translations) :]:
         word_translations = {word: {}}
         for lang_code in target_languages:
             tokenizer.src_lang = "en"
             encoded_word = tokenizer(word, return_tensors="pt")
-            generated_tokens = model.generate(**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code))
-            translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+            generated_tokens = model.generate(
+                **encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code)
+            )
+            translated_word = tokenizer.batch_decode(
+                generated_tokens, skip_special_tokens=True
+            )[0]
             word_translations[word][lang_code] = translated_word
+
         translations.append(word_translations)
-        with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file:
+
+        with open(
+            "../formatted_data/translated_words.json", "w", encoding="utf-8"
+        ) as file:
             json.dump(translations, file, ensure_ascii=False, indent=4)
-            file.write('\n')
+            file.write("\n")
+
         print(f"Translation results for the word '{word}' have been saved.")
 
-    print("Translation results for all words are saved to the translated_words.json file.")
-    
+    print(
+        "Translation results for all words are saved to the translated_words.json file."
+    )
+
+
 if __name__ == "__main__":
-    translate_words('words_to_translate.json')
\ No newline at end of file
+    translate_words("words_to_translate.json")