From 1fa59ea4330230a62f370b71f525d60e41fbe325 Mon Sep 17 00:00:00 2001 From: Linfye <3158203624@qq.com> Date: Tue, 27 Feb 2024 14:53:36 +0800 Subject: [PATCH 1/6] English trans finished --- .../English/translations/__init__.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/src/scribe_data/extract_transform/languages/English/translations/__init__.py b/src/scribe_data/extract_transform/languages/English/translations/__init__.py index e69de29bb..b040d1fbe 100644 --- a/src/scribe_data/extract_transform/languages/English/translations/__init__.py +++ b/src/scribe_data/extract_transform/languages/English/translations/__init__.py @@ -0,0 +1,49 @@ +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer +import json +import signal +import os + +with open('words_to_translate.json', 'r', encoding='utf-8') as file: + json_data = json.load(file) + +word_list = [] + +for item in json_data: + word_list.append(item["word"]) + +#print(word_list[0]) + +model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") + +target_languages = ["fr", "de", "it", "pt", "ru", "es", "sv"] + +translations = [] + +if os.path.exists('translations.json'): + with open('translations.json', 'r', encoding='utf-8') as file: + translations = json.load(file) + +def signal_handler(sig, frame): + print("\nThe interrupt signal has been caught and the current progress is being saved...") + with open('translations.json', 'w', encoding='utf-8') as file: + json.dump(translations, file, ensure_ascii=False, indent=4) + print("The current progress is saved to the translations.json file.") + exit() + +signal.signal(signal.SIGINT, signal_handler) + +for word in word_list[len(translations):]: + word_translations = {word: {}} + for lang_code in target_languages: + tokenizer.src_lang = "en" + encoded_word = tokenizer(word, return_tensors="pt") + generated_tokens = model.generate(**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code)) + translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] + word_translations[word][lang_code] = translated_word + translations.append(word_translations) + with open('translations.json', 'w', encoding='utf-8') as file: + json.dump(translations, file, ensure_ascii=False, indent=4) + print(f"Translation results for the word '{word}' have been saved.") + +print("Translation results for all words are saved to the translations.json file.") \ No newline at end of file From 6e9d5e87080b17a3cbffacce4896d6bca53c9127 Mon Sep 17 00:00:00 2001 From: Linfye <3158203624@qq.com> Date: Tue, 27 Feb 2024 14:56:06 +0800 Subject: [PATCH 2/6] change dic --- .../languages/English/translations/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scribe_data/extract_transform/languages/English/translations/__init__.py b/src/scribe_data/extract_transform/languages/English/translations/__init__.py index b040d1fbe..2c59a0285 100644 --- a/src/scribe_data/extract_transform/languages/English/translations/__init__.py +++ b/src/scribe_data/extract_transform/languages/English/translations/__init__.py @@ -42,7 +42,7 @@ def signal_handler(sig, frame): translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] word_translations[word][lang_code] = translated_word translations.append(word_translations) - with open('translations.json', 'w', encoding='utf-8') as file: + with open('../formatted_data/translations.json', 'w', encoding='utf-8') as file: json.dump(translations, file, ensure_ascii=False, indent=4) print(f"Translation results for the word '{word}' have been saved.") From 8311358e62ee3455dbc4712869fa2041a5616b8f Mon Sep 17 00:00:00 2001 From: Linfye <3158203624@qq.com> Date: Tue, 27 Feb 2024 15:11:25 +0800 Subject: [PATCH 3/6] change file name --- .../formatted_data/translated_words.json | 849 ++++++++++++++++++ 1 file changed, 849 insertions(+) create mode 100644 src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json diff --git a/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json b/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json new file mode 100644 index 000000000..f97905aa1 --- /dev/null +++ b/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json @@ -0,0 +1,849 @@ +[ + { + "after": { + "fr": "Après", + "de": "Nach dem", + "it": "Dopo", + "pt": "Depois de", + "ru": "После", + "es": "Después", + "sv": "Efter" + } + }, + { + "around": { + "fr": "autour", + "de": "Umgeben", + "it": "Intorno", + "pt": "ao redor", + "ru": "вокруг", + "es": "alrededor", + "sv": "Omkring" + } + }, + { + "beside": { + "fr": "à côté", + "de": "Neben", + "it": "accanto", + "pt": "ao lado", + "ru": "рядом", + "es": "Al lado", + "sv": "Bredvid" + } + }, + { + "through": { + "fr": "à travers", + "de": "durch", + "it": "attraverso", + "pt": "Por meio", + "ru": "через", + "es": "A través", + "sv": "genom" + } + }, + { + "past": { + "fr": "Le passé", + "de": "Vergangenheit", + "it": "Il passato", + "pt": "passado", + "ru": "прошлое", + "es": "pasado", + "sv": "förflutna" + } + }, + { + "despite": { + "fr": "Malgré", + "de": "Trotz", + "it": "Nonostante", + "pt": "Apesar de", + "ru": "Несмотря", + "es": "A pesar de", + "sv": "Trots att" + } + }, + { + "always": { + "fr": "toujours", + "de": "immer", + "it": "sempre", + "pt": "sempre", + "ru": "Всегда", + "es": "Siempre", + "sv": "Alltid" + } + }, + { + "orange": { + "fr": "Orange", + "de": "Orange", + "it": "L’arancia", + "pt": "Laranja", + "ru": "Оранжевый", + "es": "Orange", + "sv": "orange" + } + }, + { + "beautiful": { + "fr": "Beaucoup", + "de": "Schönes", + "it": "bella", + "pt": "bonita", + "ru": "Красивый", + "es": "hermosa", + "sv": "vacker" + } + }, + { + "large": { + "fr": "grand grand", + "de": "Große", + "it": "Grande", + "pt": "Grande", + "ru": "Большой", + "es": "Grandes", + "sv": "Stora" + } + }, + { + "serious": { + "fr": "sérieux", + "de": "ernst", + "it": "serio", + "pt": "sério", + "ru": "Серьезный", + "es": "serio", + "sv": "allvarligt" + } + }, + { + "bright": { + "fr": "Brille", + "de": "Brille", + "it": "Il brillo", + "pt": "brilhante", + "ru": "Яркий", + "es": "brillo", + "sv": "ljusa" + } + }, + { + "strong": { + "fr": "Forte", + "de": "starke", + "it": "forte", + "pt": "forte", + "ru": "Сильный", + "es": "fuerte", + "sv": "starka" + } + }, + { + "sweet": { + "fr": "doux", + "de": "Süßes", + "it": "dolce", + "pt": "doce", + "ru": "Сладкий", + "es": "dulce", + "sv": "söta" + } + }, + { + "clear": { + "fr": "clair", + "de": "klar", + "it": "chiaro", + "pt": "Claro", + "ru": "Яркий", + "es": "claramente", + "sv": "tydligt" + } + }, + { + "deep": { + "fr": "profondeur", + "de": "tief", + "it": "profondo", + "pt": "profundidade", + "ru": "глубокий", + "es": "profundidad", + "sv": "djupt" + } + }, + { + "different": { + "fr": "Différents", + "de": "unterschiedlich", + "it": "Differenza", + "pt": "Diferentes", + "ru": "Разное", + "es": "diferentes", + "sv": "annorlunda" + } + }, + { + "difficult": { + "fr": "Difficile", + "de": "Schwierig", + "it": "Difficile", + "pt": "Difícil", + "ru": "Трудное", + "es": "difícil", + "sv": "svåra" + } + }, + { + "come": { + "fr": "Venez", + "de": "Kommen", + "it": "Vieni", + "pt": "Venha", + "ru": "Приходите", + "es": "Venga", + "sv": "Kom och" + } + }, + { + "wait": { + "fr": "Attendre", + "de": "Warten", + "it": "Aspetta", + "pt": "Aguardando", + "ru": "ждать", + "es": "Espera", + "sv": "Väntar" + } + }, + { + "except": { + "fr": "Sauf", + "de": "Ausgenommen", + "it": "tranne che", + "pt": "Excepção", + "ru": "за исключением", + "es": "excepto", + "sv": "Förutom" + } + }, + { + "purpose": { + "fr": "Objectif", + "de": "Zweck", + "it": "Obiettivo", + "pt": "Objetivo", + "ru": "Цель", + "es": "Objetivo", + "sv": "syftet" + } + }, + { + "begin": { + "fr": "Début", + "de": "beginnt", + "it": "Iniziamo", + "pt": "Começando", + "ru": "Начало", + "es": "Inicio", + "sv": "Börja" + } + }, + { + "throw": { + "fr": "Jouer", + "de": "Schießen", + "it": "lancio", + "pt": "lança", + "ru": "Стрельба", + "es": "lanzar", + "sv": "kastar" + } + }, + { + "teach": { + "fr": "enseignant", + "de": "Lehre", + "it": "insegnare", + "pt": "Ensino", + "ru": "Учитель", + "es": "enseñar", + "sv": "Lär dig" + } + }, + { + "slope": { + "fr": "Le Slope", + "de": "Schlange", + "it": "di Slope", + "pt": "Limpeza", + "ru": "Слайд", + "es": "El Slope", + "sv": "Slippa" + } + }, + { + "smash": { + "fr": "Smoothie", + "de": "Schmutz", + "it": "di smash", + "pt": "Mãe Smash", + "ru": "Смаш", + "es": "El Smash", + "sv": "Smash" + } + }, + { + "smile": { + "fr": "sourire", + "de": "Lächeln", + "it": "Il sorriso", + "pt": "sorriso", + "ru": "улыбка", + "es": "sonrisas", + "sv": "Ett leende" + } + }, + { + "liberate": { + "fr": "libérée", + "de": "Befreiung", + "it": "Liberato", + "pt": "Libertação", + "ru": "освобождать", + "es": "Liberación", + "sv": "befriade" + } + }, + { + "rate": { + "fr": "taux", + "de": "Rate", + "it": "Tasso", + "pt": "Taxa", + "ru": "Уровень", + "es": "La tasa", + "sv": "Rättigheter" + } + }, + { + "point": { + "fr": "point", + "de": "Punkt", + "it": "Il punto", + "pt": "ponto", + "ru": "Точка", + "es": "punto", + "sv": "Poäng" + } + }, + { + "print": { + "fr": "Printé", + "de": "Druck", + "it": "stampa", + "pt": "Impressão", + "ru": "Принтер", + "es": "impresión", + "sv": "tryck" + } + }, + { + "bar": { + "fr": "Bar", + "de": "Bar", + "it": "Bar", + "pt": "Bar", + "ru": "Бар", + "es": "bar", + "sv": "Bar" + } + }, + { + "break": { + "fr": "La pause", + "de": "Pause", + "it": "pausa", + "pt": "Pausa", + "ru": "Перерыв", + "es": "La pausa", + "sv": "Avbrott" + } + }, + { + "call": { + "fr": "Appel", + "de": "Anrufe", + "it": "Chiamate", + "pt": "Chamado", + "ru": "Звонок", + "es": "llamadas", + "sv": "ringer" + } + }, + { + "initiate": { + "fr": "Initiation", + "de": "Initiieren", + "it": "iniziare", + "pt": "Iniciação", + "ru": "Инициативы", + "es": "Inicio", + "sv": "initiera" + } + }, + { + "contribute": { + "fr": "Contribuer", + "de": "Beiträge", + "it": "contributi", + "pt": "Contribuição", + "ru": "Вклад", + "es": "Contribución", + "sv": "Bidrag" + } + }, + { + "test": { + "fr": "Tests", + "de": "Test", + "it": "Il test", + "pt": "Testes", + "ru": "Тест", + "es": "Testes", + "sv": "Testning" + } + }, + { + "deal": { + "fr": "Accord", + "de": "Vereinbarung", + "it": "Accordo", + "pt": "acordo", + "ru": "Договор", + "es": "Acuerdo", + "sv": "Avtal" + } + }, + { + "dine": { + "fr": "Dîne", + "de": "Ihre", + "it": "Il tuo", + "pt": "Dinheiro", + "ru": "Тёни", + "es": "Tiene", + "sv": "Dina" + } + }, + { + "meet": { + "fr": "Rencontre", + "de": "Treffen", + "it": "Incontrare", + "pt": "Encontro", + "ru": "Встреча", + "es": "Encuentro", + "sv": "möter" + } + }, + { + "area": { + "fr": "La zone", + "de": "Region", + "it": "Regione", + "pt": "Área", + "ru": "Область", + "es": "Área", + "sv": "Område" + } + }, + { + "today": { + "fr": "Aujourd’hui", + "de": "Heute", + "it": "Oggi", + "pt": "Hoje em dia", + "ru": "Сегодня", + "es": "Hoy hoy", + "sv": "i dag" + } + }, + { + "wear": { + "fr": "Porter", + "de": "Kleidung", + "it": "indossare", + "pt": "Usando", + "ru": "носить", + "es": "Usar", + "sv": "bära" + } + }, + { + "wave": { + "fr": "La vague", + "de": "Wellen", + "it": "Il Wave", + "pt": "A onda", + "ru": "Волна", + "es": "Las ondas", + "sv": "våg" + } + }, + { + "wind": { + "fr": "Le vent", + "de": "Wind", + "it": "Il vento", + "pt": "Vento", + "ru": "Ветер", + "es": "El viento", + "sv": "Vinden" + } + }, + { + "floor": { + "fr": "Le sol", + "de": "Boden", + "it": "Il pavimento", + "pt": "piso", + "ru": "Поверхность", + "es": "El piso", + "sv": "golv" + } + }, + { + "man": { + "fr": "L’homme", + "de": "Mann", + "it": "uomo", + "pt": "Homem", + "ru": "Человек", + "es": "El hombre", + "sv": "Människa" + } + }, + { + "word": { + "fr": "Paroles", + "de": "Wort", + "it": "Parola", + "pt": "Palavra", + "ru": "Слово", + "es": "Palabras", + "sv": "ord" + } + }, + { + "bath": { + "fr": "baignade", + "de": "Bad", + "it": "Il bagno", + "pt": "banho", + "ru": "ванны", + "es": "baño", + "sv": "Badrum" + } + }, + { + "bear": { + "fr": "Les ours", + "de": "Bären", + "it": "Il miele", + "pt": "A Beira", + "ru": "Медведь", + "es": "El Bear", + "sv": "Björn" + } + }, + { + "bell": { + "fr": "Bélon", + "de": "Bellen", + "it": "di Bell", + "pt": "Bela", + "ru": "Белл", + "es": "Bellas", + "sv": "Klocka" + } + }, + { + "tooth": { + "fr": "Les dents", + "de": "Zähne", + "it": "Il dente", + "pt": "Dentes", + "ru": "Зуб", + "es": "Dientes", + "sv": "tänder" + } + }, + { + "thumb": { + "fr": "Tumeur", + "de": "Dumm", + "it": "Peccato", + "pt": "Duma", + "ru": "Дюм", + "es": "Tumba", + "sv": "Dumma" + } + }, + { + "lightning": { + "fr": "éclairage", + "de": "Leuchten", + "it": "illuminazione", + "pt": "Iluminação", + "ru": "светильник", + "es": "Iluminación", + "sv": "Ljuset" + } + }, + { + "thunder": { + "fr": "Thunder", + "de": "Thunder", + "it": "di Thunder", + "pt": "Tandem", + "ru": "Тондер", + "es": "El Thunder", + "sv": "Thunder" + } + }, + { + "ticket": { + "fr": "Les billets", + "de": "Tickets", + "it": "biglietto", + "pt": "Bilhete", + "ru": "Билеты", + "es": "Título", + "sv": "Biljett" + } + }, + { + "tray": { + "fr": "Trois", + "de": "Dreie", + "it": "Trai", + "pt": "Três", + "ru": "Трей", + "es": "Trio", + "sv": "Tray" + } + }, + { + "tree": { + "fr": "Arbre", + "de": "Bäume", + "it": "albero", + "pt": "Árvore", + "ru": "Деревья", + "es": "árboles", + "sv": "Träd" + } + }, + { + "salt": { + "fr": "Le sel", + "de": "Salz", + "it": "Il sale", + "pt": "Sal", + "ru": "Соль", + "es": "Sal", + "sv": "salt" + } + }, + { + "secretary": { + "fr": "Secrétaire", + "de": "Sekretär", + "it": "Segretario", + "pt": "Secretário", + "ru": "Секретарь", + "es": "Secretario", + "sv": "Sekreterare" + } + }, + { + "shelf": { + "fr": "Shelleau", + "de": "Schiff", + "it": "Il Shell", + "pt": "Shelby", + "ru": "Шелф", + "es": "El shelf", + "sv": "Shelby" + } + }, + { + "hat": { + "fr": "Chapeau", + "de": "Hatten", + "it": "Il cappello", + "pt": "Cabeça", + "ru": "Шапка", + "es": "Caballero", + "sv": "Hatten" + } + }, + { + "dress": { + "fr": "Vêtements", + "de": "Kleidung", + "it": "vestito", + "pt": "vestido", + "ru": "Одежда", + "es": "El vestido", + "sv": "Klänning" + } + }, + { + "daughter": { + "fr": "Fille", + "de": "Tochter", + "it": "figlia", + "pt": "Filha", + "ru": "дочь", + "es": "hija", + "sv": "dotter" + } + }, + { + "son": { + "fr": "Fils", + "de": "Sohn", + "it": "Figlio", + "pt": "Filho", + "ru": "Сын", + "es": "hijo", + "sv": "Sonen" + } + }, + { + "soup": { + "fr": "soupe", + "de": "Suppe", + "it": "La zuppa", + "pt": "Sopa", + "ru": "Суп", + "es": "Sopa", + "sv": "Sopp" + } + }, + { + "space": { + "fr": "Espace", + "de": "Raum", + "it": "Spazio", + "pt": "Espaço", + "ru": "пространство", + "es": "Espacio", + "sv": "utrymme" + } + }, + { + "car": { + "fr": "voiture", + "de": "Autos", + "it": "auto", + "pt": "carro", + "ru": "Автомобили", + "es": "El coche", + "sv": "Bilen" + } + }, + { + "circle": { + "fr": "Cirque", + "de": "Kreis", + "it": "Circolo", + "pt": "Círculo", + "ru": "Круг", + "es": "Círculo", + "sv": "cirkel" + } + }, + { + "sphere": { + "fr": "sphère", + "de": "Sphäre", + "it": "La sfera", + "pt": "Espécie", + "ru": "сфера", + "es": "Esfera", + "sv": "Sfera" + } + }, + { + "steel": { + "fr": "Acier", + "de": "Stahl", + "it": "Acciaio", + "pt": "Aço", + "ru": "сталь", + "es": "El acero", + "sv": "stål" + } + }, + { + "stomach": { + "fr": "Le ventre", + "de": "Magen", + "it": "dello stomaco", + "pt": "O estômago", + "ru": "желудок", + "es": "El estómago", + "sv": "magen" + } + }, + { + "store": { + "fr": "Boutique", + "de": "Geschäfte", + "it": "negozio", + "pt": "Loja", + "ru": "магазин", + "es": "La tienda", + "sv": "Butiken" + } + }, + { + "range": { + "fr": "Range", + "de": "Range", + "it": "Rango", + "pt": "Rango", + "ru": "Ранги", + "es": "Rango", + "sv": "Range" + } + }, + { + "pig": { + "fr": "Le porc", + "de": "Schweine", + "it": "Il maiale", + "pt": "Porco", + "ru": "Свинья", + "es": "El cerdo", + "sv": "Svin" + } + }, + { + "rice": { + "fr": "Le riz", + "de": "Riesen", + "it": "Il riso", + "pt": "O arroz", + "ru": "Рис", + "es": "El arroz", + "sv": "Rice" + } + } +] \ No newline at end of file From 960c4f67b259f6300680a7c59db50025392f60e8 Mon Sep 17 00:00:00 2001 From: Linfye <3158203624@qq.com> Date: Tue, 27 Feb 2024 15:30:33 +0800 Subject: [PATCH 4/6] file name changed --- .../languages/English/translations/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/scribe_data/extract_transform/languages/English/translations/__init__.py b/src/scribe_data/extract_transform/languages/English/translations/__init__.py index 2c59a0285..edb5b8fde 100644 --- a/src/scribe_data/extract_transform/languages/English/translations/__init__.py +++ b/src/scribe_data/extract_transform/languages/English/translations/__init__.py @@ -20,15 +20,15 @@ translations = [] -if os.path.exists('translations.json'): - with open('translations.json', 'r', encoding='utf-8') as file: +if os.path.exists('../formatted_data/translated_words.json'): + with open('../formatted_data/translated_words.json', 'r', encoding='utf-8') as file: translations = json.load(file) def signal_handler(sig, frame): print("\nThe interrupt signal has been caught and the current progress is being saved...") - with open('translations.json', 'w', encoding='utf-8') as file: + with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file: json.dump(translations, file, ensure_ascii=False, indent=4) - print("The current progress is saved to the translations.json file.") + print("The current progress is saved to the translated_words.json file.") exit() signal.signal(signal.SIGINT, signal_handler) @@ -42,8 +42,8 @@ def signal_handler(sig, frame): translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] word_translations[word][lang_code] = translated_word translations.append(word_translations) - with open('../formatted_data/translations.json', 'w', encoding='utf-8') as file: + with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file: json.dump(translations, file, ensure_ascii=False, indent=4) print(f"Translation results for the word '{word}' have been saved.") -print("Translation results for all words are saved to the translations.json file.") \ No newline at end of file +print("Translation results for all words are saved to the translated_words.json file.") \ No newline at end of file From ef8be672d5b2be802b8960dfb7cca100bda9cf51 Mon Sep 17 00:00:00 2001 From: Linfye <3158203624@qq.com> Date: Mon, 4 Mar 2024 23:04:03 +0800 Subject: [PATCH 5/6] minus fixed --- .../formatted_data/translated_words.json | 2 +- .../English/translations/__init__.py | 49 ---------------- .../English/translations/translate_words.py | 57 +++++++++++++++++++ 3 files changed, 58 insertions(+), 50 deletions(-) create mode 100644 src/scribe_data/extract_transform/languages/English/translations/translate_words.py diff --git a/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json b/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json index f97905aa1..deb33e9e3 100644 --- a/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json +++ b/src/scribe_data/extract_transform/languages/English/formatted_data/translated_words.json @@ -846,4 +846,4 @@ "sv": "Rice" } } -] \ No newline at end of file +] diff --git a/src/scribe_data/extract_transform/languages/English/translations/__init__.py b/src/scribe_data/extract_transform/languages/English/translations/__init__.py index edb5b8fde..e69de29bb 100644 --- a/src/scribe_data/extract_transform/languages/English/translations/__init__.py +++ b/src/scribe_data/extract_transform/languages/English/translations/__init__.py @@ -1,49 +0,0 @@ -from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer -import json -import signal -import os - -with open('words_to_translate.json', 'r', encoding='utf-8') as file: - json_data = json.load(file) - -word_list = [] - -for item in json_data: - word_list.append(item["word"]) - -#print(word_list[0]) - -model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") -tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") - -target_languages = ["fr", "de", "it", "pt", "ru", "es", "sv"] - -translations = [] - -if os.path.exists('../formatted_data/translated_words.json'): - with open('../formatted_data/translated_words.json', 'r', encoding='utf-8') as file: - translations = json.load(file) - -def signal_handler(sig, frame): - print("\nThe interrupt signal has been caught and the current progress is being saved...") - with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file: - json.dump(translations, file, ensure_ascii=False, indent=4) - print("The current progress is saved to the translated_words.json file.") - exit() - -signal.signal(signal.SIGINT, signal_handler) - -for word in word_list[len(translations):]: - word_translations = {word: {}} - for lang_code in target_languages: - tokenizer.src_lang = "en" - encoded_word = tokenizer(word, return_tensors="pt") - generated_tokens = model.generate(**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code)) - translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] - word_translations[word][lang_code] = translated_word - translations.append(word_translations) - with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file: - json.dump(translations, file, ensure_ascii=False, indent=4) - print(f"Translation results for the word '{word}' have been saved.") - -print("Translation results for all words are saved to the translated_words.json file.") \ No newline at end of file diff --git a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py new file mode 100644 index 000000000..cfb66b34a --- /dev/null +++ b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py @@ -0,0 +1,57 @@ +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer +import json +import signal +import os + +def translate_words(words_path: str): + with open(words_path, 'r', encoding='utf-8') as file: + words_json_data = json.load(file) + + word_list = [] + + for item in words_json_data: + word_list.append(item["word"]) + + model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") + tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") + + with open('../../../../../scribe_data/resources/language_meta_data.json', 'r', encoding='utf-8') as file: + lang_json_data = json.load(file) + iso_list = [lang['iso'] for lang in lang_json_data['languages']] + + target_languages = iso_list + + translations = [] + + if os.path.exists('../formatted_data/translated_words.json'): + with open('../formatted_data/translated_words.json', 'r', encoding='utf-8') as file: + translations = json.load(file) + + def signal_handler(sig, frame): + print("\nThe interrupt signal has been caught and the current progress is being saved...") + with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file: + json.dump(translations, file, ensure_ascii=False, indent=4) + file.write('\n') + print("The current progress is saved to the translated_words.json file.") + exit() + + signal.signal(signal.SIGINT, signal_handler) + + for word in word_list[len(translations):]: + word_translations = {word: {}} + for lang_code in target_languages: + tokenizer.src_lang = "en" + encoded_word = tokenizer(word, return_tensors="pt") + generated_tokens = model.generate(**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code)) + translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] + word_translations[word][lang_code] = translated_word + translations.append(word_translations) + with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file: + json.dump(translations, file, ensure_ascii=False, indent=4) + file.write('\n') + print(f"Translation results for the word '{word}' have been saved.") + + print("Translation results for all words are saved to the translated_words.json file.") + +if __name__ == "__main__": + translate_words('words_to_translate.json') \ No newline at end of file From 24605846752a4977f155f4f193eeb96b4a8a5c91 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Sun, 17 Mar 2024 14:00:40 +0100 Subject: [PATCH 6/6] #72 formatting for translation file and adding docstring --- .../English/translations/translate_words.py | 69 +++++++++++++------ 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py index cfb66b34a..1efff8aac 100644 --- a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py @@ -1,10 +1,16 @@ -from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer +""" +Translates the English words queried from Wikidata to all other Scribe languages. +""" + import json -import signal import os +import signal + +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + def translate_words(words_path: str): - with open(words_path, 'r', encoding='utf-8') as file: + with open(words_path, "r", encoding="utf-8") as file: words_json_data = json.load(file) word_list = [] @@ -15,43 +21,66 @@ def translate_words(words_path: str): model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") - with open('../../../../../scribe_data/resources/language_meta_data.json', 'r', encoding='utf-8') as file: + with open( + "../../../../../scribe_data/resources/language_meta_data.json", + "r", + encoding="utf-8", + ) as file: lang_json_data = json.load(file) - iso_list = [lang['iso'] for lang in lang_json_data['languages']] - + iso_list = [lang["iso"] for lang in lang_json_data["languages"]] + target_languages = iso_list translations = [] - if os.path.exists('../formatted_data/translated_words.json'): - with open('../formatted_data/translated_words.json', 'r', encoding='utf-8') as file: + if os.path.exists("../formatted_data/translated_words.json"): + with open( + "../formatted_data/translated_words.json", "r", encoding="utf-8" + ) as file: translations = json.load(file) def signal_handler(sig, frame): - print("\nThe interrupt signal has been caught and the current progress is being saved...") - with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file: + print( + "\nThe interrupt signal has been caught and the current progress is being saved..." + ) + with open( + "../formatted_data/translated_words.json", "w", encoding="utf-8" + ) as file: json.dump(translations, file, ensure_ascii=False, indent=4) - file.write('\n') - print("The current progress is saved to the translated_words.json file.") + file.write("\n") + + print("The current progress has been saved to the translated_words.json file.") exit() signal.signal(signal.SIGINT, signal_handler) - for word in word_list[len(translations):]: + for word in word_list[len(translations) :]: word_translations = {word: {}} for lang_code in target_languages: tokenizer.src_lang = "en" encoded_word = tokenizer(word, return_tensors="pt") - generated_tokens = model.generate(**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code)) - translated_word = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] + generated_tokens = model.generate( + **encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code) + ) + translated_word = tokenizer.batch_decode( + generated_tokens, skip_special_tokens=True + )[0] word_translations[word][lang_code] = translated_word + translations.append(word_translations) - with open('../formatted_data/translated_words.json', 'w', encoding='utf-8') as file: + + with open( + "../formatted_data/translated_words.json", "w", encoding="utf-8" + ) as file: json.dump(translations, file, ensure_ascii=False, indent=4) - file.write('\n') + file.write("\n") + print(f"Translation results for the word '{word}' have been saved.") - print("Translation results for all words are saved to the translated_words.json file.") - + print( + "Translation results for all words are saved to the translated_words.json file." + ) + + if __name__ == "__main__": - translate_words('words_to_translate.json') \ No newline at end of file + translate_words("words_to_translate.json")