-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#75 Italian translation process and reorder directory structure
- Loading branch information
1 parent
02f220e
commit 2b72e64
Showing
22 changed files
with
213 additions
and
121 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
43 changes: 43 additions & 0 deletions
43
src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
""" | ||
Translates the Italian words queried from Wikidata to all other Scribe languages. | ||
Example | ||
------- | ||
python3 src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py | ||
""" | ||
|
||
import json | ||
import os | ||
import sys | ||
|
||
PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] | ||
PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" | ||
sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) | ||
|
||
from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 | ||
translate_to_other_languages, | ||
) | ||
|
||
SRC_LANG = "Italian" | ||
translate_script_dir = os.path.dirname(os.path.abspath(__file__)) | ||
words_to_translate_path = os.path.join(translate_script_dir, "words_to_translate.json") | ||
|
||
with open(words_to_translate_path, "r", encoding="utf-8") as file: | ||
json_data = json.load(file) | ||
|
||
word_list = [item["word"] for item in json_data] | ||
|
||
translations = {} | ||
translated_words_path = os.path.join( | ||
translate_script_dir, "../formatted_data/translated_words.json" | ||
) | ||
if os.path.exists(translated_words_path): | ||
with open(translated_words_path, "r", encoding="utf-8") as file: | ||
translations = json.load(file) | ||
|
||
translate_to_other_languages( | ||
source_language=SRC_LANG, | ||
word_list=word_list, | ||
translations=translations, | ||
batch_size=100, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
111 changes: 111 additions & 0 deletions
111
src/scribe_data/extract_transform/translation/translation_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
""" | ||
Utility functions for the machine translation process. | ||
Contents: | ||
translation_interrupt_handler, | ||
translate_to_other_languages | ||
""" | ||
|
||
import json | ||
import os | ||
import signal | ||
import sys | ||
|
||
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | ||
|
||
PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] | ||
PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" | ||
sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) | ||
|
||
from scribe_data.utils import ( # noqa: E402 | ||
get_language_dir_path, | ||
get_language_iso, | ||
get_target_langcodes, | ||
) | ||
|
||
|
||
def translation_interrupt_handler(source_language, translations): | ||
""" | ||
Handles interrupt signals and saves the current translation progress. | ||
Parameters | ||
---------- | ||
source_language : str | ||
The source language being translated from. | ||
translations : list[dict] | ||
The current list of translations. | ||
""" | ||
print( | ||
"\nThe interrupt signal has been caught and the current progress is being saved..." | ||
) | ||
|
||
with open( | ||
f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", | ||
"w", | ||
encoding="utf-8", | ||
) as file: | ||
json.dump(translations, file, ensure_ascii=False, indent=4) | ||
|
||
print("The current progress is saved to the translated_words.json file.") | ||
exit() | ||
|
||
|
||
def translate_to_other_languages(source_language, word_list, translations, batch_size): | ||
""" | ||
Translates a list of words from the source language to other target languages using batch processing. | ||
Parameters | ||
---------- | ||
source_language : str | ||
The source language being translated from. | ||
word_list : list[str] | ||
The list of words to translate. | ||
translations : dict | ||
The current dictionary of translations. | ||
batch_size : int | ||
The number of words to translate in each batch. | ||
""" | ||
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") | ||
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") | ||
|
||
signal.signal( | ||
signal.SIGINT, | ||
lambda sig, frame: translation_interrupt_handler(source_language, translations), | ||
) | ||
|
||
for i in range(0, len(word_list), batch_size): | ||
batch_words = word_list[i : i + batch_size] | ||
print(f"Translating batch {i//batch_size + 1}: {batch_words}") | ||
|
||
for lang_code in get_target_langcodes(source_language): | ||
tokenizer.src_lang = get_language_iso(source_language) | ||
encoded_words = tokenizer(batch_words, return_tensors="pt", padding=True) | ||
generated_tokens = model.generate( | ||
**encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code) | ||
) | ||
translated_words = tokenizer.batch_decode( | ||
generated_tokens, skip_special_tokens=True | ||
) | ||
|
||
for word, translation in zip(batch_words, translated_words): | ||
if word not in translations: | ||
translations[word] = {} | ||
|
||
translations[word][lang_code] = translation | ||
|
||
print(f"Batch {i//batch_size + 1} translation completed.") | ||
|
||
with open( | ||
f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", | ||
"w", | ||
encoding="utf-8", | ||
) as file: | ||
json.dump(translations, file, ensure_ascii=False, indent=4) | ||
|
||
print( | ||
"Translation results for all words are saved to the translated_words.json file." | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Oops, something went wrong.