From 2b72e6408611bd4da2521c052049b1221a49a2db Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Mon, 25 Mar 2024 00:16:48 +0100 Subject: [PATCH] #75 Italian translation process and reorder directory structure --- CHANGELOG.md | 7 +- README.md | 10 +- .../English/translations/translate_words.py | 4 +- .../French/translations/translate_words.py | 4 +- .../German/translations/translate_words.py | 4 +- .../Italian/translations/translate_words.py | 43 +++++++ .../translations/translate_words.py | 4 +- .../Russian/translations/translate_words.py | 4 +- .../Spanish/translations/translate_words.py | 4 +- .../Swedish/translations/translate_words.py | 4 +- .../translation/translation_utils.py | 111 ++++++++++++++++++ .../update_words_to_translate.py | 2 +- .../{ => unicode}/emoji_utils.py | 0 .../unicode/gen_emoji_lexicon.ipynb | 4 +- .../{ => unicode}/process_unicode.py | 4 +- .../{ => wikidata}/query_profanity.sparql | 0 .../query_words_to_translate.sparql | 0 .../{ => wikidata}/update_data.py | 2 +- .../{ => wikipedia}/extract_wiki.py | 0 .../wikipedia/gen_autosuggestions.ipynb | 34 ++++-- .../{ => wikipedia}/process_wiki.py | 0 src/scribe_data/utils.py | 89 -------------- 22 files changed, 213 insertions(+), 121 deletions(-) create mode 100644 src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py create mode 100644 src/scribe_data/extract_transform/translation/translation_utils.py rename src/scribe_data/extract_transform/{ => translation}/update_words_to_translate.py (96%) rename src/scribe_data/extract_transform/{ => unicode}/emoji_utils.py (100%) rename src/scribe_data/extract_transform/{ => unicode}/process_unicode.py (98%) rename src/scribe_data/extract_transform/{ => wikidata}/query_profanity.sparql (100%) rename src/scribe_data/extract_transform/{ => wikidata}/query_words_to_translate.sparql (100%) rename src/scribe_data/extract_transform/{ => wikidata}/update_data.py (99%) rename src/scribe_data/extract_transform/{ => wikipedia}/extract_wiki.py (100%) rename src/scribe_data/extract_transform/{ => wikipedia}/process_wiki.py (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2bce29c7..932bbd67e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,10 +12,7 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/). ## [Upcoming] Scribe-Data 3.3.0 - - - +- The translation process has been updated to allow for translations from non-English languages ([#72](https://github.com/scribe-org/Scribe-Data/issues/72), [#73](https://github.com/scribe-org/Scribe-Data/issues/73), [#74](https://github.com/scribe-org/Scribe-Data/issues/74), [#75](https://github.com/scribe-org/Scribe-Data/issues/75), [#75](https://github.com/scribe-org/Scribe-Data/issues/75), [#76](https://github.com/scribe-org/Scribe-Data/issues/76), [#77](https://github.com/scribe-org/Scribe-Data/issues/77), [#78](https://github.com/scribe-org/Scribe-Data/issues/78), [#79](https://github.com/scribe-org/Scribe-Data/issues/79)). - The documentation has been given a new layout with the logo in the top left ([#90](https://github.com/scribe-org/Scribe-Data/issues/90)). - The documentation now has links to the code at the top of each page ([#91](https://github.com/scribe-org/Scribe-Data/issues/91)). @@ -25,6 +22,8 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/). - A Ruff based GitHub workflow was added to check the code formatting and lint the codebase on each pull request ([#109](https://github.com/scribe-org/Scribe-Data/issues/109)). - The `_update_files` directory was renamed `update_files` as these files are used in non-internal manners now ([#57](https://github.com/scribe-org/Scribe-Data/issues/57)). - A common function has been created to map Wikidata ids to noun genders ([#69](https://github.com/scribe-org/Scribe-Data/issues/69)). +- Files in the `extract_transform` directory were moved based on if they access Wikidata, Wikipedia or Unicode. + - Translation files are further moved to their own directory. ## Scribe-Data 3.2.2 diff --git a/README.md b/README.md index 8de609ff1..feda2101f 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ ## Wikidata and Wikipedia language data extraction -**Scribe-Data** contains the scripts for extracting and formatting data from [Wikidata](https://www.wikidata.org/) and [Wikipedia](https://www.wikipedia.org/) for Scribe applications. Updates to the language keyboard and interface data can be done using [scribe_data/load/update_data.py](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/load/update_data.py) and the notebooks within the [scribe_data/load](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/load) directory. +**Scribe-Data** contains the scripts for extracting and formatting data from [Wikidata](https://www.wikidata.org/) and [Wikipedia](https://www.wikipedia.org/) for Scribe applications. Updates to the language keyboard and interface data can be done using [scribe_data/extract_transform/wikidata/update_data.py](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/wikidata/update_data.py) and the notebooks within the [scribe_data/load](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/load) directory. > [!NOTE]\ > The [contributing](#contributing) section has information for those interested, with the articles and presentations in [featured by](#featured-by) also being good resources for learning more about Scribe. @@ -38,14 +38,14 @@ Check out Scribe's [architecture diagrams](https://github.com/scribe-org/Organiz # Process [`⇧`](#contents) -[scribe_data/extract_transform/update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) and the notebooks within the [scribe_data/extract_transform](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform) directory are used to update all data for [Scribe-iOS](https://github.com/scribe-org/Scribe-iOS), with this functionality later being expanded to update [Scribe-Android](https://github.com/scribe-org/Scribe-Android) and [Scribe-Desktop](https://github.com/scribe-org/Scribe-Desktop) when they're active. +[scribe_data/extract_transform/wikidata/update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikidata/update_data.py) and the notebooks within the [scribe_data/extract_transform](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform) directory are used to update all data for [Scribe-iOS](https://github.com/scribe-org/Scribe-iOS), with this functionality later being expanded to update [Scribe-Android](https://github.com/scribe-org/Scribe-Android) and [Scribe-Desktop](https://github.com/scribe-org/Scribe-Desktop) when they're active. -The main data update process in [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) triggers [SPARQL queries](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/languages) to query language data from [Wikidata](https://www.wikidata.org/) using [SPARQLWrapper](https://github.com/RDFLib/sparqlwrapper) as a URI. The autosuggestion process derives popular words from [Wikipedia](https://www.wikipedia.org/) as well as those words that normally follow them for an effective baseline feature until natural language processing methods are employed. Functions to generate autosuggestions are ran in [gen_autosuggestions.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb). Emojis are further sourced from [Unicode CLDR](https://github.com/unicode-org/cldr), with this process being ran in [gen_emoji_lexicon.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb). +The main data update process in [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikidata/update_data.py) triggers [SPARQL queries](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/languages) to query language data from [Wikidata](https://www.wikidata.org/) using [SPARQLWrapper](https://github.com/RDFLib/sparqlwrapper) as a URI. The autosuggestion process derives popular words from [Wikipedia](https://www.wikipedia.org/) as well as those words that normally follow them for an effective baseline feature until natural language processing methods are employed. Functions to generate autosuggestions are ran in [gen_autosuggestions.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb). Emojis are further sourced from [Unicode CLDR](https://github.com/unicode-org/cldr), with this process being ran in [gen_emoji_lexicon.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb). -Running [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) is done via the following CLI command: +Running [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikidata/update_data.py) is done via the following CLI command: ```bash -python3 src/scribe_data/extract_transform/update_data.py +python3 src/scribe_data/extract_transform/wikidata/update_data.py ``` The ultimate goal is that this repository will house language packs that are periodically updated with new [Wikidata](https://www.wikidata.org/) lexicographical data and data from other sources. These packs would then be available to download by users of Scribe applications. diff --git a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py index fad6e9ec4..27b2ac1c4 100644 --- a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py @@ -14,7 +14,9 @@ PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) -from scribe_data.utils import translate_to_other_languages # noqa: E402 +from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 + translate_to_other_languages, +) SRC_LANG = "English" translate_script_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/scribe_data/extract_transform/languages/French/translations/translate_words.py b/src/scribe_data/extract_transform/languages/French/translations/translate_words.py index a8177bdb7..48db29482 100644 --- a/src/scribe_data/extract_transform/languages/French/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/French/translations/translate_words.py @@ -14,7 +14,9 @@ PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) -from scribe_data.utils import translate_to_other_languages # noqa: E402 +from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 + translate_to_other_languages, +) SRC_LANG = "French" translate_script_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/scribe_data/extract_transform/languages/German/translations/translate_words.py b/src/scribe_data/extract_transform/languages/German/translations/translate_words.py index e838530e5..35fcebe36 100644 --- a/src/scribe_data/extract_transform/languages/German/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/German/translations/translate_words.py @@ -14,7 +14,9 @@ PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) -from scribe_data.utils import translate_to_other_languages # noqa: E402 +from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 + translate_to_other_languages, +) SRC_LANG = "German" translate_script_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py new file mode 100644 index 000000000..832db0263 --- /dev/null +++ b/src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py @@ -0,0 +1,43 @@ +""" +Translates the Italian words queried from Wikidata to all other Scribe languages. + +Example +------- + python3 src/scribe_data/extract_transform/languages/Italian/translations/translate_words.py +""" + +import json +import os +import sys + +PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] +PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" +sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) + +from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 + translate_to_other_languages, +) + +SRC_LANG = "Italian" +translate_script_dir = os.path.dirname(os.path.abspath(__file__)) +words_to_translate_path = os.path.join(translate_script_dir, "words_to_translate.json") + +with open(words_to_translate_path, "r", encoding="utf-8") as file: + json_data = json.load(file) + +word_list = [item["word"] for item in json_data] + +translations = {} +translated_words_path = os.path.join( + translate_script_dir, "../formatted_data/translated_words.json" +) +if os.path.exists(translated_words_path): + with open(translated_words_path, "r", encoding="utf-8") as file: + translations = json.load(file) + +translate_to_other_languages( + source_language=SRC_LANG, + word_list=word_list, + translations=translations, + batch_size=100, +) diff --git a/src/scribe_data/extract_transform/languages/Portuguese/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Portuguese/translations/translate_words.py index 884083d8f..23256f718 100644 --- a/src/scribe_data/extract_transform/languages/Portuguese/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/Portuguese/translations/translate_words.py @@ -14,7 +14,9 @@ PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) -from scribe_data.utils import translate_to_other_languages # noqa: E402 +from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 + translate_to_other_languages, +) SRC_LANG = "Portuguese" translate_script_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py index a2b78e8f1..1de4f75c2 100644 --- a/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py @@ -14,7 +14,9 @@ PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) -from scribe_data.utils import translate_to_other_languages # noqa: E402 +from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 + translate_to_other_languages, +) SRC_LANG = "Russian" translate_script_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/scribe_data/extract_transform/languages/Spanish/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Spanish/translations/translate_words.py index b5151b151..2d6c172f2 100644 --- a/src/scribe_data/extract_transform/languages/Spanish/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/Spanish/translations/translate_words.py @@ -14,7 +14,9 @@ PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) -from scribe_data.utils import translate_to_other_languages # noqa: E402 +from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 + translate_to_other_languages, +) SRC_LANG = "Spanish" translate_script_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/scribe_data/extract_transform/languages/Swedish/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Swedish/translations/translate_words.py index f960b5650..f753bbedc 100644 --- a/src/scribe_data/extract_transform/languages/Swedish/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/Swedish/translations/translate_words.py @@ -14,7 +14,9 @@ PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) -from scribe_data.utils import translate_to_other_languages # noqa: E402 +from scribe_data.extract_transform.translation.translation_utils import ( # noqa: E402 + translate_to_other_languages, +) SRC_LANG = "Swedish" translate_script_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/scribe_data/extract_transform/translation/translation_utils.py b/src/scribe_data/extract_transform/translation/translation_utils.py new file mode 100644 index 000000000..380474795 --- /dev/null +++ b/src/scribe_data/extract_transform/translation/translation_utils.py @@ -0,0 +1,111 @@ +""" +Utility functions for the machine translation process. + +Contents: + translation_interrupt_handler, + translate_to_other_languages +""" + +import json +import os +import signal +import sys + +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + +PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] +PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" +sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) + +from scribe_data.utils import ( # noqa: E402 + get_language_dir_path, + get_language_iso, + get_target_langcodes, +) + + +def translation_interrupt_handler(source_language, translations): + """ + Handles interrupt signals and saves the current translation progress. + + Parameters + ---------- + source_language : str + The source language being translated from. + + translations : list[dict] + The current list of translations. + """ + print( + "\nThe interrupt signal has been caught and the current progress is being saved..." + ) + + with open( + f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", + "w", + encoding="utf-8", + ) as file: + json.dump(translations, file, ensure_ascii=False, indent=4) + + print("The current progress is saved to the translated_words.json file.") + exit() + + +def translate_to_other_languages(source_language, word_list, translations, batch_size): + """ + Translates a list of words from the source language to other target languages using batch processing. + + Parameters + ---------- + source_language : str + The source language being translated from. + + word_list : list[str] + The list of words to translate. + + translations : dict + The current dictionary of translations. + + batch_size : int + The number of words to translate in each batch. + """ + model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") + tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") + + signal.signal( + signal.SIGINT, + lambda sig, frame: translation_interrupt_handler(source_language, translations), + ) + + for i in range(0, len(word_list), batch_size): + batch_words = word_list[i : i + batch_size] + print(f"Translating batch {i//batch_size + 1}: {batch_words}") + + for lang_code in get_target_langcodes(source_language): + tokenizer.src_lang = get_language_iso(source_language) + encoded_words = tokenizer(batch_words, return_tensors="pt", padding=True) + generated_tokens = model.generate( + **encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code) + ) + translated_words = tokenizer.batch_decode( + generated_tokens, skip_special_tokens=True + ) + + for word, translation in zip(batch_words, translated_words): + if word not in translations: + translations[word] = {} + + translations[word][lang_code] = translation + + print(f"Batch {i//batch_size + 1} translation completed.") + + with open( + f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", + "w", + encoding="utf-8", + ) as file: + json.dump(translations, file, ensure_ascii=False, indent=4) + + print( + "Translation results for all words are saved to the translated_words.json file." + ) diff --git a/src/scribe_data/extract_transform/update_words_to_translate.py b/src/scribe_data/extract_transform/translation/update_words_to_translate.py similarity index 96% rename from src/scribe_data/extract_transform/update_words_to_translate.py rename to src/scribe_data/extract_transform/translation/update_words_to_translate.py index ab12a44d5..7979b5d23 100644 --- a/src/scribe_data/extract_transform/update_words_to_translate.py +++ b/src/scribe_data/extract_transform/translation/update_words_to_translate.py @@ -8,7 +8,7 @@ Example ------- - python update_words_to_translate.py '["French", "German"]' + python3 src/scribe_data/extract_transform/translation/update_words_to_translate.py '["French", "German"]' """ import json diff --git a/src/scribe_data/extract_transform/emoji_utils.py b/src/scribe_data/extract_transform/unicode/emoji_utils.py similarity index 100% rename from src/scribe_data/extract_transform/emoji_utils.py rename to src/scribe_data/extract_transform/unicode/emoji_utils.py diff --git a/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb b/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb index a1dcd577b..62d4b7a72 100644 --- a/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb +++ b/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb @@ -35,9 +35,7 @@ "source": [ "import os\n", "import sys\n", - "import json\n", "\n", - "from tqdm.auto import tqdm\n", "from IPython.display import display, HTML\n", "display(HTML(\"\"))" ] @@ -71,7 +69,7 @@ }, "outputs": [], "source": [ - "from scribe_data.extract_transform.process_unicode import gen_emoji_lexicon" + "from scribe_data.extract_transform.unicode.process_unicode import gen_emoji_lexicon" ] }, { diff --git a/src/scribe_data/extract_transform/process_unicode.py b/src/scribe_data/extract_transform/unicode/process_unicode.py similarity index 98% rename from src/scribe_data/extract_transform/process_unicode.py rename to src/scribe_data/extract_transform/unicode/process_unicode.py index 4297f56dc..8dc88f1f4 100644 --- a/src/scribe_data/extract_transform/process_unicode.py +++ b/src/scribe_data/extract_transform/unicode/process_unicode.py @@ -14,13 +14,13 @@ from icu import Char, UProperty from tqdm.auto import tqdm -from scribe_data.extract_transform.emoji_utils import get_emoji_codes_to_ignore +from scribe_data.extract_transform.unicode.emoji_utils import get_emoji_codes_to_ignore from scribe_data.utils import ( get_language_iso, get_path_from_et_dir, ) -from . import _resources +from .. import _resources emoji_codes_to_ignore = get_emoji_codes_to_ignore() diff --git a/src/scribe_data/extract_transform/query_profanity.sparql b/src/scribe_data/extract_transform/wikidata/query_profanity.sparql similarity index 100% rename from src/scribe_data/extract_transform/query_profanity.sparql rename to src/scribe_data/extract_transform/wikidata/query_profanity.sparql diff --git a/src/scribe_data/extract_transform/query_words_to_translate.sparql b/src/scribe_data/extract_transform/wikidata/query_words_to_translate.sparql similarity index 100% rename from src/scribe_data/extract_transform/query_words_to_translate.sparql rename to src/scribe_data/extract_transform/wikidata/query_words_to_translate.sparql diff --git a/src/scribe_data/extract_transform/update_data.py b/src/scribe_data/extract_transform/wikidata/update_data.py similarity index 99% rename from src/scribe_data/extract_transform/update_data.py rename to src/scribe_data/extract_transform/wikidata/update_data.py index 23d274459..0a9fc7152 100644 --- a/src/scribe_data/extract_transform/update_data.py +++ b/src/scribe_data/extract_transform/wikidata/update_data.py @@ -11,7 +11,7 @@ Example ------- - python update_data.py '["French", "German"]' '["nouns", "verbs"]' + python3 src/scribe_data/extract_transform/wikidata/update_data.py '["French", "German"]' '["nouns", "verbs"]' """ import itertools diff --git a/src/scribe_data/extract_transform/extract_wiki.py b/src/scribe_data/extract_transform/wikipedia/extract_wiki.py similarity index 100% rename from src/scribe_data/extract_transform/extract_wiki.py rename to src/scribe_data/extract_transform/wikipedia/extract_wiki.py diff --git a/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb b/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb index ed98a9d30..77cc413fb 100644 --- a/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb +++ b/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb @@ -21,6 +21,18 @@ "This notebook is used to run the functions found in Scribe-Data to extract, clean and load autosuggestion files into Scribe apps." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec5ff38", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\", message=r\"Passing\", category=FutureWarning)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -36,8 +48,6 @@ "import os\n", "import sys\n", "import json\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\", message=r\"Passing\", category=FutureWarning)\n", "\n", "from tqdm.auto import tqdm\n", "from IPython.core.display import display, HTML\n", @@ -73,8 +83,14 @@ }, "outputs": [], "source": [ - "from scribe_data.extract_transform.extract_wiki import download_wiki, parse_to_ndjson\n", - "from scribe_data.extract_transform.process_wiki import clean, gen_autosuggestions\n", + "from scribe_data.extract_transform.wikipedia.extract_wiki import (\n", + " download_wiki,\n", + " parse_to_ndjson,\n", + ")\n", + "from scribe_data.extract_transform.wikipedia.process_wiki import (\n", + " clean,\n", + " gen_autosuggestions,\n", + ")\n", "from scribe_data.utils import get_language_iso" ] }, @@ -116,9 +132,9 @@ "outputs": [], "source": [ "files = download_wiki(\n", - " language=language, \n", - " target_dir=f\"./{language_abbr}wiki_dump\", \n", - " file_limit=None, # None is all files \n", + " language=language,\n", + " target_dir=f\"./{language_abbr}wiki_dump\",\n", + " file_limit=None, # None is all files\n", " dump_id=\"20220920\"\n", ")\n", "print(f\"Number of files: {len(files)}\")" @@ -207,7 +223,7 @@ " texts=article_texts,\n", " language=language,\n", " remove_words=None,\n", - " sample_size=sample_size, \n", + " sample_size=sample_size,\n", " verbose=True,\n", ")" ] @@ -232,7 +248,7 @@ "outputs": [], "source": [ "autosuggest_dict = gen_autosuggestions(\n", - " text_corpus, \n", + " text_corpus,\n", " language=language,\n", " num_words=1000,\n", " ignore_words=None,\n", diff --git a/src/scribe_data/extract_transform/process_wiki.py b/src/scribe_data/extract_transform/wikipedia/process_wiki.py similarity index 100% rename from src/scribe_data/extract_transform/process_wiki.py rename to src/scribe_data/extract_transform/wikipedia/process_wiki.py diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 303cb1afb..881d662fa 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -30,7 +30,6 @@ import ast import json import os -import signal import sys from importlib import resources from pathlib import Path @@ -38,7 +37,6 @@ import langcodes from langcodes import Language -from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer PROJECT_ROOT = "Scribe-Data" @@ -532,93 +530,6 @@ def get_target_langcodes(source_lang) -> list[str]: ] -def translation_interrupt_handler(source_language, translations): - """ - Handles interrupt signals and saves the current translation progress. - - Parameters - ---------- - source_language : str - The source language being translated from. - - translations : list[dict] - The current list of translations. - """ - print( - "\nThe interrupt signal has been caught and the current progress is being saved..." - ) - - with open( - f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", - "w", - encoding="utf-8", - ) as file: - json.dump(translations, file, ensure_ascii=False, indent=4) - - print("The current progress is saved to the translated_words.json file.") - exit() - - -def translate_to_other_languages(source_language, word_list, translations, batch_size): - """ - Translates a list of words from the source language to other target languages using batch processing. - - Parameters - ---------- - source_language : str - The source language being translated from. - - word_list : list[str] - The list of words to translate. - - translations : dict - The current dictionary of translations. - - batch_size : int - The number of words to translate in each batch. - """ - model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") - tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") - - signal.signal( - signal.SIGINT, - lambda sig, frame: translation_interrupt_handler(source_language, translations), - ) - - for i in range(0, len(word_list), batch_size): - batch_words = word_list[i : i + batch_size] - print(f"Translating batch {i//batch_size + 1}: {batch_words}") - - for lang_code in get_target_langcodes(source_language): - tokenizer.src_lang = get_language_iso(source_language) - encoded_words = tokenizer(batch_words, return_tensors="pt", padding=True) - generated_tokens = model.generate( - **encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code) - ) - translated_words = tokenizer.batch_decode( - generated_tokens, skip_special_tokens=True - ) - - for word, translation in zip(batch_words, translated_words): - if word not in translations: - translations[word] = {} - - translations[word][lang_code] = translation - - print(f"Batch {i//batch_size + 1} translation completed.") - - with open( - f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", - "w", - encoding="utf-8", - ) as file: - json.dump(translations, file, ensure_ascii=False, indent=4) - - print( - "Translation results for all words are saved to the translated_words.json file." - ) - - def map_genders(wikidata_gender): """ Maps those genders from Wikidata to succinct versions.