From 06e60a6013478098e002dca48bc18985c0a26abe Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Mon, 3 Jun 2024 01:52:33 +0200 Subject: [PATCH] Retarget all processes to data export directory in root --- CHANGELOG.md | 6 ++-- .../language_data_extraction/index.rst | 2 +- .../English}/nouns.json | 0 .../English}/translated_words.json | 0 .../English}/verbs.json | 0 .../French}/autosuggestions.json | 0 .../French}/emoji_keywords.json | 0 .../French}/nouns.json | 0 .../French}/prepositions.json | 0 .../French}/translations.json | 0 .../French}/verbs.json | 0 .../German}/autosuggestions.json | 0 .../German}/emoji_keywords.json | 0 .../German}/nouns.json | 0 .../German}/prepositions.json | 0 .../German}/translations.json | 0 .../German}/verbs.json | 0 .../Italian}/autosuggestions.json | 0 .../Italian}/emoji_keywords.json | 0 .../Italian}/nouns.json | 0 .../Italian}/prepositions.json | 0 .../Italian}/translations.json | 0 .../Italian}/verbs.json | 0 .../Portuguese}/autosuggestions.json | 0 .../Portuguese}/emoji_keywords.json | 0 .../Portuguese}/nouns.json | 0 .../Portuguese}/prepositions.json | 0 .../Portuguese}/translations.json | 0 .../Portuguese}/verbs.json | 0 .../Russian}/autosuggestions.json | 0 .../Russian}/emoji_keywords.json | 0 .../Russian}/nouns.json | 0 .../Russian}/prepositions.json | 0 .../Russian}/translated_words.json | 0 .../Russian}/translations.json | 0 .../Russian}/verbs.json | 0 .../Spanish}/autosuggestions.json | 0 .../Spanish}/emoji_keywords.json | 0 .../Spanish}/nouns.json | 0 .../Spanish}/prepositions.json | 0 .../Spanish}/translations.json | 0 .../Spanish}/verbs.json | 0 .../Swedish}/autosuggestions.json | 0 .../Swedish}/emoji_keywords.json | 0 .../Swedish}/nouns.json | 0 .../Swedish}/prepositions.json | 0 .../Swedish}/translations.json | 0 .../Swedish}/verbs.json | 0 .../English/translations/translate_words.py | 5 +++- .../French/nouns/format_nouns.py | 1 - .../French/translations/translate_words.py | 4 ++- .../German/nouns/format_nouns.py | 1 - .../German/translations/translate_words.py | 4 ++- .../Italian/nouns/format_nouns.py | 1 - .../Italian/translations/translate_words.py | 4 ++- .../Portuguese/nouns/format_nouns.py | 1 - .../translations/translate_words.py | 4 ++- .../Russian/translations/translate_words.py | 4 ++- .../Spanish/translations/translate_words.py | 4 ++- .../Swedish/nouns/format_nouns.py | 1 - .../Swedish/translations/translate_words.py | 4 ++- src/scribe_data/load/data_to_sqlite.py | 8 +++-- .../translation/translation_utils.py | 7 +++-- src/scribe_data/unicode/process_unicode.py | 11 +++++-- src/scribe_data/utils.py | 29 +++++-------------- src/scribe_data/wikidata/update_data.py | 4 +-- src/scribe_data/wikipedia/process_wiki.py | 6 ++-- 67 files changed, 61 insertions(+), 50 deletions(-) rename {src/scribe_data/language_data_extraction/English/formatted_data => language_data_export/English}/nouns.json (100%) rename {src/scribe_data/language_data_extraction/English/formatted_data => language_data_export/English}/translated_words.json (100%) rename {src/scribe_data/language_data_extraction/English/formatted_data => language_data_export/English}/verbs.json (100%) rename {src/scribe_data/language_data_extraction/French/formatted_data => language_data_export/French}/autosuggestions.json (100%) rename {src/scribe_data/language_data_extraction/French/formatted_data => language_data_export/French}/emoji_keywords.json (100%) rename {src/scribe_data/language_data_extraction/French/formatted_data => language_data_export/French}/nouns.json (100%) rename {src/scribe_data/language_data_extraction/French/formatted_data => language_data_export/French}/prepositions.json (100%) rename {src/scribe_data/language_data_extraction/French/formatted_data => language_data_export/French}/translations.json (100%) rename {src/scribe_data/language_data_extraction/French/formatted_data => language_data_export/French}/verbs.json (100%) rename {src/scribe_data/language_data_extraction/German/formatted_data => language_data_export/German}/autosuggestions.json (100%) rename {src/scribe_data/language_data_extraction/German/formatted_data => language_data_export/German}/emoji_keywords.json (100%) rename {src/scribe_data/language_data_extraction/German/formatted_data => language_data_export/German}/nouns.json (100%) rename {src/scribe_data/language_data_extraction/German/formatted_data => language_data_export/German}/prepositions.json (100%) rename {src/scribe_data/language_data_extraction/German/formatted_data => language_data_export/German}/translations.json (100%) rename {src/scribe_data/language_data_extraction/German/formatted_data => language_data_export/German}/verbs.json (100%) rename {src/scribe_data/language_data_extraction/Italian/formatted_data => language_data_export/Italian}/autosuggestions.json (100%) rename {src/scribe_data/language_data_extraction/Italian/formatted_data => language_data_export/Italian}/emoji_keywords.json (100%) rename {src/scribe_data/language_data_extraction/Italian/formatted_data => language_data_export/Italian}/nouns.json (100%) rename {src/scribe_data/language_data_extraction/Italian/formatted_data => language_data_export/Italian}/prepositions.json (100%) rename {src/scribe_data/language_data_extraction/Italian/formatted_data => language_data_export/Italian}/translations.json (100%) rename {src/scribe_data/language_data_extraction/Italian/formatted_data => language_data_export/Italian}/verbs.json (100%) rename {src/scribe_data/language_data_extraction/Portuguese/formatted_data => language_data_export/Portuguese}/autosuggestions.json (100%) rename {src/scribe_data/language_data_extraction/Portuguese/formatted_data => language_data_export/Portuguese}/emoji_keywords.json (100%) rename {src/scribe_data/language_data_extraction/Portuguese/formatted_data => language_data_export/Portuguese}/nouns.json (100%) rename {src/scribe_data/language_data_extraction/Portuguese/formatted_data => language_data_export/Portuguese}/prepositions.json (100%) rename {src/scribe_data/language_data_extraction/Portuguese/formatted_data => language_data_export/Portuguese}/translations.json (100%) rename {src/scribe_data/language_data_extraction/Portuguese/formatted_data => language_data_export/Portuguese}/verbs.json (100%) rename {src/scribe_data/language_data_extraction/Russian/formatted_data => language_data_export/Russian}/autosuggestions.json (100%) rename {src/scribe_data/language_data_extraction/Russian/formatted_data => language_data_export/Russian}/emoji_keywords.json (100%) rename {src/scribe_data/language_data_extraction/Russian/formatted_data => language_data_export/Russian}/nouns.json (100%) rename {src/scribe_data/language_data_extraction/Russian/formatted_data => language_data_export/Russian}/prepositions.json (100%) rename {src/scribe_data/language_data_extraction/Russian/formatted_data => language_data_export/Russian}/translated_words.json (100%) rename {src/scribe_data/language_data_extraction/Russian/formatted_data => language_data_export/Russian}/translations.json (100%) rename {src/scribe_data/language_data_extraction/Russian/formatted_data => language_data_export/Russian}/verbs.json (100%) rename {src/scribe_data/language_data_extraction/Spanish/formatted_data => language_data_export/Spanish}/autosuggestions.json (100%) rename {src/scribe_data/language_data_extraction/Spanish/formatted_data => language_data_export/Spanish}/emoji_keywords.json (100%) rename {src/scribe_data/language_data_extraction/Spanish/formatted_data => language_data_export/Spanish}/nouns.json (100%) rename {src/scribe_data/language_data_extraction/Spanish/formatted_data => language_data_export/Spanish}/prepositions.json (100%) rename {src/scribe_data/language_data_extraction/Spanish/formatted_data => language_data_export/Spanish}/translations.json (100%) rename {src/scribe_data/language_data_extraction/Spanish/formatted_data => language_data_export/Spanish}/verbs.json (100%) rename {src/scribe_data/language_data_extraction/Swedish/formatted_data => language_data_export/Swedish}/autosuggestions.json (100%) rename {src/scribe_data/language_data_extraction/Swedish/formatted_data => language_data_export/Swedish}/emoji_keywords.json (100%) rename {src/scribe_data/language_data_extraction/Swedish/formatted_data => language_data_export/Swedish}/nouns.json (100%) rename {src/scribe_data/language_data_extraction/Swedish/formatted_data => language_data_export/Swedish}/prepositions.json (100%) rename {src/scribe_data/language_data_extraction/Swedish/formatted_data => language_data_export/Swedish}/translations.json (100%) rename {src/scribe_data/language_data_extraction/Swedish/formatted_data => language_data_export/Swedish}/verbs.json (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8646e9d3f..303e3e76d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ Scribe-Data tries to follow [semantic versioning](https://semver.org/), a MAJOR. Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/). -## [Upcoming] Scribe-Data 3.3.0 +## [Upcoming] Scribe-Data 4.0.0 ### ✨ Features @@ -29,11 +29,13 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/). - The `_update_files` directory was renamed `update_files` as these files are used in non-internal manners now ([#57](https://github.com/scribe-org/Scribe-Data/issues/57)). - A common function has been created to map Wikidata ids to noun genders ([#69](https://github.com/scribe-org/Scribe-Data/issues/69)). - The project now is installed locally for development and command line usage, so usages of `sys.path` have been removed from files ([#122](https://github.com/scribe-org/Scribe-Data/issues/122)). -- The directory structure has been dramatically streamlined and includes folders for future projects where language data could come from (Wiktionary). +- The directory structure has been dramatically streamlined and includes folders for future projects where language data could come from other sources like Wiktionary ([#139](https://github.com/scribe-org/Scribe-Data/issues/139)). - Translation files are moved to their own directory. - The `extract_transform` directory has been removed and all files within it have been moved one level up. - The `languages` directory has been renamed `language_data_extraction`. - All files within `wikidata/_resources` have been moved to the `resources` directory. + - The gender and case annotations for data formatting have now been commonly defined. + - All language directory `formatted_data` files have been now moved to the `language_data_export` directory to prepare for outputs being required to be directed to a directory outside of the package. ## Scribe-Data 3.2.2 diff --git a/docs/source/scribe_data/language_data_extraction/index.rst b/docs/source/scribe_data/language_data_extraction/index.rst index f44c2013b..77a41e6f5 100644 --- a/docs/source/scribe_data/language_data_extraction/index.rst +++ b/docs/source/scribe_data/language_data_extraction/index.rst @@ -3,6 +3,6 @@ language_data_extraction `View code on Github `_ -This directory contains all language extraction and formatting code for Scribe-Data. The structure is broken down by language, with each language sub-directory then including directories for nouns, prepositions, translations and verbs if needed. Within these word type directories are :code:`query_WORD_TYPE.sparql` SPARQL files that are ran to query Wikidata and then formatted with the given :code:`format_WORD_TYPE.py` Python files. Included in each language sub-directory is also a :code:`formatted_data` directory that includes the outputs of all word type query and formatting processes. +This directory contains all language extraction and formatting code for Scribe-Data. The structure is broken down by language, with each language sub-directory then including directories for nouns, prepositions, translations and verbs if needed. Within these word type directories are :code:`query_WORD_TYPE.sparql` SPARQL files that are ran to query Wikidata and then formatted with the given :code:`format_WORD_TYPE.py` Python files. Use the :code:`View code on GitHub` link above to view the directory and explore the process! diff --git a/src/scribe_data/language_data_extraction/English/formatted_data/nouns.json b/language_data_export/English/nouns.json similarity index 100% rename from src/scribe_data/language_data_extraction/English/formatted_data/nouns.json rename to language_data_export/English/nouns.json diff --git a/src/scribe_data/language_data_extraction/English/formatted_data/translated_words.json b/language_data_export/English/translated_words.json similarity index 100% rename from src/scribe_data/language_data_extraction/English/formatted_data/translated_words.json rename to language_data_export/English/translated_words.json diff --git a/src/scribe_data/language_data_extraction/English/formatted_data/verbs.json b/language_data_export/English/verbs.json similarity index 100% rename from src/scribe_data/language_data_extraction/English/formatted_data/verbs.json rename to language_data_export/English/verbs.json diff --git a/src/scribe_data/language_data_extraction/French/formatted_data/autosuggestions.json b/language_data_export/French/autosuggestions.json similarity index 100% rename from src/scribe_data/language_data_extraction/French/formatted_data/autosuggestions.json rename to language_data_export/French/autosuggestions.json diff --git a/src/scribe_data/language_data_extraction/French/formatted_data/emoji_keywords.json b/language_data_export/French/emoji_keywords.json similarity index 100% rename from src/scribe_data/language_data_extraction/French/formatted_data/emoji_keywords.json rename to language_data_export/French/emoji_keywords.json diff --git a/src/scribe_data/language_data_extraction/French/formatted_data/nouns.json b/language_data_export/French/nouns.json similarity index 100% rename from src/scribe_data/language_data_extraction/French/formatted_data/nouns.json rename to language_data_export/French/nouns.json diff --git a/src/scribe_data/language_data_extraction/French/formatted_data/prepositions.json b/language_data_export/French/prepositions.json similarity index 100% rename from src/scribe_data/language_data_extraction/French/formatted_data/prepositions.json rename to language_data_export/French/prepositions.json diff --git a/src/scribe_data/language_data_extraction/French/formatted_data/translations.json b/language_data_export/French/translations.json similarity index 100% rename from src/scribe_data/language_data_extraction/French/formatted_data/translations.json rename to language_data_export/French/translations.json diff --git a/src/scribe_data/language_data_extraction/French/formatted_data/verbs.json b/language_data_export/French/verbs.json similarity index 100% rename from src/scribe_data/language_data_extraction/French/formatted_data/verbs.json rename to language_data_export/French/verbs.json diff --git a/src/scribe_data/language_data_extraction/German/formatted_data/autosuggestions.json b/language_data_export/German/autosuggestions.json similarity index 100% rename from src/scribe_data/language_data_extraction/German/formatted_data/autosuggestions.json rename to language_data_export/German/autosuggestions.json diff --git a/src/scribe_data/language_data_extraction/German/formatted_data/emoji_keywords.json b/language_data_export/German/emoji_keywords.json similarity index 100% rename from src/scribe_data/language_data_extraction/German/formatted_data/emoji_keywords.json rename to language_data_export/German/emoji_keywords.json diff --git a/src/scribe_data/language_data_extraction/German/formatted_data/nouns.json b/language_data_export/German/nouns.json similarity index 100% rename from src/scribe_data/language_data_extraction/German/formatted_data/nouns.json rename to language_data_export/German/nouns.json diff --git a/src/scribe_data/language_data_extraction/German/formatted_data/prepositions.json b/language_data_export/German/prepositions.json similarity index 100% rename from src/scribe_data/language_data_extraction/German/formatted_data/prepositions.json rename to language_data_export/German/prepositions.json diff --git a/src/scribe_data/language_data_extraction/German/formatted_data/translations.json b/language_data_export/German/translations.json similarity index 100% rename from src/scribe_data/language_data_extraction/German/formatted_data/translations.json rename to language_data_export/German/translations.json diff --git a/src/scribe_data/language_data_extraction/German/formatted_data/verbs.json b/language_data_export/German/verbs.json similarity index 100% rename from src/scribe_data/language_data_extraction/German/formatted_data/verbs.json rename to language_data_export/German/verbs.json diff --git a/src/scribe_data/language_data_extraction/Italian/formatted_data/autosuggestions.json b/language_data_export/Italian/autosuggestions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Italian/formatted_data/autosuggestions.json rename to language_data_export/Italian/autosuggestions.json diff --git a/src/scribe_data/language_data_extraction/Italian/formatted_data/emoji_keywords.json b/language_data_export/Italian/emoji_keywords.json similarity index 100% rename from src/scribe_data/language_data_extraction/Italian/formatted_data/emoji_keywords.json rename to language_data_export/Italian/emoji_keywords.json diff --git a/src/scribe_data/language_data_extraction/Italian/formatted_data/nouns.json b/language_data_export/Italian/nouns.json similarity index 100% rename from src/scribe_data/language_data_extraction/Italian/formatted_data/nouns.json rename to language_data_export/Italian/nouns.json diff --git a/src/scribe_data/language_data_extraction/Italian/formatted_data/prepositions.json b/language_data_export/Italian/prepositions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Italian/formatted_data/prepositions.json rename to language_data_export/Italian/prepositions.json diff --git a/src/scribe_data/language_data_extraction/Italian/formatted_data/translations.json b/language_data_export/Italian/translations.json similarity index 100% rename from src/scribe_data/language_data_extraction/Italian/formatted_data/translations.json rename to language_data_export/Italian/translations.json diff --git a/src/scribe_data/language_data_extraction/Italian/formatted_data/verbs.json b/language_data_export/Italian/verbs.json similarity index 100% rename from src/scribe_data/language_data_extraction/Italian/formatted_data/verbs.json rename to language_data_export/Italian/verbs.json diff --git a/src/scribe_data/language_data_extraction/Portuguese/formatted_data/autosuggestions.json b/language_data_export/Portuguese/autosuggestions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Portuguese/formatted_data/autosuggestions.json rename to language_data_export/Portuguese/autosuggestions.json diff --git a/src/scribe_data/language_data_extraction/Portuguese/formatted_data/emoji_keywords.json b/language_data_export/Portuguese/emoji_keywords.json similarity index 100% rename from src/scribe_data/language_data_extraction/Portuguese/formatted_data/emoji_keywords.json rename to language_data_export/Portuguese/emoji_keywords.json diff --git a/src/scribe_data/language_data_extraction/Portuguese/formatted_data/nouns.json b/language_data_export/Portuguese/nouns.json similarity index 100% rename from src/scribe_data/language_data_extraction/Portuguese/formatted_data/nouns.json rename to language_data_export/Portuguese/nouns.json diff --git a/src/scribe_data/language_data_extraction/Portuguese/formatted_data/prepositions.json b/language_data_export/Portuguese/prepositions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Portuguese/formatted_data/prepositions.json rename to language_data_export/Portuguese/prepositions.json diff --git a/src/scribe_data/language_data_extraction/Portuguese/formatted_data/translations.json b/language_data_export/Portuguese/translations.json similarity index 100% rename from src/scribe_data/language_data_extraction/Portuguese/formatted_data/translations.json rename to language_data_export/Portuguese/translations.json diff --git a/src/scribe_data/language_data_extraction/Portuguese/formatted_data/verbs.json b/language_data_export/Portuguese/verbs.json similarity index 100% rename from src/scribe_data/language_data_extraction/Portuguese/formatted_data/verbs.json rename to language_data_export/Portuguese/verbs.json diff --git a/src/scribe_data/language_data_extraction/Russian/formatted_data/autosuggestions.json b/language_data_export/Russian/autosuggestions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Russian/formatted_data/autosuggestions.json rename to language_data_export/Russian/autosuggestions.json diff --git a/src/scribe_data/language_data_extraction/Russian/formatted_data/emoji_keywords.json b/language_data_export/Russian/emoji_keywords.json similarity index 100% rename from src/scribe_data/language_data_extraction/Russian/formatted_data/emoji_keywords.json rename to language_data_export/Russian/emoji_keywords.json diff --git a/src/scribe_data/language_data_extraction/Russian/formatted_data/nouns.json b/language_data_export/Russian/nouns.json similarity index 100% rename from src/scribe_data/language_data_extraction/Russian/formatted_data/nouns.json rename to language_data_export/Russian/nouns.json diff --git a/src/scribe_data/language_data_extraction/Russian/formatted_data/prepositions.json b/language_data_export/Russian/prepositions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Russian/formatted_data/prepositions.json rename to language_data_export/Russian/prepositions.json diff --git a/src/scribe_data/language_data_extraction/Russian/formatted_data/translated_words.json b/language_data_export/Russian/translated_words.json similarity index 100% rename from src/scribe_data/language_data_extraction/Russian/formatted_data/translated_words.json rename to language_data_export/Russian/translated_words.json diff --git a/src/scribe_data/language_data_extraction/Russian/formatted_data/translations.json b/language_data_export/Russian/translations.json similarity index 100% rename from src/scribe_data/language_data_extraction/Russian/formatted_data/translations.json rename to language_data_export/Russian/translations.json diff --git a/src/scribe_data/language_data_extraction/Russian/formatted_data/verbs.json b/language_data_export/Russian/verbs.json similarity index 100% rename from src/scribe_data/language_data_extraction/Russian/formatted_data/verbs.json rename to language_data_export/Russian/verbs.json diff --git a/src/scribe_data/language_data_extraction/Spanish/formatted_data/autosuggestions.json b/language_data_export/Spanish/autosuggestions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Spanish/formatted_data/autosuggestions.json rename to language_data_export/Spanish/autosuggestions.json diff --git a/src/scribe_data/language_data_extraction/Spanish/formatted_data/emoji_keywords.json b/language_data_export/Spanish/emoji_keywords.json similarity index 100% rename from src/scribe_data/language_data_extraction/Spanish/formatted_data/emoji_keywords.json rename to language_data_export/Spanish/emoji_keywords.json diff --git a/src/scribe_data/language_data_extraction/Spanish/formatted_data/nouns.json b/language_data_export/Spanish/nouns.json similarity index 100% rename from src/scribe_data/language_data_extraction/Spanish/formatted_data/nouns.json rename to language_data_export/Spanish/nouns.json diff --git a/src/scribe_data/language_data_extraction/Spanish/formatted_data/prepositions.json b/language_data_export/Spanish/prepositions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Spanish/formatted_data/prepositions.json rename to language_data_export/Spanish/prepositions.json diff --git a/src/scribe_data/language_data_extraction/Spanish/formatted_data/translations.json b/language_data_export/Spanish/translations.json similarity index 100% rename from src/scribe_data/language_data_extraction/Spanish/formatted_data/translations.json rename to language_data_export/Spanish/translations.json diff --git a/src/scribe_data/language_data_extraction/Spanish/formatted_data/verbs.json b/language_data_export/Spanish/verbs.json similarity index 100% rename from src/scribe_data/language_data_extraction/Spanish/formatted_data/verbs.json rename to language_data_export/Spanish/verbs.json diff --git a/src/scribe_data/language_data_extraction/Swedish/formatted_data/autosuggestions.json b/language_data_export/Swedish/autosuggestions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Swedish/formatted_data/autosuggestions.json rename to language_data_export/Swedish/autosuggestions.json diff --git a/src/scribe_data/language_data_extraction/Swedish/formatted_data/emoji_keywords.json b/language_data_export/Swedish/emoji_keywords.json similarity index 100% rename from src/scribe_data/language_data_extraction/Swedish/formatted_data/emoji_keywords.json rename to language_data_export/Swedish/emoji_keywords.json diff --git a/src/scribe_data/language_data_extraction/Swedish/formatted_data/nouns.json b/language_data_export/Swedish/nouns.json similarity index 100% rename from src/scribe_data/language_data_extraction/Swedish/formatted_data/nouns.json rename to language_data_export/Swedish/nouns.json diff --git a/src/scribe_data/language_data_extraction/Swedish/formatted_data/prepositions.json b/language_data_export/Swedish/prepositions.json similarity index 100% rename from src/scribe_data/language_data_extraction/Swedish/formatted_data/prepositions.json rename to language_data_export/Swedish/prepositions.json diff --git a/src/scribe_data/language_data_extraction/Swedish/formatted_data/translations.json b/language_data_export/Swedish/translations.json similarity index 100% rename from src/scribe_data/language_data_extraction/Swedish/formatted_data/translations.json rename to language_data_export/Swedish/translations.json diff --git a/src/scribe_data/language_data_extraction/Swedish/formatted_data/verbs.json b/language_data_export/Swedish/verbs.json similarity index 100% rename from src/scribe_data/language_data_extraction/Swedish/formatted_data/verbs.json rename to language_data_export/Swedish/verbs.json diff --git a/src/scribe_data/language_data_extraction/English/translations/translate_words.py b/src/scribe_data/language_data_extraction/English/translations/translate_words.py index 8f4b7bcda..73ec7a362 100644 --- a/src/scribe_data/language_data_extraction/English/translations/translate_words.py +++ b/src/scribe_data/language_data_extraction/English/translations/translate_words.py @@ -8,6 +8,7 @@ import json import os +import sys from scribe_data.translation.translation_utils import ( translate_to_other_languages, @@ -24,8 +25,10 @@ translations = {} translated_words_path = os.path.join( - translate_script_dir, "../formatted_data/translated_words.json" + translate_script_dir, + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{SRC_LANG}/translated_words.json", ) + if os.path.exists(translated_words_path): with open(translated_words_path, "r", encoding="utf-8") as file: translations = json.load(file) diff --git a/src/scribe_data/language_data_extraction/French/nouns/format_nouns.py b/src/scribe_data/language_data_extraction/French/nouns/format_nouns.py index 1e3a85491..a376a3c6c 100644 --- a/src/scribe_data/language_data_extraction/French/nouns/format_nouns.py +++ b/src/scribe_data/language_data_extraction/French/nouns/format_nouns.py @@ -21,7 +21,6 @@ file_path=file_path, language=LANGUAGE, data_type=DATA_TYPE ) - nouns_formatted = {} for noun_vals in nouns_list: diff --git a/src/scribe_data/language_data_extraction/French/translations/translate_words.py b/src/scribe_data/language_data_extraction/French/translations/translate_words.py index d38add3f2..c48adf90d 100644 --- a/src/scribe_data/language_data_extraction/French/translations/translate_words.py +++ b/src/scribe_data/language_data_extraction/French/translations/translate_words.py @@ -8,6 +8,7 @@ import json import os +import sys from scribe_data.translation.translation_utils import ( translate_to_other_languages, @@ -24,7 +25,8 @@ translations = {} translated_words_path = os.path.join( - translate_script_dir, "../formatted_data/translated_words.json" + translate_script_dir, + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{SRC_LANG}/translated_words.json", ) if os.path.exists(translated_words_path): with open(translated_words_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/language_data_extraction/German/nouns/format_nouns.py b/src/scribe_data/language_data_extraction/German/nouns/format_nouns.py index a943e39a3..1386277ae 100644 --- a/src/scribe_data/language_data_extraction/German/nouns/format_nouns.py +++ b/src/scribe_data/language_data_extraction/German/nouns/format_nouns.py @@ -21,7 +21,6 @@ file_path=file_path, language=LANGUAGE, data_type=DATA_TYPE ) - nouns_formatted = {} for noun_vals in nouns_list: diff --git a/src/scribe_data/language_data_extraction/German/translations/translate_words.py b/src/scribe_data/language_data_extraction/German/translations/translate_words.py index 05062b092..ef638965d 100644 --- a/src/scribe_data/language_data_extraction/German/translations/translate_words.py +++ b/src/scribe_data/language_data_extraction/German/translations/translate_words.py @@ -8,6 +8,7 @@ import json import os +import sys from scribe_data.translation.translation_utils import ( translate_to_other_languages, @@ -24,7 +25,8 @@ translations = {} translated_words_path = os.path.join( - translate_script_dir, "../formatted_data/translated_words.json" + translate_script_dir, + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{SRC_LANG}/translated_words.json", ) if os.path.exists(translated_words_path): with open(translated_words_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/language_data_extraction/Italian/nouns/format_nouns.py b/src/scribe_data/language_data_extraction/Italian/nouns/format_nouns.py index 9c844ffb9..9f48b3c00 100644 --- a/src/scribe_data/language_data_extraction/Italian/nouns/format_nouns.py +++ b/src/scribe_data/language_data_extraction/Italian/nouns/format_nouns.py @@ -21,7 +21,6 @@ file_path=file_path, language=LANGUAGE, data_type=DATA_TYPE ) - nouns_formatted = {} for noun_vals in nouns_list: diff --git a/src/scribe_data/language_data_extraction/Italian/translations/translate_words.py b/src/scribe_data/language_data_extraction/Italian/translations/translate_words.py index c501d5e6f..75f3e7cc2 100644 --- a/src/scribe_data/language_data_extraction/Italian/translations/translate_words.py +++ b/src/scribe_data/language_data_extraction/Italian/translations/translate_words.py @@ -8,6 +8,7 @@ import json import os +import sys from scribe_data.translation.translation_utils import ( translate_to_other_languages, @@ -24,7 +25,8 @@ translations = {} translated_words_path = os.path.join( - translate_script_dir, "../formatted_data/translated_words.json" + translate_script_dir, + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{SRC_LANG}/translated_words.json", ) if os.path.exists(translated_words_path): with open(translated_words_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/language_data_extraction/Portuguese/nouns/format_nouns.py b/src/scribe_data/language_data_extraction/Portuguese/nouns/format_nouns.py index 606b4a46e..5e99d788b 100644 --- a/src/scribe_data/language_data_extraction/Portuguese/nouns/format_nouns.py +++ b/src/scribe_data/language_data_extraction/Portuguese/nouns/format_nouns.py @@ -21,7 +21,6 @@ file_path=file_path, language=LANGUAGE, data_type=DATA_TYPE ) - nouns_formatted = {} for noun_vals in nouns_list: diff --git a/src/scribe_data/language_data_extraction/Portuguese/translations/translate_words.py b/src/scribe_data/language_data_extraction/Portuguese/translations/translate_words.py index 4985460b4..f59f71596 100644 --- a/src/scribe_data/language_data_extraction/Portuguese/translations/translate_words.py +++ b/src/scribe_data/language_data_extraction/Portuguese/translations/translate_words.py @@ -8,6 +8,7 @@ import json import os +import sys from scribe_data.translation.translation_utils import ( translate_to_other_languages, @@ -24,7 +25,8 @@ translations = {} translated_words_path = os.path.join( - translate_script_dir, "../formatted_data/translated_words.json" + translate_script_dir, + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{SRC_LANG}/translated_words.json", ) if os.path.exists(translated_words_path): with open(translated_words_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/language_data_extraction/Russian/translations/translate_words.py b/src/scribe_data/language_data_extraction/Russian/translations/translate_words.py index 5e8c337d4..68906bead 100644 --- a/src/scribe_data/language_data_extraction/Russian/translations/translate_words.py +++ b/src/scribe_data/language_data_extraction/Russian/translations/translate_words.py @@ -8,6 +8,7 @@ import json import os +import sys from scribe_data.translation.translation_utils import ( translate_to_other_languages, @@ -24,7 +25,8 @@ translations = {} translated_words_path = os.path.join( - translate_script_dir, "../formatted_data/translated_words.json" + translate_script_dir, + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{SRC_LANG}/translated_words.json", ) if os.path.exists(translated_words_path): with open(translated_words_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/language_data_extraction/Spanish/translations/translate_words.py b/src/scribe_data/language_data_extraction/Spanish/translations/translate_words.py index 2adb0a18d..6d4e0a06b 100644 --- a/src/scribe_data/language_data_extraction/Spanish/translations/translate_words.py +++ b/src/scribe_data/language_data_extraction/Spanish/translations/translate_words.py @@ -8,6 +8,7 @@ import json import os +import sys from scribe_data.translation.translation_utils import ( translate_to_other_languages, @@ -24,7 +25,8 @@ translations = {} translated_words_path = os.path.join( - translate_script_dir, "../formatted_data/translated_words.json" + translate_script_dir, + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{SRC_LANG}/translated_words.json", ) if os.path.exists(translated_words_path): with open(translated_words_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/language_data_extraction/Swedish/nouns/format_nouns.py b/src/scribe_data/language_data_extraction/Swedish/nouns/format_nouns.py index 712623025..de82ad836 100644 --- a/src/scribe_data/language_data_extraction/Swedish/nouns/format_nouns.py +++ b/src/scribe_data/language_data_extraction/Swedish/nouns/format_nouns.py @@ -21,7 +21,6 @@ file_path=file_path, language=LANGUAGE, data_type=DATA_TYPE ) - nouns_formatted = {} for noun_vals in nouns_list: diff --git a/src/scribe_data/language_data_extraction/Swedish/translations/translate_words.py b/src/scribe_data/language_data_extraction/Swedish/translations/translate_words.py index 202aaafd2..7c97d96e9 100644 --- a/src/scribe_data/language_data_extraction/Swedish/translations/translate_words.py +++ b/src/scribe_data/language_data_extraction/Swedish/translations/translate_words.py @@ -8,6 +8,7 @@ import json import os +import sys from scribe_data.translation.translation_utils import ( translate_to_other_languages, @@ -24,7 +25,8 @@ translations = {} translated_words_path = os.path.join( - translate_script_dir, "../formatted_data/translated_words.json" + translate_script_dir, + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{SRC_LANG}/translated_words.json", ) if os.path.exists(translated_words_path): with open(translated_words_path, "r", encoding="utf-8") as file: diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 16ba67d4a..a6158337b 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -74,7 +74,9 @@ language_word_type_dict = { lang: [ f.split(".json")[0] - for f in os.listdir(f"{PATH_TO_LANGUAGE_DIRS}{lang}/formatted_data") + for f in os.listdir( + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{lang}" + ) if f.split(".json")[0] in word_types ] for lang in languages_update @@ -139,7 +141,9 @@ def table_insert(word_type, keys): for wt in language_word_type_dict[lang]: print(f"Creating {lang} {wt} table...") json_data = json.load( - open(f"{PATH_TO_LANGUAGE_DIRS}{lang}/formatted_data/{wt}.json") + open( + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{lang}/{wt}.json" + ) ) if wt == "nouns": diff --git a/src/scribe_data/translation/translation_utils.py b/src/scribe_data/translation/translation_utils.py index b70f2da37..2c303b0dc 100644 --- a/src/scribe_data/translation/translation_utils.py +++ b/src/scribe_data/translation/translation_utils.py @@ -3,12 +3,13 @@ """ import json +import os import signal +import sys from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer from scribe_data.utils import ( - get_language_dir_path, get_language_iso, get_target_langcodes, ) @@ -31,7 +32,7 @@ def translation_interrupt_handler(source_language, translations): ) with open( - f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{source_language}/translated_words.json", "w", encoding="utf-8", ) as file: @@ -90,7 +91,7 @@ def translate_to_other_languages(source_language, word_list, translations, batch print(f"Batch {i//batch_size + 1} translation completed.") with open( - f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{source_language}/translated_words.json", "w", encoding="utf-8", ) as file: diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py index 751aaf8c4..8d94b5791 100644 --- a/src/scribe_data/unicode/process_unicode.py +++ b/src/scribe_data/unicode/process_unicode.py @@ -5,6 +5,8 @@ import csv import fileinput import json +import os +import sys from importlib.resources import files import emoji @@ -54,7 +56,7 @@ def gen_emoji_lexicon( Whether to export whether the emojis is a base character as well as its rank. update_local_data : bool (default=False) - Saves the created dictionaries as JSONs in the local formatted_data directories. + Saves the created dictionaries as JSONs in the target directories. verbose : bool (default=True) Whether to show a tqdm progress bar for the process. @@ -167,7 +169,10 @@ def gen_emoji_lexicon( ) # Check nouns files for plurals and update their data with the emojis for their singular forms. - with open(f"./{language}/formatted_data/nouns.json", encoding="utf-8") as f: + with open( + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{language}/nouns.json", + encoding="utf-8", + ) as f: noun_data = json.load(f) plurals_to_singulars_dict = { @@ -209,7 +214,7 @@ def gen_emoji_lexicon( if update_local_data: path_to_formatted_data = ( get_path_from_wikidata_dir() - + f"/Scribe-Data/src/scribe_data/language_data_extraction/{language.capitalize()}/formatted_data/emoji_keywords.json" + + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{language}/emoji_keywords.json" ) with open(path_to_formatted_data, "w", encoding="utf-8") as file: diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index 82d980c9c..037359807 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -217,24 +217,6 @@ def get_language_words_to_ignore(language: str) -> list[str]: ) -def get_language_dir_path(language): - """ - Returns the directory path for a specific language within the Scribe-Data project. - - Parameters - ---------- - language : str - The language for which the directory path is needed. - - Returns - ------- - str - The directory path for the specified language. - """ - PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] - return f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/language_data_extraction/{language}" - - def load_queried_data(file_path, language, data_type): """ Loads queried data from a JSON file for a specific language and data type. @@ -261,7 +243,9 @@ def load_queried_data(file_path, language, data_type): data_path = queried_data_file else: update_data_in_use = True - data_path = f"{get_language_dir_path(language)}/{data_type}/{queried_data_file}" + PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] + LANG_DIR_PATH = f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/language_data_extraction/{language}" + data_path = f"{LANG_DIR_PATH}/{data_type}/{queried_data_file}" with open(data_path, encoding="utf-8") as f: return json.load(f), update_data_in_use, data_path @@ -287,14 +271,15 @@ def export_formatted_data(formatted_data, update_data_in_use, language, data_typ None """ if update_data_in_use: - export_path = ( - f"{get_language_dir_path(language)}/formatted_data/{data_type}.json" - ) + PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] + export_path = f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/language_data_export/{language}/{data_type}.json" + else: export_path = f"{data_type}.json" with open(export_path, "w", encoding="utf-8") as file: json.dump(formatted_data, file, ensure_ascii=False, indent=0) + print(f"Wrote file {data_type}.json with {len(formatted_data):,} {data_type}.") diff --git a/src/scribe_data/wikidata/update_data.py b/src/scribe_data/wikidata/update_data.py index d1cbd07b3..4b1374b1c 100644 --- a/src/scribe_data/wikidata/update_data.py +++ b/src/scribe_data/wikidata/update_data.py @@ -215,9 +215,9 @@ f"python {PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang}/{target_type}/format_{target_type}.py" ) - # Check current data within for formatted_data directories. + # Check current data within formatted data directories. with open( - f"{PATH_TO_LANGUAGE_EXTRACTION_FILES}/{lang.capitalize()}/formatted_data/{target_type}.json", + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{lang.capitalize()}/{target_type}.json", encoding="utf-8", ) as json_file: new_keyboard_data = json.load(json_file) diff --git a/src/scribe_data/wikipedia/process_wiki.py b/src/scribe_data/wikipedia/process_wiki.py index 4957680d5..c517b9c28 100644 --- a/src/scribe_data/wikipedia/process_wiki.py +++ b/src/scribe_data/wikipedia/process_wiki.py @@ -3,7 +3,9 @@ """ import json +import os import re +import sys import warnings from collections import Counter from itertools import chain @@ -331,7 +333,7 @@ def gen_autosuggestions( Strings that should be removed from the text body. update_local_data : bool (default=False) - Saves the created dictionaries as JSONs in the local formatted_data directories. + Saves the created dictionaries as JSONs in the target directories. verbose : bool (default=True) Whether to show a tqdm progress bar for the process. @@ -418,7 +420,7 @@ def gen_autosuggestions( if update_local_data: path_to_formatted_data = ( get_path_from_wikidata_dir() - + f"/Scribe-Data/src/scribe_data/language_data_extraction/{language.capitalize()}/formatted_data/autosuggestions.json" + + f"{os.path.dirname(sys.path[0]).split('scribe_data')[0]}/../language_data_export/{language}/autosuggestions.json" ) with open(