diff --git a/qanary-component-MT-Python-HelsinkiNLP/Dockerfile b/qanary-component-MT-Python-HelsinkiNLP/Dockerfile index 61eb53cca..b1adee742 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/Dockerfile +++ b/qanary-component-MT-Python-HelsinkiNLP/Dockerfile @@ -1,14 +1,21 @@ -FROM python:3.7 +FROM python:3.10 COPY requirements.txt ./ RUN pip install --upgrade pip -RUN pip install -r requirements.txt; exit 0 -RUN pip install gunicorn +RUN pip install -r requirements.txt COPY component component +COPY utils utils COPY run.py boot.sh ./ +# to allow preconfigured images +ARG SOURCE_LANGUAGE +ARG TARGET_LANGUAGE + +ENV SOURCE_LANGUAGE=$SOURCE_LANGUAGE +ENV TARGET_LANGUAGE=$TARGET_LANGUAGE + RUN chmod +x boot.sh ENTRYPOINT ["./boot.sh"] diff --git a/qanary-component-MT-Python-HelsinkiNLP/README.md b/qanary-component-MT-Python-HelsinkiNLP/README.md index ad079b96b..112ec186a 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/README.md +++ b/qanary-component-MT-Python-HelsinkiNLP/README.md @@ -54,8 +54,9 @@ SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL=http://public-component-host: SPRING_BOOT_ADMIN_USERNAME=admin SPRING_BOOT_ADMIN_PASSWORD=admin SERVICE_NAME_COMPONENT=MT-Helsinki-NLP -SERVICE_DESCRIPTION_COMPONENT=Translates question to English +SERVICE_DESCRIPTION_COMPONENT=Translates questions SOURCE_LANGUAGE=de +TARGET_LANGUAGE=en ``` The parameters description: @@ -68,7 +69,8 @@ The parameters description: * `SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL` -- the URL of your Qanary component (has to be visible to the Qanary pipeline) * `SERVICE_NAME_COMPONENT` -- the name of your Qanary component (for better identification) * `SERVICE_DESCRIPTION_COMPONENT` -- the description of your Qanary component -* `SOURCE_LANGUAGE` -- (optional) the source language of the text (the component will use langdetect if no source language is given) +* `SOURCE_LANGUAGE` -- (optional) the default source language of the translation +* `TARGET_LANGUAGE` -- (optional) the default target language of the translation 4. Build the Docker image: @@ -82,18 +84,43 @@ docker-compose build . docker-compose up ``` -After execution, component creates Qanary annotation in the Qanary triplestore: +After successful execution, component creates Qanary annotation in the Qanary triplestore: ``` GRAPH { - ?a a qa:AnnotationOfQuestionLanguage . - ?a qa:translationResult "translation result" . - ?a qa:sourceLanguage "ISO_639-1 language code" . - ?a oa:annotatedBy . - ?a oa:annotatedAt ?time . - } + ?a a qa:AnnotationOfQuestionTranslation . + ?a oa:hasTarget . + ?a oa:hasBody "translation_result"@ISO_639-1 language code + ?a oa:annotatedBy . + ?a oa:annotatedAt ?time . } ``` +### Support for multiple Source and Target Languages + +This component relies on the presence of one of more existing annotations that associate a question text with a language. +This can be in the form of an `AnnotationOfQuestionLanguage`, as created by LD components, or an `AnnotationOfQuestionTranslation` as created by MT components. + +It supports multiple combinations of source and target languages. +You can specify a desired source and target language independently, or simply use all available language pairings. + +If a `SOURCE_LANGUAGE` is set, then only texts with this specific language are considered for translation. +If none is set, then all configured source languages will be used to find candidates for translation. + +Similarily, if a `TARGET_LANGUAGE` is set, then texts are only translated into that language. +If none is set, then the texts are translated into all target languages that are supported for their respective source language. + +Note that while configured source languages naturally determine the possible target languages, +the configured target languages also determine which source languages can be supported! + +### Pre-configured Docker Images + +You may use the included file `docker-compose-pairs.yml` to build a list of images that are preconfigured for specific language pairs. +Note that if you intend to use these containers at the same time, you need to assign different `SERVER_PORT` values for each image. + +```bash +docker-compose -f docker-compose-pairs.yml build +``` + ## How To Test This Component This component uses the [pytest](https://docs.pytest.org/). diff --git a/qanary-component-MT-Python-HelsinkiNLP/boot.sh b/qanary-component-MT-Python-HelsinkiNLP/boot.sh index 64a688048..43630043a 100755 --- a/qanary-component-MT-Python-HelsinkiNLP/boot.sh +++ b/qanary-component-MT-Python-HelsinkiNLP/boot.sh @@ -1,16 +1,33 @@ -#!/bin/sh +#!/bin/bash +export $(grep -v "^#" < .env) +# check required parameters +declare -a required_vars=( +"SPRING_BOOT_ADMIN_URL" +"SERVER_HOST" +"SERVER_PORT" +"SPRING_BOOT_ADMIN_USERNAME" +"SPRING_BOOT_ADMIN_PASSWORD" +"SERVICE_NAME_COMPONENT" +"SERVICE_DESCRIPTION_COMPONENT" +) -export $(grep -v '^#' .env | xargs) +for param in ${required_vars[@]}; +do + if [[ -z ${!param} ]]; then + echo "Required variable \"$param\" is not set!" + echo "The required variables are: ${required_vars[@]}" + exit 4 + fi +done -echo Downloading the model -python -c "from transformers.models.marian.modeling_marian import MarianMTModel; from transformers.models.marian.tokenization_marian import MarianTokenizer; supported_langs = ['ru', 'es', 'de', 'fr']; models = {lang: MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-{lang}-en'.format(lang=lang)) for lang in supported_langs}; tokenizers = {lang: MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-{lang}-en'.format(lang=lang)) for lang in supported_langs}" -echo Downloading the model finished +echo Downloading the models + +python -c "from utils.model_utils import load_models_and_tokenizers; SUPPORTED_LANGS = { 'en': ['de', 'fr', 'ru', 'es'], 'de': ['en', 'fr', 'es'], 'fr': ['en', 'de', 'ru', 'es'], 'ru': ['en', 'fr', 'es'], 'es': ['en', 'de', 'fr', 'es'], }; load_models_and_tokenizers(SUPPORTED_LANGS); " +echo Downloading the model finished echo The port number is: $SERVER_PORT +echo The host is: $SERVER_HOST echo The Qanary pipeline URL is: $SPRING_BOOT_ADMIN_URL -if [ -n $SERVER_PORT ] -then - exec gunicorn -b :$SERVER_PORT --access-logfile - --error-logfile - run:app # refer to the gunicorn documentation for more options -fi +exec uvicorn run:app --host 0.0.0.0 --port $SERVER_PORT --log-level warning diff --git a/qanary-component-MT-Python-HelsinkiNLP/component/__init__.py b/qanary-component-MT-Python-HelsinkiNLP/component/__init__.py index 75d0e3807..f84862e10 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/component/__init__.py +++ b/qanary-component-MT-Python-HelsinkiNLP/component/__init__.py @@ -1,27 +1,33 @@ -from component.mt_helsinki_nlp import mt_helsinki_nlp_bp -from flask import Flask +from component import mt_helsinki_nlp +from fastapi import FastAPI +from fastapi.responses import RedirectResponse, Response -version = "0.1.2" +version = "0.2.0" # default config file (use -c parameter on command line specify a custom config file) configfile = "app.conf" # endpoint for health information of the service required for Spring Boot Admin server callback -healthendpoint = "/health" - -aboutendpoint = "/about" +HEALTHENDPOINT = "/health" +ABOUTENDPOINT = "/about" +# TODO: add languages endpoint? # initialize Flask app and add the externalized service information -app = Flask(__name__) -app.register_blueprint(mt_helsinki_nlp_bp) +app = FastAPI(docs_url="/swagger-ui.html") +app.include_router(mt_helsinki_nlp.router) + + +@app.get("/") +async def main(): + return RedirectResponse("/about") -@app.route(healthendpoint, methods=['GET']) +@app.get(HEALTHENDPOINT, description="Shows the status of the component") def health(): """required health endpoint for callback of Spring Boot Admin server""" - return "alive" + return Response("alive", media_type="text/plain") -@app.route(aboutendpoint, methods=['GET']) +@app.get(ABOUTENDPOINT, description="Shows a description of the component") def about(): - """required about endpoint for callback of Spring Boot Admin server""" - return "about" + """required about endpoint for callback of Srping Boot Admin server""" + return Response("Translates questions into English", media_type="text/plain") diff --git a/qanary-component-MT-Python-HelsinkiNLP/component/mt_helsinki_nlp.py b/qanary-component-MT-Python-HelsinkiNLP/component/mt_helsinki_nlp.py index 16e379b0b..17e36b2dc 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/component/mt_helsinki_nlp.py +++ b/qanary-component-MT-Python-HelsinkiNLP/component/mt_helsinki_nlp.py @@ -1,126 +1,179 @@ -import langid import logging import os -from flask import Blueprint, jsonify, request from qanary_helpers.qanary_queries import get_text_question_in_graph, insert_into_triplestore -from transformers.models.marian.modeling_marian import MarianMTModel -from transformers.models.marian.tokenization_marian import MarianTokenizer +from qanary_helpers.language_queries import get_translated_texts_in_triplestore, get_texts_with_detected_language_in_triplestore, QuestionTextWithLanguage, create_annotation_of_question_translation +from utils.model_utils import load_models_and_tokenizers +from utils.lang_utils import translation_options +from fastapi import APIRouter, Request +from fastapi.responses import JSONResponse + logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) -mt_helsinki_nlp_bp = Blueprint('mt_helsinki_nlp_bp', __name__, template_folder='templates') +router = APIRouter() SERVICE_NAME_COMPONENT = os.environ['SERVICE_NAME_COMPONENT'] -SOURCE_LANG = os.environ["SOURCE_LANGUAGE"] -TARGET_LANG = "en" # currently only used for annotation -# TODO: no target language is set, because only 'en' is supported -# TODO: determine supported target langs and download models for that - -supported_langs = ['ru', 'es', 'de', 'fr'] -langid.set_languages(supported_langs) -models = {lang: MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-{lang}-en'.format(lang=lang)) for lang in supported_langs} -tokenizers = {lang: MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-{lang}-en'.format(lang=lang)) for lang in supported_langs} - - -@mt_helsinki_nlp_bp.route("/annotatequestion", methods=['POST']) -def qanary_service(): - """the POST endpoint required for a Qanary service""" - - triplestore_endpoint = request.json["values"]["urn:qanary#endpoint"] - triplestore_ingraph = request.json["values"]["urn:qanary#inGraph"] - triplestore_outgraph = request.json["values"]["urn:qanary#outGraph"] - logging.info("endpoint: %s, inGraph: %s, outGraph: %s" % (triplestore_endpoint, triplestore_ingraph, triplestore_outgraph)) +TRANSLATEENDPOINT = "/translate" - text = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, graph=triplestore_ingraph)[0]['text'] - question_uri = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, graph=triplestore_ingraph)[0]['uri'] - logging.info(f'Question Text: {text}') +models, tokenizers = load_models_and_tokenizers(translation_options) - if SOURCE_LANG != None and len(SOURCE_LANG.strip()) > 0: - lang = SOURCE_LANG - logging.info("Using custom SOURCE_LANGUAGE") - else: - lang, prob = langid.classify(text) - logging.info("No SOURCE_LANGUAGE specified, using langid!") - logging.info(f"source language: {lang}") - if lang not in supported_langs: - raise RuntimeError(f"source language {lang} is not supported!") +def translate_input(text: str, source_lang: str, target_lang: str) -> str: + """Translates text from a source language into a target language. + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation - batch = tokenizers[lang]([text], return_tensors="pt", padding=True) + Returns: + str: The translated text + """ + logging.info(f"translating \"{text}\" from \"{source_lang}\" to \"{target_lang}\"") + batch = tokenizers[source_lang][target_lang]([text], return_tensors="pt", padding=True) # Make sure that the tokenized text does not exceed the maximum # allowed size of 512 batch["input_ids"] = batch["input_ids"][:, :512] batch["attention_mask"] = batch["attention_mask"][:, :512] # Perform the translation and decode the output - translation = models[lang].generate(**batch) - result = tokenizers[lang].batch_decode(translation, skip_special_tokens=True)[0] - - # building SPARQL query TODO: verify this annotation AnnotationOfQuestionTranslation ?? - SPARQLqueryAnnotationOfQuestionTranslation = """ - PREFIX qa: - PREFIX oa: - PREFIX xsd: - - INSERT {{ - GRAPH <{uuid}> {{ - ?a a qa:AnnotationOfQuestionTranslation ; - oa:hasTarget <{qanary_question_uri}> ; - oa:hasBody "{translation_result}"@{target_lang} ; - oa:annotatedBy ; - oa:annotatedAt ?time . - - }} - }} - WHERE {{ - BIND (IRI(str(RAND())) AS ?a) . - BIND (now() as ?time) - }} - """.format( - uuid=triplestore_ingraph, - qanary_question_uri=question_uri, - translation_result=result.replace("\"", "\\\""), #keep quotation marks that are part of the translation - target_lang=TARGET_LANG, - app_name=SERVICE_NAME_COMPONENT - ) - - SPARQLqueryAnnotationOfQuestionLanguage = """ - PREFIX qa: - PREFIX oa: - PREFIX xsd: - - INSERT {{ - GRAPH <{uuid}> {{ - ?b a qa:AnnotationOfQuestionLanguage ; - oa:hasTarget <{qanary_question_uri}> ; - oa:hasBody "{src_lang}"^^xsd:string ; - oa:annotatedBy ; - oa:annotatedAt ?time . - }} - }} - WHERE {{ - BIND (IRI(str(RAND())) AS ?b) . - BIND (now() as ?time) - }} - """.format( - uuid=triplestore_ingraph, - qanary_question_uri=question_uri, - src_lang=lang, - app_name=SERVICE_NAME_COMPONENT - ) - - logging.info(f'SPARQL: {SPARQLqueryAnnotationOfQuestionTranslation}') - logging.info(f'SPARQL: {SPARQLqueryAnnotationOfQuestionLanguage}') - # inserting new data to the triplestore - insert_into_triplestore(triplestore_endpoint, SPARQLqueryAnnotationOfQuestionTranslation) - insert_into_triplestore(triplestore_endpoint, SPARQLqueryAnnotationOfQuestionLanguage) - - return jsonify(request.get_json()) - - -@mt_helsinki_nlp_bp.route("/", methods=['GET']) -def index(): - """an examplary GET endpoint returning "hello world (String)""" - - logging.info("host_url: %s" % (request.host_url,)) - return "Hi! \n This is Python MT Helsinki NLP component" + translation = models[source_lang][target_lang].generate(**batch) + result = tokenizers[source_lang][target_lang].batch_decode(translation, skip_special_tokens=True)[0] + logging.info(f"result: \"{result}\"") + translation = result.replace("\"", "\\\"") #keep quotation marks that are part of the translation + return translation + + +@router.get("/translate_to_one", description="Translate a text from a given source language into one target language.", tags=["Translate"]) +def translate_to_one(text: str, source_lang: str, target_lang: str) -> dict[str, str]: + """Translates a text from a given source language into one target language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + + Returns: + dict: A dictionary of translations identified by their target language (only one in this case) + """ + + if (source_lang in translation_options.keys()) and (target_lang in translation_options.get(source_lang, [])): + translation = translate_input(text, source_lang, target_lang) + return {target_lang: translation} + else: + raise RuntimeError("Unsupported source and/or target language! Valid options: {to}".format(to=translation_options)) + + +@router.get("/translate_to_all", description="Translate a text from a given source language into all configured target languages for that source language.", tags=["Translate"]) +def translate_to_all(text: str, source_lang: str) -> dict[str, str]: + """Translates a text from a given source language into all target configured languages for that source language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + + Returns: + dict: A dictionary of translations identified by their target language + """ + + if source_lang in translation_options.keys(): + translations = dict() + for target_lang in translation_options[source_lang]: + translation = translate_input(text, source_lang, target_lang) + translations.update({ + target_lang: translation + }) + return translations + else: + raise RuntimeError("Unsupported source language! Valid options: {to}".format(to=translation_options)) + + +def find_source_texts_in_triplestore(triplestore_endpoint: str, graph_uri: str, lang: str) -> list[QuestionTextWithLanguage]: + """Retrieves questions of a specific language from the triplestore. + + Parameters: + triplestore_endpoint (str): URL of the triplestore endpoint + graph_uri (str): URI of the graph to query inside of the triplestore + lang (str): Expected language + + Returns: + list: A list of appropriate QuestionTextWithLanguage objects with information from the triplestore. + """ + + source_texts = [] + + # check if supported languages have been determined already (LD) + # (use filters) + # if so, use the target uris to find the question text to translate + ld_source_texts = get_texts_with_detected_language_in_triplestore(triplestore_endpoint, graph_uri, lang) + source_texts.extend(ld_source_texts) + + # check if there are translations into the relevant language (MT) + # (use filters) + # if so, use the translation texts + mt_source_texts = get_translated_texts_in_triplestore(triplestore_endpoint, graph_uri, lang) + source_texts.extend(mt_source_texts) + + # TODO: what if nothing found? + if len(source_texts) == 0: + logging.warning(f"No source texts with language {lang} could be found In the triplestore!") + + return source_texts + + +@router.post("/annotatequestion", description="Standard process method for Qanary components", tags=["Qanary"]) +async def qanary_service(request: Request): + """the POST endpoint required for a Qanary service""" + + # Retrieve basic information about the current question process + request_json = await request.json() + + triplestore_endpoint = request_json["values"]["urn:qanary#endpoint"] + triplestore_ingraph = request_json["values"]["urn:qanary#inGraph"] + triplestore_outgraph = request_json["values"]["urn:qanary#outGraph"] + logging.info("endpoint: %s, inGraph: %s, outGraph: %s" % (triplestore_endpoint, triplestore_ingraph, triplestore_outgraph)) + + + text_question_in_graph = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, graph=triplestore_ingraph) + question_text = text_question_in_graph[0]['text'] + logging.info(f'Original question text: {question_text}') + + + # Collect texts to be translated (group by source language) + + source_texts_per_language = dict() + # keep a list of annotations to insert + insert_annotations = list() + + for source_lang in translation_options.keys(): + source_texts = find_source_texts_in_triplestore( + triplestore_endpoint=triplestore_endpoint, + graph_uri=triplestore_ingraph, + lang=source_lang + ) + source_texts_per_language.update({source_lang: source_texts}) + + # for every source language that has associated texts + for source_lang in source_texts_per_language.keys(): + # translate each found text + for source_text in source_texts_per_language[source_lang]: + # into every target language that is supported for this source language + for target_lang in translation_options[source_lang]: + translation = translate_input(source_text.get_text(), source_lang, target_lang) + if len(translation.strip()) > 0: + SPARQLqueryAnnotationOfQuestionTranslation = create_annotation_of_question_translation( + graph_uri=triplestore_ingraph, + question_uri=source_text.get_uri(), + translation=translation, + translation_language=target_lang, + app_name=SERVICE_NAME_COMPONENT + ) + insert_annotations.append(SPARQLqueryAnnotationOfQuestionTranslation) + else: + logging.error(f"result is empty string!") + + # insert the created annotations into the triplestore + for insert_annotation in insert_annotations: + insert_into_triplestore(triplestore_endpoint, insert_annotation) + + return JSONResponse(request_json) diff --git a/qanary-component-MT-Python-HelsinkiNLP/docker-compose-pairs.yml b/qanary-component-MT-Python-HelsinkiNLP/docker-compose-pairs.yml new file mode 100644 index 000000000..aedb587c3 --- /dev/null +++ b/qanary-component-MT-Python-HelsinkiNLP/docker-compose-pairs.yml @@ -0,0 +1,108 @@ +version: '3' +services: + + component-en-de: + # for building from source + image: qanary/qanary-component-mt-python-helsinkinlp-en-de:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=de + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-helsinkinlp-en-de + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + component-en-ru: + # for building from source + image: qanary/qanary-component-mt-python-helsinkinlp-en-ru:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=ru + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-helsinkinlp-en-ru + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-en-es: + # for building from source + image: qanary/qanary-component-mt-python-helsinkinlp-en-es:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=es + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-helsinkinlp-en-es + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-de-en: + # for building from source + image: qanary/qanary-component-mt-python-helsinkinlp-de-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=de + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-helsinkinlp-de-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-ru-en: + # for building from source + image: qanary/qanary-component-mt-python-helsinkinlp-ru-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=ru + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-helsinkinlp-ru-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-es-en: + # for building from source + image: qanary/qanary-component-mt-python-helsinkinlp-es-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=es + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-helsinkinlp-es-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host diff --git a/qanary-component-MT-Python-HelsinkiNLP/docker-compose.yml b/qanary-component-MT-Python-HelsinkiNLP/docker-compose.yml index 1df13709f..3e40c41f9 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/docker-compose.yml +++ b/qanary-component-MT-Python-HelsinkiNLP/docker-compose.yml @@ -2,12 +2,14 @@ version: '3' services: component: # for building from source - image: qanary-component-mt-python-helsinki:latest + image: qanary/qanary-component-mt-python-helsinkinlp:0.2.0 build: context: . dockerfile: Dockerfile - network_mode: host env_file: - .env + environment: + - SERVICE_NAME_COMPONENT=MT-Helsinki-all volumes: - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host diff --git a/qanary-component-MT-Python-HelsinkiNLP/pytest.ini b/qanary-component-MT-Python-HelsinkiNLP/pytest.ini index 93627edc6..e3075bac1 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/pytest.ini +++ b/qanary-component-MT-Python-HelsinkiNLP/pytest.ini @@ -1,14 +1,13 @@ [pytest] -log_cli = 0 +log_cli = 1 log_cli_level = INFO log_cli_format = %(asctime)s [%(levelname)8s] [%(filename)s:%(lineno)s] %(message)s log_cli_date_format=%Y-%m-%d %H:%M:%S env = - SPRING_BOOT_ADMIN_URL=https://localhost:43740 + SPRING_BOOT_ADMIN_URL=https://localhost:8080 SPRING_BOOT_ADMIN_USERNAME=admin SPRING_BOOT_ADMIN_PASSWORD=admin - SERVICE_HOST=http://localhost: - SERVICE_PORT=41062 + SERVER_HOST=http://localhost: + SERVER_PORT=8088 SERVICE_NAME_COMPONENT=MT-Helsinki-NLP-Component SERVICE_DESCRIPTION_COMPONENT=MT tool that uses pre-trained models by Helsinki NLP implemented in transformers library - SOURCE_LANGUAGE=de diff --git a/qanary-component-MT-Python-HelsinkiNLP/requirements.txt b/qanary-component-MT-Python-HelsinkiNLP/requirements.txt index 10fe627f3..a604a81e3 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/requirements.txt +++ b/qanary-component-MT-Python-HelsinkiNLP/requirements.txt @@ -1,9 +1,10 @@ -transformers -torch -SentencePiece -qanary-helpers -SPARQLWrapper -Flask -langid -pytest -pytest-env +fastapi==0.109.1 +pytest==8.3.2 +pytest-env==1.1.3 +SentencePiece==0.2.0 +SPARQLWrapper==2.0.0 +torch==2.4.0 +transformers==4.44.0 +qanary-helpers==0.3.2 +uvicorn==0.30.1 +httpx==0.27.0 diff --git a/qanary-component-MT-Python-HelsinkiNLP/run.py b/qanary-component-MT-Python-HelsinkiNLP/run.py index e0fd511e5..49d8a9cc6 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/run.py +++ b/qanary-component-MT-Python-HelsinkiNLP/run.py @@ -4,32 +4,35 @@ from qanary_helpers.registration import Registration from qanary_helpers.registrator import Registrator -from component import app, healthendpoint, aboutendpoint +from component import app, HEALTHENDPOINT, ABOUTENDPOINT -logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) +logging.basicConfig(level=logging.ERROR) +# TODO: get logger from module +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) SPRING_BOOT_ADMIN_URL = os.getenv('SPRING_BOOT_ADMIN_URL') SPRING_BOOT_ADMIN_USERNAME = os.getenv('SPRING_BOOT_ADMIN_USERNAME') SPRING_BOOT_ADMIN_PASSWORD = os.getenv('SPRING_BOOT_ADMIN_PASSWORD') -SERVICE_HOST = os.getenv('SERVER_HOST') -SERVICE_PORT = os.getenv('SERVER_PORT') +SERVER_HOST = os.getenv('SERVER_HOST') +SERVER_PORT = os.getenv('SERVER_PORT') SERVICE_NAME_COMPONENT = os.getenv('SERVICE_NAME_COMPONENT') SERVICE_DESCRIPTION_COMPONENT = os.getenv('SERVICE_DESCRIPTION_COMPONENT') -URL_COMPONENT = f"{SERVICE_HOST}:{SERVICE_PORT}" +URL_COMPONENT = f"{SERVER_HOST}:{SERVER_PORT}" # define metadata that will be shown in the Spring Boot Admin server UI metadata = { "start": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "description": SERVICE_DESCRIPTION_COMPONENT, - "about": f"{SERVICE_HOST}:{SERVICE_PORT}{aboutendpoint}", + "about": f"{SERVER_HOST}:{SERVER_PORT}{ABOUTENDPOINT}", "written in": "Python" } # initialize the registration object, to be send to the Spring Boot Admin server registration = Registration( name=SERVICE_NAME_COMPONENT, - serviceUrl=f"{SERVICE_HOST}:{SERVICE_PORT}", - healthUrl=f"{SERVICE_HOST}:{SERVICE_PORT}{healthendpoint}", + serviceUrl=f"{SERVER_HOST}:{SERVER_PORT}", + healthUrl=f"{SERVER_HOST}:{SERVER_PORT}{HEALTHENDPOINT}", metadata=metadata ) @@ -47,4 +50,5 @@ if __name__ == "__main__": # start the web service - app.run(debug=True, port=SERVICE_PORT) \ No newline at end of file + if SERVER_PORT == None: + raise RuntimeError("SERVER_PORT must not be empty!") diff --git a/qanary-component-MT-Python-HelsinkiNLP/tests/plugins/env_vars.py b/qanary-component-MT-Python-HelsinkiNLP/tests/plugins/env_vars.py index 295510e00..acf244f6c 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/tests/plugins/env_vars.py +++ b/qanary-component-MT-Python-HelsinkiNLP/tests/plugins/env_vars.py @@ -7,6 +7,6 @@ def pytest_load_initial_conftests(args, early_config, parser): os.environ["SPRING_BOOT_ADMIN_USERNAME"]="admin" os.environ["SPRING_BOOT_ADMIN_PASSWORD"]="admin" os.environ["SERVICE_HOST"]="http://webengineering.ins.hs-anhalt.de" - os.environ["SERVICE_PORT"]="41062" + os.environ["SERVER_PORT"]="41062" os.environ["SERVICE_NAME_COMPONENT"]="MT-Helsinki-NLP-Component" os.environ["SERVICE_DESCRIPTION_COMPONENT"]="MT tool that uses pre-trained models by Helsinki NLP implemented in transformers library" diff --git a/qanary-component-MT-Python-HelsinkiNLP/tests/test_lang_utils.py b/qanary-component-MT-Python-HelsinkiNLP/tests/test_lang_utils.py new file mode 100644 index 000000000..e2b2d14d6 --- /dev/null +++ b/qanary-component-MT-Python-HelsinkiNLP/tests/test_lang_utils.py @@ -0,0 +1,66 @@ +import logging +from unittest import mock +from unittest import TestCase +import os +import importlib + +class TestLangUtils(TestCase): + + logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'fr'}) + def test_only_one_source_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + assert 'fr' in translation_options.keys() + assert len(translation_options.keys()) == 1 + + + @mock.patch.dict(os.environ, {'TARGET_LANGUAGE': 'ru'}) + def test_only_one_target_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + # currently, there are only two source languages that support target language 'ru' + assert len(translation_options.items()) == 2 + assert ('en', ['ru']) in translation_options.items() + assert ('fr', ['ru']) in translation_options.items() + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'en', 'TARGET_LANGUAGE': 'es'}) + def test_specific_source_and_target_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + assert translation_options == {'en': ['es']} + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'zh'}) + def test_unsupported_source_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'en', 'TARGET_LANGUAGE': 'zh'}) + def test_unsupported_target_for_source_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass + + + @mock.patch.dict(os.environ, {'TARGET_LANGUAGE': 'zh'}) + def test_unsupported_target_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass diff --git a/qanary-component-MT-Python-HelsinkiNLP/tests/test_mt_helsinky_nlp.py b/qanary-component-MT-Python-HelsinkiNLP/tests/test_mt_helsinky_nlp.py index 371bdd940..99add4a45 100644 --- a/qanary-component-MT-Python-HelsinkiNLP/tests/test_mt_helsinky_nlp.py +++ b/qanary-component-MT-Python-HelsinkiNLP/tests/test_mt_helsinky_nlp.py @@ -1,14 +1,19 @@ -from component.mt_helsinki_nlp import * +import logging from component import app +from fastapi.testclient import TestClient from unittest.mock import patch +from unittest import mock import re from unittest import TestCase +from qanary_helpers.language_queries import QuestionTextWithLanguage +import os +import importlib class TestComponent(TestCase): logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) - questions = list([{"uri": "urn:test-uri", "text": "was ist ein Test?"}]) + questions = list([{"uri": "urn:test-uri", "text": "Was ist die Hauptstadt von Deutschland?"}]) endpoint = "urn:qanary#test-endpoint" in_graph = "urn:qanary#test-inGraph" out_graph = "urn:qanary#test-outGraph" @@ -16,6 +21,10 @@ class TestComponent(TestCase): source_language = "de" target_language = "en" + source_texts = [ + QuestionTextWithLanguage("uri", "Was ist die Hauptstadt von Deutschland?", "de") + ] + request_data = '''{ "values": { "urn:qanary#endpoint": "urn:qanary#test-endpoint", @@ -31,45 +40,51 @@ class TestComponent(TestCase): "Content-Type": "application/json" } - def test_qanary_service(self): + client = TestClient(app) + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'de', 'TARGET_LANGUAGE': 'en'}) + def test_qanary_service(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + import component.mt_helsinki_nlp + importlib.reload(component.mt_helsinki_nlp) + from component import app - logging.info("port: %s" % (os.environ["SERVICE_PORT"])) + logging.info("port: %s" % (os.environ["SERVER_PORT"])) assert os.environ["SERVICE_NAME_COMPONENT"] == "MT-Helsinki-NLP-Component" + assert os.environ["SOURCE_LANGUAGE"] == self.source_language + assert os.environ["TARGET_LANGUAGE"] == self.target_language - with app.test_client() as client, \ - patch('component.mt_helsinki_nlp.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ - patch('component.mt_helsinki_nlp.insert_into_triplestore') as mocked_insert_into_triplestore: + with patch('component.mt_helsinki_nlp.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ + patch('component.mt_helsinki_nlp.find_source_texts_in_triplestore') as mocked_find_source_texts_in_triplestore, \ + patch('component.mt_helsinki_nlp.insert_into_triplestore') as mocked_insert_into_triplestore: # given a non-english question is present in the current graph mocked_get_text_question_in_graph.return_value = self.questions + mocked_find_source_texts_in_triplestore.return_value = self.source_texts mocked_insert_into_triplestore.return_value = None # when a call to /annotatequestion is made - response_json = client.post("/annotatequestion", headers = self.headers, data = self.request_data) + response_json = self.client.post("/annotatequestion", headers = self.headers, data = self.request_data) # then the text question is retrieved from the triplestore mocked_get_text_question_in_graph.assert_called_with(triplestore_endpoint=self.endpoint, graph=self.in_graph) + mocked_find_source_texts_in_triplestore.assert_called_with(triplestore_endpoint=self.endpoint, graph_uri=self.in_graph, lang=self.source_language) + assert mocked_find_source_texts_in_triplestore.call_count == 1 + # get arguments of the (2) separate insert calls arg_list = mocked_insert_into_triplestore.call_args_list # get the call arguments for question translation call_args_translation = [a.args for a in arg_list if "AnnotationOfQuestionTranslation" in a.args[1]] assert len(call_args_translation) == 1 - # get the call arguments for question language - call_args_language = [a.args for a in arg_list if "AnnotationOfQuestionLanguage" in a.args[1]] - assert len(call_args_language) == 1 # clean query strings query_translation = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_translation[0][1]) - query_language = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_language[0][1]) # then the triplestore is updated twice # (question language and translation) - assert mocked_insert_into_triplestore.call_count == 2 - - # then the source language is correctly identified and annotated - self.assertRegex(query_language, r".*AnnotationOfQuestionLanguage(.*;\W?)*oa:hasBody \""+self.source_language+r"\".*\.") + assert mocked_insert_into_triplestore.call_count == 1 # then the question is translated and the result is annotated self.assertRegex(query_translation, r".*AnnotationOfQuestionTranslation(.*;\W?)*oa:hasBody \".*\"@" + self.target_language + r".*\.") @@ -77,3 +92,28 @@ def test_qanary_service(self): # then the response is not empty assert response_json != None + + + # test with all supported languages enabled + def test_translate_input(self): + import component.mt_helsinki_nlp + from component.mt_helsinki_nlp import translate_input + import utils.lang_utils + importlib.reload(utils.lang_utils) + importlib.reload(component.mt_helsinki_nlp) + translations = [ + {"text": "Was ist die Hauptstadt von Deutschland?", + "translation": "What is the capital of Germany?", + "source_lang": "de", "target_lang": "en"}, + {"text": "What is the capital of Germany?", + "translation": "Quelle est la capitale de l'Allemagne?", + "source_lang": "en", "target_lang": "fr"}, + {"text": "What is the capital of Germany?", + "translation": "Какая столица Германии?", + "source_lang": "en", "target_lang": "ru"}, + ] + + for translation in translations: + expected = translation["translation"] + actual = translate_input(translation["text"], translation["source_lang"], translation["target_lang"]) + assert expected == actual diff --git a/qanary-component-MT-Python-HelsinkiNLP/utils/__init__.py b/qanary-component-MT-Python-HelsinkiNLP/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/qanary-component-MT-Python-HelsinkiNLP/utils/lang_utils.py b/qanary-component-MT-Python-HelsinkiNLP/utils/lang_utils.py new file mode 100644 index 000000000..da87a2a48 --- /dev/null +++ b/qanary-component-MT-Python-HelsinkiNLP/utils/lang_utils.py @@ -0,0 +1,63 @@ +import os +import logging + + +logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + +SOURCE_LANGUAGE = os.getenv("SOURCE_LANGUAGE") +TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE") +SUPPORTED_LANGS = { +# source: targets + 'en': ['de', 'fr', 'ru', 'es'], + 'de': ['en', 'fr', 'es'], + 'fr': ['en', 'de', 'ru', 'es'], + 'ru': ['en', 'fr', 'es'], + 'es': ['en', 'de', 'fr'], +} + + +def setup_translation_options() -> dict: + """Creates a dictionary of possible source and target languages, based on SUPPORTED_LANGS and configured languages.""" + + logging.info("SETTING UP TRANSLATION OPTIONS") + translation_options = dict() # init emtpy + + # check if a source language is specified + if SOURCE_LANGUAGE != None and len(SOURCE_LANGUAGE.strip()) > 0: + # pre-select appropriate translation options from the list of supported source languages + try: + translation_options[SOURCE_LANGUAGE] = SUPPORTED_LANGS[SOURCE_LANGUAGE] + # this will fail for invalid keys! + except KeyError: + raise ValueError(f"The source language \"{SOURCE_LANGUAGE}\" is not supported!") + # if no source language is specified, use all source languages that are supported by the models + else: + translation_options = SUPPORTED_LANGS + + # check if a target language is specified + if TARGET_LANGUAGE != None and len(TARGET_LANGUAGE.strip()) > 0: + discard_keys = list() + # remove instances where source == target + translation_options.pop(TARGET_LANGUAGE, None) + for source_lang in translation_options.keys(): + if TARGET_LANGUAGE in translation_options[source_lang]: + translation_options[source_lang] = [TARGET_LANGUAGE] + else: + discard_keys.append(source_lang) + # cleanup keys + translation_options = {sl:tl for sl,tl in translation_options.items() if sl not in discard_keys} + # check for empty translation options, if all keys dropped + if len(translation_options.keys()) == 0: + raise ValueError("The target language \"{tl}\" is not supported for any configured source languages! \nValid language pairs (source: [targets]) are: \n{slk}!" + .format(tl=TARGET_LANGUAGE, slk=SUPPORTED_LANGS)) + # check if only some keys dropped + elif len(discard_keys) > 0: + logging.warning("Specific target language \"{tl}\" is not supported for these source languages: {dk}!. \nThese language pairs will be ignored." + .format(tl=TARGET_LANGUAGE, dk=discard_keys)) + # else do nothing, the lists are already complete + + logging.info(translation_options) + return translation_options + + +translation_options = setup_translation_options() diff --git a/qanary-component-MT-Python-HelsinkiNLP/utils/model_utils.py b/qanary-component-MT-Python-HelsinkiNLP/utils/model_utils.py new file mode 100644 index 000000000..aeca3c368 --- /dev/null +++ b/qanary-component-MT-Python-HelsinkiNLP/utils/model_utils.py @@ -0,0 +1,19 @@ +from transformers.models.marian.modeling_marian import MarianMTModel +from transformers.models.marian.tokenization_marian import MarianTokenizer + + +def load_models_and_tokenizers(translation_options: dict): + """Loads models and tokenizers based on configured translation language pairs. + + Parameters: + translation_options (dict): Key is the source language, value is a list of configured target languages + """ + + models = {} + tokenizers = {} + for s_lang in translation_options.keys(): + lang_models = {t_lang: MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{s_lang}-{t_lang}') for t_lang in translation_options[s_lang]} + lang_tokenizers = {t_lang: MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{s_lang}-{t_lang}') for t_lang in translation_options[s_lang]} + models[s_lang] = lang_models + tokenizers[s_lang] = lang_tokenizers + return models, tokenizers diff --git a/qanary-component-MT-Python-LibreTranslate/Dockerfile b/qanary-component-MT-Python-LibreTranslate/Dockerfile index 77e878f42..b1adee742 100644 --- a/qanary-component-MT-Python-LibreTranslate/Dockerfile +++ b/qanary-component-MT-Python-LibreTranslate/Dockerfile @@ -1,13 +1,20 @@ -FROM python:3.10.8 +FROM python:3.10 COPY requirements.txt ./ RUN pip install --upgrade pip -RUN pip install -r requirements.txt; exit 0 -RUN pip install gunicorn +RUN pip install -r requirements.txt COPY component component -COPY run.py boot.sh ./ +COPY utils utils +COPY run.py boot.sh ./ + +# to allow preconfigured images +ARG SOURCE_LANGUAGE +ARG TARGET_LANGUAGE + +ENV SOURCE_LANGUAGE=$SOURCE_LANGUAGE +ENV TARGET_LANGUAGE=$TARGET_LANGUAGE RUN chmod +x boot.sh diff --git a/qanary-component-MT-Python-LibreTranslate/README.md b/qanary-component-MT-Python-LibreTranslate/README.md index 1d6f87897..ee84a710d 100644 --- a/qanary-component-MT-Python-LibreTranslate/README.md +++ b/qanary-component-MT-Python-LibreTranslate/README.md @@ -22,12 +22,6 @@ Not applicable as the textual question is a default parameter. oa:hasBody "translation_result"@en ; oa:annotatedBy ; oa:annotatedAt "2001-10-26T21:32:52"^^xsd:dateTime . - - a qa:AnnotationOfQuestionLanguage . - oa:hasTarget ; - oa:hasBody "lang-id"^^xsd:string ; - oa:annotatedBy ; - oa:annotatedAt "2001-10-26T21:32:52"^^xsd:dateTime . ``` ## Usage @@ -55,6 +49,8 @@ SPRING_BOOT_ADMIN_PASSWORD=admin SERVICE_NAME_COMPONENT=LibreTranslate TRANSLATE_ENDPOINT=http://localhost:5000/translate LANGUAGES_ENDPOINT=http://localhost:5000/languages +SOURCE_LANGUAGE=de +TARGET_LANGUAGE=en ``` The parameters description: @@ -67,6 +63,8 @@ The parameters description: * `SERVICE_NAME_COMPONENT` -- the name of your Qanary component (for better identification) * `TRANSLATE_ENDPOINT` -- the LibreTranslate endpoint to be used for translation * `LANGUAGES_ENDPOINT` -- the LibreTranslate endpoint returning a list of supported languages +* `SOURCE_LANGUAGE` -- (optional) the default source language of the translation +* `TARGET_LANGUAGE` -- (optional) the default target language of the translation 4. pull the LibreTranslate image: @@ -93,18 +91,43 @@ docker-compose build docker-compose up latest ``` -After execution, component creates Qanary annotation in the Qanary triplestore: +After successful execution, component creates Qanary annotation in the Qanary triplestore: ``` GRAPH { - ?a a qa:AnnotationOfQuestionLanguage . - ?a qa:translationResult "translation result" . - ?a qa:sourceLanguage "ISO_639-1 language code" . - ?a oa:annotatedBy . - ?a oa:annotatedAt ?time . - } + ?a a qa:AnnotationOfQuestionTranslation . + ?a oa:hasTarget . + ?a oa:hasBody "translation_result"@ISO_639-1 language code + ?a oa:annotatedBy . + ?a oa:annotatedAt ?time . } ``` +### Support for multiple Source and Target Languages + +This component relies on the presence of one of more existing annotations that associate a question text with a language. +This can be in the form of an `AnnotationOfQuestionLanguage`, as created by LD components, or an `AnnotationOfQuestionTranslation` as created by MT components. + +It supports multiple combinations of source and target languages. +You can specify a desired source and target language independently, or simply use all available language pairings. + +If a `SOURCE_LANGUAGE` is set, then only texts with this specific language are considered for translation. +If none is set, then all configured source languages will be used to find candidates for translation. + +Similarily, if a `TARGET_LANGUAGE` is set, then texts are only translated into that language. +If none is set, then the texts are translated into all target languages that are supported for their respective source language. + +Note that while configured source languages naturally determine the possible target languages, +the configured target languages also determine which source languages can be supported! + +### Pre-configured Docker Images + +You may use the included file `docker-compose-pairs.yml` to build a list of images that are preconfigured for specific language pairs. +Note that if you intend to use these containers at the same time, you need to assign different `SERVER_PORT` values for each image. + +```bash +docker-compose -f docker-compose-pairs.yml build +``` + ## Endpoints * `/about` -- (GET) a short service description diff --git a/qanary-component-MT-Python-LibreTranslate/boot.sh b/qanary-component-MT-Python-LibreTranslate/boot.sh index 01f029178..275018758 100755 --- a/qanary-component-MT-Python-LibreTranslate/boot.sh +++ b/qanary-component-MT-Python-LibreTranslate/boot.sh @@ -1,12 +1,29 @@ -#!/bin/sh +#!/bin/bash +export $(grep -v "^#" < .env) -export $(grep -v '^#' .env | xargs) +# check required parameters +declare -a required_vars=( +"SPRING_BOOT_ADMIN_URL" +"SERVER_HOST" +"SERVER_PORT" +"SPRING_BOOT_ADMIN_USERNAME" +"SPRING_BOOT_ADMIN_PASSWORD" +"SERVICE_NAME_COMPONENT" +"SERVICE_DESCRIPTION_COMPONENT" +"TRANSLATE_ENDPOINT" +"LANGUAGES_ENDPOINT" +) + +for param in ${required_vars[@]}; +do + if [[ -z ${!param} ]]; then + echo "Required variable \"$param\" is not set!" + echo "The required variables are: ${required_vars[@]}" + exit 4 + fi +done echo The port number is: $SERVER_PORT +echo The host is: $SERVER_HOST echo The Qanary pipeline URL is: $SPRING_BOOT_ADMIN_URL -if [ -n $SERVER_PORT ] -then - exec gunicorn -b :$SERVER_PORT --access-logfile - --error-logfile - run:app # refer to the gunicorn documentation for more options -fi - - +exec uvicorn run:app --host 0.0.0.0 --port $SERVER_PORT --log-level warning diff --git a/qanary-component-MT-Python-LibreTranslate/component/__init__.py b/qanary-component-MT-Python-LibreTranslate/component/__init__.py index 1de4ddf5b..2b2c27f76 100644 --- a/qanary-component-MT-Python-LibreTranslate/component/__init__.py +++ b/qanary-component-MT-Python-LibreTranslate/component/__init__.py @@ -1,33 +1,37 @@ -from component.mt_libretranslate import mt_libretranslate_bp -from component.mt_libretranslate import check_connection +from fastapi import FastAPI +from fastapi.responses import RedirectResponse, Response, JSONResponse +from component import mt_libretranslate from component.mt_libretranslate import get_languages -from flask import Flask -version = "0.1.2" +version = "0.2.0" # default config file configfile = "app.conf" # service status information -healthendpoint = "/health" -aboutendpoint = "/about" -languagesendpoint = "/languages" +HEALTHENDPOINT = "/health" +ABOUTENDPOINT = "/about" +LANGUAGESENDPOINT = "/languages" # init Flask app and add externalized service information -app = Flask(__name__) -app.register_blueprint(mt_libretranslate_bp) +app = FastAPI(docs_url="/swagger-ui.html") +app.include_router(mt_libretranslate.router) -@app.route(healthendpoint, methods=["GET"]) +@app.get("/") +async def main(): + return RedirectResponse("/about") + +@app.get(HEALTHENDPOINT, description="Shows the status of the component") def health(): """required health endpoint for callback of Spring Boot Admin server""" - status, message = check_connection() - return f"{'ALIVE' if status else 'DOWN'} - {message}" + return Response("alive", media_type="text/plain") -@app.route(aboutendpoint, methods=["GET"]) +@app.get(ABOUTENDPOINT, description="Shows a description of the component") def about(): """required about endpoint for callback of Srping Boot Admin server""" - return "Translates questions into English. \nSee /languages for a list of supported source languages!" + return Response("Translates questions into English", media_type="text/plain") -@app.route(languagesendpoint, methods=["GET"]) +@app.get(LANGUAGESENDPOINT) def languages(): - return get_languages() + return JSONResponse(get_languages()) + diff --git a/qanary-component-MT-Python-LibreTranslate/component/mt_libretranslate.py b/qanary-component-MT-Python-LibreTranslate/component/mt_libretranslate.py index 419ac06b0..9ddd1b85d 100644 --- a/qanary-component-MT-Python-LibreTranslate/component/mt_libretranslate.py +++ b/qanary-component-MT-Python-LibreTranslate/component/mt_libretranslate.py @@ -1,126 +1,35 @@ -from langdetect import detect import logging import os import requests from flask import Blueprint, jsonify, request from qanary_helpers.qanary_queries import get_text_question_in_graph, insert_into_triplestore +from qanary_helpers.language_queries import get_translated_texts_in_triplestore, get_texts_with_detected_language_in_triplestore, QuestionTextWithLanguage, create_annotation_of_question_translation +from utils.lang_utils import translation_options +from fastapi import APIRouter, Request +from fastapi.responses import JSONResponse logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) - -mt_libretranslate_bp = Blueprint("mt_libretranslate_bp", __name__, template_folder="templates") +router = APIRouter() SERVICE_NAME_COMPONENT = os.environ["SERVICE_NAME_COMPONENT"] +TRANSLATIONENDPOINT = "/translate" -SOURCE_LANG = os.environ["SOURCE_LANGUAGE"] -#TARGET_LANG = os.environ["TARGET_LANGUAGE"] -TARGET_LANG = "en" # currently only supports English TRANSLATE_ENDPOINT = os.environ["TRANSLATE_ENDPOINT"] LANGUAGES_ENDPOINT = os.environ["LANGUAGES_ENDPOINT"] -@mt_libretranslate_bp.route("/annotatequestion", methods=["POST"]) -def qanary_service(): - """the POST endpoint required for a Qanary service""" - - triplestore_endpoint = request.json["values"]["urn:qanary#endpoint"] - triplestore_ingraph = request.json["values"]["urn:qanary#inGraph"] - triplestore_outgraph = request.json["values"]["urn:qanary#outGraph"] - logging.info("endpoint: %s, inGraph: %s, outGraph: %s" % \ - (triplestore_endpoint, triplestore_ingraph, triplestore_outgraph)) +def translate_input(text: str, source_lang: str, target_lang: str): + """Translates text from a source language into a target language. - text = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, - graph=triplestore_ingraph)[0]["text"] - question_uri = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, - graph=triplestore_ingraph)[0]["uri"] - logging.info(f"Question text: {text}") + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation - if SOURCE_LANG != None and len(SOURCE_LANG.strip()) > 0: - lang = SOURCE_LANG - logging.info("Using custom SOURCE_LANGUAGE") - else: - lang = detect(text) - logging.info("No SOURCE_LANGUAGE specified, using langdetect!") - logging.info(f"source language: {lang}") - - #lang, prob = langid.classify(text) - lang = detect(text) - logging.info(f"source language: {lang}") - - ## TODO: MAIN FUNCTIONALITY - result, _ = translate_input(text, lang, TARGET_LANG) - - # building SPARQL query TODO: verify this annotation AnnotationOfQuestionTranslation ?? - SPARQLqueryAnnotationOfQuestionTranslation = """ - PREFIX qa: - PREFIX oa: - PREFIX xsd: - - INSERT {{ - GRAPH <{uuid}> {{ - ?a a qa:AnnotationOfQuestionTranslation ; - oa:hasTarget <{qanary_question_uri}> ; - oa:hasBody "{translation_result}"@{target_lang} ; - oa:annotatedBy ; - oa:annotatedAt ?time . - - }} - }} - WHERE {{ - BIND (IRI(str(RAND())) AS ?a) . - BIND (now() as ?time) - }} - """.format( - uuid=triplestore_ingraph, - qanary_question_uri=question_uri, - translation_result=result.replace("\"", "\\\""), #keep quotation marks that are part of the translation - target_lang=TARGET_LANG, - app_name=SERVICE_NAME_COMPONENT - ) - - SPARQLqueryAnnotationOfQuestionLanguage = """ - PREFIX qa: - PREFIX oa: - PREFIX xsd: - - INSERT {{ - GRAPH <{uuid}> {{ - ?b a qa:AnnotationOfQuestionLanguage ; - oa:hasTarget <{qanary_question_uri}> ; - oa:hasBody "{src_lang}"^^xsd:string ; - oa:annotatedBy ; - oa:annotatedAt ?time . - }} - }} - WHERE {{ - BIND (IRI(str(RAND())) AS ?b) . - BIND (now() as ?time) - }} - """.format( - uuid=triplestore_ingraph, - qanary_question_uri=question_uri, - src_lang=lang, - app_name=SERVICE_NAME_COMPONENT - ) - - logging.info(f'SPARQL: {SPARQLqueryAnnotationOfQuestionTranslation}') - logging.info(f'SPARQL: {SPARQLqueryAnnotationOfQuestionLanguage}') - # inserting new data to the triplestore - insert_into_triplestore(triplestore_endpoint, SPARQLqueryAnnotationOfQuestionTranslation) - insert_into_triplestore(triplestore_endpoint, SPARQLqueryAnnotationOfQuestionLanguage) - - return jsonify(request.get_json()) - - -@mt_libretranslate_bp.route("/", methods=["GET"]) -def index(): - """examplary GET endpoint""" - - logging.info("host_url: %s" % (request.host_url)) - return "Python MT LibreTranslate Qanary component" - - -def translate_input(text, source_lang, target_lang): + Returns: + str: The translated text + """ req_json = { 'q': text, @@ -132,9 +41,148 @@ def translate_input(text, source_lang, target_lang): } response = requests.request("POST", TRANSLATE_ENDPOINT, headers=headers, data=req_json) logging.info(f"got response json: {response.json()}") - translation = response.json().get('translatedText') + result = response.json().get('translatedText') error = response.json().get('error') - return translation, error + logging.info(f"result: \"{result}\"") + translation = result.replace("\"", "\\\"") #keep quotation marks that are part of the translation + if error: + return "" + else: + return translation + + +def find_source_texts_in_triplestore(triplestore_endpoint: str, graph_uri: str, lang: str) -> list[QuestionTextWithLanguage]: + """Retrieves questions of a specific language from the triplestore. + + Parameters: + triplestore_endpoint (str): URL of the triplestore endpoint + graph_uri (str): URI of the graph to query inside of the triplestore + lang (str): Expected language + + Returns: + list: A list of appropriate QuestionTextWithLanguage objects with information from the triplestore. + """ + + source_texts = [] + + # check if supported languages have been determined already (LD) + # (use filters) + # if so, use the target uris to find the question text to translate + ld_source_texts = get_texts_with_detected_language_in_triplestore(triplestore_endpoint, graph_uri, lang) + source_texts.extend(ld_source_texts) + + # check if there are translations into the relevant language (MT) + # (use filters) + # if so, use the translation texts + mt_source_texts = get_translated_texts_in_triplestore(triplestore_endpoint, graph_uri, lang) + source_texts.extend(mt_source_texts) + + # TODO: what if nothing found? + if len(source_texts) == 0: + logging.warning(f"No source texts with language {lang} could be found In the triplestore!") + + return source_texts + + +@router.get("/translate_to_one", description="Translate a text from a given source language into one target language.", tags=["Translate"]) +def translate_to_one(text: str, source_lang: str, target_lang: str): + """Translates a text from a given source language into one target language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + + Returns: + dict: A dictionary of translations identified by their target language (only one in this case) + """ + + if (source_lang in translation_options.keys()) and (target_lang in translation_options.get(source_lang, [])): + translation = translate_input(text, source_lang, target_lang) + return {target_lang: translation} + else: + raise RuntimeError("Unsupported source and/or target language! Valid options: {to}".format(to=translation_options)) + + +@router.get("/translate_to_all", description="Translate a text from a given source language into all configured target languages for that source language.", tags=["Translate"]) +def translate_to_all(text: str, source_lang: str): + """Translates a text from a given source language into all target configured languages for that source language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + + Returns: + dict: A dictionary of translations identified by their target language + """ + + if source_lang in translation_options.keys(): + translations = list() + for target_lang in translation_options[source_lang]: + translation = translate_input(text, source_lang, target_lang) + translations.append({ + target_lang: translation + }) + return translations + else: + raise RuntimeError("Unsupported source language! Valid options: {to}".format(to=translation_options)) + + +@router.post("/annotatequestion", description="Standard process method for Qanary components", tags=["Qanary"]) +async def qanary_service(request: Request): + """the POST endpoint required for a Qanary service""" + + request_json = await request.json() + + triplestore_endpoint = request_json["values"]["urn:qanary#endpoint"] + triplestore_ingraph = request_json["values"]["urn:qanary#inGraph"] + triplestore_outgraph = request_json["values"]["urn:qanary#outGraph"] + logging.info("endpoint: %s, inGraph: %s, outGraph: %s" % \ + (triplestore_endpoint, triplestore_ingraph, triplestore_outgraph)) + + text_question_in_graph = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, graph=triplestore_ingraph) + question_text = text_question_in_graph[0]['text'] + logging.info(f'Original question text: {question_text}') + + # Collect texts to be translated (group by source language) + + source_texts_per_language = dict() + # keep a list of annotations to insert + insert_annotations = list() + + for source_lang in translation_options.keys(): + source_texts = find_source_texts_in_triplestore( + triplestore_endpoint=triplestore_endpoint, + graph_uri=triplestore_ingraph, + lang=source_lang + ) + source_texts_per_language.update({source_lang: source_texts}) + + # for every source language that has associated texts + for source_lang in source_texts_per_language.keys(): + # translate each found text + for source_text in source_texts_per_language[source_lang]: + # into every target language that is supported for this source language + for target_lang in translation_options[source_lang]: + translation = translate_input(source_text.get_text(), source_lang, target_lang) + if len(translation.strip()) > 0: + SPARQLqueryAnnotationOfQuestionTranslation = create_annotation_of_question_translation( + graph_uri=triplestore_ingraph, + question_uri=source_text.get_uri(), + translation=translation, + translation_language=target_lang, + app_name=SERVICE_NAME_COMPONENT + ) + insert_annotations.append(SPARQLqueryAnnotationOfQuestionTranslation) + else: + logging.error(f"result is empty string!") + + # insert the created annotations into the triplestore + for insert_annotation in insert_annotations: + insert_into_triplestore(triplestore_endpoint, insert_annotation) + + return JSONResponse(request_json) def check_connection(): diff --git a/qanary-component-MT-Python-LibreTranslate/docker-compose-pairs.yml b/qanary-component-MT-Python-LibreTranslate/docker-compose-pairs.yml new file mode 100644 index 000000000..db9133a85 --- /dev/null +++ b/qanary-component-MT-Python-LibreTranslate/docker-compose-pairs.yml @@ -0,0 +1,108 @@ +version: '3' +services: + + component-en-de: + # for building from source + image: qanary/qanary-component-mt-python-libretranslate-en-de:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=de + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-LibreTranslate-en-de + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + component-en-ru: + # for building from source + image: qanary/qanary-component-mt-python-libretranslate-en-ru:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=ru + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-LibreTranslate-en-ru + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-en-es: + # for building from source + image: qanary/qanary-component-mt-python-libretranslate-en-es:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=es + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-LibreTranslate-en-es + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-de-en: + # for building from source + image: qanary/qanary-component-mt-python-libretranslate-de-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=de + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-LibreTranslate-de-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-ru-en: + # for building from source + image: qanary/qanary-component-mt-python-libretranslate-ru-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=ru + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-LibreTranslate-ru-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-es-en: + # for building from source + image: qanary/qanary-component-mt-python-libretranslate-es-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=es + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-LibreTranslate-es-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host diff --git a/qanary-component-MT-Python-LibreTranslate/docker-compose.yml b/qanary-component-MT-Python-LibreTranslate/docker-compose.yml index 04b43e8c7..5ddb0ee98 100644 --- a/qanary-component-MT-Python-LibreTranslate/docker-compose.yml +++ b/qanary-component-MT-Python-LibreTranslate/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.5" services: # local libretranslate server @@ -13,10 +12,11 @@ services: # component component: # for building from source - image: qanary-component-mt-python-libretranslate:latest - build: + image: qanary/qanary-component-mt-python-libretranslate:0.2.0 + build: context: . dockerfile: Dockerfile - network_mode: host env_file: - .env + environment: + - SERVICE_NAME_COMPONENT=MT-LibreTranslate diff --git a/qanary-component-MT-Python-LibreTranslate/pytest.ini b/qanary-component-MT-Python-LibreTranslate/pytest.ini index 49dd1c4f3..737bf5b2f 100644 --- a/qanary-component-MT-Python-LibreTranslate/pytest.ini +++ b/qanary-component-MT-Python-LibreTranslate/pytest.ini @@ -1,15 +1,17 @@ [pytest] -log_cli = 0 +log_cli = 1 log_cli_level = INFO log_cli_format = %(asctime)s [%(levelname)8s] [%(filename)s:%(lineno)s] %(message)s log_cli_date_format=%Y-%m-%d %H:%M:%S env = - SERVER_PORT=40120 - SPRING_BOOT_ADMIN_URL=http://qanary-pipeline-host:40111 - SERVER_HOST=http://public-component-host + SERVER_PORT=8081 + SERVER_HOST=http://localhost + SPRING_BOOT_ADMIN_URL=http://localhost:40111 + SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL=http://localhost:8081 SPRING_BOOT_ADMIN_USERNAME=admin SPRING_BOOT_ADMIN_PASSWORD=admin SERVICE_NAME_COMPONENT=LibreTranslate TRANSLATE_ENDPOINT=http://localhost:5000/translate LANGUAGES_ENDPOINT=http://localhost:5000/languages - SOURCE_LANGUAGE= + LIVE_TESTS_ENABLED=False + SERVICE_DESCRIPTION_COMPONENT=Translates question to English diff --git a/qanary-component-MT-Python-LibreTranslate/requirements.txt b/qanary-component-MT-Python-LibreTranslate/requirements.txt index 80eaccc2d..a604a81e3 100644 --- a/qanary-component-MT-Python-LibreTranslate/requirements.txt +++ b/qanary-component-MT-Python-LibreTranslate/requirements.txt @@ -1,9 +1,10 @@ -Flask -langdetect==1.0.9 -langid==1.1.6 -mock==3.0.5 -python-dotenv==0.21.1 -qanary_helpers==0.2.2 -gunicorn==20.1.0 -pytest -pytest-env +fastapi==0.109.1 +pytest==8.3.2 +pytest-env==1.1.3 +SentencePiece==0.2.0 +SPARQLWrapper==2.0.0 +torch==2.4.0 +transformers==4.44.0 +qanary-helpers==0.3.2 +uvicorn==0.30.1 +httpx==0.27.0 diff --git a/qanary-component-MT-Python-LibreTranslate/run.py b/qanary-component-MT-Python-LibreTranslate/run.py index 339ec7e1e..16177a12c 100644 --- a/qanary-component-MT-Python-LibreTranslate/run.py +++ b/qanary-component-MT-Python-LibreTranslate/run.py @@ -4,32 +4,35 @@ from qanary_helpers.registration import Registration from qanary_helpers.registrator import Registrator -from component import app, healthendpoint, aboutendpoint +from component import app, HEALTHENDPOINT, ABOUTENDPOINT -logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) +logging.basicConfig(level=logging.ERROR) +# TODO: get logger from module +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) SPRING_BOOT_ADMIN_URL = os.getenv('SPRING_BOOT_ADMIN_URL') SPRING_BOOT_ADMIN_USERNAME = os.getenv('SPRING_BOOT_ADMIN_USERNAME') SPRING_BOOT_ADMIN_PASSWORD = os.getenv('SPRING_BOOT_ADMIN_PASSWORD') -SERVICE_HOST = os.getenv('SERVER_HOST') -SERVICE_PORT = os.getenv('SERVER_PORT') +SERVER_HOST = os.getenv('SERVER_HOST') +SERVER_PORT = os.getenv('SERVER_PORT') SERVICE_NAME_COMPONENT = os.getenv('SERVICE_NAME_COMPONENT') SERVICE_DESCRIPTION_COMPONENT = os.getenv('SERVICE_DESCRIPTION_COMPONENT') -URL_COMPONENT = f"http://{SERVICE_HOST}:{SERVICE_PORT}" +URL_COMPONENT = f"http://{SERVER_HOST}:{SERVER_PORT}" # define metadata that will be shown in the Spring Boot Admin server UI metadata = { "start": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "description": SERVICE_DESCRIPTION_COMPONENT, - "about": f"{SERVICE_HOST}:{SERVICE_PORT}{aboutendpoint}", + "about": f"{SERVER_HOST}:{SERVER_PORT}{ABOUTENDPOINT}", "written in": "Python" } # initialize the registration object, to be send to the Spring Boot Admin server registration = Registration( name=SERVICE_NAME_COMPONENT, - serviceUrl=f"{SERVICE_HOST}:{SERVICE_PORT}", - healthUrl=f"{SERVICE_HOST}:{SERVICE_PORT}{healthendpoint}", + serviceUrl=f"{SERVER_HOST}:{SERVER_PORT}", + healthUrl=f"{SERVER_HOST}:{SERVER_PORT}{HEALTHENDPOINT}", metadata=metadata ) @@ -47,7 +50,5 @@ if __name__ == "__main__": # start the web service - if SERVICE_PORT == None: - raise RuntimeError("SERVICE_PORT must not be empty!") - else: - app.run(debug=True, port=SERVICE_PORT) + if SERVER_PORT == None: + raise RuntimeError("SERVER_PORT must not be empty!") diff --git a/qanary-component-MT-Python-LibreTranslate/tests/test_lang_utils.py b/qanary-component-MT-Python-LibreTranslate/tests/test_lang_utils.py new file mode 100644 index 000000000..7e34cdf44 --- /dev/null +++ b/qanary-component-MT-Python-LibreTranslate/tests/test_lang_utils.py @@ -0,0 +1,68 @@ +import logging +from unittest import mock +from unittest import TestCase +import os +import importlib + +class TestLangUtils(TestCase): + + logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'fr'}) + def test_only_one_source_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + assert 'fr' in translation_options.keys() + assert len(translation_options.keys()) == 1 + + + @mock.patch.dict(os.environ, {'TARGET_LANGUAGE': 'ru'}) + def test_only_one_target_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + # currently, there are only two source languages that support target language 'ru' + assert len(translation_options.items()) == 4 + assert ('en', ['ru']) in translation_options.items() + assert ('de', ['ru']) in translation_options.items() + assert ('es', ['ru']) in translation_options.items() + assert ('fr', ['ru']) in translation_options.items() + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'en', 'TARGET_LANGUAGE': 'es'}) + def test_specific_source_and_target_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + assert translation_options == {'en': ['es']} + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'zh'}) + def test_unsupported_source_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'en', 'TARGET_LANGUAGE': 'zh'}) + def test_unsupported_target_for_source_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass + + + @mock.patch.dict(os.environ, {'TARGET_LANGUAGE': 'zh'}) + def test_unsupported_target_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass diff --git a/qanary-component-MT-Python-LibreTranslate/tests/test_mt_libretranslate.py b/qanary-component-MT-Python-LibreTranslate/tests/test_mt_libretranslate.py index 2bb6dcfbf..a858fcfc3 100644 --- a/qanary-component-MT-Python-LibreTranslate/tests/test_mt_libretranslate.py +++ b/qanary-component-MT-Python-LibreTranslate/tests/test_mt_libretranslate.py @@ -1,13 +1,23 @@ -from component.mt_libretranslate import * +import logging from component import app +from fastapi.testclient import TestClient +import pytest from unittest.mock import patch +from unittest import mock import re from unittest import TestCase +from qanary_helpers.language_queries import QuestionTextWithLanguage +import os +import importlib + + +LIVE_TESTS_ENABLED = os.getenv("LIVE_TESTS_ENABLED", "False").lower() in ('true', '1', 't') class TestComponent(TestCase): logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + logging.info(LIVE_TESTS_ENABLED) questions = list([{"uri": "urn:test-uri", "text": "was ist ein Test?"}]) endpoint = "urn:qanary#test-endpoint" @@ -19,6 +29,10 @@ class TestComponent(TestCase): test_translation_placeholder = "test_translation" + source_texts = [ + QuestionTextWithLanguage("uri", "Was ist die Hauptstadt von Deutschland?", "de") + ] + request_data = '''{ "values": { "urn:qanary#endpoint": "urn:qanary#test-endpoint", @@ -34,44 +48,96 @@ class TestComponent(TestCase): "Content-Type": "application/json" } + client = TestClient(app) - def test_qanary_service(self): + @pytest.mark.skipif(not LIVE_TESTS_ENABLED, reason="Live tests using a LibreTranslate endpoint are disabled.") + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'de', 'TARGET_LANGUAGE': 'en'}) + def test_qanary_service_live(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + import component.mt_libretranslate + importlib.reload(component.mt_libretranslate) + from component import app - with app.test_client() as client, \ - patch('component.mt_libretranslate.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ - patch('component.mt_libretranslate.insert_into_triplestore') as mocked_insert_into_triplestore, \ - patch('component.mt_libretranslate.translate_input') as mocked_translate_input: + with patch('component.mt_libretranslate.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ + patch('component.mt_libretranslate.find_source_texts_in_triplestore') as mocked_find_source_texts_in_triplestore, \ + patch('component.mt_libretranslate.insert_into_triplestore') as mocked_insert_into_triplestore: # given a non-english question is present in the current graph mocked_get_text_question_in_graph.return_value = self.questions + mocked_find_source_texts_in_triplestore.return_value = self.source_texts mocked_insert_into_triplestore.return_value = None - mocked_translate_input.return_value = self.test_translation_placeholder, None # when a call to /annotatequestion is made - response_json = client.post("/annotatequestion", headers = self.headers, data = self.request_data) + response_json = self.client.post("/annotatequestion", headers = self.headers, data = self.request_data) # then the text question is retrieved from the triplestore mocked_get_text_question_in_graph.assert_called_with(triplestore_endpoint=self.endpoint, graph=self.in_graph) + mocked_find_source_texts_in_triplestore.assert_called_with(triplestore_endpoint=self.endpoint, graph_uri=self.in_graph, lang=self.source_language) + assert mocked_find_source_texts_in_triplestore.call_count == 1 + # get arguments of the (2) separate insert calls arg_list = mocked_insert_into_triplestore.call_args_list # get the call arguments for question translation call_args_translation = [a.args for a in arg_list if "AnnotationOfQuestionTranslation" in a.args[1]] assert len(call_args_translation) == 1 - # get the call arguments for question language - call_args_language = [a.args for a in arg_list if "AnnotationOfQuestionLanguage" in a.args[1]] - assert len(call_args_language) == 1 # clean query strings query_translation = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_translation[0][1]) - query_language = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_language[0][1]) # then the triplestore is updated twice # (question language and translation) - assert mocked_insert_into_triplestore.call_count == 2 + assert mocked_insert_into_triplestore.call_count == 1 + + # then the question is translated and the result is annotated + self.assertRegex(query_translation, r".*AnnotationOfQuestionTranslation(.*;\W?)*oa:hasBody \".*\"@" + self.target_language + r".*\.") + assert "@"+self.target_language in query_translation.lower() + + # then the response is not empty + assert response_json != None + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'de', 'TARGET_LANGUAGE': 'en'}) + def test_qanary_service(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + import component.mt_libretranslate + importlib.reload(component.mt_libretranslate) + from component import app - # then the source language is correctly identified and annotated - self.assertRegex(query_language, r".*AnnotationOfQuestionLanguage(.*;\W?)*oa:hasBody \""+self.source_language+r"\".*\.") + with patch('component.mt_libretranslate.translate_input') as mocked_translate_input, \ + patch('component.mt_libretranslate.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ + patch('component.mt_libretranslate.find_source_texts_in_triplestore') as mocked_find_source_texts_in_triplestore, \ + patch('component.mt_libretranslate.insert_into_triplestore') as mocked_insert_into_triplestore: + + # given a non-english question is present in the current graph + mocked_get_text_question_in_graph.return_value = self.questions + mocked_find_source_texts_in_triplestore.return_value = self.source_texts + mocked_insert_into_triplestore.return_value = None + mocked_translate_input.return_value = self.test_translation_placeholder + + # when a call to /annotatequestion is made + response_json = self.client.post("/annotatequestion", headers = self.headers, data = self.request_data) + + # then the text question is retrieved from the triplestore + mocked_get_text_question_in_graph.assert_called_with(triplestore_endpoint=self.endpoint, graph=self.in_graph) + + mocked_find_source_texts_in_triplestore.assert_called_with(triplestore_endpoint=self.endpoint, graph_uri=self.in_graph, lang=self.source_language) + assert mocked_find_source_texts_in_triplestore.call_count == 1 + + # get arguments of the (2) separate insert calls + arg_list = mocked_insert_into_triplestore.call_args_list + # get the call arguments for question translation + call_args_translation = [a.args for a in arg_list if "AnnotationOfQuestionTranslation" in a.args[1]] + assert len(call_args_translation) == 1 + + # clean query strings + query_translation = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_translation[0][1]) + + # then the triplestore is updated twice + # (question language and translation) + assert mocked_insert_into_triplestore.call_count == 1 # then the question is translated and the result is annotated self.assertRegex(query_translation, r".*AnnotationOfQuestionTranslation(.*;\W?)*oa:hasBody \".*\"@" + self.target_language + r".*\.") @@ -79,3 +145,29 @@ def test_qanary_service(self): # then the response is not empty assert response_json != None + + + # test with all supported languages enabled + @pytest.mark.skipif(not LIVE_TESTS_ENABLED, reason="Live tests using a LibreTranslate endpoint are disabled.") + def test_translate_input(self): + import component.mt_libretranslate + from component.mt_libretranslate import translate_input + import utils.lang_utils + importlib.reload(utils.lang_utils) + importlib.reload(component.mt_libretranslate) + translations = [ + {"text": "Was ist die Hauptstadt von Deutschland?", + "translation": "What is the capital of Germany?", + "source_lang": "de", "target_lang": "en"}, + {"text": "What is the capital of Germany?", + "translation": "Quelle est la capitale de l'Allemagne ?", + "source_lang": "en", "target_lang": "fr"}, + {"text": "What is the capital of Germany?", + "translation": "Какая столица Германии?", + "source_lang": "en", "target_lang": "ru"}, + ] + + for translation in translations: + expected = translation["translation"] + actual = translate_input(translation["text"], translation["source_lang"], translation["target_lang"]) + assert expected == actual diff --git a/qanary-component-MT-Python-LibreTranslate/utils/__init__.py b/qanary-component-MT-Python-LibreTranslate/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/qanary-component-MT-Python-LibreTranslate/utils/lang_utils.py b/qanary-component-MT-Python-LibreTranslate/utils/lang_utils.py new file mode 100644 index 000000000..3e71a605d --- /dev/null +++ b/qanary-component-MT-Python-LibreTranslate/utils/lang_utils.py @@ -0,0 +1,63 @@ +import os +import logging + + +logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + +SOURCE_LANGUAGE = os.getenv("SOURCE_LANGUAGE") +TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE") +SUPPORTED_LANGS = { +# source: targets + 'en': ['de', 'fr', 'ru', 'es'], + 'de': ['en', 'fr', 'ru', 'es'], + 'fr': ['en', 'de', 'ru', 'es'], + 'ru': ['en', 'de', 'fr', 'es'], + 'es': ['en', 'de', 'fr', 'ru'] +} + + +def setup_translation_options() -> dict: + """Creates a dictionary of possible source and target languages, based on SUPPORTED_LANGS and configured languages.""" + + logging.info("SETTING UP TRANSLATION OPTIONS") + translation_options = dict() # init emtpy + + # check if a source language is specified + if SOURCE_LANGUAGE != None and len(SOURCE_LANGUAGE.strip()) > 0: + # pre-select appropriate translation options from the list of supported source languages + try: + translation_options[SOURCE_LANGUAGE] = SUPPORTED_LANGS[SOURCE_LANGUAGE] + # this will fail for invalid keys! + except KeyError: + raise ValueError(f"The source language \"{SOURCE_LANGUAGE}\" is not supported!") + # if no source language is specified, use all source languages that are supported by the models + else: + translation_options = SUPPORTED_LANGS + + # check if a target language is specified + if TARGET_LANGUAGE != None and len(TARGET_LANGUAGE.strip()) > 0: + discard_keys = list() + # remove instances where source == target + translation_options.pop(TARGET_LANGUAGE, None) + for source_lang in translation_options.keys(): + if TARGET_LANGUAGE in translation_options[source_lang]: + translation_options[source_lang] = [TARGET_LANGUAGE] + else: + discard_keys.append(source_lang) + # cleanup keys + translation_options = {sl:tl for sl,tl in translation_options.items() if sl not in discard_keys} + # check for empty translation options, if all keys dropped + if len(translation_options.keys()) == 0: + raise ValueError("The target language \"{tl}\" is not supported for any configured source languages! \nValid language pairs (source: [targets]) are: \n{slk}!" + .format(tl=TARGET_LANGUAGE, slk=SUPPORTED_LANGS)) + # check if only some keys dropped + elif len(discard_keys) > 0: + logging.warning("Specific target language \"{tl}\" is not supported for these source languages: {dk}!. \nThese language pairs will be ignored." + .format(tl=TARGET_LANGUAGE, dk=discard_keys)) + # else do nothing, the lists are already complete + + logging.info(translation_options) + return translation_options + + +translation_options = setup_translation_options() diff --git a/qanary-component-MT-Python-MBart/Dockerfile b/qanary-component-MT-Python-MBart/Dockerfile index 61eb53cca..b1adee742 100644 --- a/qanary-component-MT-Python-MBart/Dockerfile +++ b/qanary-component-MT-Python-MBart/Dockerfile @@ -1,14 +1,21 @@ -FROM python:3.7 +FROM python:3.10 COPY requirements.txt ./ RUN pip install --upgrade pip -RUN pip install -r requirements.txt; exit 0 -RUN pip install gunicorn +RUN pip install -r requirements.txt COPY component component +COPY utils utils COPY run.py boot.sh ./ +# to allow preconfigured images +ARG SOURCE_LANGUAGE +ARG TARGET_LANGUAGE + +ENV SOURCE_LANGUAGE=$SOURCE_LANGUAGE +ENV TARGET_LANGUAGE=$TARGET_LANGUAGE + RUN chmod +x boot.sh ENTRYPOINT ["./boot.sh"] diff --git a/qanary-component-MT-Python-MBart/README.md b/qanary-component-MT-Python-MBart/README.md index a1a1bd366..63dc67e19 100644 --- a/qanary-component-MT-Python-MBart/README.md +++ b/qanary-component-MT-Python-MBart/README.md @@ -22,12 +22,6 @@ Not applicable as the textual question is a default parameter oa:hasBody "translation_result"@en ; oa:annotatedBy ; oa:annotatedAt "2001-10-26T21:32:52"^^xsd:dateTime . - - a qa:AnnotationOfQuestionLanguage . - oa:hasTarget ; - oa:hasBody "lang-id"^^xsd:string ; - oa:annotatedBy ; - oa:annotatedAt "2001-10-26T21:32:52"^^xsd:dateTime . ``` ## Usage @@ -69,8 +63,8 @@ The parameters description: * `SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL` -- the URL of your Qanary component (has to be visible to the Qanary pipeline) * `SERVICE_NAME_COMPONENT` -- the name of your Qanary component (for better identification) * `SERVICE_DESCRIPTION_COMPONENT` -- the description of your Qanary component -* `SOURCE_LANGUAGE` -- (optional) the source language of the text (the component will use langdetect if no source language is given) -* `TARGET_LANGUAGE` -- the language that the text should be translated to +* `SOURCE_LANGUAGE` -- (optional) the default source language of the translation +* `TARGET_LANGUAGE` -- (optional) the default target language of the translation 4. Build the Docker image: @@ -84,18 +78,43 @@ docker-compose build docker-compose up ``` -After execution, component creates Qanary annotation in the Qanary triplestore: +After successful execution, component creates Qanary annotation in the Qanary triplestore: ``` GRAPH { - ?a a qa:AnnotationOfQuestionLanguage . - ?a qa:translationResult "translation result" . - ?a qa:sourceLanguage "ISO_639-1 language code" . - ?a oa:annotatedBy . - ?a oa:annotatedAt ?time . - } + ?a a qa:AnnotationOfQuestionTranslation . + ?a oa:hasTarget . + ?a oa:hasBody "translation_result"@ISO_639-1 language code + ?a oa:annotatedBy . + ?a oa:annotatedAt ?time . } ``` +### Support for multiple Source and Target Languages + +This component relies on the presence of one of more existing annotations that associate a question text with a language. +This can be in the form of an `AnnotationOfQuestionLanguage`, as created by LD components, or an `AnnotationOfQuestionTranslation` as created by MT components. + +It supports multiple combinations of source and target languages. +You can specify a desired source and target language independently, or simply use all available language pairings. + +If a `SOURCE_LANGUAGE` is set, then only texts with this specific language are considered for translation. +If none is set, then all configured source languages will be used to find candidates for translation. + +Similarily, if a `TARGET_LANGUAGE` is set, then texts are only translated into that language. +If none is set, then the texts are translated into all target languages that are supported for their respective source language. + +Note that while configured source languages naturally determine the possible target languages, +the configured target languages also determine which source languages can be supported! + +### Pre-configured Docker Images + +You may use the included file `docker-compose-pairs.yml` to build a list of images that are preconfigured for specific language pairs. +Note that if you intend to use these containers at the same time, you need to assign different `SERVER_PORT` values for each image. + +```bash +docker-compose -f docker-compose-pairs.yml build +``` + ## How To Test This Component This component uses the [pytest](https://docs.pytest.org/). diff --git a/qanary-component-MT-Python-MBart/boot.sh b/qanary-component-MT-Python-MBart/boot.sh index 8ef76b030..65fe4e9b8 100755 --- a/qanary-component-MT-Python-MBart/boot.sh +++ b/qanary-component-MT-Python-MBart/boot.sh @@ -1,15 +1,33 @@ -#!/bin/sh +#!/bin/bash +export $(grep -v "^#" < .env) -export $(grep -v '^#' .env | xargs) +# check required parameters +declare -a required_vars=( +"SPRING_BOOT_ADMIN_URL" +"SERVER_HOST" +"SERVER_PORT" +"SPRING_BOOT_ADMIN_USERNAME" +"SPRING_BOOT_ADMIN_PASSWORD" +"SERVICE_NAME_COMPONENT" +"SERVICE_DESCRIPTION_COMPONENT" +) -echo Downloading the model -python -c 'from transformers import MBartForConditionalGeneration, MBart50TokenizerFast; model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt"); tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")' -echo Downloading the model finished +for param in ${required_vars[@]}; +do + if [[ -z ${!param} ]]; then + echo "Required variable \"$param\" is not set!" + echo "The required variables are: ${required_vars[@]}" + exit 4 + fi +done + +echo Downloading the models -echo SERVER_PORT: $SERVER_PORT -echo Qanary pipeline at SPRING_BOOT_ADMIN_URL: $SPRING_BOOT_ADMIN_URL +python -c "from utils.model_utils import load_models_and_tokenizers; load_models_and_tokenizers(); " + +echo Downloading the model finished -if [ -n $SERVER_PORT ] -then - exec gunicorn -b :$SERVER_PORT --access-logfile - --error-logfile - run:app # refer to the gunicorn documentation for more options -fi +echo The port number is: $SERVER_PORT +echo The host is: $SERVER_HOST +echo The Qanary pipeline URL is: $SPRING_BOOT_ADMIN_URL +exec uvicorn run:app --host 0.0.0.0 --port $SERVER_PORT --log-level warning diff --git a/qanary-component-MT-Python-MBart/component/__init__.py b/qanary-component-MT-Python-MBart/component/__init__.py index 40da0f2f5..679a96e9c 100644 --- a/qanary-component-MT-Python-MBart/component/__init__.py +++ b/qanary-component-MT-Python-MBart/component/__init__.py @@ -1,26 +1,32 @@ -from component.mt_mbart_nlp import mt_mbart_nlp_bp -from flask import Flask +from component import mt_mbart_nlp +from fastapi import FastAPI +from fastapi.responses import RedirectResponse, Response -version = "0.1.2" +version = "0.2.0" # default config file configfile = "app.conf" # service status information -healthendpoint = "/health" - -aboutendpoint = "/about" +HEALTHENDPOINT = "/health" +ABOUTENDPOINT = "/about" +# TODO: add languages endpoint? # init Flask app and add externalized service information -app = Flask(__name__) -app.register_blueprint(mt_mbart_nlp_bp) +app = FastAPI(docs_url="/swagger-ui.html") +app.include_router(mt_mbart_nlp.router) + + +@app.get("/") +async def main(): + return RedirectResponse("/about") -@app.route(healthendpoint, methods=["GET"]) +@app.get(HEALTHENDPOINT, description="Shows the status of the component") def health(): """required health endpoint for callback of Spring Boot Admin server""" - return "alive" + return Response("alive", media_type="text/plain") -@app.route(aboutendpoint, methods=["GET"]) +@app.get(ABOUTENDPOINT, description="Shows a description of the component") def about(): """required about endpoint for callback of Srping Boot Admin server""" - return "about" # TODO: replace this with a service description from configuration + return Response("Translates questions into English", media_type="text/plain") diff --git a/qanary-component-MT-Python-MBart/component/mt_mbart_nlp.py b/qanary-component-MT-Python-MBart/component/mt_mbart_nlp.py index 9e4ad5520..846c5bd86 100644 --- a/qanary-component-MT-Python-MBart/component/mt_mbart_nlp.py +++ b/qanary-component-MT-Python-MBart/component/mt_mbart_nlp.py @@ -1,60 +1,38 @@ -from langdetect import detect import logging import os -from flask import Blueprint, jsonify, request from qanary_helpers.qanary_queries import get_text_question_in_graph, insert_into_triplestore +from qanary_helpers.language_queries import get_translated_texts_in_triplestore, get_texts_with_detected_language_in_triplestore, QuestionTextWithLanguage, create_annotation_of_question_translation +from utils.model_utils import load_models_and_tokenizers +from utils.lang_utils import translation_options, LANG_CODE_MAP +from fastapi import APIRouter, Request +from fastapi.responses import JSONResponse -from transformers import MBartForConditionalGeneration, MBart50TokenizerFast logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) - -mt_mbart_nlp_bp = Blueprint("mt_mbart_nlp_bp", __name__, template_folder="templates") +router = APIRouter() SERVICE_NAME_COMPONENT = os.environ["SERVICE_NAME_COMPONENT"] -SOURCE_LANG = os.environ["SOURCE_LANGUAGE"] -TARGET_LANG = os.environ["TARGET_LANGUAGE"] - -model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") -tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") -lang_code_map = { - "en": "en_XX", - "de": "de_DE", - "ru": "ru_RU", - "fr": "fr_XX", - "es": "ex_XX", - "pt": "pr_XX" -} -target_lang = TARGET_LANG - -supported_langs = lang_code_map.keys() # TODO: check supported languages for LibreTranslate - - -@mt_mbart_nlp_bp.route("/annotatequestion", methods=["POST"]) -def qanary_service(): - """the POST endpoint required for a Qanary service""" +TRANSLATEENDPOINT = "/translate" - triplestore_endpoint = request.json["values"]["urn:qanary#endpoint"] - triplestore_ingraph = request.json["values"]["urn:qanary#inGraph"] - triplestore_outgraph = request.json["values"]["urn:qanary#outGraph"] - logging.info("endpoint: %s, inGraph: %s, outGraph: %s" % \ - (triplestore_endpoint, triplestore_ingraph, triplestore_outgraph)) +model, tokenizer = load_models_and_tokenizers() - text = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, - graph=triplestore_ingraph)[0]["text"] - question_uri = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, - graph=triplestore_ingraph)[0]["uri"] - logging.info(f"Question text: {text}") - if SOURCE_LANG != None and len(SOURCE_LANG.strip()) > 0: - lang = SOURCE_LANG - logging.info("Using custom SOURCE_LANGUAGE") - else: - lang = detect(text) - logging.info("No SOURCE_LANGUAGE specified, using langdetect!") - logging.info(f"source language: {lang}") +def translate_input(text:str, source_lang: str, target_lang: str) -> str: + """Translates text from a source language into a target language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + + Returns: + str: The translated text + """ + + logging.info(f"translating \"{text}\" from \"{source_lang}\" to \"{target_lang}\"") ## MAIN FUNCTIONALITY - tokenizer.src_lang = lang_code_map[lang] # TODO: do formats match? + tokenizer.src_lang = LANG_CODE_MAP[source_lang] # TODO: do formats match? logging.info(f"source language mapped code: {tokenizer.src_lang}") batch = tokenizer(text, return_tensors="pt") @@ -66,76 +44,142 @@ def qanary_service(): # Perform the translation and decode the output generated_tokens = model.generate( **batch, - forced_bos_token_id=tokenizer.lang_code_to_id[lang_code_map[target_lang]]) # TODO: defined target lang + forced_bos_token_id=tokenizer.convert_tokens_to_ids(LANG_CODE_MAP[target_lang])) result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] + translation = result.replace("\"", "\\\"") #keep quotation marks that are part of the translation + logging.info(f"result: \"{translation}\"") + return translation + + +@router.get("/translate_to_one", description="Translate a text from a given source language into one target language.", tags=["Translate"]) +def translate_to_one(text: str, source_lang: str, target_lang: str): + """Translates a text from a given source language into one target language. + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + Returns: + dict: A dictionary of translations identified by their target language (only one in this case) + """ + + if (source_lang in translation_options.keys()) and (target_lang in translation_options.get(source_lang, [])): + translation = translate_input(text, source_lang, target_lang) + return {target_lang: translation} + else: + raise RuntimeError("Unsupported source and/or target language! Valid options: {to}".format(to=translation_options)) + + +@router.get("/translate_to_all", description="Translate a text from a given source language into all configured target languages for that source language.", tags=["Translate"]) +def translate_to_all(text: str, source_lang: str): + """Translates a text from a given source language into all target configured languages for that source language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + + Returns: + dict: A dictionary of translations identified by their target language + """ + + if source_lang in translation_options.keys(): + translations = list() + for target_lang in translation_options[source_lang]: + translation = translate_input(text, source_lang, target_lang) + translations.append({ + target_lang: translation + }) + return translations + else: + raise RuntimeError("Unsupported source language! Valid options: {to}".format(to=translation_options)) + + +@router.post("/annotatequestion", description="Standard process method for Qanary components", tags=["Qanary"]) +async def qanary_service(request: Request): + """the POST endpoint required for a Qanary service""" + + request_json = await request.json() + + triplestore_endpoint = request_json["values"]["urn:qanary#endpoint"] + triplestore_ingraph = request_json["values"]["urn:qanary#inGraph"] + triplestore_outgraph = request_json["values"]["urn:qanary#outGraph"] + logging.info("endpoint: %s, inGraph: %s, outGraph: %s" % \ + (triplestore_endpoint, triplestore_ingraph, triplestore_outgraph)) - # building SPARQL query TODO: verify this annotation AnnotationOfQuestionTranslation ?? - SPARQLqueryAnnotationOfQuestionTranslation = """ - PREFIX qa: - PREFIX oa: - PREFIX xsd: - - INSERT {{ - GRAPH <{uuid}> {{ - ?a a qa:AnnotationOfQuestionTranslation ; - oa:hasTarget <{qanary_question_uri}> ; - oa:hasBody "{translation_result}"@{target_lang} ; - oa:annotatedBy ; - oa:annotatedAt ?time . - - }} - }} - WHERE {{ - BIND (IRI(str(RAND())) AS ?a) . - BIND (now() as ?time) - }} - """.format( - uuid=triplestore_ingraph, - qanary_question_uri=question_uri, - translation_result=result.replace("\"", "\\\""), #keep quotation marks that are part of the translation - target_lang=TARGET_LANG, - app_name=SERVICE_NAME_COMPONENT - ) - - SPARQLqueryAnnotationOfQuestionLanguage = """ - PREFIX qa: - PREFIX oa: - PREFIX xsd: - - INSERT {{ - GRAPH <{uuid}> {{ - ?b a qa:AnnotationOfQuestionLanguage ; - oa:hasTarget <{qanary_question_uri}> ; - oa:hasBody "{src_lang}"^^xsd:string ; - oa:annotatedBy ; - oa:annotatedAt ?time . - }} - }} - WHERE {{ - BIND (IRI(str(RAND())) AS ?b) . - BIND (now() as ?time) - }} - """.format( - uuid=triplestore_ingraph, - qanary_question_uri=question_uri, - src_lang=lang, - app_name=SERVICE_NAME_COMPONENT - ) - - logging.info(f'SPARQL: {SPARQLqueryAnnotationOfQuestionTranslation}') - logging.info(f'SPARQL: {SPARQLqueryAnnotationOfQuestionLanguage}') - # inserting new data to the triplestore - insert_into_triplestore(triplestore_endpoint, SPARQLqueryAnnotationOfQuestionTranslation) - insert_into_triplestore(triplestore_endpoint, SPARQLqueryAnnotationOfQuestionLanguage) - - return jsonify(request.get_json()) - - -@mt_mbart_nlp_bp.route("/", methods=["GET"]) -def index(): - """examplary GET endpoint""" - - logging.info("host_url: %s" % (request.host_url)) - return "Python MT MBart Qanary component" + text_question_in_graph = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, graph=triplestore_ingraph) + question_text = text_question_in_graph[0]['text'] + logging.info(f'Original question text: {question_text}') + + # Collect texts to be translated (group by source language) + + source_texts_per_language = dict() + # keep a list of annotations to insert + insert_annotations = list() + + for source_lang in translation_options.keys(): + source_texts = find_source_texts_in_triplestore( + triplestore_endpoint=triplestore_endpoint, + graph_uri=triplestore_ingraph, + lang=source_lang + ) + source_texts_per_language.update({source_lang: source_texts}) + + # for every source language that has associated texts + for source_lang in source_texts_per_language.keys(): + # translate each found text + for source_text in source_texts_per_language[source_lang]: + # into every target language that is supported for this source language + for target_lang in translation_options[source_lang]: + translation = translate_input(source_text.get_text(), source_lang, target_lang) + if len(translation.strip()) > 0: + SPARQLqueryAnnotationOfQuestionTranslation = create_annotation_of_question_translation( + graph_uri=triplestore_ingraph, + question_uri=source_text.get_uri(), + translation=translation, + translation_language=target_lang, + app_name=SERVICE_NAME_COMPONENT + ) + insert_annotations.append(SPARQLqueryAnnotationOfQuestionTranslation) + else: + logging.error(f"result is empty string!") + + # insert the created annotations into the triplestore + for insert_annotation in insert_annotations: + insert_into_triplestore(triplestore_endpoint, insert_annotation) + + return JSONResponse(request_json) + + +def find_source_texts_in_triplestore(triplestore_endpoint: str, graph_uri: str, lang: str) -> list[QuestionTextWithLanguage]: + """Retrieves questions of a specific language from the triplestore. + + Parameters: + triplestore_endpoint (str): URL of the triplestore endpoint + graph_uri (str): URI of the graph to query inside of the triplestore + lang (str): Expected language + + Returns: + list: A list of appropriate QuestionTextWithLanguage objects with information from the triplestore. + """ + + source_texts = [] + + # check if supported languages have been determined already (LD) + # (use filters) + # if so, use the target uris to find the question text to translate + ld_source_texts = get_texts_with_detected_language_in_triplestore(triplestore_endpoint, graph_uri, lang) + source_texts.extend(ld_source_texts) + + # check if there are translations into the relevant language (MT) + # (use filters) + # if so, use the translation texts + mt_source_texts = get_translated_texts_in_triplestore(triplestore_endpoint, graph_uri, lang) + source_texts.extend(mt_source_texts) + + # TODO: what if nothing found? + if len(source_texts) == 0: + logging.warning(f"No source texts with language {lang} could be found In the triplestore!") + + return source_texts diff --git a/qanary-component-MT-Python-MBart/docker-compose-pairs.yml b/qanary-component-MT-Python-MBart/docker-compose-pairs.yml new file mode 100644 index 000000000..42f423de8 --- /dev/null +++ b/qanary-component-MT-Python-MBart/docker-compose-pairs.yml @@ -0,0 +1,108 @@ +version: '3' +services: + + component-en-de: + # for building from source + image: qanary/qanary-component-mt-python-mbart-en-de:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=de + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-Mbart-en-de + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + component-en-ru: + # for building from source + image: qanary/qanary-component-mt-python-mbart-en-ru:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=ru + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-Mbart-en-ru + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-en-es: + # for building from source + image: qanary/qanary-component-mt-python-mbart-en-es:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=es + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-Mbart-en-es + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-de-en: + # for building from source + image: qanary/qanary-component-mt-python-mbart-de-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=de + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-Mbart-de-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-ru-en: + # for building from source + image: qanary/qanary-component-mt-python-mbart-ru-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=ru + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-Mbart-ru-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-es-en: + # for building from source + image: qanary/qanary-component-mt-python-mbart-es-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=es + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-Mbart-es-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host diff --git a/qanary-component-MT-Python-MBart/docker-compose.yml b/qanary-component-MT-Python-MBart/docker-compose.yml index 0c5345d01..b03ad6391 100644 --- a/qanary-component-MT-Python-MBart/docker-compose.yml +++ b/qanary-component-MT-Python-MBart/docker-compose.yml @@ -2,12 +2,14 @@ version: '3' services: component: # for building from source - image: qanary-component-mt-python-mbart:latest + image: qanary/qanary-component-mt-python-mbart:0.2.0 build: context: . dockerfile: Dockerfile - network_mode: host env_file: - .env + environment: + - SERVICE_NAME_COMPONENT=MT-MBart volumes: - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host diff --git a/qanary-component-MT-Python-MBart/pytest.ini b/qanary-component-MT-Python-MBart/pytest.ini index a51bc97cf..9a13e811f 100644 --- a/qanary-component-MT-Python-MBart/pytest.ini +++ b/qanary-component-MT-Python-MBart/pytest.ini @@ -1,16 +1,14 @@ [pytest] -log_cli = 0 +log_cli = 1 log_cli_level = INFO log_cli_format = %(asctime)s [%(levelname)8s] [%(filename)s:%(lineno)s] %(message)s log_cli_date_format=%Y-%m-%d %H:%M:%S env = - SERVER_PORT=40120 - SPRING_BOOT_ADMIN_URL=http://qanary-pipeline-host:40111 - SERVER_HOST=http://public-component-host - SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL=http://public-component-host:40120 + SERVER_PORT=8081 + SERVER_HOST=http://localhost + SPRING_BOOT_ADMIN_URL=http://localhost:40111 + SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL=http://localhost:8081 SPRING_BOOT_ADMIN_USERNAME=admin SPRING_BOOT_ADMIN_PASSWORD=admin - SERVICE_NAME_COMPONENT=MT-MBart + SERVICE_NAME_COMPONENT=MT-MBart-Component SERVICE_DESCRIPTION_COMPONENT=Translates question to English - SOURCE_LANGUAGE=de - TARGET_LANGUAGE=en diff --git a/qanary-component-MT-Python-MBart/requirements.txt b/qanary-component-MT-Python-MBart/requirements.txt index 7e37256a2..a604a81e3 100644 --- a/qanary-component-MT-Python-MBart/requirements.txt +++ b/qanary-component-MT-Python-MBart/requirements.txt @@ -1,13 +1,10 @@ -Flask -langdetect==1.0.9 -langid==1.1.6 -mock==3.0.5 -python-dotenv==0.21.1 -qanary_helpers==0.2.2 -transformers==4.41.0 -sentencepiece==0.1.97 -torch==2.3.0 -gunicorn==20.1.0 -protobuf==3.20.* -pytest -pytest-env +fastapi==0.109.1 +pytest==8.3.2 +pytest-env==1.1.3 +SentencePiece==0.2.0 +SPARQLWrapper==2.0.0 +torch==2.4.0 +transformers==4.44.0 +qanary-helpers==0.3.2 +uvicorn==0.30.1 +httpx==0.27.0 diff --git a/qanary-component-MT-Python-MBart/run.py b/qanary-component-MT-Python-MBart/run.py index 339ec7e1e..16177a12c 100644 --- a/qanary-component-MT-Python-MBart/run.py +++ b/qanary-component-MT-Python-MBart/run.py @@ -4,32 +4,35 @@ from qanary_helpers.registration import Registration from qanary_helpers.registrator import Registrator -from component import app, healthendpoint, aboutendpoint +from component import app, HEALTHENDPOINT, ABOUTENDPOINT -logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) +logging.basicConfig(level=logging.ERROR) +# TODO: get logger from module +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) SPRING_BOOT_ADMIN_URL = os.getenv('SPRING_BOOT_ADMIN_URL') SPRING_BOOT_ADMIN_USERNAME = os.getenv('SPRING_BOOT_ADMIN_USERNAME') SPRING_BOOT_ADMIN_PASSWORD = os.getenv('SPRING_BOOT_ADMIN_PASSWORD') -SERVICE_HOST = os.getenv('SERVER_HOST') -SERVICE_PORT = os.getenv('SERVER_PORT') +SERVER_HOST = os.getenv('SERVER_HOST') +SERVER_PORT = os.getenv('SERVER_PORT') SERVICE_NAME_COMPONENT = os.getenv('SERVICE_NAME_COMPONENT') SERVICE_DESCRIPTION_COMPONENT = os.getenv('SERVICE_DESCRIPTION_COMPONENT') -URL_COMPONENT = f"http://{SERVICE_HOST}:{SERVICE_PORT}" +URL_COMPONENT = f"http://{SERVER_HOST}:{SERVER_PORT}" # define metadata that will be shown in the Spring Boot Admin server UI metadata = { "start": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "description": SERVICE_DESCRIPTION_COMPONENT, - "about": f"{SERVICE_HOST}:{SERVICE_PORT}{aboutendpoint}", + "about": f"{SERVER_HOST}:{SERVER_PORT}{ABOUTENDPOINT}", "written in": "Python" } # initialize the registration object, to be send to the Spring Boot Admin server registration = Registration( name=SERVICE_NAME_COMPONENT, - serviceUrl=f"{SERVICE_HOST}:{SERVICE_PORT}", - healthUrl=f"{SERVICE_HOST}:{SERVICE_PORT}{healthendpoint}", + serviceUrl=f"{SERVER_HOST}:{SERVER_PORT}", + healthUrl=f"{SERVER_HOST}:{SERVER_PORT}{HEALTHENDPOINT}", metadata=metadata ) @@ -47,7 +50,5 @@ if __name__ == "__main__": # start the web service - if SERVICE_PORT == None: - raise RuntimeError("SERVICE_PORT must not be empty!") - else: - app.run(debug=True, port=SERVICE_PORT) + if SERVER_PORT == None: + raise RuntimeError("SERVER_PORT must not be empty!") diff --git a/qanary-component-MT-Python-MBart/tests/test_lang_utils.py b/qanary-component-MT-Python-MBart/tests/test_lang_utils.py new file mode 100644 index 000000000..6f81e7117 --- /dev/null +++ b/qanary-component-MT-Python-MBart/tests/test_lang_utils.py @@ -0,0 +1,70 @@ +import logging +from unittest import mock +from unittest import TestCase +import os +import importlib + +class TestLangUtils(TestCase): + + logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'fr'}) + def test_only_one_source_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + assert 'fr' in translation_options.keys() + assert len(translation_options.keys()) == 1 + + + @mock.patch.dict(os.environ, {'TARGET_LANGUAGE': 'ru'}) + def test_only_one_target_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + # all 5 non-russian source languages should support 'ru' + assert len(translation_options.items()) == 5 + # but each item should only contain the one target language! + assert ('en', ['ru']) in translation_options.items() + assert ('de', ['ru']) in translation_options.items() + assert ('es', ['ru']) in translation_options.items() + assert ('fr', ['ru']) in translation_options.items() + assert ('pt', ['ru']) in translation_options.items() + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'en', 'TARGET_LANGUAGE': 'es'}) + def test_specific_source_and_target_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + assert translation_options == {'en': ['es']} + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'zh'}) + def test_unsupported_source_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'en', 'TARGET_LANGUAGE': 'zh'}) + def test_unsupported_target_for_source_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass + + + @mock.patch.dict(os.environ, {'TARGET_LANGUAGE': 'zh'}) + def test_unsupported_target_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass diff --git a/qanary-component-MT-Python-MBart/tests/test_mt_mbart_nlp.py b/qanary-component-MT-Python-MBart/tests/test_mt_mbart_nlp.py index eedfce342..143991d68 100644 --- a/qanary-component-MT-Python-MBart/tests/test_mt_mbart_nlp.py +++ b/qanary-component-MT-Python-MBart/tests/test_mt_mbart_nlp.py @@ -1,15 +1,20 @@ -from component.mt_mbart_nlp import * +import logging from component import app +from fastapi.testclient import TestClient from unittest.mock import patch +from unittest import mock import re from unittest import TestCase +from qanary_helpers.language_queries import QuestionTextWithLanguage +import os +import importlib class TestComponent(TestCase): logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) - questions = list([{"uri": "urn:test-uri", "text": "was ist ein Test?"}]) + questions = list([{"uri": "urn:test-uri", "text": "Was ist die Hauptstadt von Deutschland?"}]) endpoint = "urn:qanary#test-endpoint" in_graph = "urn:qanary#test-inGraph" out_graph = "urn:qanary#test-outGraph" @@ -17,6 +22,10 @@ class TestComponent(TestCase): source_language = "de" target_language = "en" + source_texts = [ + QuestionTextWithLanguage("uri", "Was ist die Hauptstadt von Deutschland?", "de") + ] + request_data = '''{ "values": { "urn:qanary#endpoint": "urn:qanary#test-endpoint", @@ -32,42 +41,51 @@ class TestComponent(TestCase): "Content-Type": "application/json" } + client = TestClient(app) + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'de', 'TARGET_LANGUAGE': 'en'}) def test_qanary_service(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + import component.mt_mbart_nlp + importlib.reload(component.mt_mbart_nlp) + from component import app + + logging.info("port: %s" % (os.environ["SERVER_PORT"])) + assert os.environ["SERVICE_NAME_COMPONENT"] == "MT-MBart-Component" + assert os.environ["SOURCE_LANGUAGE"] == self.source_language + assert os.environ["TARGET_LANGUAGE"] == self.target_language - with app.test_client() as client, \ - patch('component.mt_mbart_nlp.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ - patch('component.mt_mbart_nlp.insert_into_triplestore') as mocked_insert_into_triplestore: + with patch('component.mt_mbart_nlp.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ + patch('component.mt_mbart_nlp.find_source_texts_in_triplestore') as mocked_find_source_texts_in_triplestore, \ + patch('component.mt_mbart_nlp.insert_into_triplestore') as mocked_insert_into_triplestore: # given a non-english question is present in the current graph mocked_get_text_question_in_graph.return_value = self.questions + mocked_find_source_texts_in_triplestore.return_value = self.source_texts mocked_insert_into_triplestore.return_value = None # when a call to /annotatequestion is made - response_json = client.post("/annotatequestion", headers = self.headers, data = self.request_data) + response_json = self.client.post("/annotatequestion", headers = self.headers, data = self.request_data) # then the text question is retrieved from the triplestore mocked_get_text_question_in_graph.assert_called_with(triplestore_endpoint=self.endpoint, graph=self.in_graph) + mocked_find_source_texts_in_triplestore.assert_called_with(triplestore_endpoint=self.endpoint, graph_uri=self.in_graph, lang=self.source_language) + assert mocked_find_source_texts_in_triplestore.call_count == 1 + # get arguments of the (2) separate insert calls arg_list = mocked_insert_into_triplestore.call_args_list # get the call arguments for question translation call_args_translation = [a.args for a in arg_list if "AnnotationOfQuestionTranslation" in a.args[1]] assert len(call_args_translation) == 1 - # get the call arguments for question language - call_args_language = [a.args for a in arg_list if "AnnotationOfQuestionLanguage" in a.args[1]] - assert len(call_args_language) == 1 # clean query strings query_translation = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_translation[0][1]) - query_language = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_language[0][1]) # then the triplestore is updated twice # (question language and translation) - assert mocked_insert_into_triplestore.call_count == 2 - - # then the source language is correctly identified and annotated - self.assertRegex(query_language, r".*AnnotationOfQuestionLanguage(.*;\W?)*oa:hasBody \""+self.source_language+r"\".*\.") + assert mocked_insert_into_triplestore.call_count == 1 # then the question is translated and the result is annotated self.assertRegex(query_translation, r".*AnnotationOfQuestionTranslation(.*;\W?)*oa:hasBody \".*\"@" + self.target_language + r".*\.") @@ -75,3 +93,28 @@ def test_qanary_service(self): # then the response is not empty assert response_json != None + + + # test with all supported languages enabled + def test_translate_input(self): + import component.mt_mbart_nlp + from component.mt_mbart_nlp import translate_input + import utils.lang_utils + importlib.reload(utils.lang_utils) + importlib.reload(component.mt_mbart_nlp) + translations = [ + {"text": "Was ist die Hauptstadt von Deutschland?", + "translation": "What is the capital of Germany?", + "source_lang": "de", "target_lang": "en"}, + {"text": "What is the capital of Germany?", + "translation": "Quelle est la capitale de l'Allemagne?", + "source_lang": "en", "target_lang": "fr"}, +# {"text": "What is the capital of Germany?", TODO: MBart answers: "Что такое столица Германии?" +# "translation": "Какая столица Германии?", +# "source_lang": "en", "target_lang": "ru"}, + ] + + for translation in translations: + expected = translation["translation"] + actual = translate_input(translation["text"], translation["source_lang"], translation["target_lang"]) + assert expected == actual diff --git a/qanary-component-MT-Python-MBart/utils/__init__.py b/qanary-component-MT-Python-MBart/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/qanary-component-MT-Python-MBart/utils/lang_utils.py b/qanary-component-MT-Python-MBart/utils/lang_utils.py new file mode 100644 index 000000000..94733cb8b --- /dev/null +++ b/qanary-component-MT-Python-MBart/utils/lang_utils.py @@ -0,0 +1,72 @@ +import os +import logging + + +logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + +SOURCE_LANGUAGE = os.getenv("SOURCE_LANGUAGE") +TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE") +SUPPORTED_LANGS = { +# source: targets + 'en': ['de', 'ru', 'fr', 'es', 'pt'], + 'de': ['en', 'ru', 'fr', 'es', 'pt'], + 'ru': ['en', 'de', 'fr', 'es', 'pt'], + 'fr': ['en', 'de', 'ru', 'es', 'pt'], + 'es': ['en', 'de', 'ru', 'fr', 'pt'], + 'pt': ['en', 'de', 'ru', 'fr', 'es'] +} + +LANG_CODE_MAP = { + "en": "en_XX", + "de": "de_DE", + "ru": "ru_RU", + "fr": "fr_XX", + "es": "es_XX", + "pt": "pt_XX" +} + +def setup_translation_options() -> dict: + """Creates a dictionary of possible source and target languages, based on SUPPORTED_LANGS and configured languages.""" + + logging.info("SETTING UP TRANSLATION OPTIONS") + translation_options = dict() # init emtpy + + # check if a source language is specified + if SOURCE_LANGUAGE != None and len(SOURCE_LANGUAGE.strip()) > 0: + # pre-select appropriate translation options from the list of supported source languages + try: + translation_options[SOURCE_LANGUAGE] = SUPPORTED_LANGS[SOURCE_LANGUAGE] + # this will fail for invalid keys! + except KeyError: + raise ValueError(f"The source language \"{SOURCE_LANGUAGE}\" is not supported!") + # if no source language is specified, use all source languages that are supported by the models + else: + translation_options = SUPPORTED_LANGS + + # check if a target language is specified + if TARGET_LANGUAGE != None and len(TARGET_LANGUAGE.strip()) > 0: + discard_keys = list() + # remove instances where source == target + translation_options.pop(TARGET_LANGUAGE, None) + for source_lang in translation_options.keys(): + if TARGET_LANGUAGE in translation_options[source_lang]: + translation_options[source_lang] = [TARGET_LANGUAGE] + else: + discard_keys.append(source_lang) + # cleanup keys + translation_options = {sl:tl for sl,tl in translation_options.items() if sl not in discard_keys} + # check for empty translation options, if all keys dropped + if len(translation_options.keys()) == 0: + raise ValueError("The target language \"{tl}\" is not supported for any configured source languages! \nValid language pairs (source: [targets]) are: \n{slk}!" + .format(tl=TARGET_LANGUAGE, slk=SUPPORTED_LANGS)) + # check if only some keys dropped + elif len(discard_keys) > 0: + logging.warning("Specific target language \"{tl}\" is not supported for these source languages: {dk}!. \nThese language pairs will be ignored." + .format(tl=TARGET_LANGUAGE, dk=discard_keys)) + # else do nothing, the lists are already complete + + logging.info(translation_options) + return translation_options + + +translation_options = setup_translation_options() diff --git a/qanary-component-MT-Python-MBart/utils/model_utils.py b/qanary-component-MT-Python-MBart/utils/model_utils.py new file mode 100644 index 000000000..f4c0ef5b7 --- /dev/null +++ b/qanary-component-MT-Python-MBart/utils/model_utils.py @@ -0,0 +1,13 @@ +from transformers import MBartForConditionalGeneration, MBart50TokenizerFast + + +def load_models_and_tokenizers(): + """Loads models and tokenizers based on configured translation language pairs. + + Parameters: + translation_options (dict): Key is the source language, value is a list of configured target languages + """ + + model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") + tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") + return model, tokenizer diff --git a/qanary-component-MT-Python-NLLB/Dockerfile b/qanary-component-MT-Python-NLLB/Dockerfile index 61eb53cca..b1adee742 100644 --- a/qanary-component-MT-Python-NLLB/Dockerfile +++ b/qanary-component-MT-Python-NLLB/Dockerfile @@ -1,14 +1,21 @@ -FROM python:3.7 +FROM python:3.10 COPY requirements.txt ./ RUN pip install --upgrade pip -RUN pip install -r requirements.txt; exit 0 -RUN pip install gunicorn +RUN pip install -r requirements.txt COPY component component +COPY utils utils COPY run.py boot.sh ./ +# to allow preconfigured images +ARG SOURCE_LANGUAGE +ARG TARGET_LANGUAGE + +ENV SOURCE_LANGUAGE=$SOURCE_LANGUAGE +ENV TARGET_LANGUAGE=$TARGET_LANGUAGE + RUN chmod +x boot.sh ENTRYPOINT ["./boot.sh"] diff --git a/qanary-component-MT-Python-NLLB/README.md b/qanary-component-MT-Python-NLLB/README.md index 42a352e87..a0d11a4a4 100644 --- a/qanary-component-MT-Python-NLLB/README.md +++ b/qanary-component-MT-Python-NLLB/README.md @@ -22,12 +22,6 @@ Not applicable as the textual question is a default parameter oa:hasBody "translation_result"@en ; oa:annotatedBy ; oa:annotatedAt "2001-10-26T21:32:52"^^xsd:dateTime . - - a qa:AnnotationOfQuestionLanguage . - oa:hasTarget ; - oa:hasBody "lang-id"^^xsd:string ; - oa:annotatedBy ; - oa:annotatedAt "2001-10-26T21:32:52"^^xsd:dateTime . ``` ## Usage @@ -69,8 +63,8 @@ The parameters description: * `SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL` -- the URL of your Qanary component (has to be visible to the Qanary pipeline) * `SERVICE_NAME_COMPONENT` -- the name of your Qanary component (for better identification) * `SERVICE_DESCRIPTION_COMPONENT` -- the description of your Qanary component -* `SOURCE_LANGUAGE` -- (optional) the source language of the text (the component will use langdetect if no source language is given) -* `TARGET_LANGUAGE` -- the language that the text should be translated to +* `SOURCE_LANGUAGE` -- (optional) the default source language of the translation +* `TARGET_LANGUAGE` -- (optional) the default target language of the translation 4. Build the Docker image: @@ -84,18 +78,43 @@ docker-compose build docker-compose up ``` -After execution, component creates Qanary annotation in the Qanary triplestore: +After successful execution, component creates Qanary annotation in the Qanary triplestore: ``` GRAPH { - ?a a qa:AnnotationOfQuestionLanguage . - ?a qa:translationResult "translation result" . - ?a qa:sourceLanguage "ISO_639-1 language code" . - ?a oa:annotatedBy . - ?a oa:annotatedAt ?time . - } + ?a a qa:AnnotationOfQuestionTranslation . + ?a oa:hasTarget . + ?a oa:hasBody "translation_result"@ISO_639-1 language code + ?a oa:annotatedBy . + ?a oa:annotatedAt ?time . } ``` +### Support for multiple Source and Target Languages + +This component relies on the presence of one of more existing annotations that associate a question text with a language. +This can be in the form of an `AnnotationOfQuestionLanguage`, as created by LD components, or an `AnnotationOfQuestionTranslation` as created by MT components. + +It supports multiple combinations of source and target languages. +You can specify a desired source and target language independently, or simply use all available language pairings. + +If a `SOURCE_LANGUAGE` is set, then only texts with this specific language are considered for translation. +If none is set, then all configured source languages will be used to find candidates for translation. + +Similarily, if a `TARGET_LANGUAGE` is set, then texts are only translated into that language. +If none is set, then the texts are translated into all target languages that are supported for their respective source language. + +Note that while configured source languages naturally determine the possible target languages, +the configured target languages also determine which source languages can be supported! + +### Pre-configured Docker Images + +You may use the included file `docker-compose-pairs.yml` to build a list of images that are preconfigured for specific language pairs. +Note that if you intend to use these containers at the same time, you need to assign different `SERVER_PORT` values for each image. + +```bash +docker-compose -f docker-compose-pairs.yml build +``` + ## How To Test This Component This component uses the [pytest](https://docs.pytest.org/). diff --git a/qanary-component-MT-Python-NLLB/boot.sh b/qanary-component-MT-Python-NLLB/boot.sh index 42301c685..65fe4e9b8 100755 --- a/qanary-component-MT-Python-NLLB/boot.sh +++ b/qanary-component-MT-Python-NLLB/boot.sh @@ -1,14 +1,33 @@ -#!/bin/sh +#!/bin/bash +export $(grep -v "^#" < .env) -export $(grep -v '^#' .env | xargs) +# check required parameters +declare -a required_vars=( +"SPRING_BOOT_ADMIN_URL" +"SERVER_HOST" +"SERVER_PORT" +"SPRING_BOOT_ADMIN_USERNAME" +"SPRING_BOOT_ADMIN_PASSWORD" +"SERVICE_NAME_COMPONENT" +"SERVICE_DESCRIPTION_COMPONENT" +) + +for param in ${required_vars[@]}; +do + if [[ -z ${!param} ]]; then + echo "Required variable \"$param\" is not set!" + echo "The required variables are: ${required_vars[@]}" + exit 4 + fi +done + +echo Downloading the models + +python -c "from utils.model_utils import load_models_and_tokenizers; load_models_and_tokenizers(); " -echo Downloading the model -python -c 'from transformers import AutoModelForSeq2SeqLM, AutoTokenizer; model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") ; tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")' echo Downloading the model finished echo The port number is: $SERVER_PORT +echo The host is: $SERVER_HOST echo The Qanary pipeline URL is: $SPRING_BOOT_ADMIN_URL -if [ -n $SERVER_PORT ] -then - exec gunicorn -b :$SERVER_PORT --access-logfile - --error-logfile - run:app # refer to the gunicorn documentation for more options -fi +exec uvicorn run:app --host 0.0.0.0 --port $SERVER_PORT --log-level warning diff --git a/qanary-component-MT-Python-NLLB/component/__init__.py b/qanary-component-MT-Python-NLLB/component/__init__.py index 239105a22..5efe8f36f 100644 --- a/qanary-component-MT-Python-NLLB/component/__init__.py +++ b/qanary-component-MT-Python-NLLB/component/__init__.py @@ -1,26 +1,31 @@ -from component.mt_nllb import mt_nllb_bp -from flask import Flask +from component import mt_nllb +from fastapi import FastAPI +from fastapi.responses import RedirectResponse, Response -version = "0.1.3" +version = "0.2.0" # default config file configfile = "app.conf" # service status information -healthendpoint = "/health" - -aboutendpoint = "/about" +HEALTHENDPOINT = "/health" +ABOUTENDPOINT = "/about" +# TODO: add languages endpoint? # init Flask app and add externalized service information -app = Flask(__name__) -app.register_blueprint(mt_nllb_bp) +app = FastAPI(docs_url="/swagger-ui.html") +app.include_router(mt_nllb.router) + +@app.get("/") +async def main(): + return RedirectResponse("/about") -@app.route(healthendpoint, methods=["GET"]) +@app.get(HEALTHENDPOINT, description="Shows the status of the component") def health(): """required health endpoint for callback of Spring Boot Admin server""" - return "alive" + return Response("alive", media_type="text/plain") -@app.route(aboutendpoint, methods=["GET"]) +@app.get(ABOUTENDPOINT, description="Shows a description of the component") def about(): """required about endpoint for callback of Srping Boot Admin server""" - return "about" # TODO: replace this with a service description from configuration + return Response("Translates questions into English", media_type="text/plain") diff --git a/qanary-component-MT-Python-NLLB/component/mt_nllb.py b/qanary-component-MT-Python-NLLB/component/mt_nllb.py index acdf979b1..5606fabc0 100644 --- a/qanary-component-MT-Python-NLLB/component/mt_nllb.py +++ b/qanary-component-MT-Python-NLLB/component/mt_nllb.py @@ -1,58 +1,36 @@ -from langdetect import detect import logging import os -from flask import Blueprint, jsonify, request from qanary_helpers.qanary_queries import get_text_question_in_graph, insert_into_triplestore +from qanary_helpers.language_queries import get_translated_texts_in_triplestore, get_texts_with_detected_language_in_triplestore, QuestionTextWithLanguage, create_annotation_of_question_translation +from utils.model_utils import load_models_and_tokenizers +from utils.lang_utils import translation_options, LANG_CODE_MAP +from fastapi import APIRouter, Request +from fastapi.responses import JSONResponse -from transformers import AutoModelForSeq2SeqLM, AutoTokenizer logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) - -mt_nllb_bp = Blueprint("mt_nllb_bp", __name__, template_folder="templates") +router = APIRouter() SERVICE_NAME_COMPONENT = os.environ["SERVICE_NAME_COMPONENT"] -SOURCE_LANG = os.environ["SOURCE_LANGUAGE"] -TARGET_LANG = os.environ["TARGET_LANGUAGE"] - -model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") -tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") -lang_code_map = { - 'en': 'eng_Latn', - 'de': 'deu_Latn', - 'ru': 'rus_Cyrl', - 'fr': 'fra_Latn', - 'es': 'spa_Latn', - 'pt': 'por_Latn' -} - - -@mt_nllb_bp.route("/annotatequestion", methods=["POST"]) -def qanary_service(): - """the POST endpoint required for a Qanary service""" +TRANSLATEENDPOINT = "/translate" - triplestore_endpoint = request.json["values"]["urn:qanary#endpoint"] - triplestore_ingraph = request.json["values"]["urn:qanary#inGraph"] - triplestore_outgraph = request.json["values"]["urn:qanary#outGraph"] - logging.info("endpoint: %s, inGraph: %s, outGraph: %s" % \ - (triplestore_endpoint, triplestore_ingraph, triplestore_outgraph)) +model, tokenizer = load_models_and_tokenizers() - text = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, - graph=triplestore_ingraph)[0]["text"] - question_uri = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, - graph=triplestore_ingraph)[0]["uri"] - logging.info(f"Question text: {text}") - if SOURCE_LANG != None and len(SOURCE_LANG.strip()) > 0: - lang = SOURCE_LANG - logging.info("Using custom SOURCE_LANGUAGE") - else: - lang = detect(text) - logging.info("No SOURCE_LANGUAGE specified, using langdetect!") - logging.info(f"source language: {lang}") +def translate_input(text: str, source_lang: str, target_lang: str) -> str: + """Translates text from a source language into a target language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + Returns: + str: The translated text + """ - ## MAIN FUNCTIONALITY - tokenizer.src_lang = lang_code_map[lang] + logging.info(f"translating \"{text}\" from \"{source_lang}\" to \"{target_lang}\"") + tokenizer.src_lang = LANG_CODE_MAP[source_lang] logging.info(f"source language mapped code: {tokenizer.src_lang}") batch = tokenizer(text, return_tensors="pt") @@ -64,73 +42,142 @@ def qanary_service(): # Perform the translation and decode the output generated_tokens = model.generate( **batch, - forced_bos_token_id=tokenizer.lang_code_to_id[lang_code_map[TARGET_LANG]]) + forced_bos_token_id=tokenizer.convert_tokens_to_ids(LANG_CODE_MAP[target_lang])) result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] + translation = result.replace("\"", "\\\"") #keep quotation marks that are part of the translation + logging.info(f"result: \"{translation}\"") + return translation + + +@router.get("/translate_to_one", description="Translate a text from a given source language into one target language.", tags=["Translate"]) +def translate_to_one(text: str, source_lang: str, target_lang: str): + """Translates a text from a given source language into one target language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + + Returns: + dict: A dictionary of translations identified by their target language (only one in this case) + """ + + if (source_lang in translation_options.keys()) and (target_lang in translation_options.get(source_lang, [])): + translation = translate_input(text, source_lang, target_lang) + return {target_lang: translation} + else: + raise RuntimeError("Unsupported source and/or target language! Valid options: {to}".format(to=translation_options)) + + +@router.get("/translate_to_all", description="Translate a text from a given source language into all configured target languages for that source language.", tags=["Translate"]) +def translate_to_all(text: str, source_lang: str): + """Translates a text from a given source language into all target configured languages for that source language. + + Parameters: + text (str): Text to be translated + source_lang (str): Language of the text + target_lang (str): Language of the translation + + Returns: + dict: A dictionary of translations identified by their target language + """ + + if source_lang in translation_options.keys(): + translations = list() + for target_lang in translation_options[source_lang]: + translation = translate_input(text, source_lang, target_lang) + translations.append({ + target_lang: translation + }) + return translations + else: + raise RuntimeError("Unsupported source language! Valid options: {to}".format(to=translation_options)) + +def find_source_texts_in_triplestore(triplestore_endpoint: str, graph_uri: str, lang: str) -> list[QuestionTextWithLanguage]: + """Retrieves questions of a specific language from the triplestore. + + Parameters: + triplestore_endpoint (str): URL of the triplestore endpoint + graph_uri (str): URI of the graph to query inside of the triplestore + lang (str): Expected language + + Returns: + list: A list of appropriate QuestionTextWithLanguage objects with information from the triplestore. + """ + + source_texts = [] + + # check if supported languages have been determined already (LD) + # (use filters) + # if so, use the target uris to find the question text to translate + ld_source_texts = get_texts_with_detected_language_in_triplestore(triplestore_endpoint, graph_uri, lang) + source_texts.extend(ld_source_texts) + + # check if there are translations into the relevant language (MT) + # (use filters) + # if so, use the translation texts + mt_source_texts = get_translated_texts_in_triplestore(triplestore_endpoint, graph_uri, lang) + source_texts.extend(mt_source_texts) + + # TODO: what if nothing found? + if len(source_texts) == 0: + logging.warning(f"No source texts with language {lang} could be found In the triplestore!") + + return source_texts + + +@router.post("/annotatequestion", description="Standard process method for Qanary components", tags=["Qanary"]) +async def qanary_service(request: Request): + """the POST endpoint required for a Qanary service""" + + request_json = await request.json() + + triplestore_endpoint = request_json["values"]["urn:qanary#endpoint"] + triplestore_ingraph = request_json["values"]["urn:qanary#inGraph"] + triplestore_outgraph = request_json["values"]["urn:qanary#outGraph"] + logging.info("endpoint: %s, inGraph: %s, outGraph: %s" % \ + (triplestore_endpoint, triplestore_ingraph, triplestore_outgraph)) - # building SPARQL query TODO: verify this annotation AnnotationOfQuestionTranslation ?? - SPARQLqueryAnnotationOfQuestionTranslation = """ - PREFIX qa: - PREFIX oa: - PREFIX xsd: - - INSERT {{ - GRAPH <{uuid}> {{ - ?a a qa:AnnotationOfQuestionTranslation ; - oa:hasTarget <{qanary_question_uri}> ; - oa:hasBody "{translation_result}"@{target_lang} ; - oa:annotatedBy ; - oa:annotatedAt ?time . - - }} - }} - WHERE {{ - BIND (IRI(str(RAND())) AS ?a) . - BIND (now() as ?time) - }}""".format( - uuid=triplestore_ingraph, - qanary_question_uri=question_uri, - translation_result=result.replace("\"", "\\\""), #keep quotation marks that are part of the translation - target_lang=TARGET_LANG, - app_name=SERVICE_NAME_COMPONENT - ) - - SPARQLqueryAnnotationOfQuestionLanguage = """ - PREFIX qa: - PREFIX oa: - PREFIX xsd: - - INSERT {{ - GRAPH <{uuid}> {{ - ?b a qa:AnnotationOfQuestionLanguage ; - oa:hasTarget <{qanary_question_uri}> ; - oa:hasBody "{src_lang}"^^xsd:string ; - oa:annotatedBy ; - oa:annotatedAt ?time . - }} - }} - WHERE {{ - BIND (IRI(str(RAND())) AS ?b) . - BIND (now() as ?time) - }}""".format( - uuid=triplestore_ingraph, - qanary_question_uri=question_uri, - src_lang=lang, - app_name=SERVICE_NAME_COMPONENT - ) - - logging.info(f'SPARQL: {SPARQLqueryAnnotationOfQuestionTranslation}') - logging.info(f'SPARQL: {SPARQLqueryAnnotationOfQuestionLanguage}') - # inserting new data to the triplestore - insert_into_triplestore(triplestore_endpoint, SPARQLqueryAnnotationOfQuestionTranslation) - insert_into_triplestore(triplestore_endpoint, SPARQLqueryAnnotationOfQuestionLanguage) - - return jsonify(request.get_json()) - - -@mt_nllb_bp.route("/", methods=["GET"]) -def index(): - """examplary GET endpoint""" - - logging.info("host_url: %s" % (request.host_url)) - return "Python MT NLLB Qanary component" + text_question_in_graph = get_text_question_in_graph(triplestore_endpoint=triplestore_endpoint, graph=triplestore_ingraph) + question_text = text_question_in_graph[0]['text'] + logging.info(f'Original question text: {question_text}') + + # Collect texts to be translated (group by source language) + + source_texts_per_language = dict() + # keep a list of annotations to insert + insert_annotations = list() + + for source_lang in translation_options.keys(): + source_texts = find_source_texts_in_triplestore( + triplestore_endpoint=triplestore_endpoint, + graph_uri=triplestore_ingraph, + lang=source_lang + ) + source_texts_per_language.update({source_lang: source_texts}) + + # for every source language that has associated texts + for source_lang in source_texts_per_language.keys(): + # translate each found text + for source_text in source_texts_per_language[source_lang]: + # into every target language that is supported for this source language + for target_lang in translation_options[source_lang]: + translation = translate_input(source_text.get_text(), source_lang, target_lang) + if len(translation.strip()) > 0: + SPARQLqueryAnnotationOfQuestionTranslation = create_annotation_of_question_translation( + graph_uri=triplestore_ingraph, + question_uri=source_text.get_uri(), + translation=translation, + translation_language=target_lang, + app_name=SERVICE_NAME_COMPONENT + ) + insert_annotations.append(SPARQLqueryAnnotationOfQuestionTranslation) + else: + logging.error(f"result is empty string!") + + # insert the created annotations into the triplestore + for insert_annotation in insert_annotations: + insert_into_triplestore(triplestore_endpoint, insert_annotation) + + return JSONResponse(request_json) diff --git a/qanary-component-MT-Python-NLLB/docker-compose-pairs.yml b/qanary-component-MT-Python-NLLB/docker-compose-pairs.yml new file mode 100644 index 000000000..ac9469f87 --- /dev/null +++ b/qanary-component-MT-Python-NLLB/docker-compose-pairs.yml @@ -0,0 +1,108 @@ +version: '3' +services: + + component-en-de: + # for building from source + image: qanary/qanary-component-mt-python-nllb-en-de:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=de + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-NLLB-en-de + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + component-en-ru: + # for building from source + image: qanary/qanary-component-mt-python-nllb-en-ru:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=ru + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-NLLB-en-ru + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-en-es: + # for building from source + image: qanary/qanary-component-mt-python-nllb-en-es:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=en + - TARGET_LANGUAGE=es + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-NLLB-en-es + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-de-en: + # for building from source + image: qanary/qanary-component-mt-python-nllb-de-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=de + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-NLLB-de-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-ru-en: + # for building from source + image: qanary/qanary-component-mt-python-nllb-ru-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=ru + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-NLLB-ru-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host + + + component-es-en: + # for building from source + image: qanary/qanary-component-mt-python-nllb-es-en:latest + build: + context: . + dockerfile: Dockerfile + args: + - SOURCE_LANGUAGE=es + - TARGET_LANGUAGE=en + env_file: + - .env + environment: + - SERVICE_NAME_COMPONENT=MT-NLLB-es-en + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host diff --git a/qanary-component-MT-Python-NLLB/docker-compose.yml b/qanary-component-MT-Python-NLLB/docker-compose.yml index 67ccfcb12..c49d4838a 100644 --- a/qanary-component-MT-Python-NLLB/docker-compose.yml +++ b/qanary-component-MT-Python-NLLB/docker-compose.yml @@ -2,12 +2,14 @@ version: '3' services: component: # for building from source - image: qanary-component-mt-python-nllb:latest + image: qanary/qanary-component-mt-python-nllb:0.2.0 build: context: . dockerfile: Dockerfile - network_mode: host env_file: - .env + environment: + - SERVICE_NAME_COMPONENT=MT-NLLB volumes: - ~/.cache/huggingface:/root/.cache/huggingface/ + network_mode: host diff --git a/qanary-component-MT-Python-NLLB/pytest.ini b/qanary-component-MT-Python-NLLB/pytest.ini index 4c87d9ab0..3a623b2e4 100644 --- a/qanary-component-MT-Python-NLLB/pytest.ini +++ b/qanary-component-MT-Python-NLLB/pytest.ini @@ -1,16 +1,14 @@ [pytest] -log_cli = 0 +log_cli = 1 log_cli_level = INFO log_cli_format = %(asctime)s [%(levelname)8s] [%(filename)s:%(lineno)s] %(message)s log_cli_date_format=%Y-%m-%d %H:%M:%S env = - SERVER_PORT=40120 - SPRING_BOOT_ADMIN_URL=http://qanary-pipeline-host:40111 - SERVER_HOST=http://public-component-host - SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL=http://public-component-host:40120 + SERVER_PORT=8081 + SERVER_HOST=http://localhost + SPRING_BOOT_ADMIN_URL=http://localhost:40111 + SPRING_BOOT_ADMIN_CLIENT_INSTANCE_SERVICE-BASE-URL=http://localhost:8081 SPRING_BOOT_ADMIN_USERNAME=admin SPRING_BOOT_ADMIN_PASSWORD=admin - SERVICE_NAME_COMPONENT=MT-NLLB + SERVICE_NAME_COMPONENT=MT-NLLB-Component SERVICE_DESCRIPTION_COMPONENT=Translates question to English - SOURCE_LANGUAGE=de - TARGET_LANGUAGE=en diff --git a/qanary-component-MT-Python-NLLB/requirements.txt b/qanary-component-MT-Python-NLLB/requirements.txt index ec4a61a82..a604a81e3 100644 --- a/qanary-component-MT-Python-NLLB/requirements.txt +++ b/qanary-component-MT-Python-NLLB/requirements.txt @@ -1,12 +1,10 @@ -Flask -langdetect==1.0.9 -mock==3.0.5 -python-dotenv==0.21.1 -qanary_helpers==0.2.2 -transformers==4.41.0 -sentencepiece==0.1.97 -torch==2.3.0 -gunicorn==20.1.0 -protobuf==3.20.* -pytest -pytest-env +fastapi==0.109.1 +pytest==8.3.2 +pytest-env==1.1.3 +SentencePiece==0.2.0 +SPARQLWrapper==2.0.0 +torch==2.4.0 +transformers==4.44.0 +qanary-helpers==0.3.2 +uvicorn==0.30.1 +httpx==0.27.0 diff --git a/qanary-component-MT-Python-NLLB/run.py b/qanary-component-MT-Python-NLLB/run.py index 339ec7e1e..16177a12c 100644 --- a/qanary-component-MT-Python-NLLB/run.py +++ b/qanary-component-MT-Python-NLLB/run.py @@ -4,32 +4,35 @@ from qanary_helpers.registration import Registration from qanary_helpers.registrator import Registrator -from component import app, healthendpoint, aboutendpoint +from component import app, HEALTHENDPOINT, ABOUTENDPOINT -logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) +logging.basicConfig(level=logging.ERROR) +# TODO: get logger from module +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) SPRING_BOOT_ADMIN_URL = os.getenv('SPRING_BOOT_ADMIN_URL') SPRING_BOOT_ADMIN_USERNAME = os.getenv('SPRING_BOOT_ADMIN_USERNAME') SPRING_BOOT_ADMIN_PASSWORD = os.getenv('SPRING_BOOT_ADMIN_PASSWORD') -SERVICE_HOST = os.getenv('SERVER_HOST') -SERVICE_PORT = os.getenv('SERVER_PORT') +SERVER_HOST = os.getenv('SERVER_HOST') +SERVER_PORT = os.getenv('SERVER_PORT') SERVICE_NAME_COMPONENT = os.getenv('SERVICE_NAME_COMPONENT') SERVICE_DESCRIPTION_COMPONENT = os.getenv('SERVICE_DESCRIPTION_COMPONENT') -URL_COMPONENT = f"http://{SERVICE_HOST}:{SERVICE_PORT}" +URL_COMPONENT = f"http://{SERVER_HOST}:{SERVER_PORT}" # define metadata that will be shown in the Spring Boot Admin server UI metadata = { "start": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "description": SERVICE_DESCRIPTION_COMPONENT, - "about": f"{SERVICE_HOST}:{SERVICE_PORT}{aboutendpoint}", + "about": f"{SERVER_HOST}:{SERVER_PORT}{ABOUTENDPOINT}", "written in": "Python" } # initialize the registration object, to be send to the Spring Boot Admin server registration = Registration( name=SERVICE_NAME_COMPONENT, - serviceUrl=f"{SERVICE_HOST}:{SERVICE_PORT}", - healthUrl=f"{SERVICE_HOST}:{SERVICE_PORT}{healthendpoint}", + serviceUrl=f"{SERVER_HOST}:{SERVER_PORT}", + healthUrl=f"{SERVER_HOST}:{SERVER_PORT}{HEALTHENDPOINT}", metadata=metadata ) @@ -47,7 +50,5 @@ if __name__ == "__main__": # start the web service - if SERVICE_PORT == None: - raise RuntimeError("SERVICE_PORT must not be empty!") - else: - app.run(debug=True, port=SERVICE_PORT) + if SERVER_PORT == None: + raise RuntimeError("SERVER_PORT must not be empty!") diff --git a/qanary-component-MT-Python-NLLB/tests/test_lang_utils.py b/qanary-component-MT-Python-NLLB/tests/test_lang_utils.py new file mode 100644 index 000000000..6f81e7117 --- /dev/null +++ b/qanary-component-MT-Python-NLLB/tests/test_lang_utils.py @@ -0,0 +1,70 @@ +import logging +from unittest import mock +from unittest import TestCase +import os +import importlib + +class TestLangUtils(TestCase): + + logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'fr'}) + def test_only_one_source_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + assert 'fr' in translation_options.keys() + assert len(translation_options.keys()) == 1 + + + @mock.patch.dict(os.environ, {'TARGET_LANGUAGE': 'ru'}) + def test_only_one_target_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + # all 5 non-russian source languages should support 'ru' + assert len(translation_options.items()) == 5 + # but each item should only contain the one target language! + assert ('en', ['ru']) in translation_options.items() + assert ('de', ['ru']) in translation_options.items() + assert ('es', ['ru']) in translation_options.items() + assert ('fr', ['ru']) in translation_options.items() + assert ('pt', ['ru']) in translation_options.items() + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'en', 'TARGET_LANGUAGE': 'es'}) + def test_specific_source_and_target_language(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + from utils.lang_utils import translation_options + assert translation_options == {'en': ['es']} + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'zh'}) + def test_unsupported_source_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass + + + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'en', 'TARGET_LANGUAGE': 'zh'}) + def test_unsupported_target_for_source_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass + + + @mock.patch.dict(os.environ, {'TARGET_LANGUAGE': 'zh'}) + def test_unsupported_target_language_raises_error(self): + try: + import utils.lang_utils + importlib.reload(utils.lang_utils) + except ValueError as ve: + logging.error(ve) + pass diff --git a/qanary-component-MT-Python-NLLB/tests/test_mt_nllb.py b/qanary-component-MT-Python-NLLB/tests/test_mt_nllb.py index ff81038ef..e0249f40f 100644 --- a/qanary-component-MT-Python-NLLB/tests/test_mt_nllb.py +++ b/qanary-component-MT-Python-NLLB/tests/test_mt_nllb.py @@ -1,15 +1,20 @@ -from component.mt_nllb import * +import logging from component import app +from fastapi.testclient import TestClient from unittest.mock import patch +from unittest import mock import re from unittest import TestCase +from qanary_helpers.language_queries import QuestionTextWithLanguage +import os +import importlib class TestComponent(TestCase): logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) - questions = list([{"uri": "urn:test-uri", "text": "was ist ein Test?"}]) + questions = list([{"uri": "urn:test-uri", "text": "Was ist die Hauptstadt von Deutschland?"}]) endpoint = "urn:qanary#test-endpoint" in_graph = "urn:qanary#test-inGraph" out_graph = "urn:qanary#test-outGraph" @@ -17,6 +22,10 @@ class TestComponent(TestCase): source_language = "de" target_language = "en" + source_texts = [ + QuestionTextWithLanguage("uri", "Was ist die Hauptstadt von Deutschland?", "de") + ] + request_data = '''{ "values": { "urn:qanary#endpoint": "urn:qanary#test-endpoint", @@ -32,42 +41,51 @@ class TestComponent(TestCase): "Content-Type": "application/json" } + client = TestClient(app) + @mock.patch.dict(os.environ, {'SOURCE_LANGUAGE': 'de', 'TARGET_LANGUAGE': 'en'}) def test_qanary_service(self): + import utils.lang_utils + importlib.reload(utils.lang_utils) + import component.mt_nllb + importlib.reload(component.mt_nllb) + from component import app + + logging.info("port: %s" % (os.environ["SERVER_PORT"])) + assert os.environ["SERVICE_NAME_COMPONENT"] == "MT-NLLB-Component" + assert os.environ["SOURCE_LANGUAGE"] == self.source_language + assert os.environ["TARGET_LANGUAGE"] == self.target_language - with app.test_client() as client, \ - patch('component.mt_nllb.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ - patch('component.mt_nllb.insert_into_triplestore') as mocked_insert_into_triplestore: + with patch('component.mt_nllb.get_text_question_in_graph') as mocked_get_text_question_in_graph, \ + patch('component.mt_nllb.find_source_texts_in_triplestore') as mocked_find_source_texts_in_triplestore, \ + patch('component.mt_nllb.insert_into_triplestore') as mocked_insert_into_triplestore: # given a non-english question is present in the current graph mocked_get_text_question_in_graph.return_value = self.questions + mocked_find_source_texts_in_triplestore.return_value = self.source_texts mocked_insert_into_triplestore.return_value = None # when a call to /annotatequestion is made - response_json = client.post("/annotatequestion", headers = self.headers, data = self.request_data) + response_json = self.client.post("/annotatequestion", headers = self.headers, data = self.request_data) # then the text question is retrieved from the triplestore mocked_get_text_question_in_graph.assert_called_with(triplestore_endpoint=self.endpoint, graph=self.in_graph) + mocked_find_source_texts_in_triplestore.assert_called_with(triplestore_endpoint=self.endpoint, graph_uri=self.in_graph, lang=self.source_language) + assert mocked_find_source_texts_in_triplestore.call_count == 1 + # get arguments of the (2) separate insert calls arg_list = mocked_insert_into_triplestore.call_args_list # get the call arguments for question translation call_args_translation = [a.args for a in arg_list if "AnnotationOfQuestionTranslation" in a.args[1]] assert len(call_args_translation) == 1 - # get the call arguments for question language - call_args_language = [a.args for a in arg_list if "AnnotationOfQuestionLanguage" in a.args[1]] - assert len(call_args_language) == 1 # clean query strings query_translation = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_translation[0][1]) - query_language = re.sub(r"(\\n\W*|\n\W*)", " ", call_args_language[0][1]) # then the triplestore is updated twice # (question language and translation) - assert mocked_insert_into_triplestore.call_count == 2 - - # then the source language is correctly identified and annotated - self.assertRegex(query_language, r".*AnnotationOfQuestionLanguage(.*;\W?)*oa:hasBody \""+self.source_language+r"\".*\.") + assert mocked_insert_into_triplestore.call_count == 1 # then the question is translated and the result is annotated self.assertRegex(query_translation, r".*AnnotationOfQuestionTranslation(.*;\W?)*oa:hasBody \".*\"@" + self.target_language + r".*\.") @@ -75,3 +93,28 @@ def test_qanary_service(self): # then the response is not empty assert response_json != None + + + # test with all supported languages enabled + def test_translate_input(self): + import component.mt_nllb + from component.mt_nllb import translate_input + import utils.lang_utils + importlib.reload(utils.lang_utils) + importlib.reload(component.mt_nllb) + translations = [ + {"text": "Was ist die Hauptstadt von Deutschland?", + "translation": "What is the capital of Germany?", + "source_lang": "de", "target_lang": "en"}, + {"text": "What is the capital of Germany?", + "translation": "Quelle est la capitale de l'Allemagne?", + "source_lang": "en", "target_lang": "fr"}, +# {"text": "What is the capital of Germany?", TODO: result from NLLB: Какова столица Германии? +# "translation": "Какая столица Германии?", +# "source_lang": "en", "target_lang": "ru"}, + ] + + for translation in translations: + expected = translation["translation"] + actual = translate_input(translation["text"], translation["source_lang"], translation["target_lang"]) + assert expected == actual diff --git a/qanary-component-MT-Python-NLLB/utils/__init__.py b/qanary-component-MT-Python-NLLB/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/qanary-component-MT-Python-NLLB/utils/lang_utils.py b/qanary-component-MT-Python-NLLB/utils/lang_utils.py new file mode 100644 index 000000000..62193c7f5 --- /dev/null +++ b/qanary-component-MT-Python-NLLB/utils/lang_utils.py @@ -0,0 +1,73 @@ +import os +import logging + + +logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + +SOURCE_LANGUAGE = os.getenv("SOURCE_LANGUAGE") +TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE") +SUPPORTED_LANGS = { +# source: targets + 'en': ['de', 'ru', 'fr', 'es', 'pt'], + 'de': ['en', 'ru', 'fr', 'es', 'pt'], + 'ru': ['en', 'de', 'fr', 'es', 'pt'], + 'fr': ['en', 'de', 'ru', 'es', 'pt'], + 'es': ['en', 'de', 'ru', 'fr', 'pt'], + 'pt': ['en', 'de', 'ru', 'fr', 'es'] +} + +LANG_CODE_MAP = { + 'en': 'eng_Latn', + 'de': 'deu_Latn', + 'ru': 'rus_Cyrl', + 'fr': 'fra_Latn', + 'es': 'spa_Latn', + 'pt': 'por_Latn' +} + + +def setup_translation_options() -> dict: + """Creates a dictionary of possible source and target languages, based on SUPPORTED_LANGS and configured languages.""" + + logging.info("SETTING UP TRANSLATION OPTIONS") + translation_options = dict() # init emtpy + + # check if a source language is specified + if SOURCE_LANGUAGE != None and len(SOURCE_LANGUAGE.strip()) > 0: + # pre-select appropriate translation options from the list of supported source languages + try: + translation_options[SOURCE_LANGUAGE] = SUPPORTED_LANGS[SOURCE_LANGUAGE] + # this will fail for invalid keys! + except KeyError: + raise ValueError(f"The source language \"{SOURCE_LANGUAGE}\" is not supported!") + # if no source language is specified, use all source languages that are supported by the models + else: + translation_options = SUPPORTED_LANGS + + # check if a target language is specified + if TARGET_LANGUAGE != None and len(TARGET_LANGUAGE.strip()) > 0: + discard_keys = list() + # remove instances where source == target + translation_options.pop(TARGET_LANGUAGE, None) + for source_lang in translation_options.keys(): + if TARGET_LANGUAGE in translation_options[source_lang]: + translation_options[source_lang] = [TARGET_LANGUAGE] + else: + discard_keys.append(source_lang) + # cleanup keys + translation_options = {sl:tl for sl,tl in translation_options.items() if sl not in discard_keys} + # check for empty translation options, if all keys dropped + if len(translation_options.keys()) == 0: + raise ValueError("The target language \"{tl}\" is not supported for any configured source languages! \nValid language pairs (source: [targets]) are: \n{slk}!" + .format(tl=TARGET_LANGUAGE, slk=SUPPORTED_LANGS)) + # check if only some keys dropped + elif len(discard_keys) > 0: + logging.warning("Specific target language \"{tl}\" is not supported for these source languages: {dk}!. \nThese language pairs will be ignored." + .format(tl=TARGET_LANGUAGE, dk=discard_keys)) + # else do nothing, the lists are already complete + + logging.info(translation_options) + return translation_options + + +translation_options = setup_translation_options() diff --git a/qanary-component-MT-Python-NLLB/utils/model_utils.py b/qanary-component-MT-Python-NLLB/utils/model_utils.py new file mode 100644 index 000000000..e52e5865b --- /dev/null +++ b/qanary-component-MT-Python-NLLB/utils/model_utils.py @@ -0,0 +1,12 @@ +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + +def load_models_and_tokenizers(): + """Loads models and tokenizers based on configured translation language pairs. + + Parameters: + translation_options (dict): Key is the source language, value is a list of configured target languages + """ + + model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") + tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") + return model, tokenizer diff --git a/service_config/files/mt-helsinki b/service_config/files/mt-helsinki new file mode 100644 index 000000000..dbd684a5d --- /dev/null +++ b/service_config/files/mt-helsinki @@ -0,0 +1,4 @@ +SPRING_BOOT_ADMIN_URL=http://demos.swe.htwk-leipzig.de:40111/ +SERVER_HOST=http://demos.swe.htwk-leipzig.de +SERVER_PORT=40193 +SERVICE_DESCRIPTION_COMPONENT="Translates text using Helsinki OpusMT" diff --git a/service_config/files/mt-mbart b/service_config/files/mt-mbart new file mode 100644 index 000000000..2dcbea51b --- /dev/null +++ b/service_config/files/mt-mbart @@ -0,0 +1,4 @@ +SPRING_BOOT_ADMIN_URL=http://demos.swe.htwk-leipzig.de:40111/ +SERVER_HOST=http://demos.swe.htwk-leipzig.de +SERVER_PORT=40196 +SERVICE_DESCRIPTION_COMPONENT="Translates text using MBart" diff --git a/service_config/files/mt-nllb b/service_config/files/mt-nllb new file mode 100644 index 000000000..612f21886 --- /dev/null +++ b/service_config/files/mt-nllb @@ -0,0 +1,4 @@ +SPRING_BOOT_ADMIN_URL=http://demos.swe.htwk-leipzig.de:40111/ +SERVER_HOST=http://demos.swe.htwk-leipzig.de +SERVER_PORT=40194 +SERVICE_DESCRIPTION_COMPONENT="Translates text using NLLB" diff --git a/service_config/service_config.json b/service_config/service_config.json index b8175481f..9bf727131 100644 --- a/service_config/service_config.json +++ b/service_config/service_config.json @@ -143,8 +143,34 @@ "files": { ".env": "ned-openai-gpt" } + }, + { + "mode": "dockerfile", + "port": "40193:40193", + "image": "qanary/qanary-component-mt-python-helsinkinlp", + "tag": "latest", + "files": { + ".env": "mt-helsinki" + } + }, + { + "mode": "dockerfile", + "port": "40194:40194", + "image": "qanary/qanary-component-mt-python-nllb", + "tag": "latest", + "files": { + ".env": "mt-nllb" + } + }, + { + "mode": "dockerfile", + "port": "40196:40196", + "image": "qanary/qanary-component-mt-python-mbart", + "tag": "latest", + "files": { + ".env": "mt-mbart" + } } - ], "ids": { "0": "qanary-qanary-component-ld-shuyo", @@ -162,6 +188,9 @@ "12": "qanary-qanary-component-tqa-chatgptwrapper", "13": "qanary-qanary-component-qb-dateofdeathdpbedia", "14": "qanary-qanary-component-kg2kg-translateannotationsofinstance", - "15": "qanary-qanary-component-ned-openai-gpt" + "15": "qanary-qanary-component-ned-openai-gpt", + "16": "qanary-qanary-component-mt-python-helsinkinlp", + "17": "qanary-qanary-component-mt-python-nllb", + "18": "qanary-qanary-component-mt-python-mbart" } }