Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing text language Translation #12

Merged
merged 48 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
58e2d05
Connecting the hugging face inference model
ConradKash Aug 31, 2023
c26537f
Defined english to multiple function
ConradKash Aug 31, 2023
dd9ba60
Translating Texts
ConradKash Aug 31, 2023
a481aa3
Exponential Backoff
ConradKash Aug 31, 2023
45c939c
Mult - Mult Translation implemented
ConradKash Sep 1, 2023
b0f1dec
Language-ID is active where source language == None
ConradKash Sep 1, 2023
07b6429
Validation and Batch Translation Implemented
ConradKash Sep 2, 2023
e7c5353
FastApi Incoperation
ConradKash Sep 3, 2023
0e2436b
Connecting the hugging face inference model
ConradKash Aug 31, 2023
2d2c823
Defined english to multiple function
ConradKash Aug 31, 2023
db88236
Translating Texts
ConradKash Aug 31, 2023
1755c9b
Exponential Backoff
ConradKash Aug 31, 2023
3f84d2a
Mult - Mult Translation implemented
ConradKash Sep 1, 2023
26a3707
Language-ID is active where source language == None
ConradKash Sep 1, 2023
cf90a14
Validation and Batch Translation Implemented
ConradKash Sep 2, 2023
323314a
FastApi Incoperation
ConradKash Sep 3, 2023
53ddf78
fast Api implementation
ConradKash Sep 4, 2023
5cacdfe
Joblib add
ConradKash Sep 4, 2023
7415bc8
sklearn add
ConradKash Sep 4, 2023
ca62a5f
Model Versions corrected
ConradKash Sep 4, 2023
53de3d8
Merge remote-tracking branch 'origin/imp-text-translation' into imp-t…
ConradKash Sep 4, 2023
91bc556
Merge remote-tracking branch 'origin/imp-text-translation' into imp-t…
ConradKash Sep 4, 2023
cbc45bc
Unknown Source Language Implemented
ConradKash Sep 4, 2023
8cdff6e
Integrated mul-mul into api
ConradKash Sep 4, 2023
6715e20
Integrated long_text > 200 into api
ConradKash Sep 4, 2023
8f19268
Integrated long_text > 200 into api
ConradKash Sep 4, 2023
5494eae
Language ID incoporated
ConradKash Sep 5, 2023
f4d89bc
Done now exponential back off
ConradKash Sep 5, 2023
19a433d
Exponential Backoff
ConradKash Sep 5, 2023
3de6f57
Exponnential Backoff successfully implemented
ConradKash Sep 5, 2023
b523aee
Exponnential Backoff successfully implemente
ConradKash Sep 5, 2023
784fb53
Exponnential Backoff successfully implemente
ConradKash Sep 5, 2023
78efc98
Hugging Face
ConradKash Sep 5, 2023
b4f3554
Mul-mul test
ConradKash Sep 5, 2023
0beacc8
Mul-eng test
ConradKash Sep 5, 2023
b94fc8b
eng - mul test
ConradKash Sep 5, 2023
a4ba829
Source language detect
ConradKash Sep 5, 2023
a04a804
Source language detect
ConradKash Sep 5, 2023
89611c8
Source language detect
ConradKash Sep 5, 2023
3ee9742
Source language detect
ConradKash Sep 5, 2023
5e37193
Source language detect
ConradKash Sep 5, 2023
6e03ce4
Source language detect
ConradKash Sep 5, 2023
ab670e1
pytest mocker
ConradKash Sep 7, 2023
c5b1c35
pytest mocker
ConradKash Sep 7, 2023
b7b8b88
No module Error
ConradKash Sep 7, 2023
46b2ef9
No module Error
ConradKash Sep 7, 2023
42969d0
Fix mock tests
cmplx-xyttmt Sep 8, 2023
221937c
Remove unused variables
cmplx-xyttmt Sep 8, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.analysis.extraPaths": ["${sunbird-ai-api-experimental}/"]
}
Empty file added app/__init__.py
Empty file.
23 changes: 23 additions & 0 deletions app/api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from fastapi import FastAPI
from app.schemas.tasks import TranslationRequest, TranslationResponse
from app.inference_services.translate import (translate_text,
long_text_translation,
predicted_language)


app = FastAPI()
Expand All @@ -7,3 +11,22 @@
@app.get("/")
def read_root():
return {"Hello": "World"}


@app.post("/translate", response_model=TranslationResponse)
def translate(translation_request: TranslationRequest):
# This is the pont where it checks if the source language is not Null
source_language = None
if translation_request.source_language == "":
source_language = predicted_language(translation_request.text)

if len(translation_request.text) < 200:
response = translate_text(translation_request.text,
translation_request.source_language,
translation_request.target_language)
else:
response = long_text_translation(translation_request.text,
translation_request.source_language,
translation_request.target_language)

return TranslationResponse(text=response, source_language=source_language)
Empty file.
48 changes: 48 additions & 0 deletions app/inference_services/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import requests
import os
import time
from dotenv import load_dotenv
from tenacity import retry, wait_exponential

load_dotenv()


@retry(
wait=wait_exponential(multiplier=3, min=98, max=120) # Exponential backoff
)
def inference_request_en_mul(payload):
url = 'https://api-inference.huggingface.co/m'\
'odels/Sunbird/sunbird-en-mul'
headers_en_mul = {"Authorization": os.getenv("HEADER_HUGGING_FACE_TOKEN")}
response = requests.post(url, headers=headers_en_mul, json=payload)
# TODOCreate a function that just calls it
# This is where i applied the exponential backoff
if response.status_code == 503:
estimated_time = response.json()['estimated_time']
time.sleep(estimated_time)
# logging.info(f"Model Loading ...{estimated_time}")
print(estimated_time)
response = requests.post(url, headers=headers_en_mul, json=payload)
return response.text
else:
return response.text


@retry(
wait=wait_exponential(multiplier=3, min=98, max=120) # Exponential backoff
)
def inference_request_mul_en(payload):
url = 'https://api-inference.huggingface.co/m'\
'odels/Sunbird/mbart-mul-en'
headers_mul_en = {"Authorization": os.getenv("HEADER_HUGGING_FACE_TOKEN")}
response = requests.post(url, headers=headers_mul_en, json=payload)
# This is where i applied the exponential backoff
if response.status_code == 503:
estimated_time = response.json()['estimated_time']
time.sleep(estimated_time)
# logging.info(f"Model Loading ...{estimated_time}")
print(estimated_time)
response = requests.post(url, headers=headers_mul_en, json=payload)
return response.text
else:
return response.text
14 changes: 14 additions & 0 deletions app/inference_services/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import joblib
from pathlib import Path

BASE_DIR = Path(__file__).resolve(strict=True).parent

with open(f'{BASE_DIR}/pipeline_MultinomialNB.pkl', "rb") as f:
model = joblib.load(f)


def predicted_language(sentence):
samples = [sentence]
prediction = str(model.predict(samples))
prediction = prediction[2:-2]
return prediction
Binary file not shown.
78 changes: 78 additions & 0 deletions app/inference_services/translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from app.inference_services.base import (inference_request_en_mul,
inference_request_mul_en)
from app.inference_services.model import predicted_language


def create_payload_en_mul(text, target_language):
payload = {
"inputs": ">>" + str(target_language) + "<<" + str(text)
}
return payload


def create_payload_mul_en(text):
payload = {
"inputs": str(text)
}
return payload


def translate_text(text, source_language=None, target_language=None):
response_translate = []
while source_language is None:
source_language = predicted_language(text)

if source_language != 'eng' and target_language != 'eng':
payload = create_payload_mul_en(text)
response_eng = inference_request_mul_en(payload)
response_eng = response_eng[20:-3]
payload = create_payload_en_mul(response_eng,
target_language)
response_translate = inference_request_en_mul(payload)

elif source_language == 'eng':
payload = create_payload_en_mul(text,
target_language)
response_translate = inference_request_en_mul(payload)

elif target_language == 'eng':
payload = create_payload_mul_en(text)
response_translate = inference_request_mul_en(payload)
response = response_translate[20:-3]

return response


def create_chunks(text: str, chunk_size: int):
chunks = []
last_char_index = len(text)
chunk_start = 0
chunk_stop = 0

while chunk_stop != last_char_index:

chunk_stop += chunk_size

if chunk_stop > last_char_index:
chunk_stop = last_char_index

if chunk_stop != last_char_index:
while text[chunk_stop] != " ":
chunk_stop -= 1

chunks.append(text[chunk_start:chunk_stop])

chunk_start = chunk_stop+1

return chunks


def long_text_translation(src_text: str, src_lang: str, trans_lang: str):
src_text_chunks = create_chunks(src_text, chunk_size=200)
trans_text_chunks = []
for chunk in src_text_chunks:
trans_text_chunks.append(
translate_text(text=chunk,
target_language=trans_lang,
source_language=src_lang))
return " ".join(trans_text_chunks)
Empty file added app/schemas/__init__.py
Empty file.
22 changes: 22 additions & 0 deletions app/schemas/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from pydantic import BaseModel, Field
# from enum import Enum

# TODO: Include Enum
# class Language(str, Enum):
# Acholi = "ach"
# Ateso = "teo"
# English = "eng"
# Luganda = "lug"
# Lugbara = "Lgg"
# Runyankole = "nyn"


class TranslationResponse(BaseModel):
text: str
source_language: str = None


class TranslationRequest(BaseModel):
source_language: str = None
target_language: str
text: str = Field(max_length=5000, min_length=3)
Binary file not shown.
31 changes: 30 additions & 1 deletion app/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,39 @@
from fastapi.testclient import TestClient
from app.api import app

from app.inference_services.translate import translate_text, predicted_language
client = TestClient(app)


def test_root_api():
response = client.get('/')
assert response.status_code == 200
assert response.json() == {"Hello": "World"}


def test_mul_mul(mocker):
fake_response_eng = '[{"generated_text":"Where are we heading?"}]'
mocker.patch('requests.post').return_value.text = fake_response_eng

fake_response_mul = '[{"generated_text":"Nituza nkahi?"}]'
mocker.patch('requests.post').return_value.text = fake_response_mul

assert translate_text('Tuli wa', 'lug', 'nyn') == 'Nituza nkahi?'


def test_mul_eng(mocker):
fake_response_eng = '[{"generated_text":"Where are we heading?"}]'
mocker.patch('requests.post').return_value.text = fake_response_eng
assert translate_text('Tuli wa', 'lug',
'eng') == "Where are we heading?"


def test_eng_mul(mocker):
fake_response_mul = '[{"generated_text":"Nituza nkahi?"}]'
mocker.patch('requests.post').return_value.text = fake_response_mul

assert translate_text('Where are we heading?',
'eng', 'nyn') == "Nituza nkahi?"


def test_language_detect():
assert predicted_language('Ninyisha omuka') == "nyn"
2 changes: 0 additions & 2 deletions pyproject.toml

This file was deleted.

12 changes: 9 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
pre-commit==3.3.3
pydantic<2.0.0
fastapi
uvicorn
pytest
httpx
flake8

# Task 2 - add github actions and badges
coveralls
tenacity==8.2.3
python-dotenv==1.0.0
joblib==1.3.2
# Task 2 - add gitHub actions and badges
coveralls
scikit-learn==1.3.0
requests
pytest-mock~=3.10.0