-
-
Notifications
You must be signed in to change notification settings - Fork 191
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
🔶 Add asyncIO feature for optimization of batch_translate #202
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from typing import Optional | ||
|
||
import aiohttp | ||
|
||
from deep_translator.exceptions import RequestError, TooManyRequests | ||
|
||
|
||
async def async_get_request( | ||
session: aiohttp.ClientSession, | ||
url: str, | ||
params: Optional[dict] = None, | ||
proxies: Optional[dict] = None, | ||
): | ||
async with session.get(url=url, params=params) as response: | ||
if response.status == 429: | ||
raise TooManyRequests() | ||
|
||
if response.status != 200: | ||
raise RequestError() | ||
|
||
return await response.text() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,10 +2,14 @@ | |
|
||
__copyright__ = "Copyright (C) 2020 Nidhal Baccouri" | ||
|
||
import asyncio | ||
from abc import ABC, abstractmethod | ||
from functools import lru_cache | ||
from pathlib import Path | ||
from typing import List, Optional, Union | ||
|
||
import aiohttp | ||
|
||
from deep_translator.constants import GOOGLE_LANGUAGES_TO_CODES | ||
from deep_translator.exceptions import ( | ||
InvalidSourceOrTargetLanguage, | ||
|
@@ -128,6 +132,23 @@ def translate(self, text: str, **kwargs) -> str: | |
""" | ||
return NotImplemented("You need to implement the translate method!") | ||
|
||
@abstractmethod | ||
@lru_cache(maxsize=128) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be better if you make the cache_maxsize value in a separate global config.py file |
||
async def _async_translate( | ||
self, text: str, session: aiohttp.ClientSession, **kwargs | ||
) -> str: | ||
""" | ||
translate a text using a async_translator under the hood and return | ||
the translated text | ||
@param text: text to translate | ||
@param session: a network ClientSession object of anyiohttp | ||
@param kwargs: additional arguments | ||
@return: str | ||
""" | ||
return NotImplemented( | ||
"You need to implement the _async_translate method!" | ||
) | ||
|
||
def _read_docx(self, f: str): | ||
import docx2txt | ||
|
||
|
@@ -181,3 +202,14 @@ def _translate_batch(self, batch: List[str], **kwargs) -> List[str]: | |
translated = self.translate(text, **kwargs) | ||
arr.append(translated) | ||
return arr | ||
|
||
async def async_translate_batch( | ||
self, batch: List[str], **kwargs | ||
) -> List[str]: | ||
if not batch: | ||
raise Exception("Enter your text list that you want to translate") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add this custom exception to exceptions.py just to keep everything consistent? Something like a NotValidInputBatch exception |
||
async with aiohttp.ClientSession() as session: | ||
translation_tasks = [ | ||
self._async_translate(text, session) for text in batch | ||
] | ||
return await asyncio.gather(*translation_tasks) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can get a ValueError here if translation_tasks is empty. You may want to add a check for that |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,11 +4,14 @@ | |
|
||
__copyright__ = "Copyright (C) 2020 Nidhal Baccouri" | ||
|
||
from functools import lru_cache | ||
from typing import List, Optional | ||
|
||
import aiohttp | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
from deep_translator.async_requests import async_get_request | ||
from deep_translator.base import BaseTranslator | ||
from deep_translator.constants import BASE_URLS | ||
from deep_translator.exceptions import ( | ||
|
@@ -120,3 +123,53 @@ def translate_batch(self, batch: List[str], **kwargs) -> List[str]: | |
@return: list of translations | ||
""" | ||
return self._translate_batch(batch, **kwargs) | ||
|
||
@lru_cache(maxsize=None) | ||
async def _async_translate( | ||
self, text: str, session: aiohttp.ClientSession, **kwargs | ||
): | ||
if is_input_valid(text): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since most of the code in this function is the same as the non-async translate, maybe you can find a way to make some parts reusable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nidhaloff I'll try to keep all the things in mind and fix it. Thanks for you detailed explanation. |
||
text = text.strip() | ||
if self._same_source_target() or is_empty(text): | ||
return text | ||
self._url_params["tl"] = self._target | ||
self._url_params["sl"] = self._source | ||
|
||
if self.payload_key: | ||
self._url_params[self.payload_key] = text | ||
|
||
response_text = await async_get_request( | ||
session, | ||
url=self._base_url, | ||
params=self._url_params, | ||
proxies=self.proxies, | ||
) | ||
|
||
soup = BeautifulSoup(response_text, "html.parser") | ||
|
||
element = soup.find(self._element_tag, self._element_query) | ||
|
||
if not element: | ||
element = soup.find(self._element_tag, self._alt_element_query) | ||
if not element: | ||
raise TranslationNotFound(text) | ||
if element.get_text(strip=True) == text.strip(): | ||
to_translate_alpha = "".join( | ||
ch for ch in text.strip() if ch.isalnum() | ||
) | ||
translated_alpha = "".join( | ||
ch for ch in element.get_text(strip=True) if ch.isalnum() | ||
) | ||
if ( | ||
to_translate_alpha | ||
and translated_alpha | ||
and to_translate_alpha == translated_alpha | ||
): | ||
self._url_params["tl"] = self._target | ||
if "hl" not in self._url_params: | ||
return text.strip() | ||
del self._url_params["hl"] | ||
return self.translate(text) | ||
|
||
else: | ||
return element.get_text(strip=True) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You will need a try catch here when you make this depedency optional or maybe just add the import inside the function where this will be used. For an example, check out the docx or pypdf dependencies.