Skip to content

Commit

Permalink
Add --mask-dilation-offset
Browse files Browse the repository at this point in the history
  • Loading branch information
thatDudo committed Oct 23, 2023
1 parent 10fc4d8 commit 49d6aff
Show file tree
Hide file tree
Showing 10 changed files with 51 additions and 42 deletions.
18 changes: 10 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,14 @@ THA: Thai
-v, --verbose Print debug info and save intermediate images in result
folder
-f, --format {png,webp,jpg,xcf,psd,pdf} Output format of the translation.
--attempts ATTEMPTS Retry attempts on encountered error. -1 means infinite
times.
--ignore-errors Skip image on encountered error.
--overwrite Overwrite already translated images in batch mode.
--skip-no-text Skip image without text (Will not be saved).
--model-dir MODEL_DIR Model directory (by default ./models in project root)
--use-cuda Turn on/off cuda
--use-cuda-limited Turn on/off cuda (excluding offline translator)
--detector {default,ctd,craft,none} Text detector used for creating a text mask from an
image, DO NOT use craft for manga, it's not designed
for it
Expand All @@ -352,14 +360,6 @@ THA: Thai
image. Note the first translation service acts as
default if the language isnt defined. Example:
--translator-chain "google:JPN;sugoi:ENG".
--use-cuda Turn on/off cuda
--use-cuda-limited Turn on/off cuda (excluding offline translator)
--model-dir MODEL_DIR Model directory (by default ./models in project root)
--attempts ATTEMPTS Retry attempts on encountered error. -1 means infinite
times.
--ignore-errors Skip image on encountered error.
--overwrite Overwrite already translated images in batch mode.
--skip-no-text Skip image without text (Will not be saved).
--revert-upscaling Downscales the previously upscaled image after
translation back to original size (Use with --upscale-
ratio).
Expand All @@ -382,6 +382,8 @@ THA: Thai
full image size
--denoise-sigma DENOISE_SIGMA Used by colorizer and affects color strength, range
from 0 to 255 (default 30). -1 turns it off.
--mask-dilation-offset MASK_DILATION_OFFSET By how much to extend the text mask to remove left-over
text pixels of the original image.
--font-size FONT_SIZE Use fixed font size for rendering
--font-size-offset FONT_SIZE_OFFSET Offset font size by a given amount, positive number
increase font size and vice versa
Expand Down
18 changes: 10 additions & 8 deletions README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,14 @@ THA: Thai
-v, --verbose Print debug info and save intermediate images in result
folder
-f, --format {png,webp,jpg,xcf,psd,pdf} Output format of the translation.
--attempts ATTEMPTS Retry attempts on encountered error. -1 means infinite
times.
--ignore-errors Skip image on encountered error.
--overwrite Overwrite already translated images in batch mode.
--skip-no-text Skip image without text (Will not be saved).
--model-dir MODEL_DIR Model directory (by default ./models in project root)
--use-cuda Turn on/off cuda
--use-cuda-limited Turn on/off cuda (excluding offline translator)
--detector {default,ctd,craft,none} Text detector used for creating a text mask from an
image, DO NOT use craft for manga, it's not designed
for it
Expand All @@ -139,14 +147,6 @@ THA: Thai
image. Note the first translation service acts as
default if the language isnt defined. Example:
--translator-chain "google:JPN;sugoi:ENG".
--use-cuda Turn on/off cuda
--use-cuda-limited Turn on/off cuda (excluding offline translator)
--model-dir MODEL_DIR Model directory (by default ./models in project root)
--attempts ATTEMPTS Retry attempts on encountered error. -1 means infinite
times.
--ignore-errors Skip image on encountered error.
--overwrite Overwrite already translated images in batch mode.
--skip-no-text Skip image without text (Will not be saved).
--revert-upscaling Downscales the previously upscaled image after
translation back to original size (Use with --upscale-
ratio).
Expand All @@ -169,6 +169,8 @@ THA: Thai
full image size
--denoise-sigma DENOISE_SIGMA Used by colorizer and affects color strength, range
from 0 to 255 (default 30). -1 turns it off.
--mask-dilation-offset MASK_DILATION_OFFSET By how much to extend the text mask to remove left-over
text pixels of the original image.
--font-size FONT_SIZE Use fixed font size for rendering
--font-size-offset FONT_SIZE_OFFSET Offset font size by a given amount, positive number
increase font size and vice versa
Expand Down
21 changes: 12 additions & 9 deletions manga_translator/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,16 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
parser.add_argument('-l', '--target-lang', default='CHS', type=str, choices=VALID_LANGUAGES, help='Destination language')
parser.add_argument('-v', '--verbose', action='store_true', help='Print debug info and save intermediate images in result folder')
parser.add_argument('-f', '--format', default=None, choices=OUTPUT_FORMATS, help='Output format of the translation.')
parser.add_argument('--attempts', default=0, type=int, help='Retry attempts on encountered error. -1 means infinite times.')
parser.add_argument('--ignore-errors', action='store_true', help='Skip image on encountered error.')
parser.add_argument('--overwrite', action='store_true', help='Overwrite already translated images in batch mode.')
parser.add_argument('--skip-no-text', action='store_true', help='Skip image without text (Will not be saved).')
parser.add_argument('--model-dir', default=None, type=dir_path, help='Model directory (by default ./models in project root)')

g = parser.add_mutually_exclusive_group()
g.add_argument('--use-cuda', action='store_true', help='Turn on/off cuda')
g.add_argument('--use-cuda-limited', action='store_true', help='Turn on/off cuda (excluding offline translator)')

parser.add_argument('--detector', default='default', type=str, choices=DETECTORS, help='Text detector used for creating a text mask from an image, DO NOT use craft for manga, it\'s not designed for it')
parser.add_argument('--ocr', default='48px_ctc', type=str, choices=OCRS, help='Optical character recognition (OCR) model to use')
parser.add_argument('--inpainter', default='lama_mpe', type=str, choices=INPAINTERS, help='Inpainting model to use')
Expand All @@ -105,15 +115,6 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
g.add_argument('--translator-chain', default=None, type=translator_chain, help='Output of one translator goes in another. Example: --translator-chain "google:JPN;sugoi:ENG".')
g.add_argument('--selective-translation', default=None, type=translator_chain, help='Select a translator based on detected language in image. Note the first translation service acts as default if the language isnt defined. Example: --translator-chain "google:JPN;sugoi:ENG".')

g = parser.add_mutually_exclusive_group()
g.add_argument('--use-cuda', action='store_true', help='Turn on/off cuda')
g.add_argument('--use-cuda-limited', action='store_true', help='Turn on/off cuda (excluding offline translator)')

parser.add_argument('--model-dir', default=None, type=dir_path, help='Model directory (by default ./models in project root)')
parser.add_argument('--attempts', default=0, type=int, help='Retry attempts on encountered error. -1 means infinite times.')
parser.add_argument('--ignore-errors', action='store_true', help='Skip image on encountered error.')
parser.add_argument('--overwrite', action='store_true', help='Overwrite already translated images in batch mode.')
parser.add_argument('--skip-no-text', action='store_true', help='Skip image without text (Will not be saved).')
parser.add_argument('--revert-upscaling', action='store_true', help='Downscales the previously upscaled image after translation back to original size (Use with --upscale-ratio).')
parser.add_argument('--detection-size', default=1536, type=int, help='Size of image used for detection')
parser.add_argument('--det-rotate', action='store_true', help='Rotate the image for detection. Might improve detection.')
Expand All @@ -127,6 +128,8 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
parser.add_argument('--inpainting-size', default=2048, type=int, help='Size of image used for inpainting (too large will result in OOM)')
parser.add_argument('--colorization-size', default=576, type=int, help='Size of image used for colorization. Set to -1 to use full image size')
parser.add_argument('--denoise-sigma', default=30, type=int, help='Used by colorizer and affects color strength, range from 0 to 255 (default 30). -1 turns it off.')
parser.add_argument('--mask-dilation-offset', default=0, type=int, help='By how much to extend the text mask to remove left-over text pixels of the original image.')

parser.add_argument('--font-size', default=None, type=int, help='Use fixed font size for rendering')
parser.add_argument('--font-size-offset', default=0, type=int, help='Offset font size by a given amount, positive number increase font size and vice versa')
parser.add_argument('--font-size-minimum', default=-1, type=int, help='Minimum output font size. Default is image_sides_sum/200')
Expand Down
5 changes: 2 additions & 3 deletions manga_translator/manga_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ def __init__(self, params: dict = None):
torch.backends.cudnn.allow_tf32 = True

def parse_init_params(self, params: dict):
self.ignore_bubble=int(params.get('ignore_bubble', 0))
self.verbose = params.get('verbose', False)
self.ignore_errors = params.get('ignore_errors', False)

Expand Down Expand Up @@ -456,7 +455,7 @@ async def _run_detection(self, ctx: Context):
self.device, self.verbose)

async def _run_ocr(self, ctx: Context):
textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, self.device, self.verbose, self.ignore_bubble)
textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, ctx, self.device, self.verbose)

# Filter out regions by original text
new_textlines = []
Expand Down Expand Up @@ -517,7 +516,7 @@ async def _run_text_translation(self, ctx: Context):
return new_text_regions

async def _run_mask_refinement(self, ctx: Context):
return await dispatch_mask_refinement(ctx.text_regions, ctx.img_rgb, ctx.mask_raw, 'fit_text', self.verbose,self.ignore_bubble)
return await dispatch_mask_refinement(ctx.text_regions, ctx.img_rgb, ctx.mask_raw, 'fit_text', ctx.mask_dilation_offset, ctx.ignore_bubble, self.verbose)

async def _run_inpainting(self, ctx: Context):
return await dispatch_inpainting(ctx.inpainter, ctx.img_rgb, ctx.mask, ctx.inpainting_size, self.using_cuda, self.verbose)
Expand Down
4 changes: 2 additions & 2 deletions manga_translator/mask_refinement/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ..utils import TextBlock, Quadrilateral
from ..utils.bubble import is_ignore

async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mask: np.ndarray, method: str = 'fit_text', verbose: bool = False, ignore_bubble: int = 0) -> np.ndarray:
async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mask: np.ndarray, method: str = 'fit_text', dilation_offset: int = 0, ignore_bubble: int = 0, verbose: bool = False) -> np.ndarray:
# Larger sized mask images will probably have crisper and thinner mask segments due to being able to fit the text pixels better
# so we dont want to size them down as much to not loose information
scale_factor = max(min((raw_mask.shape[0] - raw_image.shape[0] / 3) / raw_mask.shape[0], 1), 0.5)
Expand All @@ -21,7 +21,7 @@ async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mas
q = Quadrilateral(l * scale_factor, '', 0)
textlines.append(q)

final_mask = complete_mask(img_resized, mask_resized, textlines) if method == 'fit_text' else complete_mask_fill([txtln.aabb.xywh for txtln in textlines])
final_mask = complete_mask(img_resized, mask_resized, textlines, dilation_offset=dilation_offset) if method == 'fit_text' else complete_mask_fill([txtln.aabb.xywh for txtln in textlines])
if final_mask is None:
final_mask = np.zeros((raw_image.shape[0], raw_image.shape[1]), dtype = np.uint8)
else:
Expand Down
4 changes: 2 additions & 2 deletions manga_translator/mask_refinement/text_mask_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def refine_mask(rgbimg, rawmask):
crf_mask = np.array(res * 255, dtype=np.uint8)
return crf_mask

def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilateral], keep_threshold = 1e-2):
def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilateral], keep_threshold = 1e-2, dilation_offset = 0):
bboxes = [txtln.aabb.xywh for txtln in textlines]
polys = [Polygon(txtln.pts) for txtln in textlines]
for (x, y, w, h) in bboxes:
Expand Down Expand Up @@ -166,7 +166,7 @@ def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilater
text_size = min(w1, h1, textlines[i].font_size)
x1, y1, w1, h1 = extend_rect(x1, y1, w1, h1, img.shape[1], img.shape[0], int(text_size * 0.1))
# TODO: Was text_size * 0.3 before. Need to think of better way to determine dilate_size.
dilate_size = max((int(text_size * 0.1) // 2) * 2 + 1, 3)
dilate_size = max((int((text_size + dilation_offset) * 0.1) // 2) * 2 + 1, 3)
# print(textlines[i].font_size, min(w1, h1), dilate_size)
kern = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dilate_size, dilate_size))
cc_region = np.ascontiguousarray(cc[y1: y1 + h1, x1: x1 + w1])
Expand Down
5 changes: 3 additions & 2 deletions manga_translator/ocr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ async def prepare(ocr_key: str, device: str = 'cpu'):
await ocr.download()
await ocr.load(device)

async def dispatch(ocr_key: str, image: np.ndarray, regions: List[Quadrilateral], device: str = 'cpu', verbose: bool = False,ignore_bubble: int = 0) -> List[Quadrilateral]:
async def dispatch(ocr_key: str, image: np.ndarray, regions: List[Quadrilateral], args = None, device: str = 'cpu', verbose: bool = False) -> List[Quadrilateral]:
ocr = get_ocr(ocr_key)
if isinstance(ocr, OfflineOCR):
await ocr.load(device)
return await ocr.recognize(image, regions, verbose, ignore_bubble)
args = args or {}
return await ocr.recognize(image, regions, args, verbose)
8 changes: 4 additions & 4 deletions manga_translator/ocr/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ def _generate_text_direction(self, bboxes: List[Union[Quadrilateral, TextBlock]]
for node in nodes:
yield bboxes[node], majority_dir

async def recognize(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[Quadrilateral]:
async def recognize(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]:
'''
Performs the optical character recognition, using the `textlines` as areas of interests.
Returns a `textlines` list with the `textline.text` property set to the detected text string.
'''
return await self._recognize(image, textlines, verbose, ignore_bubble)
return await self._recognize(image, textlines, args, verbose)

@abstractmethod
async def _recognize(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ingore_bubble: int = 0) -> List[Quadrilateral]:
async def _recognize(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]:
pass


Expand All @@ -56,5 +56,5 @@ async def _recognize(self, *args, **kwargs):
return await self.infer(*args, **kwargs)

@abstractmethod
async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[Quadrilateral]:
async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]:
pass
5 changes: 3 additions & 2 deletions manga_translator/ocr/model_32px.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,11 @@ async def _load(self, device: str):

async def _unload(self):
del self.model
async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[TextBlock]:

async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[TextBlock]:
text_height = 32
max_chunk_size = 16
ignore_bubble = args.get('ignore_bubble', 0)

quadrilaterals = list(self._generate_text_direction(textlines))
region_imgs = [q.get_transformed_region(image, d, text_height) for q, d in quadrilaterals]
Expand Down
5 changes: 3 additions & 2 deletions manga_translator/ocr/model_48px_ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ async def _load(self, device: str):
async def _unload(self):
del self.model

async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[TextBlock]:
async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[TextBlock]:
text_height = 48
max_chunk_size = 16
ignore_bubble = args.get('ignore_bubble', 0)

quadrilaterals = list(self._generate_text_direction(textlines))
region_imgs = [q.get_transformed_region(image, d, text_height) for q, d in quadrilaterals]
Expand All @@ -79,7 +80,7 @@ async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbos
W = region_imgs[idx].shape[1]
tmp = region_imgs[idx]
# Determine whether to skip the text block, and return True to skip.
if ignore_bubble >=1 and ignore_bubble <=50 and is_ignore(region_imgs[idx], ignore_bubble):
if ignore_bubble >=1 and ignore_bubble <=50 and is_ignore(region_imgs[idx], ignore_bubble):
ix+=1
continue
region[i, :, : W, :]=tmp
Expand Down

0 comments on commit 49d6aff

Please sign in to comment.