From 49d6affdb6e9ef531b4948200ede455a0308e167 Mon Sep 17 00:00:00 2001 From: thatDudo Date: Mon, 23 Oct 2023 04:52:30 +0200 Subject: [PATCH] Add --mask-dilation-offset --- README.md | 18 +++++++++------- README_CN.md | 18 +++++++++------- manga_translator/args.py | 21 +++++++++++-------- manga_translator/manga_translator.py | 5 ++--- manga_translator/mask_refinement/__init__.py | 4 ++-- .../mask_refinement/text_mask_utils.py | 4 ++-- manga_translator/ocr/__init__.py | 5 +++-- manga_translator/ocr/common.py | 8 +++---- manga_translator/ocr/model_32px.py | 5 +++-- manga_translator/ocr/model_48px_ctc.py | 5 +++-- 10 files changed, 51 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index b6489457c..b22fd7834 100644 --- a/README.md +++ b/README.md @@ -332,6 +332,14 @@ THA: Thai -v, --verbose Print debug info and save intermediate images in result folder -f, --format {png,webp,jpg,xcf,psd,pdf} Output format of the translation. +--attempts ATTEMPTS Retry attempts on encountered error. -1 means infinite + times. +--ignore-errors Skip image on encountered error. +--overwrite Overwrite already translated images in batch mode. +--skip-no-text Skip image without text (Will not be saved). +--model-dir MODEL_DIR Model directory (by default ./models in project root) +--use-cuda Turn on/off cuda +--use-cuda-limited Turn on/off cuda (excluding offline translator) --detector {default,ctd,craft,none} Text detector used for creating a text mask from an image, DO NOT use craft for manga, it's not designed for it @@ -352,14 +360,6 @@ THA: Thai image. Note the first translation service acts as default if the language isnt defined. Example: --translator-chain "google:JPN;sugoi:ENG". ---use-cuda Turn on/off cuda ---use-cuda-limited Turn on/off cuda (excluding offline translator) ---model-dir MODEL_DIR Model directory (by default ./models in project root) ---attempts ATTEMPTS Retry attempts on encountered error. -1 means infinite - times. ---ignore-errors Skip image on encountered error. ---overwrite Overwrite already translated images in batch mode. ---skip-no-text Skip image without text (Will not be saved). --revert-upscaling Downscales the previously upscaled image after translation back to original size (Use with --upscale- ratio). @@ -382,6 +382,8 @@ THA: Thai full image size --denoise-sigma DENOISE_SIGMA Used by colorizer and affects color strength, range from 0 to 255 (default 30). -1 turns it off. +--mask-dilation-offset MASK_DILATION_OFFSET By how much to extend the text mask to remove left-over + text pixels of the original image. --font-size FONT_SIZE Use fixed font size for rendering --font-size-offset FONT_SIZE_OFFSET Offset font size by a given amount, positive number increase font size and vice versa diff --git a/README_CN.md b/README_CN.md index 90de5c67a..e83fd761f 100644 --- a/README_CN.md +++ b/README_CN.md @@ -119,6 +119,14 @@ THA: Thai -v, --verbose Print debug info and save intermediate images in result folder -f, --format {png,webp,jpg,xcf,psd,pdf} Output format of the translation. +--attempts ATTEMPTS Retry attempts on encountered error. -1 means infinite + times. +--ignore-errors Skip image on encountered error. +--overwrite Overwrite already translated images in batch mode. +--skip-no-text Skip image without text (Will not be saved). +--model-dir MODEL_DIR Model directory (by default ./models in project root) +--use-cuda Turn on/off cuda +--use-cuda-limited Turn on/off cuda (excluding offline translator) --detector {default,ctd,craft,none} Text detector used for creating a text mask from an image, DO NOT use craft for manga, it's not designed for it @@ -139,14 +147,6 @@ THA: Thai image. Note the first translation service acts as default if the language isnt defined. Example: --translator-chain "google:JPN;sugoi:ENG". ---use-cuda Turn on/off cuda ---use-cuda-limited Turn on/off cuda (excluding offline translator) ---model-dir MODEL_DIR Model directory (by default ./models in project root) ---attempts ATTEMPTS Retry attempts on encountered error. -1 means infinite - times. ---ignore-errors Skip image on encountered error. ---overwrite Overwrite already translated images in batch mode. ---skip-no-text Skip image without text (Will not be saved). --revert-upscaling Downscales the previously upscaled image after translation back to original size (Use with --upscale- ratio). @@ -169,6 +169,8 @@ THA: Thai full image size --denoise-sigma DENOISE_SIGMA Used by colorizer and affects color strength, range from 0 to 255 (default 30). -1 turns it off. +--mask-dilation-offset MASK_DILATION_OFFSET By how much to extend the text mask to remove left-over + text pixels of the original image. --font-size FONT_SIZE Use fixed font size for rendering --font-size-offset FONT_SIZE_OFFSET Offset font size by a given amount, positive number increase font size and vice versa diff --git a/manga_translator/args.py b/manga_translator/args.py index 2b5035b5e..2b37554af 100644 --- a/manga_translator/args.py +++ b/manga_translator/args.py @@ -93,6 +93,16 @@ def _format_action_invocation(self, action: argparse.Action) -> str: parser.add_argument('-l', '--target-lang', default='CHS', type=str, choices=VALID_LANGUAGES, help='Destination language') parser.add_argument('-v', '--verbose', action='store_true', help='Print debug info and save intermediate images in result folder') parser.add_argument('-f', '--format', default=None, choices=OUTPUT_FORMATS, help='Output format of the translation.') +parser.add_argument('--attempts', default=0, type=int, help='Retry attempts on encountered error. -1 means infinite times.') +parser.add_argument('--ignore-errors', action='store_true', help='Skip image on encountered error.') +parser.add_argument('--overwrite', action='store_true', help='Overwrite already translated images in batch mode.') +parser.add_argument('--skip-no-text', action='store_true', help='Skip image without text (Will not be saved).') +parser.add_argument('--model-dir', default=None, type=dir_path, help='Model directory (by default ./models in project root)') + +g = parser.add_mutually_exclusive_group() +g.add_argument('--use-cuda', action='store_true', help='Turn on/off cuda') +g.add_argument('--use-cuda-limited', action='store_true', help='Turn on/off cuda (excluding offline translator)') + parser.add_argument('--detector', default='default', type=str, choices=DETECTORS, help='Text detector used for creating a text mask from an image, DO NOT use craft for manga, it\'s not designed for it') parser.add_argument('--ocr', default='48px_ctc', type=str, choices=OCRS, help='Optical character recognition (OCR) model to use') parser.add_argument('--inpainter', default='lama_mpe', type=str, choices=INPAINTERS, help='Inpainting model to use') @@ -105,15 +115,6 @@ def _format_action_invocation(self, action: argparse.Action) -> str: g.add_argument('--translator-chain', default=None, type=translator_chain, help='Output of one translator goes in another. Example: --translator-chain "google:JPN;sugoi:ENG".') g.add_argument('--selective-translation', default=None, type=translator_chain, help='Select a translator based on detected language in image. Note the first translation service acts as default if the language isnt defined. Example: --translator-chain "google:JPN;sugoi:ENG".') -g = parser.add_mutually_exclusive_group() -g.add_argument('--use-cuda', action='store_true', help='Turn on/off cuda') -g.add_argument('--use-cuda-limited', action='store_true', help='Turn on/off cuda (excluding offline translator)') - -parser.add_argument('--model-dir', default=None, type=dir_path, help='Model directory (by default ./models in project root)') -parser.add_argument('--attempts', default=0, type=int, help='Retry attempts on encountered error. -1 means infinite times.') -parser.add_argument('--ignore-errors', action='store_true', help='Skip image on encountered error.') -parser.add_argument('--overwrite', action='store_true', help='Overwrite already translated images in batch mode.') -parser.add_argument('--skip-no-text', action='store_true', help='Skip image without text (Will not be saved).') parser.add_argument('--revert-upscaling', action='store_true', help='Downscales the previously upscaled image after translation back to original size (Use with --upscale-ratio).') parser.add_argument('--detection-size', default=1536, type=int, help='Size of image used for detection') parser.add_argument('--det-rotate', action='store_true', help='Rotate the image for detection. Might improve detection.') @@ -127,6 +128,8 @@ def _format_action_invocation(self, action: argparse.Action) -> str: parser.add_argument('--inpainting-size', default=2048, type=int, help='Size of image used for inpainting (too large will result in OOM)') parser.add_argument('--colorization-size', default=576, type=int, help='Size of image used for colorization. Set to -1 to use full image size') parser.add_argument('--denoise-sigma', default=30, type=int, help='Used by colorizer and affects color strength, range from 0 to 255 (default 30). -1 turns it off.') +parser.add_argument('--mask-dilation-offset', default=0, type=int, help='By how much to extend the text mask to remove left-over text pixels of the original image.') + parser.add_argument('--font-size', default=None, type=int, help='Use fixed font size for rendering') parser.add_argument('--font-size-offset', default=0, type=int, help='Offset font size by a given amount, positive number increase font size and vice versa') parser.add_argument('--font-size-minimum', default=-1, type=int, help='Minimum output font size. Default is image_sides_sum/200') diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py index b3f0095d6..81b4b659c 100644 --- a/manga_translator/manga_translator.py +++ b/manga_translator/manga_translator.py @@ -90,7 +90,6 @@ def __init__(self, params: dict = None): torch.backends.cudnn.allow_tf32 = True def parse_init_params(self, params: dict): - self.ignore_bubble=int(params.get('ignore_bubble', 0)) self.verbose = params.get('verbose', False) self.ignore_errors = params.get('ignore_errors', False) @@ -456,7 +455,7 @@ async def _run_detection(self, ctx: Context): self.device, self.verbose) async def _run_ocr(self, ctx: Context): - textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, self.device, self.verbose, self.ignore_bubble) + textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, ctx, self.device, self.verbose) # Filter out regions by original text new_textlines = [] @@ -517,7 +516,7 @@ async def _run_text_translation(self, ctx: Context): return new_text_regions async def _run_mask_refinement(self, ctx: Context): - return await dispatch_mask_refinement(ctx.text_regions, ctx.img_rgb, ctx.mask_raw, 'fit_text', self.verbose,self.ignore_bubble) + return await dispatch_mask_refinement(ctx.text_regions, ctx.img_rgb, ctx.mask_raw, 'fit_text', ctx.mask_dilation_offset, ctx.ignore_bubble, self.verbose) async def _run_inpainting(self, ctx: Context): return await dispatch_inpainting(ctx.inpainter, ctx.img_rgb, ctx.mask, ctx.inpainting_size, self.using_cuda, self.verbose) diff --git a/manga_translator/mask_refinement/__init__.py b/manga_translator/mask_refinement/__init__.py index 811abceb0..12bb7917b 100644 --- a/manga_translator/mask_refinement/__init__.py +++ b/manga_translator/mask_refinement/__init__.py @@ -6,7 +6,7 @@ from ..utils import TextBlock, Quadrilateral from ..utils.bubble import is_ignore -async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mask: np.ndarray, method: str = 'fit_text', verbose: bool = False, ignore_bubble: int = 0) -> np.ndarray: +async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mask: np.ndarray, method: str = 'fit_text', dilation_offset: int = 0, ignore_bubble: int = 0, verbose: bool = False) -> np.ndarray: # Larger sized mask images will probably have crisper and thinner mask segments due to being able to fit the text pixels better # so we dont want to size them down as much to not loose information scale_factor = max(min((raw_mask.shape[0] - raw_image.shape[0] / 3) / raw_mask.shape[0], 1), 0.5) @@ -21,7 +21,7 @@ async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mas q = Quadrilateral(l * scale_factor, '', 0) textlines.append(q) - final_mask = complete_mask(img_resized, mask_resized, textlines) if method == 'fit_text' else complete_mask_fill([txtln.aabb.xywh for txtln in textlines]) + final_mask = complete_mask(img_resized, mask_resized, textlines, dilation_offset=dilation_offset) if method == 'fit_text' else complete_mask_fill([txtln.aabb.xywh for txtln in textlines]) if final_mask is None: final_mask = np.zeros((raw_image.shape[0], raw_image.shape[1]), dtype = np.uint8) else: diff --git a/manga_translator/mask_refinement/text_mask_utils.py b/manga_translator/mask_refinement/text_mask_utils.py index 9f9350778..a556d6ef7 100644 --- a/manga_translator/mask_refinement/text_mask_utils.py +++ b/manga_translator/mask_refinement/text_mask_utils.py @@ -93,7 +93,7 @@ def refine_mask(rgbimg, rawmask): crf_mask = np.array(res * 255, dtype=np.uint8) return crf_mask -def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilateral], keep_threshold = 1e-2): +def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilateral], keep_threshold = 1e-2, dilation_offset = 0): bboxes = [txtln.aabb.xywh for txtln in textlines] polys = [Polygon(txtln.pts) for txtln in textlines] for (x, y, w, h) in bboxes: @@ -166,7 +166,7 @@ def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilater text_size = min(w1, h1, textlines[i].font_size) x1, y1, w1, h1 = extend_rect(x1, y1, w1, h1, img.shape[1], img.shape[0], int(text_size * 0.1)) # TODO: Was text_size * 0.3 before. Need to think of better way to determine dilate_size. - dilate_size = max((int(text_size * 0.1) // 2) * 2 + 1, 3) + dilate_size = max((int((text_size + dilation_offset) * 0.1) // 2) * 2 + 1, 3) # print(textlines[i].font_size, min(w1, h1), dilate_size) kern = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dilate_size, dilate_size)) cc_region = np.ascontiguousarray(cc[y1: y1 + h1, x1: x1 + w1]) diff --git a/manga_translator/ocr/__init__.py b/manga_translator/ocr/__init__.py index 29948237b..ba5fa6082 100644 --- a/manga_translator/ocr/__init__.py +++ b/manga_translator/ocr/__init__.py @@ -26,8 +26,9 @@ async def prepare(ocr_key: str, device: str = 'cpu'): await ocr.download() await ocr.load(device) -async def dispatch(ocr_key: str, image: np.ndarray, regions: List[Quadrilateral], device: str = 'cpu', verbose: bool = False,ignore_bubble: int = 0) -> List[Quadrilateral]: +async def dispatch(ocr_key: str, image: np.ndarray, regions: List[Quadrilateral], args = None, device: str = 'cpu', verbose: bool = False) -> List[Quadrilateral]: ocr = get_ocr(ocr_key) if isinstance(ocr, OfflineOCR): await ocr.load(device) - return await ocr.recognize(image, regions, verbose, ignore_bubble) + args = args or {} + return await ocr.recognize(image, regions, args, verbose) diff --git a/manga_translator/ocr/common.py b/manga_translator/ocr/common.py index 609484454..1c056195f 100644 --- a/manga_translator/ocr/common.py +++ b/manga_translator/ocr/common.py @@ -37,15 +37,15 @@ def _generate_text_direction(self, bboxes: List[Union[Quadrilateral, TextBlock]] for node in nodes: yield bboxes[node], majority_dir - async def recognize(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[Quadrilateral]: + async def recognize(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]: ''' Performs the optical character recognition, using the `textlines` as areas of interests. Returns a `textlines` list with the `textline.text` property set to the detected text string. ''' - return await self._recognize(image, textlines, verbose, ignore_bubble) + return await self._recognize(image, textlines, args, verbose) @abstractmethod - async def _recognize(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ingore_bubble: int = 0) -> List[Quadrilateral]: + async def _recognize(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]: pass @@ -56,5 +56,5 @@ async def _recognize(self, *args, **kwargs): return await self.infer(*args, **kwargs) @abstractmethod - async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[Quadrilateral]: + async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]: pass diff --git a/manga_translator/ocr/model_32px.py b/manga_translator/ocr/model_32px.py index 79a4561fa..f0530a6ef 100644 --- a/manga_translator/ocr/model_32px.py +++ b/manga_translator/ocr/model_32px.py @@ -49,10 +49,11 @@ async def _load(self, device: str): async def _unload(self): del self.model - - async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[TextBlock]: + + async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[TextBlock]: text_height = 32 max_chunk_size = 16 + ignore_bubble = args.get('ignore_bubble', 0) quadrilaterals = list(self._generate_text_direction(textlines)) region_imgs = [q.get_transformed_region(image, d, text_height) for q, d in quadrilaterals] diff --git a/manga_translator/ocr/model_48px_ctc.py b/manga_translator/ocr/model_48px_ctc.py index 41a4795a1..9cf8b96bf 100644 --- a/manga_translator/ocr/model_48px_ctc.py +++ b/manga_translator/ocr/model_48px_ctc.py @@ -53,9 +53,10 @@ async def _load(self, device: str): async def _unload(self): del self.model - async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[TextBlock]: + async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[TextBlock]: text_height = 48 max_chunk_size = 16 + ignore_bubble = args.get('ignore_bubble', 0) quadrilaterals = list(self._generate_text_direction(textlines)) region_imgs = [q.get_transformed_region(image, d, text_height) for q, d in quadrilaterals] @@ -79,7 +80,7 @@ async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbos W = region_imgs[idx].shape[1] tmp = region_imgs[idx] # Determine whether to skip the text block, and return True to skip. - if ignore_bubble >=1 and ignore_bubble <=50 and is_ignore(region_imgs[idx], ignore_bubble): + if ignore_bubble >=1 and ignore_bubble <=50 and is_ignore(region_imgs[idx], ignore_bubble): ix+=1 continue region[i, :, : W, :]=tmp