Add --mask-dilation-offset

zyddnys · Oct 23, 2023 · 49d6aff · 49d6aff
1 parent 10fc4d8
commit 49d6aff
Show file tree

Hide file tree

Showing 10 changed files with 51 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -332,6 +332,14 @@ THA: Thai
 -v, --verbose                                Print debug info and save intermediate images in result
                                              folder
 -f, --format {png,webp,jpg,xcf,psd,pdf}      Output format of the translation.
+--attempts ATTEMPTS                          Retry attempts on encountered error. -1 means infinite
+                                             times.
+--ignore-errors                              Skip image on encountered error.
+--overwrite                                  Overwrite already translated images in batch mode.
+--skip-no-text                               Skip image without text (Will not be saved).
+--model-dir MODEL_DIR                        Model directory (by default ./models in project root)
+--use-cuda                                   Turn on/off cuda
+--use-cuda-limited                           Turn on/off cuda (excluding offline translator)
 --detector {default,ctd,craft,none}          Text detector used for creating a text mask from an
                                              image, DO NOT use craft for manga, it's not designed
                                              for it
@@ -352,14 +360,6 @@ THA: Thai
                                              image. Note the first translation service acts as
                                              default if the language isnt defined. Example:
                                              --translator-chain "google:JPN;sugoi:ENG".
---use-cuda                                   Turn on/off cuda
---use-cuda-limited                           Turn on/off cuda (excluding offline translator)
---model-dir MODEL_DIR                        Model directory (by default ./models in project root)
---attempts ATTEMPTS                          Retry attempts on encountered error. -1 means infinite
-                                             times.
---ignore-errors                              Skip image on encountered error.
---overwrite                                  Overwrite already translated images in batch mode.
---skip-no-text                               Skip image without text (Will not be saved).
 --revert-upscaling                           Downscales the previously upscaled image after
                                              translation back to original size (Use with --upscale-
                                              ratio).
@@ -382,6 +382,8 @@ THA: Thai
                                              full image size
 --denoise-sigma DENOISE_SIGMA                Used by colorizer and affects color strength, range
                                              from 0 to 255 (default 30). -1 turns it off.
+--mask-dilation-offset MASK_DILATION_OFFSET  By how much to extend the text mask to remove left-over
+                                             text pixels of the original image.
 --font-size FONT_SIZE                        Use fixed font size for rendering
 --font-size-offset FONT_SIZE_OFFSET          Offset font size by a given amount, positive number
                                              increase font size and vice versa

diff --git a/README_CN.md b/README_CN.md
@@ -119,6 +119,14 @@ THA: Thai
 -v, --verbose                                Print debug info and save intermediate images in result
                                              folder
 -f, --format {png,webp,jpg,xcf,psd,pdf}      Output format of the translation.
+--attempts ATTEMPTS                          Retry attempts on encountered error. -1 means infinite
+                                             times.
+--ignore-errors                              Skip image on encountered error.
+--overwrite                                  Overwrite already translated images in batch mode.
+--skip-no-text                               Skip image without text (Will not be saved).
+--model-dir MODEL_DIR                        Model directory (by default ./models in project root)
+--use-cuda                                   Turn on/off cuda
+--use-cuda-limited                           Turn on/off cuda (excluding offline translator)
 --detector {default,ctd,craft,none}          Text detector used for creating a text mask from an
                                              image, DO NOT use craft for manga, it's not designed
                                              for it
@@ -139,14 +147,6 @@ THA: Thai
                                              image. Note the first translation service acts as
                                              default if the language isnt defined. Example:
                                              --translator-chain "google:JPN;sugoi:ENG".
---use-cuda                                   Turn on/off cuda
---use-cuda-limited                           Turn on/off cuda (excluding offline translator)
---model-dir MODEL_DIR                        Model directory (by default ./models in project root)
---attempts ATTEMPTS                          Retry attempts on encountered error. -1 means infinite
-                                             times.
---ignore-errors                              Skip image on encountered error.
---overwrite                                  Overwrite already translated images in batch mode.
---skip-no-text                               Skip image without text (Will not be saved).
 --revert-upscaling                           Downscales the previously upscaled image after
                                              translation back to original size (Use with --upscale-
                                              ratio).
@@ -169,6 +169,8 @@ THA: Thai
                                              full image size
 --denoise-sigma DENOISE_SIGMA                Used by colorizer and affects color strength, range
                                              from 0 to 255 (default 30). -1 turns it off.
+--mask-dilation-offset MASK_DILATION_OFFSET  By how much to extend the text mask to remove left-over
+                                             text pixels of the original image.
 --font-size FONT_SIZE                        Use fixed font size for rendering
 --font-size-offset FONT_SIZE_OFFSET          Offset font size by a given amount, positive number
                                              increase font size and vice versa

diff --git a/manga_translator/args.py b/manga_translator/args.py
@@ -93,6 +93,16 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
 parser.add_argument('-l', '--target-lang', default='CHS', type=str, choices=VALID_LANGUAGES, help='Destination language')
 parser.add_argument('-v', '--verbose', action='store_true', help='Print debug info and save intermediate images in result folder')
 parser.add_argument('-f', '--format', default=None, choices=OUTPUT_FORMATS, help='Output format of the translation.')
+parser.add_argument('--attempts', default=0, type=int, help='Retry attempts on encountered error. -1 means infinite times.')
+parser.add_argument('--ignore-errors', action='store_true', help='Skip image on encountered error.')
+parser.add_argument('--overwrite', action='store_true', help='Overwrite already translated images in batch mode.')
+parser.add_argument('--skip-no-text', action='store_true', help='Skip image without text (Will not be saved).')
+parser.add_argument('--model-dir', default=None, type=dir_path, help='Model directory (by default ./models in project root)')
+
+g = parser.add_mutually_exclusive_group()
+g.add_argument('--use-cuda', action='store_true', help='Turn on/off cuda')
+g.add_argument('--use-cuda-limited', action='store_true', help='Turn on/off cuda (excluding offline translator)')
+
 parser.add_argument('--detector', default='default', type=str, choices=DETECTORS, help='Text detector used for creating a text mask from an image, DO NOT use craft for manga, it\'s not designed for it')
 parser.add_argument('--ocr', default='48px_ctc', type=str, choices=OCRS, help='Optical character recognition (OCR) model to use')
 parser.add_argument('--inpainter', default='lama_mpe', type=str, choices=INPAINTERS, help='Inpainting model to use')
@@ -105,15 +115,6 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
 g.add_argument('--translator-chain', default=None, type=translator_chain, help='Output of one translator goes in another. Example: --translator-chain "google:JPN;sugoi:ENG".')
 g.add_argument('--selective-translation', default=None, type=translator_chain, help='Select a translator based on detected language in image. Note the first translation service acts as default if the language isnt defined. Example: --translator-chain "google:JPN;sugoi:ENG".')
 
-g = parser.add_mutually_exclusive_group()
-g.add_argument('--use-cuda', action='store_true', help='Turn on/off cuda')
-g.add_argument('--use-cuda-limited', action='store_true', help='Turn on/off cuda (excluding offline translator)')
-
-parser.add_argument('--model-dir', default=None, type=dir_path, help='Model directory (by default ./models in project root)')
-parser.add_argument('--attempts', default=0, type=int, help='Retry attempts on encountered error. -1 means infinite times.')
-parser.add_argument('--ignore-errors', action='store_true', help='Skip image on encountered error.')
-parser.add_argument('--overwrite', action='store_true', help='Overwrite already translated images in batch mode.')
-parser.add_argument('--skip-no-text', action='store_true', help='Skip image without text (Will not be saved).')
 parser.add_argument('--revert-upscaling', action='store_true', help='Downscales the previously upscaled image after translation back to original size (Use with --upscale-ratio).')
 parser.add_argument('--detection-size', default=1536, type=int, help='Size of image used for detection')
 parser.add_argument('--det-rotate', action='store_true', help='Rotate the image for detection. Might improve detection.')
@@ -127,6 +128,8 @@ def _format_action_invocation(self, action: argparse.Action) -> str:
 parser.add_argument('--inpainting-size', default=2048, type=int, help='Size of image used for inpainting (too large will result in OOM)')
 parser.add_argument('--colorization-size', default=576, type=int, help='Size of image used for colorization. Set to -1 to use full image size')
 parser.add_argument('--denoise-sigma', default=30, type=int, help='Used by colorizer and affects color strength, range from 0 to 255 (default 30). -1 turns it off.')
+parser.add_argument('--mask-dilation-offset', default=0, type=int, help='By how much to extend the text mask to remove left-over text pixels of the original image.')
+
 parser.add_argument('--font-size', default=None, type=int, help='Use fixed font size for rendering')
 parser.add_argument('--font-size-offset', default=0, type=int, help='Offset font size by a given amount, positive number increase font size and vice versa')
 parser.add_argument('--font-size-minimum', default=-1, type=int, help='Minimum output font size. Default is image_sides_sum/200')

diff --git a/manga_translator/manga_translator.py b/manga_translator/manga_translator.py
@@ -90,7 +90,6 @@ def __init__(self, params: dict = None):
         torch.backends.cudnn.allow_tf32 = True
 
     def parse_init_params(self, params: dict):
-        self.ignore_bubble=int(params.get('ignore_bubble', 0))
         self.verbose = params.get('verbose', False)
         self.ignore_errors = params.get('ignore_errors', False)
 
@@ -456,7 +455,7 @@ async def _run_detection(self, ctx: Context):
                                         self.device, self.verbose)
 
     async def _run_ocr(self, ctx: Context):
-        textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, self.device, self.verbose, self.ignore_bubble)
+        textlines = await dispatch_ocr(ctx.ocr, ctx.img_rgb, ctx.textlines, ctx, self.device, self.verbose)
 
         # Filter out regions by original text
         new_textlines = []
@@ -517,7 +516,7 @@ async def _run_text_translation(self, ctx: Context):
         return new_text_regions
 
     async def _run_mask_refinement(self, ctx: Context):
-        return await dispatch_mask_refinement(ctx.text_regions, ctx.img_rgb, ctx.mask_raw, 'fit_text', self.verbose,self.ignore_bubble)
+        return await dispatch_mask_refinement(ctx.text_regions, ctx.img_rgb, ctx.mask_raw, 'fit_text', ctx.mask_dilation_offset, ctx.ignore_bubble, self.verbose)
 
     async def _run_inpainting(self, ctx: Context):
         return await dispatch_inpainting(ctx.inpainter, ctx.img_rgb, ctx.mask, ctx.inpainting_size, self.using_cuda, self.verbose)

diff --git a/manga_translator/mask_refinement/__init__.py b/manga_translator/mask_refinement/__init__.py
@@ -6,7 +6,7 @@
 from ..utils import TextBlock, Quadrilateral
 from ..utils.bubble import is_ignore
 
-async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mask: np.ndarray, method: str = 'fit_text', verbose: bool = False, ignore_bubble: int = 0) -> np.ndarray:
+async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mask: np.ndarray, method: str = 'fit_text', dilation_offset: int = 0, ignore_bubble: int = 0, verbose: bool = False) -> np.ndarray:
     # Larger sized mask images will probably have crisper and thinner mask segments due to being able to fit the text pixels better
     # so we dont want to size them down as much to not loose information
     scale_factor = max(min((raw_mask.shape[0] - raw_image.shape[0] / 3) / raw_mask.shape[0], 1), 0.5)
@@ -21,7 +21,7 @@ async def dispatch(text_regions: List[TextBlock], raw_image: np.ndarray, raw_mas
             q = Quadrilateral(l * scale_factor, '', 0)
             textlines.append(q)
 
-    final_mask = complete_mask(img_resized, mask_resized, textlines) if method == 'fit_text' else complete_mask_fill([txtln.aabb.xywh for txtln in textlines])
+    final_mask = complete_mask(img_resized, mask_resized, textlines, dilation_offset=dilation_offset) if method == 'fit_text' else complete_mask_fill([txtln.aabb.xywh for txtln in textlines])
     if final_mask is None:
         final_mask = np.zeros((raw_image.shape[0], raw_image.shape[1]), dtype = np.uint8)
     else:

diff --git a/manga_translator/mask_refinement/text_mask_utils.py b/manga_translator/mask_refinement/text_mask_utils.py
@@ -93,7 +93,7 @@ def refine_mask(rgbimg, rawmask):
     crf_mask = np.array(res * 255, dtype=np.uint8)
     return crf_mask
 
-def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilateral], keep_threshold = 1e-2):
+def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilateral], keep_threshold = 1e-2, dilation_offset = 0):
     bboxes = [txtln.aabb.xywh for txtln in textlines]
     polys = [Polygon(txtln.pts) for txtln in textlines]
     for (x, y, w, h) in bboxes:
@@ -166,7 +166,7 @@ def complete_mask(img: np.ndarray, mask: np.ndarray, textlines: List[Quadrilater
         text_size = min(w1, h1, textlines[i].font_size)
         x1, y1, w1, h1 = extend_rect(x1, y1, w1, h1, img.shape[1], img.shape[0], int(text_size * 0.1))
         # TODO: Was text_size * 0.3 before. Need to think of better way to determine dilate_size.
-        dilate_size = max((int(text_size * 0.1) // 2) * 2 + 1, 3)
+        dilate_size = max((int((text_size + dilation_offset) * 0.1) // 2) * 2 + 1, 3)
         # print(textlines[i].font_size, min(w1, h1), dilate_size)
         kern = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dilate_size, dilate_size))
         cc_region = np.ascontiguousarray(cc[y1: y1 + h1, x1: x1 + w1])

diff --git a/manga_translator/ocr/__init__.py b/manga_translator/ocr/__init__.py
@@ -26,8 +26,9 @@ async def prepare(ocr_key: str, device: str = 'cpu'):
         await ocr.download()
         await ocr.load(device)
 
-async def dispatch(ocr_key: str, image: np.ndarray, regions: List[Quadrilateral], device: str = 'cpu', verbose: bool = False,ignore_bubble: int = 0) -> List[Quadrilateral]:
+async def dispatch(ocr_key: str, image: np.ndarray, regions: List[Quadrilateral], args = None, device: str = 'cpu', verbose: bool = False) -> List[Quadrilateral]:
     ocr = get_ocr(ocr_key)
     if isinstance(ocr, OfflineOCR):
         await ocr.load(device)
-    return await ocr.recognize(image, regions, verbose, ignore_bubble)
+    args = args or {}
+    return await ocr.recognize(image, regions, args, verbose)
diff --git a/manga_translator/ocr/common.py b/manga_translator/ocr/common.py
@@ -37,15 +37,15 @@ def _generate_text_direction(self, bboxes: List[Union[Quadrilateral, TextBlock]]
                     for node in nodes:
                         yield bboxes[node], majority_dir
 
-    async def recognize(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[Quadrilateral]:
+    async def recognize(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]:
         '''
         Performs the optical character recognition, using the `textlines` as areas of interests.
         Returns a `textlines` list with the `textline.text` property set to the detected text string.
         '''
-        return await self._recognize(image, textlines, verbose, ignore_bubble)
+        return await self._recognize(image, textlines, args, verbose)
 
     @abstractmethod
-    async def _recognize(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ingore_bubble: int = 0) -> List[Quadrilateral]:
+    async def _recognize(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]:
         pass
 
 
@@ -56,5 +56,5 @@ async def _recognize(self, *args, **kwargs):
         return await self.infer(*args, **kwargs)
 
     @abstractmethod
-    async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[Quadrilateral]:
+    async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[Quadrilateral]:
         pass
diff --git a/manga_translator/ocr/model_32px.py b/manga_translator/ocr/model_32px.py
@@ -49,10 +49,11 @@ async def _load(self, device: str):
 
     async def _unload(self):
         del self.model
-    
-    async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[TextBlock]:
+
+    async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[TextBlock]:
         text_height = 32
         max_chunk_size = 16
+        ignore_bubble = args.get('ignore_bubble', 0)
 
         quadrilaterals = list(self._generate_text_direction(textlines))
         region_imgs = [q.get_transformed_region(image, d, text_height) for q, d in quadrilaterals]

diff --git a/manga_translator/ocr/model_48px_ctc.py b/manga_translator/ocr/model_48px_ctc.py
@@ -53,9 +53,10 @@ async def _load(self, device: str):
     async def _unload(self):
         del self.model
 
-    async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbose: bool = False, ignore_bubble: int = 0) -> List[TextBlock]:
+    async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], args: dict, verbose: bool = False) -> List[TextBlock]:
         text_height = 48
         max_chunk_size = 16
+        ignore_bubble = args.get('ignore_bubble', 0)
 
         quadrilaterals = list(self._generate_text_direction(textlines))
         region_imgs = [q.get_transformed_region(image, d, text_height) for q, d in quadrilaterals]
@@ -79,7 +80,7 @@ async def _infer(self, image: np.ndarray, textlines: List[Quadrilateral], verbos
                 W = region_imgs[idx].shape[1]
                 tmp = region_imgs[idx]
                 # Determine whether to skip the text block, and return True to skip.
-                if  ignore_bubble >=1 and ignore_bubble <=50 and is_ignore(region_imgs[idx], ignore_bubble):
+                if ignore_bubble >=1 and ignore_bubble <=50 and is_ignore(region_imgs[idx], ignore_bubble):
                     ix+=1
                     continue
                 region[i, :, : W, :]=tmp