diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..2bcd70e3 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 88 diff --git a/src/doms_databasen/text_extraction.py b/src/doms_databasen/text_extraction.py index e98dd3c2..49f1fca1 100644 --- a/src/doms_databasen/text_extraction.py +++ b/src/doms_databasen/text_extraction.py @@ -55,11 +55,14 @@ def __init__(self, config: DictConfig): def extract_text(self, pdf_path: Path | str) -> str: """Extracts text from a PDF using easyocr or pypdf. - Some text is anonymized with boxes, and some text is anonymized with underlines. - This function tries to find these anonymization, read the anonymized text, + Some text is anonymized with boxes, and some text + is anonymized with underlines. + This function tries to find these anonymization, + read the anonymized text, and then remove the anonymized text from the image before reading the rest of the text with easyocr. - If a page has no anonymization or tables, the text is read with pypdf. + If a page has no anonymization or tables, + the text is read with pypdf. Args: pdf_path (Path | str): @@ -290,7 +293,7 @@ def _extract_underline_anonymization_boxes(self, image: np.ndarray) -> Tuple: ] return anonymized_boxes_underlines_, underlines - def _get_images(self, pdf_path: Path | str) -> Mapping[np.ndarray]: + def _get_images(self, pdf_path: Path | str) -> List[np.ndarray]: """Get images from PDF. Returns all images from PDF, except if debugging a single page. @@ -316,10 +319,10 @@ def _get_images(self, pdf_path: Path | str) -> Mapping[np.ndarray]: ), ) else: - images = map(np.array, convert_from_path(pdf_path=pdf_path, dpi=DPI)) + images = list(map(np.array, convert_from_path(pdf_path=pdf_path, dpi=DPI))) # Grayscale - images = map(lambda image: cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), images) + images = list(map(lambda image: cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), images)) return images def _find_tables(self, image: np.ndarray, read_tables: bool = False) -> List[dict]: @@ -909,7 +912,7 @@ def _union(self, box_1: dict, box_2: dict) -> float: return area_1 + area_2 - self._intersection(box_1=box_1, box_2=box_2) @staticmethod - def _area(box: dict) -> float: + def _area(box: dict) -> int: """Calculates the area of a box. Args: @@ -917,7 +920,7 @@ def _area(box: dict) -> float: Anonymized box with coordinates. Returns: - float: + int: Area of the box. """ row_min, col_min, row_max, col_max = box["coordinates"] @@ -981,7 +984,8 @@ def _on_same_line(self, y: int, y_prev: int) -> bool: y_prev (int): y coordinate of top left corner of previous bounding box. max_y_difference (int): - Maximum difference between y coordinates of two bounding boxes on the same line. + Maximum difference between y coordinates of two + bounding boxes on the same line. Returns: bool: @@ -1730,20 +1734,6 @@ def _remove_inner_boxes(self, boxes: List[dict]) -> List[dict]: boxes_.append(box) return boxes_ - def _area(self, box: dict) -> int: - """Calculates the area of a box. - - Args: - box (dict): - Anonymized box with coordinates. - - Returns: - int: - Area of the box. - """ - row_min, col_min, row_max, col_max = box["coordinates"] - return (row_max - row_min) * (col_max - col_min) - def _inner_box(self, boxes: List[dict], box: dict) -> bool: """Determine if box is inside another box. @@ -1844,7 +1834,8 @@ def _process_crop_before_read( scale = self._get_scale(box_length=box_length) crop_scaled = self._scale_image(image=crop_refined, scale=scale) - # Ensure that highest pixel value is 255, else sharpening might not work as expected. + # Ensure that highest pixel value is 255, else + # sharpening might not work as expected. crop_scaled = np.array(crop_scaled / crop_scaled.max() * 255, dtype=np.uint8) crop_boundary = self._add_boundary( @@ -2060,7 +2051,8 @@ def _remove_black_border(self, blob_image: np.ndarray) -> np.ndarray: return blob_image def _split_blob_to_multiple_boxes(self, blob: RegionProperties) -> List[dict]: - """This function is called if a blob is not splitted correctly with initial methods. + """This function is called if a blob is not splitted + correctly with initial methods. Args: blob (RegionProperties): @@ -2462,8 +2454,10 @@ def _remove_boundary_noise( ) -> np.ndarray: """Removes noise on the boundary of an anonymized box. - All white pixels in a perfect bounding box should be a pixel of a relevant character. - Some images have white pixel defect at the boundary of the bounding box, and + All white pixels in a perfect bounding box + should be a pixel of a relevant character. + Some images have white pixel defect at the + boundary of the bounding box, and this function removes those white pixels. Args: @@ -2517,7 +2511,8 @@ def _too_few_pixels(self, blob: RegionProperties, touches_boundary: bool) -> boo Returns: bool: - True if blob has too few pixels to be a relevant character. False otherwise. + True if blob has too few pixels to + be a relevant character. False otherwise. """ coords = blob.coords return ( @@ -2540,7 +2535,8 @@ def _low_longest_distance_from_boundary( Returns: bool: - True if blob has a low longest distance from the boundary of the image. False otherwise. + True if blob has a low longest distance from the + boundary of the image. False otherwise. """ n = min(crop.shape) return self._maximum_distance_from_boundary(crop=crop, blob=blob) < n * 0.3 @@ -2550,9 +2546,12 @@ def _maximum_distance_from_boundary( ) -> float: """Get maximum distance from blob to boundary of image. - E.g. if the minimum distance from the blob to the top boundary of the image is 5, - and the minimum distance from the blob to the bottom boundary of the image is 10, - to the left boundary is 3, and to the right boundary is 7, then the maximum distance + E.g. if the minimum distance from the blob to + the top boundary of the image is 5, + and the minimum distance from the blob to + the bottom boundary of the image is 10, + to the left boundary is 3, and to the right + boundary is 7, then the maximum distance from the blob to the boundary of the image is 10. Used in _remove_boundary_noise to determine if a blob is noise or not. @@ -2838,6 +2837,20 @@ def _read_text_with_tika(pdf_path: str) -> str: except: pass return text.strip() + + def _get_text_from_pages(pages: dict) -> str: + """Get text from pages. + + Args: + pages (dict): + Pages with text and extraction method. + + Returns: + pdf_text (str): + Text from pages. + """ + pdf_text = "\n\n".join(page["text"] for page in pages.values()) + return pdf_text # This class is not used, but is kept for future reference.