Skip to content

Commit

Permalink
Formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
oliverkinch committed Feb 13, 2024
1 parent 10d88dc commit 2ba58aa
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 32 deletions.
2 changes: 2 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
max-line-length = 88
77 changes: 45 additions & 32 deletions src/doms_databasen/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,14 @@ def __init__(self, config: DictConfig):
def extract_text(self, pdf_path: Path | str) -> str:
"""Extracts text from a PDF using easyocr or pypdf.
Some text is anonymized with boxes, and some text is anonymized with underlines.
This function tries to find these anonymization, read the anonymized text,
Some text is anonymized with boxes, and some text
is anonymized with underlines.
This function tries to find these anonymization,
read the anonymized text,
and then remove the anonymized text from the image before
reading the rest of the text with easyocr.
If a page has no anonymization or tables, the text is read with pypdf.
If a page has no anonymization or tables,
the text is read with pypdf.
Args:
pdf_path (Path | str):
Expand Down Expand Up @@ -290,7 +293,7 @@ def _extract_underline_anonymization_boxes(self, image: np.ndarray) -> Tuple:
]
return anonymized_boxes_underlines_, underlines

def _get_images(self, pdf_path: Path | str) -> Mapping[np.ndarray]:
def _get_images(self, pdf_path: Path | str) -> List[np.ndarray]:
"""Get images from PDF.
Returns all images from PDF, except if debugging a single page.
Expand All @@ -316,10 +319,10 @@ def _get_images(self, pdf_path: Path | str) -> Mapping[np.ndarray]:
),
)
else:
images = map(np.array, convert_from_path(pdf_path=pdf_path, dpi=DPI))
images = list(map(np.array, convert_from_path(pdf_path=pdf_path, dpi=DPI)))

# Grayscale
images = map(lambda image: cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), images)
images = list(map(lambda image: cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), images))
return images

def _find_tables(self, image: np.ndarray, read_tables: bool = False) -> List[dict]:
Expand Down Expand Up @@ -909,15 +912,15 @@ def _union(self, box_1: dict, box_2: dict) -> float:
return area_1 + area_2 - self._intersection(box_1=box_1, box_2=box_2)

@staticmethod
def _area(box: dict) -> float:
def _area(box: dict) -> int:
"""Calculates the area of a box.
Args:
box (dict):
Anonymized box with coordinates.
Returns:
float:
int:
Area of the box.
"""
row_min, col_min, row_max, col_max = box["coordinates"]
Expand Down Expand Up @@ -981,7 +984,8 @@ def _on_same_line(self, y: int, y_prev: int) -> bool:
y_prev (int):
y coordinate of top left corner of previous bounding box.
max_y_difference (int):
Maximum difference between y coordinates of two bounding boxes on the same line.
Maximum difference between y coordinates of two
bounding boxes on the same line.
Returns:
bool:
Expand Down Expand Up @@ -1730,20 +1734,6 @@ def _remove_inner_boxes(self, boxes: List[dict]) -> List[dict]:
boxes_.append(box)
return boxes_

def _area(self, box: dict) -> int:
"""Calculates the area of a box.
Args:
box (dict):
Anonymized box with coordinates.
Returns:
int:
Area of the box.
"""
row_min, col_min, row_max, col_max = box["coordinates"]
return (row_max - row_min) * (col_max - col_min)

def _inner_box(self, boxes: List[dict], box: dict) -> bool:
"""Determine if box is inside another box.
Expand Down Expand Up @@ -1844,7 +1834,8 @@ def _process_crop_before_read(
scale = self._get_scale(box_length=box_length)
crop_scaled = self._scale_image(image=crop_refined, scale=scale)

# Ensure that highest pixel value is 255, else sharpening might not work as expected.
# Ensure that highest pixel value is 255, else
# sharpening might not work as expected.
crop_scaled = np.array(crop_scaled / crop_scaled.max() * 255, dtype=np.uint8)

crop_boundary = self._add_boundary(
Expand Down Expand Up @@ -2060,7 +2051,8 @@ def _remove_black_border(self, blob_image: np.ndarray) -> np.ndarray:
return blob_image

def _split_blob_to_multiple_boxes(self, blob: RegionProperties) -> List[dict]:
"""This function is called if a blob is not splitted correctly with initial methods.
"""This function is called if a blob is not splitted
correctly with initial methods.
Args:
blob (RegionProperties):
Expand Down Expand Up @@ -2462,8 +2454,10 @@ def _remove_boundary_noise(
) -> np.ndarray:
"""Removes noise on the boundary of an anonymized box.
All white pixels in a perfect bounding box should be a pixel of a relevant character.
Some images have white pixel defect at the boundary of the bounding box, and
All white pixels in a perfect bounding box
should be a pixel of a relevant character.
Some images have white pixel defect at the
boundary of the bounding box, and
this function removes those white pixels.
Args:
Expand Down Expand Up @@ -2517,7 +2511,8 @@ def _too_few_pixels(self, blob: RegionProperties, touches_boundary: bool) -> boo
Returns:
bool:
True if blob has too few pixels to be a relevant character. False otherwise.
True if blob has too few pixels to
be a relevant character. False otherwise.
"""
coords = blob.coords
return (
Expand All @@ -2540,7 +2535,8 @@ def _low_longest_distance_from_boundary(
Returns:
bool:
True if blob has a low longest distance from the boundary of the image. False otherwise.
True if blob has a low longest distance from the
boundary of the image. False otherwise.
"""
n = min(crop.shape)
return self._maximum_distance_from_boundary(crop=crop, blob=blob) < n * 0.3
Expand All @@ -2550,9 +2546,12 @@ def _maximum_distance_from_boundary(
) -> float:
"""Get maximum distance from blob to boundary of image.
E.g. if the minimum distance from the blob to the top boundary of the image is 5,
and the minimum distance from the blob to the bottom boundary of the image is 10,
to the left boundary is 3, and to the right boundary is 7, then the maximum distance
E.g. if the minimum distance from the blob to
the top boundary of the image is 5,
and the minimum distance from the blob to
the bottom boundary of the image is 10,
to the left boundary is 3, and to the right
boundary is 7, then the maximum distance
from the blob to the boundary of the image is 10.
Used in _remove_boundary_noise to determine if a blob is noise or not.
Expand Down Expand Up @@ -2838,6 +2837,20 @@ def _read_text_with_tika(pdf_path: str) -> str:
except:
pass
return text.strip()

def _get_text_from_pages(pages: dict) -> str:
"""Get text from pages.
Args:
pages (dict):
Pages with text and extraction method.
Returns:
pdf_text (str):
Text from pages.
"""
pdf_text = "\n\n".join(page["text"] for page in pages.values())
return pdf_text


# This class is not used, but is kept for future reference.
Expand Down

0 comments on commit 2ba58aa

Please sign in to comment.