Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #51

Merged
merged 3 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 106 additions & 10 deletions openbharatocr/ocr/pan.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,22 @@


def clean_input(match):
"""
Cleans the extracted text by splitting lines and removing stopwords.

Args:
match (list): A list of extracted text chunks.

Returns:
list: A cleaned list of individual names.
"""
cleaned = []

for name in match:
split_name = name.split("\n")
for chunk in split_name:
cleaned.append(chunk)

return cleaned


def extract_all_names(input):
regex = r"\n[A-Z\s]+\b"
match = re.findall(regex, input)

names = []
cleaned = clean_input(match)

stopwords = ["INDIA", "OF", "TAX", "GOVT", "DEPARTMENT", "INCOME"]

names = [
Expand All @@ -35,7 +34,34 @@ def extract_all_names(input):
return names


def extract_all_names(input):
"""
Extracts all names from the given text using a regular expression and performs basic cleaning.

Args:
input (str): The text to extract names from.

Returns:
list: A list of extracted names.
"""
regex = r"\n[A-Z\s]+\b"
match = re.findall(regex, input)

names = []
cleaned = clean_input(match)
return cleaned


def extract_pan(input):
"""
Extracts the PAN number from the given text using a regular expression.

Args:
input (str): The text to extract the PAN number from.

Returns:
str: The extracted PAN number, or an empty string if not found.
"""
regex = r"[A-Z]{5}[0-9]{4}[A-Z]"
match = re.search(regex, input)
pan_number = match.group(0) if match else ""
Expand All @@ -44,6 +70,15 @@ def extract_pan(input):


def extract_dob(input):
"""
Extracts the date of birth from the given text using a regular expression.

Args:
input (str): The text to extract the date of birth from.

Returns:
str: The extracted date of birth in a common format (DD/MM/YYYY), or an empty string if not found.
"""
regex = r"\b(\d{2}[/\-.]\d{2}[/\-.](?:\d{4}|\d{2}))\b"
match = re.search(regex, input)
dob = match.group(0) if match else ""
Expand All @@ -52,6 +87,18 @@ def extract_dob(input):


def extract_pan_details(image_path):
"""
Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image.

This version attempts extraction from the original image and a converted JPEG version
to improve compatibility.

Args:
image_path (str): The path to the PAN card image.

Returns:
dict: A dictionary containing extracted PAN details.
"""
image = Image.open(image_path)
extracted_text = pytesseract.image_to_string(image)

Expand Down Expand Up @@ -79,6 +126,25 @@ def extract_pan_details(image_path):


def preprocess_for_sketch(image_path):
"""
Preprocesses an image to convert it into a black and white sketch-like look
for improved text extraction.

This function performs several image processing steps:

1. Reads the image using OpenCV.
2. Converts the image to grayscale.
3. Applies Gaussian blur to smooth the image and reduce noise.
4. Applies adaptive thresholding to convert the image to binary (black and white).
5. Applies morphological operations (opening) to reduce noise and enhance text regions.
6. Inverts the image colors for better text recognition by Tesseract.

Args:
image_path (str): The path to the image.

Returns:
numpy.ndarray: The preprocessed image in a black and white sketch-like format.
"""
# Read the image
image = cv2.imread(image_path)

Expand All @@ -104,6 +170,19 @@ def preprocess_for_sketch(image_path):


def extract_pan_details_version2(image_path):
"""
Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image
using a pre-processing step that converts the image to a sketch-like format.

This version aims to improve extraction accuracy in cases where Version 1 might struggle.

Args:
image_path (str): The path to the PAN card image.

Returns:
dict: A dictionary containing extracted PAN details.
"""

# Preprocess the image to convert it into a black and white sketch-like look
preprocessed_image = preprocess_for_sketch(image_path)

Expand All @@ -126,6 +205,23 @@ def extract_pan_details_version2(image_path):


def pan(image_path):
"""
Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image.

This function attempts extraction using two versions:

1. Version 1: Extracts details from the original image and a converted JPEG version
to improve compatibility.
2. Version 2: If any details are missing from Version 1, it applies a pre-processing
step that converts the image to a sketch-like format and then extracts details.

Args:
image_path (str): The path to the PAN card image.

Returns:
dict: A dictionary containing extracted PAN details, with missing details from
Version 1 filled in by Version 2 if necessary.
"""
# Run Version 1
result = extract_pan_details(image_path)

Expand Down
130 changes: 130 additions & 0 deletions openbharatocr/ocr/passbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@


def extract_name(input):
"""
Extracts the customer name from the given text using a regular expression.

Args:
input (str): The text to extract the name from.

Returns:
str: The extracted customer name, or None if not found.
"""
regex = re.compile(r"Customer Name\s+([A-Z\s]+)")
match = re.search(regex, input)
if match:
Expand All @@ -12,6 +21,15 @@ def extract_name(input):


def extract_open_date(input):
"""
Extracts the account open date from the given text using a regular expression.

Args:
input (str): The text to extract the open date from.

Returns:
str: The extracted account open date in DD MMM YYYY format, or None if not found.
"""
regex = re.compile(r"Open Date\s*(\d{1,2} \w{3} \d{4})")
match = re.search(regex, input)
if match:
Expand All @@ -20,6 +38,19 @@ def extract_open_date(input):


def extract_bank_name(input):
"""
Extracts the bank name from the given text using a regular expression.

This function searches for patterns containing "Bank", "Bank Ltd",
"Bank Limited", or "Credit Union" considering case-insensitivity
and matches across multiple lines.

Args:
input (str): The text to extract the bank name from.

Returns:
str: The extracted bank name, or None if not found.
"""
regex = re.compile(
r"\b[A-Za-z\s&]+(?:BANK|BANK LTD|BANK LIMITED|CREDIT UNION)\b", re.MULTILINE
)
Expand All @@ -31,6 +62,18 @@ def extract_bank_name(input):


def extract_phone(input):
"""
Extracts the phone number from the given text using a regular expression.

This function searches for patterns starting with "Mobile No" and extracts
the following digits, considering case-insensitivity.

Args:
input (str): The text to extract the phone number from.

Returns:
str: The extracted phone number, or None if not found.
"""
regex = re.compile(r"Mobile No\s*(\d+)", re.IGNORECASE)
match = re.search(regex, input)
if match:
Expand All @@ -39,6 +82,18 @@ def extract_phone(input):


def extract_branch_name(input):
"""
Extracts the branch name from the given text using a regular expression.

This function searches for patterns starting with "Branch Name" and extracts
the following text, considering case-insensitivity.

Args:
input (str): The text to extract the branch name from.

Returns:
str: The extracted branch name, or None if not found.
"""
regex = re.compile(r"Branch Name\s*([A-Za-z\d\s-]+)", re.IGNORECASE)
match = re.search(regex, input)
if match:
Expand All @@ -47,6 +102,18 @@ def extract_branch_name(input):


def extract_nomination_name(input):
"""
Extracts the nomination name from the given text using a regular expression.

This function searches for patterns containing "Nominee" or "Nomination"
followed by two capitalized words.

Args:
input (str): The text to extract the nomination name from.

Returns:
str: The extracted nomination name (full name), or None if not found.
"""
regex = re.compile(r"Nomina(?:non|tion)\s+([A-Z][a-z]+\s[A-Z][a-z]+)")
match = re.search(regex, input)
if match:
Expand All @@ -55,6 +122,20 @@ def extract_nomination_name(input):


def extract_email(input):
"""
Extracts the email address from the given text using a regular expression.

This function searches for email addresses in the format of [email protected],
where username can contain letters, numbers, periods, underscores, plus signs,
and hyphens, and domain can contain letters, numbers, periods, and hyphens.

Args:
input (str): The text to extract the email address from.

Returns:
str: The extracted email address, or None if not found.
"""

regex = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
match = re.search(regex, input)
if match:
Expand All @@ -63,6 +144,18 @@ def extract_email(input):


def extract_account_no(input):
"""
Extracts the account number from the given text using a regular expression.

This function searches for patterns containing "Account Number:" followed by
9 to 12 digits, considering case-insensitivity.

Args:
input (str): The text to extract the account number from.

Returns:
str: The extracted account number, or None if not found.
"""
regex = re.compile(r"Account Number:\s*(\d{9,12})", re.IGNORECASE)
match = re.search(regex, input)
if match:
Expand All @@ -71,6 +164,18 @@ def extract_account_no(input):


def extract_cif_no(input):
"""
Extracts the CIF number from the given text using a regular expression.

This function searches for patterns containing "CIF" (case-insensitive),
optionally followed by "No" or ".", and then extracts the following digits.

Args:
input (str): The text to extract the CIF number from.

Returns:
str: The extracted CIF number, or None if not found.
"""
regex = re.compile(r"CIF(?: No)?\.?\s*(\d+)", re.IGNORECASE)
match = re.search(regex, input)
if match:
Expand All @@ -79,6 +184,19 @@ def extract_cif_no(input):


def extract_address(input):
"""
Extracts the address from the given text using a regular expression.

This function attempts to extract addresses using a list of patterns that
commonly represent addresses. The patterns include house numbers, street names,
city/town names, and postal codes.

Args:
input (str): The text to extract the address from.

Returns:
str: The extracted address, or None if no matching pattern is found.
"""
regex = [
r"\d+\s[A-Za-z\s,]+(?:Road|Street|Avenue|Boulevard|Lane|Drive|Court|Place|Square|Plaza|Terrace|Trail|Parkway|Circle)\s*,?\s*(?:\d{5}|\d{5}-\d{4})?",
r"\d+\s[A-Za-z\s,]+(?:Road|Street|Avenue|Boulevard|Lane|Drive|Court|Place|Square|Plaza|Terrace|Trail|Parkway|Circle)",
Expand All @@ -97,6 +215,18 @@ def extract_address(input):


def parse_passbook_frontpage(image_path):
"""
Parses a passbook front page image to extract various customer and account information.

This function uses EasyOCR to read text from the image and then employs regular expressions
to extract specific details like name, account number, address, phone number, etc.

Args:
image_path (str): The path to the passbook front page image.

Returns:
dict: A dictionary containing the extracted passbook information.
"""
reader = easyocr.Reader(["en"])

image = cv2.imread(image_path)
Expand Down
Loading
Loading