Skip to content

Commit

Permalink
Added the updated logic for Water Bill OCR
Browse files Browse the repository at this point in the history
  • Loading branch information
isumitjha committed May 21, 2024
1 parent f761d59 commit a8c2ea8
Showing 1 changed file with 80 additions and 51 deletions.
131 changes: 80 additions & 51 deletions openbharatocr/ocr/water_bill.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,24 @@
from PIL import Image
from datetime import datetime

def preprocess_for_bold_text(image):

def extract_name(input):
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

regex = r"(?:Name\s*[:\s]?\s*)(.*?)(?:\bConsumer\b|['/]|$)"
match = re.search(regex, input, re.IGNORECASE)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
opening = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
contrast = cv2.addWeighted(opening, 2, opening, -0.5, 0)

_, binary = cv2.threshold(contrast, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
sharpened = cv2.filter2D(
binary, -1, np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
)

return sharpened

def extract_name(input):
regex = r"Name:\s*(.*?)(?:\.\s|(?=\n))"
match = re.search(regex, input)
name = match.group(1).strip() if match else ""

if name == "":
Expand All @@ -16,85 +29,97 @@ def extract_name(input):
name = match.group(1).strip() if match else ""

return name



def extract_bill_amount(input):
regex = r"Bill Amount \(Rs\.\)\s*:? (\d+)"
match = re.search(regex, input, re.IGNORECASE)
bill_amount = match.group(1).strip() if match else ""
return bill_amount

def extract_meter_number(input):
regex = r"Meter No\.\s*:\s*(\d+|NA)"
match = re.search(regex, input, re.IGNORECASE)
meter_number = match.group(1).strip() if match else ""
return meter_number

def extract_all_dates(input):

regex = r"\b(\d{1,2}[/\-.](?:\d{2}|\d{4}|\w{3})[/\-.]\d{2,4})\b"
regex = r"\b(\d{1,2}-[A-Z]{3}-\d{4})\b"
dates = re.findall(regex, input)
formatted_dates = []
for date in dates:
try:
formatted_date = datetime.strptime(date, "%d/%m/%Y")
formatted_date = datetime.strptime(date, "%d-%b-%Y")
formatted_dates.append(formatted_date)
except ValueError:
try:
formatted_date = datetime.strptime(date, "%d-%m-%Y")
except ValueError:
formatted_date = datetime.strptime(date, "%d-%b-%Y")
formatted_dates.append(formatted_date)

continue
sorted_dates = sorted(formatted_dates)
sorted_dates_str = [date.strftime("%d/%m/%Y") for date in sorted_dates]

sorted_dates_str = [date.strftime("%d-%m-%Y") for date in sorted_dates]
return sorted_dates_str


def extract_phone(input):

regex = r"[6789]\d{9}"
regex = r"Mobile No\.\s*:\s*(\d+)"
match = re.search(regex, input)
phone = match.group(0) if match else ""

phone = match.group(1).strip() if match else ""
return phone


def extract_address(input):

regex = r"Address:\s*(.*?)(?=\b[A-Z][a-zA-Z\s]*:|\b[A-Z][a-zA-Z\s]*$)"
regex = r"Address\s*:\s*(.*?)(?=\s*[A-Z][a-zA-Z\s]*:|$)"
match = re.search(regex, input, re.DOTALL | re.IGNORECASE)
address = match.group(1).strip() if match else ""

return address


def extract_meter_type(input):
regex = r"Meter\s*Type\s*.DJB.Pvt.:\s*([A-Z]+)\s"

def extract_mr_code(input):
regex = r"Zone/MR\s*Code:\s*([A-Z0-9/]+\s*[A-Z0-9/]*)"
match = re.search(regex, input, re.IGNORECASE)
meter_type = match.group(1).strip() if match else ""

return meter_type
mr_code = match.group(1).strip() if match else ""
return mr_code


def extract_bill_amount(input):

regex = r"total\sdue\sdate\s*(\d+)\n"
def extract_area_code(input):
regex = r"Area Code\s*:\s*([\w/-]+)"
match = re.search(regex, input, re.IGNORECASE)
bill_amount = match.group(1).strip() if match else ""
area_code = match.group(1).strip() if match else ""
return area_code

if bill_amount == "":
regex = r".Rs.\s*(\d+)\n"
matches = re.findall(regex, input, re.IGNORECASE)
bill_amount = matches[-1] if len(matches) > 0 else ""
def extract_bill_number(input):
regex = r"Bill No\.\s*(?::\s*)?(\d+)"
match = re.search(regex, input)
bill_number = match.group(1).strip() if match else ""
return bill_number

return bill_amount
def extract_govt_body(input):
regex = r"Delhi Jal Board"
match = re.search(regex, input, re.IGNORECASE)
govt_body = match.group(0).strip() if match else "Unknown"
return govt_body

def extract_bill_date(input):
regex = r"Bill Date\s*:? (\d{2}-[A-Z]{3}-\d{4})"
match = re.search(regex, input, re.IGNORECASE)
bill_date = match.group(1).strip() if match else ""
return bill_date

def extract_bill_due_date(input):
regex = r"Bill Due Date\s*:? (\d{2}-[A-Z]{3}-\d{4})"
match = re.search(regex, input, re.IGNORECASE)
bill_due_date = match.group(1).strip() if match else ""
return bill_due_date

def extract_water_bill_details(image_path):

image = Image.open(image_path)
extracted_text = pytesseract.image_to_string(image)

name = extract_name(extracted_text)

dates = extract_all_dates(extracted_text)
bill_date = dates[0] if len(dates) > 0 else ""
due_date = dates[1] if len(dates) > 1 else ""

bill_date = extract_bill_date(extracted_text)
due_date = extract_bill_due_date(extracted_text)
address = extract_address(extracted_text)
phone = extract_phone(extracted_text)

meter_type = extract_meter_type(extracted_text)
bill_amount = extract_bill_amount(extracted_text)
mr_code = extract_mr_code(extracted_text)
area_code = extract_area_code(extracted_text)
meter_number = extract_meter_number(extracted_text)
bill_number = extract_bill_number(extracted_text)
govt_body = extract_govt_body(extracted_text)

return {
"Name": name,
Expand All @@ -103,9 +128,13 @@ def extract_water_bill_details(image_path):
"Bill Date": bill_date,
"Due Date": due_date,
"Address": address,
"Meter Type": meter_type,
"Zone/MR Code": mr_code,
"Area Code": area_code,
"Meter Number": meter_number,
"Bill Number": bill_number,
"Source/Govt Body Name": govt_body,
}


def water_bill(image_path):
return extract_water_bill_details(image_path)

0 comments on commit a8c2ea8

Please sign in to comment.