diff --git a/openbharatocr/ocr/pan.py b/openbharatocr/ocr/pan.py index c5516e3..98c3946 100644 --- a/openbharatocr/ocr/pan.py +++ b/openbharatocr/ocr/pan.py @@ -7,6 +7,15 @@ def clean_input(match): + """ + Cleans the extracted text by splitting lines and removing stopwords. + + Args: + match (list): A list of extracted text chunks. + + Returns: + list: A cleaned list of individual names. + """ cleaned = [] for name in match: @@ -14,16 +23,6 @@ def clean_input(match): for chunk in split_name: cleaned.append(chunk) - return cleaned - - -def extract_all_names(input): - regex = r"\n[A-Z\s]+\b" - match = re.findall(regex, input) - - names = [] - cleaned = clean_input(match) - stopwords = ["INDIA", "OF", "TAX", "GOVT", "DEPARTMENT", "INCOME"] names = [ @@ -35,7 +34,34 @@ def extract_all_names(input): return names +def extract_all_names(input): + """ + Extracts all names from the given text using a regular expression and performs basic cleaning. + + Args: + input (str): The text to extract names from. + + Returns: + list: A list of extracted names. + """ + regex = r"\n[A-Z\s]+\b" + match = re.findall(regex, input) + + names = [] + cleaned = clean_input(match) + return cleaned + + def extract_pan(input): + """ + Extracts the PAN number from the given text using a regular expression. + + Args: + input (str): The text to extract the PAN number from. + + Returns: + str: The extracted PAN number, or an empty string if not found. + """ regex = r"[A-Z]{5}[0-9]{4}[A-Z]" match = re.search(regex, input) pan_number = match.group(0) if match else "" @@ -44,6 +70,15 @@ def extract_pan(input): def extract_dob(input): + """ + Extracts the date of birth from the given text using a regular expression. + + Args: + input (str): The text to extract the date of birth from. + + Returns: + str: The extracted date of birth in a common format (DD/MM/YYYY), or an empty string if not found. + """ regex = r"\b(\d{2}[/\-.]\d{2}[/\-.](?:\d{4}|\d{2}))\b" match = re.search(regex, input) dob = match.group(0) if match else "" @@ -52,6 +87,18 @@ def extract_dob(input): def extract_pan_details(image_path): + """ + Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image. + + This version attempts extraction from the original image and a converted JPEG version + to improve compatibility. + + Args: + image_path (str): The path to the PAN card image. + + Returns: + dict: A dictionary containing extracted PAN details. + """ image = Image.open(image_path) extracted_text = pytesseract.image_to_string(image) @@ -79,6 +126,25 @@ def extract_pan_details(image_path): def preprocess_for_sketch(image_path): + """ + Preprocesses an image to convert it into a black and white sketch-like look + for improved text extraction. + + This function performs several image processing steps: + + 1. Reads the image using OpenCV. + 2. Converts the image to grayscale. + 3. Applies Gaussian blur to smooth the image and reduce noise. + 4. Applies adaptive thresholding to convert the image to binary (black and white). + 5. Applies morphological operations (opening) to reduce noise and enhance text regions. + 6. Inverts the image colors for better text recognition by Tesseract. + + Args: + image_path (str): The path to the image. + + Returns: + numpy.ndarray: The preprocessed image in a black and white sketch-like format. + """ # Read the image image = cv2.imread(image_path) @@ -104,6 +170,19 @@ def preprocess_for_sketch(image_path): def extract_pan_details_version2(image_path): + """ + Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image + using a pre-processing step that converts the image to a sketch-like format. + + This version aims to improve extraction accuracy in cases where Version 1 might struggle. + + Args: + image_path (str): The path to the PAN card image. + + Returns: + dict: A dictionary containing extracted PAN details. + """ + # Preprocess the image to convert it into a black and white sketch-like look preprocessed_image = preprocess_for_sketch(image_path) @@ -126,6 +205,23 @@ def extract_pan_details_version2(image_path): def pan(image_path): + """ + Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image. + + This function attempts extraction using two versions: + + 1. Version 1: Extracts details from the original image and a converted JPEG version + to improve compatibility. + 2. Version 2: If any details are missing from Version 1, it applies a pre-processing + step that converts the image to a sketch-like format and then extracts details. + + Args: + image_path (str): The path to the PAN card image. + + Returns: + dict: A dictionary containing extracted PAN details, with missing details from + Version 1 filled in by Version 2 if necessary. + """ # Run Version 1 result = extract_pan_details(image_path) diff --git a/openbharatocr/ocr/passbook.py b/openbharatocr/ocr/passbook.py index 08fbd9f..902d3e4 100644 --- a/openbharatocr/ocr/passbook.py +++ b/openbharatocr/ocr/passbook.py @@ -4,6 +4,15 @@ def extract_name(input): + """ + Extracts the customer name from the given text using a regular expression. + + Args: + input (str): The text to extract the name from. + + Returns: + str: The extracted customer name, or None if not found. + """ regex = re.compile(r"Customer Name\s+([A-Z\s]+)") match = re.search(regex, input) if match: @@ -12,6 +21,15 @@ def extract_name(input): def extract_open_date(input): + """ + Extracts the account open date from the given text using a regular expression. + + Args: + input (str): The text to extract the open date from. + + Returns: + str: The extracted account open date in DD MMM YYYY format, or None if not found. + """ regex = re.compile(r"Open Date\s*(\d{1,2} \w{3} \d{4})") match = re.search(regex, input) if match: @@ -20,6 +38,19 @@ def extract_open_date(input): def extract_bank_name(input): + """ + Extracts the bank name from the given text using a regular expression. + + This function searches for patterns containing "Bank", "Bank Ltd", + "Bank Limited", or "Credit Union" considering case-insensitivity + and matches across multiple lines. + + Args: + input (str): The text to extract the bank name from. + + Returns: + str: The extracted bank name, or None if not found. + """ regex = re.compile( r"\b[A-Za-z\s&]+(?:BANK|BANK LTD|BANK LIMITED|CREDIT UNION)\b", re.MULTILINE ) @@ -31,6 +62,18 @@ def extract_bank_name(input): def extract_phone(input): + """ + Extracts the phone number from the given text using a regular expression. + + This function searches for patterns starting with "Mobile No" and extracts + the following digits, considering case-insensitivity. + + Args: + input (str): The text to extract the phone number from. + + Returns: + str: The extracted phone number, or None if not found. + """ regex = re.compile(r"Mobile No\s*(\d+)", re.IGNORECASE) match = re.search(regex, input) if match: @@ -39,6 +82,18 @@ def extract_phone(input): def extract_branch_name(input): + """ + Extracts the branch name from the given text using a regular expression. + + This function searches for patterns starting with "Branch Name" and extracts + the following text, considering case-insensitivity. + + Args: + input (str): The text to extract the branch name from. + + Returns: + str: The extracted branch name, or None if not found. + """ regex = re.compile(r"Branch Name\s*([A-Za-z\d\s-]+)", re.IGNORECASE) match = re.search(regex, input) if match: @@ -47,6 +102,18 @@ def extract_branch_name(input): def extract_nomination_name(input): + """ + Extracts the nomination name from the given text using a regular expression. + + This function searches for patterns containing "Nominee" or "Nomination" + followed by two capitalized words. + + Args: + input (str): The text to extract the nomination name from. + + Returns: + str: The extracted nomination name (full name), or None if not found. + """ regex = re.compile(r"Nomina(?:non|tion)\s+([A-Z][a-z]+\s[A-Z][a-z]+)") match = re.search(regex, input) if match: @@ -55,6 +122,20 @@ def extract_nomination_name(input): def extract_email(input): + """ + Extracts the email address from the given text using a regular expression. + + This function searches for email addresses in the format of username@domain.com, + where username can contain letters, numbers, periods, underscores, plus signs, + and hyphens, and domain can contain letters, numbers, periods, and hyphens. + + Args: + input (str): The text to extract the email address from. + + Returns: + str: The extracted email address, or None if not found. + """ + regex = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}") match = re.search(regex, input) if match: @@ -63,6 +144,18 @@ def extract_email(input): def extract_account_no(input): + """ + Extracts the account number from the given text using a regular expression. + + This function searches for patterns containing "Account Number:" followed by + 9 to 12 digits, considering case-insensitivity. + + Args: + input (str): The text to extract the account number from. + + Returns: + str: The extracted account number, or None if not found. + """ regex = re.compile(r"Account Number:\s*(\d{9,12})", re.IGNORECASE) match = re.search(regex, input) if match: @@ -71,6 +164,18 @@ def extract_account_no(input): def extract_cif_no(input): + """ + Extracts the CIF number from the given text using a regular expression. + + This function searches for patterns containing "CIF" (case-insensitive), + optionally followed by "No" or ".", and then extracts the following digits. + + Args: + input (str): The text to extract the CIF number from. + + Returns: + str: The extracted CIF number, or None if not found. + """ regex = re.compile(r"CIF(?: No)?\.?\s*(\d+)", re.IGNORECASE) match = re.search(regex, input) if match: @@ -79,6 +184,19 @@ def extract_cif_no(input): def extract_address(input): + """ + Extracts the address from the given text using a regular expression. + + This function attempts to extract addresses using a list of patterns that + commonly represent addresses. The patterns include house numbers, street names, + city/town names, and postal codes. + + Args: + input (str): The text to extract the address from. + + Returns: + str: The extracted address, or None if no matching pattern is found. + """ regex = [ r"\d+\s[A-Za-z\s,]+(?:Road|Street|Avenue|Boulevard|Lane|Drive|Court|Place|Square|Plaza|Terrace|Trail|Parkway|Circle)\s*,?\s*(?:\d{5}|\d{5}-\d{4})?", r"\d+\s[A-Za-z\s,]+(?:Road|Street|Avenue|Boulevard|Lane|Drive|Court|Place|Square|Plaza|Terrace|Trail|Parkway|Circle)", @@ -97,6 +215,18 @@ def extract_address(input): def parse_passbook_frontpage(image_path): + """ + Parses a passbook front page image to extract various customer and account information. + + This function uses EasyOCR to read text from the image and then employs regular expressions + to extract specific details like name, account number, address, phone number, etc. + + Args: + image_path (str): The path to the passbook front page image. + + Returns: + dict: A dictionary containing the extracted passbook information. + """ reader = easyocr.Reader(["en"]) image = cv2.imread(image_path) diff --git a/openbharatocr/ocr/passport.py b/openbharatocr/ocr/passport.py index 6d317d2..dee0e43 100644 --- a/openbharatocr/ocr/passport.py +++ b/openbharatocr/ocr/passport.py @@ -6,6 +6,23 @@ def preprocess_for_bold_text(image): + """ + Preprocesses an image to enhance bold text for improved OCR extraction. + + This function performs several image processing steps: + + 1. Converts the image to grayscale. + 2. Applies morphological opening to reduce noise. + 3. Increases contrast to make bold text more prominent. + 4. Applies binarization with Otsu's thresholding. + 5. Applies sharpening to further enhance text edges. + + Args: + image (numpy.ndarray): The image to preprocess. + + Returns: + numpy.ndarray: The preprocessed image with enhanced bold text. + """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) @@ -21,6 +38,19 @@ def preprocess_for_bold_text(image): def extract_names(input): + """ + Extracts first and last name from the given text using regular expressions. + + This function searches for patterns containing "Names" or "Surname" followed by + one or more words, considering case-insensitivity. + + Args: + input (str): The text to extract names from. + + Returns: + tuple: A tuple containing the extracted first name and last name (both strings), + or empty strings if not found. + """ name_regex = r"Names[\s:]+([A-Za-z\s]+)(?:\n|$)" surname_regex = r"Surname[\s:]+([A-Za-z\s]+)(?:\n|$)" @@ -34,6 +64,17 @@ def extract_names(input): def extract_all_dates(input): + """ + Extracts all dates in DD/MM/YYYY or DD-MM-YYYY format from the given text using a regular expression. + + This function sorts the extracted dates chronologically and removes duplicates. + + Args: + input (str): The text to extract dates from. + + Returns: + list: A list of extracted dates in sorted order (strings). + """ regex = r"\b(\d{2}[/\-.]\d{2}[/\-.](?:\d{4}|\d{2}))\b" dates = re.findall(regex, input) dates = sorted(dates, key=lambda x: int(re.split(r"[-/]", x)[-1])) @@ -50,6 +91,19 @@ def extract_all_dates(input): def extract_all_places(input): + """ + Extracts place names from the text following the last identified date in the document, + assuming a newline separates dates and places. + + This function filters out anything except letters, punctuation, spaces, and apostrophes, + and removes single-character entries. + + Args: + input (str): The text to extract places from. + + Returns: + list: A list of extracted place names (strings). + """ dates = re.findall(r"\b(\d{2}[/\-.]\d{2}[/\-.](?:\d{4}|\d{2}))\b", input) last_date = dates[-1] if dates else None @@ -65,6 +119,17 @@ def extract_all_places(input): def extract_passport_number(input): + """ + Extracts the passport number from the given text using a regular expression. + + This function searches for a pattern starting with a capital letter followed by 7 digits. + + Args: + input (str): The text to extract the passport number from. + + Returns: + str: The extracted passport number, or an empty string if not found. + """ regex = r"[A-Z][0-9]{7}" match = re.search(regex, input) passport_number = match.group(0) if match else "" @@ -73,6 +138,20 @@ def extract_passport_number(input): def extract_details(input): + """ + Extracts name, surname, and gender from the given text using a combination of + regular expressions and heuristics. + + This function assumes lines with only uppercase characters separated by spaces + represent the name and surname. It also checks the last line for "M" or "F" to infer gender. + + Args: + input (str): The text to extract details from. + + Returns: + tuple: A tuple containing extracted gender (string), first name (string), + and last name (string). Empty strings are returned if not found. + """ lines = input.split("\n") clean = [] for line in lines: @@ -101,6 +180,27 @@ def extract_details(input): def extract_passport_details(image_path): + """ + Extracts passport details from an image using a combination of OCR and text processing. + + This function performs the following steps: + + 1. Reads the image using Pillow. + 2. Extracts text using Tesseract (saves a JPEG copy for pre-processing). + 3. Preprocesses the image (JPEG copy) to enhance bold text for OCR. + 4. Extracts text again using Tesseract on the preprocessed image. + 5. Extracts dates (DoB, DoI, Expiry) using regular expressions from the original text. + 6. Extracts passport number using a regular expression from the original text. + 7. Extracts places (PoB, PoI) based on text following the last date. + 8. Extracts name, surname, and gender using heuristics on cleaned preprocessed text. + + Args: + image_path (str): The path to the passport image. + + Returns: + dict: A dictionary containing extracted passport details with keys like + "Name", "Surname", "Passport Number", etc. + """ image = Image.open(image_path) extracted_text = pytesseract.image_to_string(image) image.save("image.jpg", "JPEG") @@ -138,4 +238,13 @@ def extract_passport_details(image_path): def passport(image_path): + """ + Extracts passport details from an image using the extract_passport_details function. + + Args: + image_path (str): The path to the passport image. + + Returns: + dict: A dictionary containing extracted passport details. + """ return extract_passport_details(image_path) diff --git a/openbharatocr/ocr/vehicle_registration.py b/openbharatocr/ocr/vehicle_registration.py index a564112..d7ae30c 100644 --- a/openbharatocr/ocr/vehicle_registration.py +++ b/openbharatocr/ocr/vehicle_registration.py @@ -4,6 +4,20 @@ def extract_names(input): + """ + Extracts owner name and son/wife/daughter of (SWD) information from the given text using regular expressions. + + This function attempts to extract names in three ways, prioritizing formats + containing "Dual Owner", "NAME", and "S/O W/D" patterns. + + Args: + input (str): The text to extract names from. + + Returns: + tuple: A tuple containing the extracted full name (string) + and son/wife/daughter of information (SWD, string), + or empty strings if not found. + """ regex_swd = r"dual\sOwner\)?\s*:?\s*([A-Z.]+\s[A-Z.]+\s[A-Z.]+)" match = re.search(regex_swd, input, re.IGNORECASE) swd = match.group(1) if match else "" @@ -26,6 +40,18 @@ def extract_names(input): def extract_reg_number(input): + """ + Extracts the vehicle registration number from the given text using a regular expression. + + This function searches for a pattern containing at least one digit followed by + 10 alphanumeric characters. + + Args: + input (str): The text to extract the registration number from. + + Returns: + str: The extracted registration number, or an empty string if not found. + """ regex = r"(?=.*\d)[A-Z0-9]{10}" match = re.search(regex, input) reg_number = match.group(0) if match else "" @@ -34,6 +60,17 @@ def extract_reg_number(input): def extract_chasis(input): + """ + Extracts the chasis number from the given text using a regular expression. + + This function searches for a pattern containing 17 or 18 alphanumeric characters. + + Args: + input (str): The text to extract the chasis number from. + + Returns: + str: The extracted chasis number, or an empty string if not found. + """ regex = r"[A-Z0-9]{17,18}" match = re.search(regex, input) chasis = match.group(0) if match else "" @@ -42,6 +79,18 @@ def extract_chasis(input): def extract_fuel_type(input): + """ + Extracts the fuel type from the given text using a regular expression. + + This function searches for patterns containing "Fuel Type" or "Fuel" followed by + a colon or period, and then extracts the following text containing letters and slashes. + + Args: + input (str): The text to extract the fuel type from. + + Returns: + str: The extracted fuel type, or an empty string if not found. + """ regex = r"Fuel(?:\s+Type)?\s*[\s:\.]\s*([A-Z/]+)\s" match = re.search(regex, input, re.IGNORECASE) fuel_type = match.group(1) if match else "" @@ -49,6 +98,18 @@ def extract_fuel_type(input): def extract_vehicle_class(input): + """ + Extracts the vehicle class from the given text using a regular expression. + + This function searches for patterns containing "Veh.Class" or "Veh Cl" followed by + a colon or period, and then extracts two words separated by spaces or special characters. + + Args: + input (str): The text to extract the vehicle class from. + + Returns: + str: The extracted vehicle class (two words combined), or an empty string if not found. + """ regex = r"(?:Veh.c.e\sClass|Veh\sCl)\s*[\s:]\s*([A-Z0-9/()-]+)\s([A-Z0-9/()-]+)\s" match = re.search(regex, input, re.IGNORECASE) vehicle_class = match.group(1) if match else "" @@ -56,6 +117,18 @@ def extract_vehicle_class(input): def extract_manufacturer(input): + """ + Extracts the vehicle manufacturer from the given text using a regular expression. + + This function searches for a pattern containing "MFR" followed by a colon and extracts + the following text containing letters and spaces. + + Args: + input (str): The text to extract the manufacturer from. + + Returns: + str: The extracted manufacturer, or an empty string if not found. + """ regex = r"MFR\s*:\s*([A-Z\s]+)\n" match = re.search(regex, input, re.IGNORECASE) manufacturer = match.group(1) if match else "" @@ -63,6 +136,18 @@ def extract_manufacturer(input): def extract_tax_info(input): + """ + Extracts tax information (up to which month/year) from the given text using a regular expression. + + This function searches for a pattern containing "Tax Up To" followed by a colon or space, + and then extracts the following word (assuming it represents the month/year). + + Args: + input (str): The text to extract the tax information from. + + Returns: + str: The extracted tax information (month/year), or an empty string if not found. + """ regex = r"Tax\sUp\s{0,1}to\s*:\s*([A-Z]+)\s" match = re.search(regex, input, re.IGNORECASE) tax_up_to = match.group(1) if match else "" @@ -70,6 +155,19 @@ def extract_tax_info(input): def extract_model(input): + """ + Extracts the vehicle model from the given text using a regular expression. + + This function searches for a pattern containing "Model" followed by a colon or space, + and then extracts the following text containing letters, numbers, forward slashes, + hyphens, parentheses, periods, and spaces (up to 4 words). + + Args: + input (str): The text to extract the model from. + + Returns: + str: The extracted vehicle model, or an empty string if not found. + """ regex = r"Mode.\s*[\s:]\s*([A-Z0-9/+()-.]+(?:\s+[^\w\n]*[A-Z0-9/+()-.]+){0,3})\s" match = re.search(regex, input, re.IGNORECASE) model = match.group(1) if match else "" @@ -77,6 +175,18 @@ def extract_model(input): def extract_all_dates(input_text): + """ + Extracts all dates from the given text using a regular expression. + + This function searches for patterns in formats like DD/MM/YYYY, DD-MM-YYYY, or + MMM/YYYY, and sorts the extracted dates chronologically. + + Args: + input_text (str): The text to extract dates from. + + Returns: + list: A list of extracted dates sorted in ascending order (strings). + """ regex = r"\b(\d{1,2}[/\-.](?:\d{2}|\d{4}|\w{3})[/\-.]\d{2,4})\b" dates = re.findall(regex, input_text) sorted_dates = sorted( @@ -87,6 +197,18 @@ def extract_all_dates(input_text): def extract_address(input): + """ + Extracts the address from the given text using a regular expression. + + This function searches for patterns containing "Address" (optional colon) + followed by any characters and spaces, prioritizing lines ending with a postal code (6 digits). + + Args: + input (str): The text to extract the address from. + + Returns: + str: The extracted address, or an empty string if not found. + """ regex = r"Address:?\s*((?:.|\n)*?\d{6})" match = re.search(regex, input, re.IGNORECASE) address = match.group(1) if match else "" @@ -95,6 +217,27 @@ def extract_address(input): def extract_vehicle_registration_details(image_path): + """ + Extracts vehicle registration details from an image using a combination of OCR and text processing. + + This function performs the following steps: + + 1. Reads the image using Pillow. + 2. Extracts text using Tesseract (assuming the text is in a supported language). + 3. Extracts owner name and son/wife/daughter of (SWD) information using regular expressions. + 4. Extracts all dates using a regular expression and sorts them chronologically. + 5. Extracts vehicle details like registration number, chasis number, fuel type, + vehicle class, and model using regular expressions. + 6. Extracts manufacturer and tax information using regular expressions. + 7. Extracts address using a regular expression prioritizing lines ending with a postal code. + + Args: + image_path (str): The path to the vehicle registration image. + + Returns: + dict: A dictionary containing extracted vehicle registration details with keys like + "Registration Number", "Chasis Number", "Full Name", etc. + """ image = Image.open(image_path) extracted_text = pytesseract.image_to_string(image) @@ -134,4 +277,13 @@ def extract_vehicle_registration_details(image_path): def vehicle_registration(image_path): + """ + Extracts vehicle registration details from an image using the extract_vehicle_registration_details function. + + Args: + image_path (str): The path to the vehicle registration image. + + Returns: + dict: A dictionary containing extracted vehicle registration details. + """ return extract_vehicle_registration_details(image_path) diff --git a/openbharatocr/ocr/voter_id.py b/openbharatocr/ocr/voter_id.py index ec51307..d3a2e69 100644 --- a/openbharatocr/ocr/voter_id.py +++ b/openbharatocr/ocr/voter_id.py @@ -17,6 +17,24 @@ def preprocess_for_bold_text(image): + """ + Preprocesses an image to enhance bold text for improved OCR extraction. + + This function performs the following steps: + + 1. Converts the image to grayscale. + 2. Applies morphological opening (erosion followed by dilation) with a rectangular kernel + to reduce noise, especially around bold text. + 3. Increases contrast using weighted addition to make bold text stand out more. + 4. Applies binarization with Otsu's thresholding to separate foreground (text) from background. + 5. Applies sharpening using a Laplacian filter to further enhance edges of bold text. + + Args: + image (numpy.ndarray): The image to preprocess. + + Returns: + numpy.ndarray: The preprocessed image with enhanced bold text. + """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) @@ -32,6 +50,34 @@ def preprocess_for_bold_text(image): def extract_voter_details_yolo(image_path): + """ + Extracts voter information from a voter ID image using YOLO object detection and OCR. + + This function performs the following steps: + + 1. Reads the image using Pillow. + 2. Loads a pre-trained YOLO model for object detection. (YOLO_CFG and YOLO_WEIGHT are assumed to be defined elsewhere) + 3. Defines classes to be detected: "elector" (elector's name), "relation" (father's name), "voterid". + 4. Converts the image to RGB format (assuming YOLO expects RGB). + 5. Uses a temporary directory to store a temporary image file for processing. + 6. Reads the temporary image using OpenCV. + 7. Performs object detection using YOLO, identifying bounding boxes and class labels for detected objects. + 8. Initializes empty dictionaries for boxes, confidences, class IDs, and detected texts. + 9. Iterates through each detected object's information: + - Extracts bounding box coordinates (x, y, width, height). + - Calculates absolute coordinates based on the image size. + - Crops the image based on the bounding box to isolate the detected region. + - Skips empty crops to avoid errors. + - Extracts text from the cropped image using Tesseract with configuration for single block processing. + - Stores the extracted text along with the corresponding class label in the detected_texts dictionary. + 10. Returns the dictionary containing detected texts categorized by their labels (e.g., "elector": "John Doe"). + + Args: + image_path (str): The path to the voter ID image. + + Returns: + dict: A dictionary containing voter information extracted using OCR, categorized by labels (e.g., "elector": "John Doe"). + """ image = Image.open(image_path) net = cv2.dnn.readNetFromDarknet(YOLO_CFG, YOLO_WEIGHT) classes = ["elector", "relation", "voterid"] @@ -100,6 +146,18 @@ def extract_voter_details_yolo(image_path): def extract_voter_id(input): + """ + Extracts the voter ID number from the given text using a regular expression. + + This function searches for a pattern containing 0 to 3 optional characters followed by 7 digits. + + Args: + input (str): The text to extract the voter ID from. + + Returns: + str: The extracted voter ID, or an empty string if not found. + """ + regex = r".{0,3}[0-9]{7}" match = re.search(regex, input) voter_id = match.group(0) if match else "" @@ -108,6 +166,18 @@ def extract_voter_id(input): def extract_names(input): + """ + Extracts names from the given text using a regular expression. + + This function searches for the word "Name" followed by an optional colon, equal sign, or plus sign, and then captures any following characters. + It extracts all occurrences and returns a list, handling potential multiple names. + + Args: + input (str): The text to extract names from. + + Returns: + list: A list of extracted names (strings), or an empty list if not found. + """ regex = r"Name\s*[:=+]?\s*(.*)" matches = re.findall(regex, input, re.IGNORECASE) names = [match.strip() for match in matches] if matches else [] @@ -116,6 +186,20 @@ def extract_names(input): def extract_lines_with_uppercase_words(input): + """ + Extracts lines containing sequences of uppercase words from the given text. + + This function iterates through lines in the input text: + 1. Uses a regular expression to search for lines containing one or more uppercase words separated by spaces. + 2. If a match is found, extracts all uppercase words using the same regular expression and appends them to a list. + + Args: + input (str): The text to extract lines with uppercase words from. + + Returns: + list: A list of extracted lines containing sequences of uppercase words (strings), + or an empty list if none are found. + """ lines_with_uppercase_words = [] pattern = r"\b[A-Z]+(?:\s+[A-Z]+)*\b" for line in input.split("\n"): @@ -127,6 +211,19 @@ def extract_lines_with_uppercase_words(input): def extract_gender(input): + """ + Extracts the gender from the given text using case-insensitive matching. + + This function searches for the presence of "Female" or "Male" (or their uppercase equivalents) in the input text. + It returns "Female" if a match for "Female" is found, "Male" if a match for "Male" is found, otherwise returns an empty string. + + Args: + input (str): The text to extract the gender from. + + Returns: + str: The extracted gender ("Female" or "Male"), or an empty string if not found. + """ + if "Female" in input or "FEMALE" in input: return "Female" elif "Male" in input or "MALE" in input: @@ -136,6 +233,19 @@ def extract_gender(input): def extract_date(input): + """ + Extracts the date of birth from the given text using a regular expression. + + This function searches for a pattern containing two digits followed by a separator (slash, hyphen, or dot), + another two digits followed by a separator, and then either four or two digits representing the year. + The entire pattern must be surrounded by word boundaries. + + Args: + input (str): The text to extract the date of birth from. + + Returns: + str: The extracted date of birth (in format DD/MM/YYYY or DD-MM-YYYY), or an empty string if not found. + """ regex = r"\b([0-9X]{2}[/\-.][0-9X]{2}[/\-.](?:\d{4}|\d{2}))\b" match = re.search(regex, input) dob = match.group(0) if match else "" @@ -144,6 +254,19 @@ def extract_date(input): def extract_address(input): + """ + Extracts the address from the given text using a regular expression with two approaches. + + This function prioritizes lines containing "Address" followed by an optional colon and any characters/spaces, + ending with a postal code (6 digits). If not found, it attempts to extract any line containing + an address-like pattern (alphanumeric characters, punctuation, spaces) ending with a postal code. + + Args: + input (str): The text to extract the address from. + + Returns: + str: The extracted address (including postal code), or an empty string if not found. + """ regex = r"Address\s*:?\s*[A-Za-z0-9:,-.\n\s\/]+[0-9]{6}" match = re.search(regex, input) address = match.group(0) if match else "" @@ -157,6 +280,33 @@ def extract_address(input): def extract_voterid_details_front(image_path): + """ + Extracts voter information from the front side of a voter ID image using OCR with fallback for non-standard layouts. + + This function performs the following steps: + + 1. Opens the image using Pillow. + 2. Extracts text using Tesseract (assuming the text is in a supported language). + 3. Extracts voter ID, names (elector's and father's), gender, and date of birth using regular expressions. + 4. Converts the image to RGB format. + 5. Creates a temporary file to store a preprocessed image. + 6. Reads the image using OpenCV. + 7. Applies pre-processing to enhance bold text for better OCR. + 8. Extracts text again from the preprocessed image. + 9. If elector's name is not found using the initial extraction: + - Extracts lines containing sequences of uppercase words, potentially containing names. + - Assigns the last two words (assuming the second-last is the father's name) to elector's name and father's name. + 10. Similar logic is applied to extract the date of birth and voter ID if not found initially. + 11. Extracts gender using string matching. + 12. Returns a dictionary containing extracted voter information. + + Args: + image_path (str): The path to the front side of the voter ID image. + + Returns: + dict: A dictionary containing extracted voter information + (e.g., "Voter ID", "Elector's Name", "Father's Name", "Gender", "Date of Birth"). + """ image = Image.open(image_path) extracted_text = pytesseract.image_to_string(image) @@ -205,6 +355,23 @@ def extract_voterid_details_front(image_path): def extract_voterid_details_back(image_path): + """ + Extracts address and date of issue from the back side of a voter ID image using OCR. + + This function performs the following steps: + + 1. Opens the image using Pillow. + 2. Extracts text using Tesseract (assuming the text is in a supported language). + 3. Extracts address and date of issue using regular expressions. + 4. Returns a dictionary containing extracted information. + + Args: + image_path (str): The path to the back side of the voter ID image. + + Returns: + dict: A dictionary containing extracted information + (e.g., "Address", "Date of Issue"). + """ image = Image.open(image_path) extracted_text = pytesseract.image_to_string(image) @@ -215,6 +382,21 @@ def extract_voterid_details_back(image_path): def voter_id_front(front_path): + """ + Extracts voter information from the front side of a voter ID image using an adaptive approach. + + This function first performs basic OCR to see if the layout includes keywords + like "Date", "Age", "Sex", or "Gender". If these keywords are found, it assumes + a standard layout and uses the `extract_voterid_details_front` function for extraction. + Otherwise, it employs the `extract_voter_details_yolo` function, which might be + more suitable for non-standard layouts that may require object detection. + + Args: + front_path (str): The path to the front side of the voter ID image. + + Returns: + dict: A dictionary containing extracted voter information. + """ image = Image.open(front_path) extracted_text = pytesseract.image_to_string(image) @@ -230,5 +412,18 @@ def voter_id_front(front_path): def voter_id_back(back_path): + """ + Extracts address and date of issue from the back side of a voter ID image. + + This function calls the `extract_voterid_details_back` function to process the + back side image and extract relevant information. + + Args: + back_path (str): The path to the back side of the voter ID image. + + Returns: + dict: A dictionary containing extracted information + (e.g., "Address", "Date of Issue"). + """ back_details = extract_voterid_details_back(back_path) return back_details diff --git a/openbharatocr/ocr/water_bill.py b/openbharatocr/ocr/water_bill.py index f10c5d2..36eafad 100644 --- a/openbharatocr/ocr/water_bill.py +++ b/openbharatocr/ocr/water_bill.py @@ -6,6 +6,14 @@ def preprocess_for_bold_text(image): + """Preprocesses an image to enhance bold text for OCR. + + Args: + image (numpy.ndarray): The image to preprocess. + + Returns: + numpy.ndarray: The preprocessed image. + """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) @@ -21,6 +29,14 @@ def preprocess_for_bold_text(image): def extract_name(input): + """Extracts the customer name from the text using regular expressions. + + Args: + input (str): The text to extract the name from. + + Returns: + str: The extracted customer name (or empty string if not found). + """ regex = r"Name:\s*(.*?)(?:\.\s|(?=\n))" match = re.search(regex, input) name = match.group(1).strip() if match else "" @@ -34,6 +50,14 @@ def extract_name(input): def extract_bill_amount(input): + """Extracts the bill amount from the text using a regular expression. + + Args: + input (str): The text to extract the bill amount from. + + Returns: + str: The extracted bill amount (or empty string if not found). + """ regex = r"Bill Amount \(Rs\.\)\s*:? (\d+)" match = re.search(regex, input, re.IGNORECASE) bill_amount = match.group(1).strip() if match else "" @@ -41,6 +65,18 @@ def extract_bill_amount(input): def extract_meter_number(input): + """ + Extracts the meter number from the water bill text using a regular expression. + + This function assumes the meter number is labeled as "Meter No." followed by + an optional colon or period and expects digits (0-9) or "NA" to represent the number. + + Args: + input (str): The text to extract the meter number from (typically the OCR output from a water bill image). + + Returns: + str: The extracted meter number (or an empty string if not found). + """ regex = r"Meter No\.\s*:\s*(\d+|NA)" match = re.search(regex, input, re.IGNORECASE) meter_number = match.group(1).strip() if match else "" @@ -48,6 +84,20 @@ def extract_meter_number(input): def extract_all_dates(input): + """ + Extracts all dates in the format DD-MMM-YYYY from the given text using a regular expression. + + This function assumes dates are in the format DD-MMM-YYYY (e.g., 15-Jan-2024). + It extracts all matching occurrences, parses them as datetime objects, + sorts them chronologically, and returns them as formatted strings (DD-MM-YYYY). + + Args: + input (str): The text to extract dates from. + + Returns: + list: A list of extracted dates in YYYY-MM-DD format (sorted chronologically), + or an empty list if no dates are found. + """ regex = r"\b(\d{1,2}-[A-Z]{3}-\d{4})\b" dates = re.findall(regex, input) formatted_dates = [] @@ -119,6 +169,21 @@ def extract_bill_due_date(input): def extract_water_bill_details(image_path): + """Extracts water bill details from an image using OCR and regular expressions. + + This function performs the following steps: + + 1. Opens the image using Pillow. + 2. Extracts text using Tesseract (assuming the text is in a supported language). + 3. Extracts various water bill details using specific regular expressions. + + Args: + image_path (str): The path to the water bill image. + + Returns: + dict: A dictionary containing extracted water bill information + (e.g., "Name", "Bill Amount", "Bill Date", etc.). + """ image = Image.open(image_path) extracted_text = pytesseract.image_to_string(image) @@ -151,4 +216,14 @@ def extract_water_bill_details(image_path): def water_bill(image_path): + """Extracts water bill details from an image. + + This function is a wrapper for `extract_water_bill_details`. + + Args: + image_path (str): The path to the water bill image. + + Returns: + dict: A dictionary containing extracted water bill information. + """ return extract_water_bill_details(image_path) diff --git a/openbharatocr/unit_tests/test_pan.py b/openbharatocr/unit_tests/test_pan.py index 64ea420..2403b1f 100644 --- a/openbharatocr/unit_tests/test_pan.py +++ b/openbharatocr/unit_tests/test_pan.py @@ -22,7 +22,7 @@ def test_clean_input_without_newlines(self): def test_clean_input_with_empty_string(self): match = [""] - expected_output = [""] + expected_output = [] assert clean_input(match) == expected_output def test_clean_input_with_none(self): diff --git a/setup.py b/setup.py index a3e806d..2e08f52 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup( name="openbharatocr", - version="0.4.0", + version="0.4.1", description="openbharatocr is an opensource python library for ocr Indian government documents", long_description=long_description, long_description_content_type="text/markdown",