essentiasoftserv · nerdyk3 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/openbharatocr/ocr/pan.py b/openbharatocr/ocr/pan.py
@@ -7,23 +7,22 @@
 
 
 def clean_input(match):
+    """
+    Cleans the extracted text by splitting lines and removing stopwords.
+
+    Args:
+        match (list): A list of extracted text chunks.
+
+    Returns:
+        list: A cleaned list of individual names.
+    """
     cleaned = []
 
     for name in match:
         split_name = name.split("\n")
         for chunk in split_name:
             cleaned.append(chunk)
 
-    return cleaned
-
-
-def extract_all_names(input):
-    regex = r"\n[A-Z\s]+\b"
-    match = re.findall(regex, input)
-
-    names = []
-    cleaned = clean_input(match)
-
     stopwords = ["INDIA", "OF", "TAX", "GOVT", "DEPARTMENT", "INCOME"]
 
     names = [
@@ -35,7 +34,34 @@ def extract_all_names(input):
     return names
 
 
+def extract_all_names(input):
+    """
+    Extracts all names from the given text using a regular expression and performs basic cleaning.
+
+    Args:
+        input (str): The text to extract names from.
+
+    Returns:
+        list: A list of extracted names.
+    """
+    regex = r"\n[A-Z\s]+\b"
+    match = re.findall(regex, input)
+
+    names = []
+    cleaned = clean_input(match)
+    return cleaned
+
+
 def extract_pan(input):
+    """
+    Extracts the PAN number from the given text using a regular expression.
+
+    Args:
+        input (str): The text to extract the PAN number from.
+
+    Returns:
+        str: The extracted PAN number, or an empty string if not found.
+    """
     regex = r"[A-Z]{5}[0-9]{4}[A-Z]"
     match = re.search(regex, input)
     pan_number = match.group(0) if match else ""
@@ -44,6 +70,15 @@ def extract_pan(input):
 
 
 def extract_dob(input):
+    """
+    Extracts the date of birth from the given text using a regular expression.
+
+    Args:
+        input (str): The text to extract the date of birth from.
+
+    Returns:
+        str: The extracted date of birth in a common format (DD/MM/YYYY), or an empty string if not found.
+    """
     regex = r"\b(\d{2}[/\-.]\d{2}[/\-.](?:\d{4}|\d{2}))\b"
     match = re.search(regex, input)
     dob = match.group(0) if match else ""
@@ -52,6 +87,18 @@ def extract_dob(input):
 
 
 def extract_pan_details(image_path):
+    """
+    Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image.
+
+    This version attempts extraction from the original image and a converted JPEG version
+    to improve compatibility.
+
+    Args:
+        image_path (str): The path to the PAN card image.
+
+    Returns:
+        dict: A dictionary containing extracted PAN details.
+    """
     image = Image.open(image_path)
     extracted_text = pytesseract.image_to_string(image)
 
@@ -79,6 +126,25 @@ def extract_pan_details(image_path):
 
 
 def preprocess_for_sketch(image_path):
+    """
+    Preprocesses an image to convert it into a black and white sketch-like look
+    for improved text extraction.
+
+    This function performs several image processing steps:
+
+    1. Reads the image using OpenCV.
+    2. Converts the image to grayscale.
+    3. Applies Gaussian blur to smooth the image and reduce noise.
+    4. Applies adaptive thresholding to convert the image to binary (black and white).
+    5. Applies morphological operations (opening) to reduce noise and enhance text regions.
+    6. Inverts the image colors for better text recognition by Tesseract.
+
+    Args:
+        image_path (str): The path to the image.
+
+    Returns:
+        numpy.ndarray: The preprocessed image in a black and white sketch-like format.
+    """
     # Read the image
     image = cv2.imread(image_path)
 
@@ -104,6 +170,19 @@ def preprocess_for_sketch(image_path):
 
 
 def extract_pan_details_version2(image_path):
+    """
+    Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image
+    using a pre-processing step that converts the image to a sketch-like format.
+
+    This version aims to improve extraction accuracy in cases where Version 1 might struggle.
+
+    Args:
+        image_path (str): The path to the PAN card image.
+
+    Returns:
+        dict: A dictionary containing extracted PAN details.
+    """
+
     # Preprocess the image to convert it into a black and white sketch-like look
     preprocessed_image = preprocess_for_sketch(image_path)
 
@@ -126,6 +205,23 @@ def extract_pan_details_version2(image_path):
 
 
 def pan(image_path):
+    """
+    Extracts PAN details (full name, parent's name, date of birth, PAN number) from a PAN card image.
+
+    This function attempts extraction using two versions:
+
+    1. Version 1: Extracts details from the original image and a converted JPEG version
+       to improve compatibility.
+    2. Version 2: If any details are missing from Version 1, it applies a pre-processing
+       step that converts the image to a sketch-like format and then extracts details.
+
+    Args:
+        image_path (str): The path to the PAN card image.
+
+    Returns:
+        dict: A dictionary containing extracted PAN details, with missing details from
+              Version 1 filled in by Version 2 if necessary.
+    """
     # Run Version 1
     result = extract_pan_details(image_path)
 

diff --git a/openbharatocr/ocr/passbook.py b/openbharatocr/ocr/passbook.py
@@ -4,6 +4,15 @@
 
 
 def extract_name(input):
+    """
+    Extracts the customer name from the given text using a regular expression.
+
+    Args:
+        input (str): The text to extract the name from.
+
+    Returns:
+        str: The extracted customer name, or None if not found.
+    """
     regex = re.compile(r"Customer Name\s+([A-Z\s]+)")
     match = re.search(regex, input)
     if match:
@@ -12,6 +21,15 @@ def extract_name(input):
 
 
 def extract_open_date(input):
+    """
+    Extracts the account open date from the given text using a regular expression.
+
+    Args:
+        input (str): The text to extract the open date from.
+
+    Returns:
+        str: The extracted account open date in DD MMM YYYY format, or None if not found.
+    """
     regex = re.compile(r"Open Date\s*(\d{1,2} \w{3} \d{4})")
     match = re.search(regex, input)
     if match:
@@ -20,6 +38,19 @@ def extract_open_date(input):
 
 
 def extract_bank_name(input):
+    """
+    Extracts the bank name from the given text using a regular expression.
+
+    This function searches for patterns containing "Bank", "Bank Ltd",
+    "Bank Limited", or "Credit Union" considering case-insensitivity
+    and matches across multiple lines.
+
+    Args:
+        input (str): The text to extract the bank name from.
+
+    Returns:
+        str: The extracted bank name, or None if not found.
+    """
     regex = re.compile(
         r"\b[A-Za-z\s&]+(?:BANK|BANK LTD|BANK LIMITED|CREDIT UNION)\b", re.MULTILINE
     )
@@ -31,6 +62,18 @@ def extract_bank_name(input):
 
 
 def extract_phone(input):
+    """
+    Extracts the phone number from the given text using a regular expression.
+
+    This function searches for patterns starting with "Mobile No" and extracts
+    the following digits, considering case-insensitivity.
+
+    Args:
+        input (str): The text to extract the phone number from.
+
+    Returns:
+        str: The extracted phone number, or None if not found.
+    """
     regex = re.compile(r"Mobile No\s*(\d+)", re.IGNORECASE)
     match = re.search(regex, input)
     if match:
@@ -39,6 +82,18 @@ def extract_phone(input):
 
 
 def extract_branch_name(input):
+    """
+    Extracts the branch name from the given text using a regular expression.
+
+    This function searches for patterns starting with "Branch Name" and extracts
+    the following text, considering case-insensitivity.
+
+    Args:
+        input (str): The text to extract the branch name from.
+
+    Returns:
+        str: The extracted branch name, or None if not found.
+    """
     regex = re.compile(r"Branch Name\s*([A-Za-z\d\s-]+)", re.IGNORECASE)
     match = re.search(regex, input)
     if match:
@@ -47,6 +102,18 @@ def extract_branch_name(input):
 
 
 def extract_nomination_name(input):
+    """
+    Extracts the nomination name from the given text using a regular expression.
+
+    This function searches for patterns containing "Nominee" or "Nomination"
+    followed by two capitalized words.
+
+    Args:
+        input (str): The text to extract the nomination name from.
+
+    Returns:
+        str: The extracted nomination name (full name), or None if not found.
+    """
     regex = re.compile(r"Nomina(?:non|tion)\s+([A-Z][a-z]+\s[A-Z][a-z]+)")
     match = re.search(regex, input)
     if match:
@@ -55,6 +122,20 @@ def extract_nomination_name(input):
 
 
 def extract_email(input):
+    """
+    Extracts the email address from the given text using a regular expression.
+
+    This function searches for email addresses in the format of [email protected],
+    where username can contain letters, numbers, periods, underscores, plus signs,
+    and hyphens, and domain can contain letters, numbers, periods, and hyphens.
+
+    Args:
+        input (str): The text to extract the email address from.
+
+    Returns:
+        str: The extracted email address, or None if not found.
+    """
+
     regex = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
     match = re.search(regex, input)
     if match:
@@ -63,6 +144,18 @@ def extract_email(input):
 
 
 def extract_account_no(input):
+    """
+    Extracts the account number from the given text using a regular expression.
+
+    This function searches for patterns containing "Account Number:" followed by
+    9 to 12 digits, considering case-insensitivity.
+
+    Args:
+        input (str): The text to extract the account number from.
+
+    Returns:
+        str: The extracted account number, or None if not found.
+    """
     regex = re.compile(r"Account Number:\s*(\d{9,12})", re.IGNORECASE)
     match = re.search(regex, input)
     if match:
@@ -71,6 +164,18 @@ def extract_account_no(input):
 
 
 def extract_cif_no(input):
+    """
+    Extracts the CIF number from the given text using a regular expression.
+
+    This function searches for patterns containing "CIF" (case-insensitive),
+    optionally followed by "No" or ".", and then extracts the following digits.
+
+    Args:
+        input (str): The text to extract the CIF number from.
+
+    Returns:
+        str: The extracted CIF number, or None if not found.
+    """
     regex = re.compile(r"CIF(?: No)?\.?\s*(\d+)", re.IGNORECASE)
     match = re.search(regex, input)
     if match:
@@ -79,6 +184,19 @@ def extract_cif_no(input):
 
 
 def extract_address(input):
+    """
+    Extracts the address from the given text using a regular expression.
+
+    This function attempts to extract addresses using a list of patterns that
+    commonly represent addresses. The patterns include house numbers, street names,
+    city/town names, and postal codes.
+
+    Args:
+        input (str): The text to extract the address from.
+
+    Returns:
+        str: The extracted address, or None if no matching pattern is found.
+    """
     regex = [
         r"\d+\s[A-Za-z\s,]+(?:Road|Street|Avenue|Boulevard|Lane|Drive|Court|Place|Square|Plaza|Terrace|Trail|Parkway|Circle)\s*,?\s*(?:\d{5}|\d{5}-\d{4})?",
         r"\d+\s[A-Za-z\s,]+(?:Road|Street|Avenue|Boulevard|Lane|Drive|Court|Place|Square|Plaza|Terrace|Trail|Parkway|Circle)",
@@ -97,6 +215,18 @@ def extract_address(input):
 
 
 def parse_passbook_frontpage(image_path):
+    """
+    Parses a passbook front page image to extract various customer and account information.
+
+    This function uses EasyOCR to read text from the image and then employs regular expressions
+    to extract specific details like name, account number, address, phone number, etc.
+
+    Args:
+        image_path (str): The path to the passbook front page image.
+
+    Returns:
+        dict: A dictionary containing the extracted passbook information.
+    """
     reader = easyocr.Reader(["en"])
 
     image = cv2.imread(image_path)