Skip to content

Commit

Permalink
Move sanitize_value to be common, Fix InPassportRecognizer (#1519)
Browse files Browse the repository at this point in the history
* Move sanitize_value to be common
Fix Indian Passport Number regex

* fix regex

* export module

* export module

* fix imports

* move to entity_recognizer.py

* move to entity_recognizer.py
  • Loading branch information
SharonHart authored Jan 26, 2025
1 parent 6f840ea commit 35ab8ae
Show file tree
Hide file tree
Showing 22 changed files with 187 additions and 264 deletions.
2 changes: 0 additions & 2 deletions presidio-analyzer/presidio_analyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from presidio_analyzer.analyzer_engine import AnalyzerEngine
from presidio_analyzer.batch_analyzer_engine import BatchAnalyzerEngine
from presidio_analyzer.analyzer_request import AnalyzerRequest
from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils
from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_analyzer.analyzer_engine_provider import AnalyzerEngineProvider
Expand Down Expand Up @@ -51,6 +50,5 @@
"ContextAwareEnhancer",
"LemmaContextAwareEnhancer",
"BatchAnalyzerEngine",
"PresidioAnalyzerUtils",
"AnalyzerEngineProvider",
]
75 changes: 0 additions & 75 deletions presidio-analyzer/presidio_analyzer/analyzer_utils.py

This file was deleted.

15 changes: 14 additions & 1 deletion presidio-analyzer/presidio_analyzer/entity_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from abc import abstractmethod
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple

from presidio_analyzer import RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
Expand Down Expand Up @@ -196,3 +196,16 @@ def remove_duplicates(results: List[RecognizerResult]) -> List[RecognizerResult]
filtered_results.append(result)

return filtered_results

@staticmethod
def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
"""
Cleanse the input string of the replacement pairs specified as argument.
:param text: input string
:param replacement_pairs: pairs of what has to be replaced with which value
:return: cleansed string
"""
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AbaRoutingRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -59,7 +59,9 @@ def __init__(
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs)
sanitized_value = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
return self.__checksum(sanitized_value)

@staticmethod
Expand All @@ -68,9 +70,3 @@ def __checksum(sanitized_value: str) -> bool:
for idx, m in enumerate([3, 7, 1, 3, 7, 1, 3, 7, 1]):
s += int(sanitized_value[idx]) * m
return s % 10 == 0

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AuAbnRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -72,7 +72,7 @@ def validate_result(self, pattern_text: str) -> bool:
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
abn_list = [int(digit) for digit in text if not digit.isspace()]

# Set weights based on digit position
Expand All @@ -85,9 +85,3 @@ def validate_result(self, pattern_text: str) -> bool:
sum_product += abn_list[i] * weight[i]
remainder = sum_product % 89
return remainder == 0

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AuAcnRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -69,7 +69,7 @@ def validate_result(self, pattern_text: str) -> bool:
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
acn_list = [int(digit) for digit in text if not digit.isspace()]

# Set weights based on digit position
Expand All @@ -82,9 +82,3 @@ def validate_result(self, pattern_text: str) -> bool:
remainder = sum_product % 10
complement = 10 - remainder
return complement == acn_list[-1]

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AuMedicareRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -69,7 +69,7 @@ def validate_result(self, pattern_text: str) -> bool:
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
medicare_list = [int(digit) for digit in text if not digit.isspace()]

# Set weights based on digit position
Expand All @@ -81,9 +81,3 @@ def validate_result(self, pattern_text: str) -> bool:
sum_product += medicare_list[i] * weight[i]
remainder = sum_product % 10
return remainder == medicare_list[8]

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class AuTfnRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -75,7 +75,7 @@ def validate_result(self, pattern_text: str) -> bool:
:return: A bool indicating whether the validation was successful.
"""
# Pre-processing before validation checks
text = self.__sanitize_value(pattern_text, self.replacement_pairs)
text = EntityRecognizer.sanitize_value(pattern_text, self.replacement_pairs)
tfn_list = [int(digit) for digit in text if not digit.isspace()]

# Set weights based on digit position
Expand All @@ -87,9 +87,3 @@ def validate_result(self, pattern_text: str) -> bool:
sum_product += tfn_list[i] * weight[i]
remainder = sum_product % 11
return remainder == 0

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class CreditCardRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -59,7 +59,9 @@ def __init__(
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs)
sanitized_value = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
checksum = self.__luhn_checksum(sanitized_value)

return checksum
Expand All @@ -76,9 +78,3 @@ def digits_of(n: str) -> List[int]:
for d in even_digits:
checksum += sum(digits_of(str(d * 2)))
return checksum % 10 == 0

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class EsNieRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -39,6 +39,9 @@ def __init__(
supported_entity: str = "ES_NIE",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
)
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
Expand All @@ -51,7 +54,9 @@ def __init__(
def validate_result(self, pattern_text: str) -> bool:
"""Validate the pattern by using the control character."""

pattern_text = EsNieRecognizer.__sanitize_value(pattern_text)
pattern_text = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)

letters = "TRWAGMYFPDXBNJZSQVHLCKE"
letter = pattern_text[-1]
Expand All @@ -66,7 +71,3 @@ def validate_result(self, pattern_text: str) -> bool:
# replace XYZ with 012, and check the mod 23
number = int(str("XYZ".index(pattern_text[0])) + pattern_text[1:-1])
return letter == letters[number % 23]

@staticmethod
def __sanitize_value(text: str) -> str:
return text.replace("-", "").replace(" ", "")
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from presidio_analyzer import Pattern, PatternRecognizer
from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class EsNifRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -47,12 +47,10 @@ def __init__(
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
pattern_text = EsNifRecognizer.__sanitize_value(pattern_text)
pattern_text = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
letter = pattern_text[-1]
number = int("".join(filter(str.isdigit, pattern_text)))
letters = "TRWAGMYFPDXBNJZSQVHLCKE"
return letter == letters[number % 23]

@staticmethod
def __sanitize_value(text: str) -> str:
return text.replace("-", "").replace(" ", "")
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ def __init__(

def validate_result(self, pattern_text: str): # noqa D102
try:
pattern_text = self.__sanitize_value(pattern_text, self.replacement_pairs)
pattern_text = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)
is_valid_checksum = (
self.__generate_iban_check_digits(pattern_text, self.LETTERS)
== pattern_text[2:4]
Expand Down Expand Up @@ -204,9 +206,3 @@ def __is_valid_format(
return country_regex and re.match(country_regex, iban, flags=flags)

return False

@staticmethod
def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str:
for search_string, replacement_string in replacement_pairs:
text = text.replace(search_string, replacement_string)
return text
Loading

0 comments on commit 35ab8ae

Please sign in to comment.