From 736bf7b6aab018442bb0c06a89f65deb3ff1daab Mon Sep 17 00:00:00 2001 From: Tomos Williams Date: Thu, 26 Sep 2024 13:36:14 +0100 Subject: [PATCH 1/2] working character checker for good name, party name and party address --- .../commands/get_lite_specialcharacters.py | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 api/support/management/commands/get_lite_specialcharacters.py diff --git a/api/support/management/commands/get_lite_specialcharacters.py b/api/support/management/commands/get_lite_specialcharacters.py new file mode 100644 index 000000000..f19409961 --- /dev/null +++ b/api/support/management/commands/get_lite_specialcharacters.py @@ -0,0 +1,126 @@ +from api.applications.models import GoodOnApplication, PartyOnApplication, CaseStatusEnum +from django.core.management.base import BaseCommand +import csv +import re +from django.db.models import Q + + +class SpecialCharacterFinder: + match_string = r"[^a-zA-Z0-9 .,\-\)\(\/'+:=\?\!\"%&\*;\<\>]" + fieldnames = [] + + results = [] + + def __init__(self, filename, data): + self.filename = filename + self.results = self.check_data(data) + self.write_to_csv() + + def check_regex(self, value): + match_regex = re.sub(self.match_string, "", value) + if len(match_regex) < len(value): + return set(value).difference(set(match_regex)) + + def get_value(self, entry): + return entry + + def check_data(self, data): + results = [] + for entry in data: + value = self.get_value(entry) + if match := self.check_regex(value): + results.append(self.format_results(entry, match)) + return results + + def format_results(self, data, match): + return { + "org_name": data.application.organisation.name, + "good_id": data.good.id, + "reference_code": data.application.reference_code, + "value": data.good.name, + "match": match, + } + + def write_to_csv(self): + with open(f"{self.filename}.csv", "w", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=self.fieldnames) + writer.writeheader() + writer.writerows(self.results) + + +class GoodSpecialCharacterFinder(SpecialCharacterFinder): + fieldnames = ["org_name", "reference_code", "good_id", "value", "match"] + + def get_value(self, entry): + return entry.good.name + + +class PartyNameSpecialCharacterFinder(SpecialCharacterFinder): + fieldnames = ["org_name", "reference_code", "party_id", "value", "match"] + + def get_value(self, entry): + return entry.party.name + + def format_results(self, data, match): + return { + "org_name": data.application.organisation.name, + "party_id": data.party.id, + "reference_code": data.application.reference_code, + "value": data.party.name, + "match": match, + } + + +class PartyAddressSpecialCharacterFinder(SpecialCharacterFinder): + match_string = r"[^a-zA-Z0-9 .,\-\)\(\/'+:=\?\!\"%&\*;\<\>\r\n]" + fieldnames = ["org_name", "reference_code", "party_id", "value", "match"] + + def get_value(self, entry): + return entry.party.address + + def format_results(self, data, match): + return { + "org_name": data.application.organisation.name, + "party_id": data.party.id, + "reference_code": data.application.reference_code, + "value": data.party.address, + "match": match, + } + + +class Command(BaseCommand): + help = """ + Command to check special characters within LITE + + This will generate csvs for good.name, party.name and party.address which can be retrieved using: + cf shh -c "cat app/csvname.csv > csvname.csv + + to be passed forward to support so that exporters can be contacted to review the fields raised + """ + + def handle(self, *args, **options): + + name_match_string = r"^[a-zA-Z0-9 .,\-\)\(\/'+:=\?\!\"%&\*;\<\>]+$" + address_match_string = r"^[a-zA-Z0-9 .,\-\)\(\/'+:=\?\!\"%&\*;\<\>\r\n]+$" + + # get goods that don't match the string and are not finalised + goa = GoodOnApplication.objects.filter( + ~Q(good__name__iregex=name_match_string) + & ~Q(application__status__status__in=CaseStatusEnum._terminal_statuses) + ) + + # get parties that don't match the string and are not finalised + party_matches = PartyOnApplication.objects.filter( + Q(~Q(party__name__iregex=name_match_string) | ~Q(party__address__iregex=address_match_string)) + & ~Q(application__status__status__in=CaseStatusEnum._terminal_statuses) + ) + + GoodSpecialCharacterFinder("good_names", goa) + PartyNameSpecialCharacterFinder("party_names", party_matches) + PartyAddressSpecialCharacterFinder("party_address", party_matches) + + +# retrieve file: +# cf ssh lite-api-uat -c "cat app/good_names.csv" > good_names.csv +# cf ssh lite-api-uat -c "cat app/party_names.csv" > party_names.csv +# cf ssh lite-api-uat -c "cat app/party_address.csv" > party_address.csv From 00eb1e6e3b870ee589e76834c2b9ab428c60aa04 Mon Sep 17 00:00:00 2001 From: Tomos Williams Date: Tue, 1 Oct 2024 13:41:08 +0100 Subject: [PATCH 2/2] removes duplicate on good_id, could be useful to do this for other fields or load this into pandas to interogate a bit better. --- .../commands/get_lite_specialcharacters.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/api/support/management/commands/get_lite_specialcharacters.py b/api/support/management/commands/get_lite_specialcharacters.py index f19409961..8f13c08ba 100644 --- a/api/support/management/commands/get_lite_specialcharacters.py +++ b/api/support/management/commands/get_lite_specialcharacters.py @@ -10,6 +10,7 @@ class SpecialCharacterFinder: fieldnames = [] results = [] + unique_result = {} def __init__(self, filename, data): self.filename = filename @@ -24,12 +25,18 @@ def check_regex(self, value): def get_value(self, entry): return entry + def get_id(self, entry): + return entry + def check_data(self, data): results = [] for entry in data: - value = self.get_value(entry) - if match := self.check_regex(value): - results.append(self.format_results(entry, match)) + id = self.get_id(entry) + if not self.unique_result.get(id): + value = self.get_value(entry) + if match := self.check_regex(value): + results.append(self.format_results(entry, match)) + self.unique_result[id] = True return results def format_results(self, data, match): @@ -54,6 +61,9 @@ class GoodSpecialCharacterFinder(SpecialCharacterFinder): def get_value(self, entry): return entry.good.name + def get_id(self, entry): + return str(entry.good.id) + class PartyNameSpecialCharacterFinder(SpecialCharacterFinder): fieldnames = ["org_name", "reference_code", "party_id", "value", "match"] @@ -61,6 +71,9 @@ class PartyNameSpecialCharacterFinder(SpecialCharacterFinder): def get_value(self, entry): return entry.party.name + def get_id(self, entry): + return str(entry.party.id) + def format_results(self, data, match): return { "org_name": data.application.organisation.name, @@ -78,6 +91,9 @@ class PartyAddressSpecialCharacterFinder(SpecialCharacterFinder): def get_value(self, entry): return entry.party.address + def get_id(self, entry): + return str(entry.party.id) + def format_results(self, data, match): return { "org_name": data.application.organisation.name, @@ -120,7 +136,7 @@ def handle(self, *args, **options): PartyAddressSpecialCharacterFinder("party_address", party_matches) -# retrieve file: +# # retrieve file: # cf ssh lite-api-uat -c "cat app/good_names.csv" > good_names.csv # cf ssh lite-api-uat -c "cat app/party_names.csv" > party_names.csv # cf ssh lite-api-uat -c "cat app/party_address.csv" > party_address.csv