Refactoring this to use the cleaner common import pattern (#22)

* Performing a back merge (#18) * Fixing Null field on empty string (#17) * Adding handling for North Carolina voter and history importing * Adding handling for North Carolina voter and history importing * Changing this to build against the oldest GCC I can make python 3.11 work on * Changing this to build against the oldest GCC I can make python 3.11 work on * Finishing up the build changes * Refactoring voter parsing to history pattern * Refactoring this to conform to new csv patterns
JamesJonesConsulting · Jun 4, 2023 · a0e5e2f · a0e5e2f
1 parent 72388e6
commit a0e5e2f
Show file tree

Hide file tree

Showing 14 changed files with 463 additions and 440 deletions.
diff --git a/.gitignore b/.gitignore
@@ -157,4 +157,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+#.idea/
+
+# RPM spec files
+*.spec
diff --git a/.idea/VoterWarehouse.iml b/.idea/VoterWarehouse.iml
diff --git a/.idea/runConfigurations/voterwarehouse.xml → ...figurations/voterwarehouse_NC_history.xml b/.idea/runConfigurations/voterwarehouse.xml → ...figurations/voterwarehouse_NC_history.xml
diff --git a/Import/Florida.py b/Import/Florida.py
@@ -2,11 +2,16 @@
 
 # Handles Raw data import methods for Florida data
 
+import csv
 import datetime
+import io
 import zipfile
 
-import Warehouse.FloridaSQL
+import Warehouse.FloridaSQL as StateSQL
 from Import.State import State
+from Import.FloridaCodes import __history_import_map__
+from Import.FloridaCodes import __voter_import_map__
+from Import.FloridaCodes import __suppress_keys__
 
 
 class Florida(State):
@@ -17,149 +22,71 @@ class Florida(State):
     valid_import_types = {
         "voters": {
             "sql": "set_voter",
-            "parse": "parse_raw_voter_into_tuple",
-            "batch_size": 200000
+            "parse": "parse_voter_into_tuple",
+            "batch_size": 200000,
+            "fields": __voter_import_map__.keys()
         },
         "histories": {
             "sql": "set_history",
-            "parse": "parse_raw_history_into_tuple",
-            "batch_size": 200000
+            "parse": "parse_history_into_tuple",
+            "batch_size": 200000,
+            "fields": __history_import_map__.keys()
         }
     }
 
-    history_keys = [
-        "county_code",
-        "voter_id",
-        "election_date",
-        "election_type",
-        "history_code"
-    ]
-
-    voter_keys = [
-        "county_code",
-        "voter_id",
-        "name_last",
-        "name_suffix",
-        "name_first",
-        "name_middle",
-        "suppress_address",
-        "residence_address_line_1",
-        "residence_address_line_2",
-        "residence_city",
-        "residence_state",
-        "residence_zipcode",
-        "mailing_address_line_1",
-        "mailing_address_line_2",
-        "mailing_address_line_3",
-        "mailing_city",
-        "mailing_state",
-        "mailing_zipcode",
-        "mailing_country",
-        "gender",
-        "race",
-        "birth_date",
-        "registration_date",
-        "party_affiliation",
-        "precinct",
-        "precinct_group",
-        "precinct_split",
-        "precinct_suffix",
-        "voter_status",
-        "congressional_district",
-        "house_district",
-        "senate_district",
-        "county_commission_district",
-        "school_board_district",
-        "daytime_area_code",
-        "daytime_phone_number",
-        "daytime_phone_extension",
-        "email_address"
-    ]
-
-    suppress_keys = [
-        "name_last",
-        "name_suffix",
-        "name_first",
-        "name_middle",
-        "residence_address_line_1",
-        "residence_address_line_2",
-        "residence_city",
-        "residence_state",
-        "residence_zipcode",
-        "mailing_address_line_1",
-        "mailing_address_line_2",
-        "mailing_address_line_3",
-        "mailing_city",
-        "mailing_state",
-        "mailing_zipcode",
-        "mailing_country",
-        "precinct",
-        "precinct_group",
-        "precinct_split",
-        "precinct_suffix",
-        "congressional_district",
-        "house_district",
-        "senate_district",
-        "county_commission_district",
-        "school_board_district",
-        "daytime_area_code",
-        "daytime_phone_number",
-        "daytime_phone_extension",
-        "email_address"
-    ]
-
-    def parse_raw_history_into_tuple(self, history_raw: bytes, export_date: str) -> tuple[str | None, ...]:
+    @staticmethod
+    def parse_history_into_tuple(history: dict, export_date: str) -> tuple[str | None, ...]:
         """
         Build a temporary dictionary from raw voter binary string
 
-        :param bytes history_raw: A byte string containing a raw row of history data
+        :param dict history: A dictionary containing a raw row of history data
         :param str export_date: Export date string in YYYY-MM-DD format
         :return: A tuple of SQL ready prepared parameters
         :rtype: tuple[str | None, ...]
         """
-        row: dict[str, bytes | str | None] = dict(
-            zip(
-                self.history_keys,
-                history_raw.strip().split(b"\t")
-            )
-        )
-        # Ensure all values are converted from binary to utf-8 strings and fill in missing keys
-        for k in self.history_keys:
-            if k not in row:
-                row[k] = ""
+        row = {}
+        for k, v in __history_import_map__.items():
+            if k in history:
+                # handle matching keys
+                row[k] = history[k].strip()
             else:
-                row[k] = row[k].decode('utf-8')
+                # handle non-matching keys
+                row[k] = ''
+                for match in v:
+                    if match in history:
+                        row[k] = history[match].strip()
+                        break
         # Appending the export date to the dictionary
         row["export_date"] = export_date
-        # Converting dates to the right format and fixing any dates that have been suppressed or invalid to None
         for k in ["election_date"]:
             if len(row[k]) < 10:
                 row[k] = None
             else:
                 row[k] = datetime.datetime.strptime(row[k], "%m/%d/%Y").strftime('%Y-%m-%d')
         return tuple(row.values())
 
-    def parse_raw_voter_into_tuple(self, voter_raw: bytes, export_date: str) -> tuple[str | None, ...]:
+    @staticmethod
+    def parse_voter_into_tuple(voter: dict[str, str | None], export_date: str) -> tuple[str | None, ...]:
         """
         Build a temporary dictionary from raw voter binary string
 
-        :param bytes voter_raw: A byte string containing a raw row of voter data
+        :param dict[str, str | None] voter: A byte string containing a raw row of voter data
         :param str export_date: Export date string in YYYY-MM-DD format
         :return: A tuple of SQL ready prepared parameters
         :rtype: tuple[str | None, ...]
         """
-        row: dict[str, bytes | str | None] = dict(
-            zip(
-                self.voter_keys,
-                voter_raw.strip().split(b"\t")
-            )
-        )
-        # Ensure all values are converted from binary to utf-8 strings and fill in missing keys
-        for k in self.voter_keys:
-            if k not in row:
-                row[k] = ""
+        row = {}
+        for k, v in __voter_import_map__.items():
+            if k in voter:
+                # handle matching keys
+                row[k] = voter[k].strip()
             else:
-                row[k] = row[k].decode('utf-8')
+                # handle non-matching keys
+                row[k] = ''
+                for match in v:
+                    if match in voter:
+                        row[k] = voter[match].strip()
+                        break
         # Appending the export date to the dictionary
         row["export_date"] = export_date
         # Converting dates to the right format and fixing any dates that have been suppressed or invalid to None
@@ -169,7 +96,7 @@ def parse_raw_voter_into_tuple(self, voter_raw: bytes, export_date: str) -> tupl
             else:
                 row[k] = datetime.datetime.strptime(row[k], "%m/%d/%Y").strftime('%Y-%m-%d')
         # Blanking out any suppressed fields with * to empty string
-        for k in self.suppress_keys:
+        for k in __suppress_keys__:
             if row[k] == "*":
                 row[k] = ""
         # Ensuring email addresses are in lower case
@@ -187,46 +114,53 @@ def import_source(self, file: str, t: str) -> None:
         :return: None
         """
         self.db.init_schema()
-        try:
-            with zipfile.ZipFile(file, mode="r") as archive:
-                for info in archive.infolist():
-                    print(f"Filename: {info.filename}")
-                    print(f"Modified: {datetime.datetime(*info.date_time)}")
-                    print(f"Normal size: {info.file_size} bytes")
-                    print(f"Compressed size: {info.compress_size} bytes")
-                    print("-" * 20)
-                    data = []
-                    records_imported = 0
-                    for line in archive.read(info.filename).split(b"\n"):
-                        if len(line.strip()) >= 3:
-                            if t in self.valid_import_types.keys():
+        if t in self.valid_import_types.keys():
+            try:
+                with zipfile.ZipFile(file, mode="r") as archive:
+                    for info in archive.infolist():
+                        print(f"Filename: {info.filename}")
+                        print(f"Modified: {datetime.datetime(*info.date_time)}")
+                        print(f"Normal size: {info.file_size} bytes")
+                        print(f"Compressed size: {info.compress_size} bytes")
+                        print("-" * 20)
+                        data = []
+                        records_imported = 0
+                        with archive.open(info.filename, "r") as f:
+                            reader = csv.DictReader(
+                                io.TextIOWrapper(
+                                    f,
+                                    newline='\r\n',
+                                    encoding='utf-8',
+                                    errors='ignore'
+                                ),
+                                delimiter="\t",
+                                fieldnames=self.valid_import_types[t]["fields"]
+                            )
+                            for row in reader:
                                 if len(data) < self.db.batch_limits[t]:
                                     data.append(getattr(self, self.valid_import_types[t]["parse"])(
-                                        line,
+                                        row,
                                         datetime.datetime(*info.date_time).strftime("%Y-%m-%d")
                                     ))
                                 else:
                                     print(f"Importing batch of {len(data)} records from {info.filename}..")
                                     self.db.executemany_prepared_sql(
-                                        getattr(Warehouse.FloridaSQL, self.valid_import_types[t]["sql"])(),
+                                        getattr(StateSQL, self.valid_import_types[t]["sql"])(),
                                         data
                                     )
                                     records_imported += len(data)
                                     data = []
-                            else:
-                                raise ValueError(f"Usage: Type 't' {t} is not valid")
-                    if len(data) > 0:
-                        if t in self.valid_import_types.keys():
-                            print(f"Importing batch of {len(data)} records from {info.filename}..")
-                            self.db.executemany_prepared_sql(
-                                getattr(Warehouse.FloridaSQL, self.valid_import_types[t]["sql"])(),
-                                data
-                            )
-                            records_imported += len(data)
-                        else:
-                            raise ValueError(f"Usage: Type 't' {t} is not valid")
-                    print(f"{records_imported} total records imported")
-                    print("-" * 20)
-        except Exception as error:
-            print('Caught this error: ' + repr(error))
-            raise
+                            if len(data) > 0:
+                                print(f"Importing batch of {len(data)} records from {info.filename}..")
+                                self.db.executemany_prepared_sql(
+                                    getattr(StateSQL, self.valid_import_types[t]["sql"])(),
+                                    data
+                                )
+                                records_imported += len(data)
+                            print(f"{records_imported} total records imported")
+                            print("-" * 20)
+            except Exception as error:
+                print('Caught this error: ' + repr(error))
+                raise
+        else:
+            raise ValueError(f"Usage: Type 't' {t} is not valid")