Skip to content

Commit

Permalink
Refactoring this to use the cleaner common import pattern (#22)
Browse files Browse the repository at this point in the history
* Performing a back merge (#18)

* Fixing Null field on empty string (#17)

* Adding handling for North Carolina voter and history importing

* Adding handling for North Carolina voter and history importing

* Changing this to build against the oldest GCC I can make python 3.11 work on

* Changing this to build against the oldest GCC I can make python 3.11 work on

* Finishing up the build changes

* Refactoring voter parsing to history pattern

* Refactoring this to conform to new csv patterns
  • Loading branch information
jamjon3 authored Jun 4, 2023
1 parent 72388e6 commit a0e5e2f
Show file tree
Hide file tree
Showing 14 changed files with 463 additions and 440 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
#.idea/

# RPM spec files
*.spec
2 changes: 2 additions & 0 deletions .idea/VoterWarehouse.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

226 changes: 80 additions & 146 deletions Import/Florida.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

# Handles Raw data import methods for Florida data

import csv
import datetime
import io
import zipfile

import Warehouse.FloridaSQL
import Warehouse.FloridaSQL as StateSQL
from Import.State import State
from Import.FloridaCodes import __history_import_map__
from Import.FloridaCodes import __voter_import_map__
from Import.FloridaCodes import __suppress_keys__


class Florida(State):
Expand All @@ -17,149 +22,71 @@ class Florida(State):
valid_import_types = {
"voters": {
"sql": "set_voter",
"parse": "parse_raw_voter_into_tuple",
"batch_size": 200000
"parse": "parse_voter_into_tuple",
"batch_size": 200000,
"fields": __voter_import_map__.keys()
},
"histories": {
"sql": "set_history",
"parse": "parse_raw_history_into_tuple",
"batch_size": 200000
"parse": "parse_history_into_tuple",
"batch_size": 200000,
"fields": __history_import_map__.keys()
}
}

history_keys = [
"county_code",
"voter_id",
"election_date",
"election_type",
"history_code"
]

voter_keys = [
"county_code",
"voter_id",
"name_last",
"name_suffix",
"name_first",
"name_middle",
"suppress_address",
"residence_address_line_1",
"residence_address_line_2",
"residence_city",
"residence_state",
"residence_zipcode",
"mailing_address_line_1",
"mailing_address_line_2",
"mailing_address_line_3",
"mailing_city",
"mailing_state",
"mailing_zipcode",
"mailing_country",
"gender",
"race",
"birth_date",
"registration_date",
"party_affiliation",
"precinct",
"precinct_group",
"precinct_split",
"precinct_suffix",
"voter_status",
"congressional_district",
"house_district",
"senate_district",
"county_commission_district",
"school_board_district",
"daytime_area_code",
"daytime_phone_number",
"daytime_phone_extension",
"email_address"
]

suppress_keys = [
"name_last",
"name_suffix",
"name_first",
"name_middle",
"residence_address_line_1",
"residence_address_line_2",
"residence_city",
"residence_state",
"residence_zipcode",
"mailing_address_line_1",
"mailing_address_line_2",
"mailing_address_line_3",
"mailing_city",
"mailing_state",
"mailing_zipcode",
"mailing_country",
"precinct",
"precinct_group",
"precinct_split",
"precinct_suffix",
"congressional_district",
"house_district",
"senate_district",
"county_commission_district",
"school_board_district",
"daytime_area_code",
"daytime_phone_number",
"daytime_phone_extension",
"email_address"
]

def parse_raw_history_into_tuple(self, history_raw: bytes, export_date: str) -> tuple[str | None, ...]:
@staticmethod
def parse_history_into_tuple(history: dict, export_date: str) -> tuple[str | None, ...]:
"""
Build a temporary dictionary from raw voter binary string
:param bytes history_raw: A byte string containing a raw row of history data
:param dict history: A dictionary containing a raw row of history data
:param str export_date: Export date string in YYYY-MM-DD format
:return: A tuple of SQL ready prepared parameters
:rtype: tuple[str | None, ...]
"""
row: dict[str, bytes | str | None] = dict(
zip(
self.history_keys,
history_raw.strip().split(b"\t")
)
)
# Ensure all values are converted from binary to utf-8 strings and fill in missing keys
for k in self.history_keys:
if k not in row:
row[k] = ""
row = {}
for k, v in __history_import_map__.items():
if k in history:
# handle matching keys
row[k] = history[k].strip()
else:
row[k] = row[k].decode('utf-8')
# handle non-matching keys
row[k] = ''
for match in v:
if match in history:
row[k] = history[match].strip()
break
# Appending the export date to the dictionary
row["export_date"] = export_date
# Converting dates to the right format and fixing any dates that have been suppressed or invalid to None
for k in ["election_date"]:
if len(row[k]) < 10:
row[k] = None
else:
row[k] = datetime.datetime.strptime(row[k], "%m/%d/%Y").strftime('%Y-%m-%d')
return tuple(row.values())

def parse_raw_voter_into_tuple(self, voter_raw: bytes, export_date: str) -> tuple[str | None, ...]:
@staticmethod
def parse_voter_into_tuple(voter: dict[str, str | None], export_date: str) -> tuple[str | None, ...]:
"""
Build a temporary dictionary from raw voter binary string
:param bytes voter_raw: A byte string containing a raw row of voter data
:param dict[str, str | None] voter: A byte string containing a raw row of voter data
:param str export_date: Export date string in YYYY-MM-DD format
:return: A tuple of SQL ready prepared parameters
:rtype: tuple[str | None, ...]
"""
row: dict[str, bytes | str | None] = dict(
zip(
self.voter_keys,
voter_raw.strip().split(b"\t")
)
)
# Ensure all values are converted from binary to utf-8 strings and fill in missing keys
for k in self.voter_keys:
if k not in row:
row[k] = ""
row = {}
for k, v in __voter_import_map__.items():
if k in voter:
# handle matching keys
row[k] = voter[k].strip()
else:
row[k] = row[k].decode('utf-8')
# handle non-matching keys
row[k] = ''
for match in v:
if match in voter:
row[k] = voter[match].strip()
break
# Appending the export date to the dictionary
row["export_date"] = export_date
# Converting dates to the right format and fixing any dates that have been suppressed or invalid to None
Expand All @@ -169,7 +96,7 @@ def parse_raw_voter_into_tuple(self, voter_raw: bytes, export_date: str) -> tupl
else:
row[k] = datetime.datetime.strptime(row[k], "%m/%d/%Y").strftime('%Y-%m-%d')
# Blanking out any suppressed fields with * to empty string
for k in self.suppress_keys:
for k in __suppress_keys__:
if row[k] == "*":
row[k] = ""
# Ensuring email addresses are in lower case
Expand All @@ -187,46 +114,53 @@ def import_source(self, file: str, t: str) -> None:
:return: None
"""
self.db.init_schema()
try:
with zipfile.ZipFile(file, mode="r") as archive:
for info in archive.infolist():
print(f"Filename: {info.filename}")
print(f"Modified: {datetime.datetime(*info.date_time)}")
print(f"Normal size: {info.file_size} bytes")
print(f"Compressed size: {info.compress_size} bytes")
print("-" * 20)
data = []
records_imported = 0
for line in archive.read(info.filename).split(b"\n"):
if len(line.strip()) >= 3:
if t in self.valid_import_types.keys():
if t in self.valid_import_types.keys():
try:
with zipfile.ZipFile(file, mode="r") as archive:
for info in archive.infolist():
print(f"Filename: {info.filename}")
print(f"Modified: {datetime.datetime(*info.date_time)}")
print(f"Normal size: {info.file_size} bytes")
print(f"Compressed size: {info.compress_size} bytes")
print("-" * 20)
data = []
records_imported = 0
with archive.open(info.filename, "r") as f:
reader = csv.DictReader(
io.TextIOWrapper(
f,
newline='\r\n',
encoding='utf-8',
errors='ignore'
),
delimiter="\t",
fieldnames=self.valid_import_types[t]["fields"]
)
for row in reader:
if len(data) < self.db.batch_limits[t]:
data.append(getattr(self, self.valid_import_types[t]["parse"])(
line,
row,
datetime.datetime(*info.date_time).strftime("%Y-%m-%d")
))
else:
print(f"Importing batch of {len(data)} records from {info.filename}..")
self.db.executemany_prepared_sql(
getattr(Warehouse.FloridaSQL, self.valid_import_types[t]["sql"])(),
getattr(StateSQL, self.valid_import_types[t]["sql"])(),
data
)
records_imported += len(data)
data = []
else:
raise ValueError(f"Usage: Type 't' {t} is not valid")
if len(data) > 0:
if t in self.valid_import_types.keys():
print(f"Importing batch of {len(data)} records from {info.filename}..")
self.db.executemany_prepared_sql(
getattr(Warehouse.FloridaSQL, self.valid_import_types[t]["sql"])(),
data
)
records_imported += len(data)
else:
raise ValueError(f"Usage: Type 't' {t} is not valid")
print(f"{records_imported} total records imported")
print("-" * 20)
except Exception as error:
print('Caught this error: ' + repr(error))
raise
if len(data) > 0:
print(f"Importing batch of {len(data)} records from {info.filename}..")
self.db.executemany_prepared_sql(
getattr(StateSQL, self.valid_import_types[t]["sql"])(),
data
)
records_imported += len(data)
print(f"{records_imported} total records imported")
print("-" * 20)
except Exception as error:
print('Caught this error: ' + repr(error))
raise
else:
raise ValueError(f"Usage: Type 't' {t} is not valid")
Loading

0 comments on commit a0e5e2f

Please sign in to comment.