Skip to content

Commit

Permalink
Merge pull request #8 from tulip/dan.schema_helper_functions
Browse files Browse the repository at this point in the history
Add Helper Functions for Cleaning Data
  • Loading branch information
henryivesjones authored Feb 28, 2023
2 parents d3cf2ef + dc9f595 commit 689b626
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 4 deletions.
33 changes: 33 additions & 0 deletions relationalize/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,39 @@ def drop_null_columns(self) -> int:
del self.schema[column]
return len(columns_to_drop)

def drop_special_char_columns(self) -> int:
"""
Drops columns which have a non alnumeric (excluding whitespace) in their name from the schema.
Returns the # of columns that were dropped.
"""
columns_to_drop = []
for key in self.schema.keys():
if any(not (c.isalnum() or c == " ") for c in key):
columns_to_drop.append(key)

for column in columns_to_drop:
del self.schema[column]
return len(columns_to_drop)

def drop_duplicate_columns(self) -> int:
"""
Drops columns from the schema which have a duplicate (case sensitive) match. Keeps the first column it reads.
Returns the # of columns that were dropped.
"""
lowercased_keys = set()
columns_to_drop = []
for key in self.schema.keys():
if key.casefold() not in lowercased_keys:
lowercased_keys.add(key.casefold())
else:
columns_to_drop.append(key)

for column in columns_to_drop:
del self.schema[column]
return len(columns_to_drop)

def read_object(self, object: Dict):
"""
Read an object and merge into the current schema.
Expand Down
45 changes: 41 additions & 4 deletions test/schema.test.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,10 @@ def test_drop_null_columns(self):
self.assertDictEqual({}, schema1.schema)

schema2 = Schema()
schema1.read_object(CASE_3)
schema1.read_object(CASE_4)
schema1.drop_null_columns()
self.assertDictEqual({"1": "int"}, schema1.schema)
schema2.read_object(CASE_3)
schema2.read_object(CASE_4)
schema2.drop_null_columns()
self.assertDictEqual({"1": "int"}, schema2.schema)

def test_generate_output_columns_no_choice(self):
schema1 = Schema()
Expand All @@ -196,6 +196,43 @@ def test_generate_output_columns_choice(self):
schema1.generate_output_columns(),
)

def test_drop_special_char_columns(self):
schema1 = Schema()
schema1.read_object({"abc ": 1, "def@#": 1, "$$ghi": 1, "jkl": 1, "!@#mno": 1})
self.assertEqual(3, schema1.drop_special_char_columns())
self.assertEqual(schema1.schema, {"abc ": "int", "jkl": "int"})
schema2 = Schema()
schema2.read_object({"abc": 1, "def": 2, "GH I ": 3})
self.assertEqual(0, schema2.drop_special_char_columns())
self.assertEqual(schema2.schema, {"abc": "int", "def": "int", "GH I ": "int"})

def test_drop_duplicate_columns(self):
schema1 = Schema()
schema1.read_object(
{"ABc ": 1, "DEf ": 1, "ghi": 1, "jkl": 1, "ABC": 1, "abc ": 1, "JkL": 1}
)
self.assertEqual(2, schema1.drop_duplicate_columns())
self.assertEqual(
schema1.schema,
{"ABc ": "int", "DEf ": "int", "ghi": "int", "jkl": "int", "ABC": "int"},
)
schema2 = Schema()
schema2.read_object(
{"abc": 1, "ABC": 2, "ABc": 3, "abC ": 4, "D E F": 5, "DEF": 5}
)
self.assertEqual(2, schema2.drop_duplicate_columns())
self.assertEqual(
schema2.schema,
{"abc": "int", "abC ": "int", "D E F": "int", "DEF": "int"},
)
schema3 = Schema()
schema3.read_object({"abc": 1, "def": 2, "GH I ": 3, "abC ": 4, "D E F": 5})
self.assertEqual(0, schema3.drop_duplicate_columns())
self.assertEqual(
schema3.schema,
{"abc": "int", "def": "int", "GH I ": "int", "abC ": "int", "D E F": "int"},
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 689b626

Please sign in to comment.