diff --git a/relationalize/schema.py b/relationalize/schema.py index 2b23fe2..669389d 100644 --- a/relationalize/schema.py +++ b/relationalize/schema.py @@ -150,6 +150,39 @@ def drop_null_columns(self) -> int: del self.schema[column] return len(columns_to_drop) + def drop_special_char_columns(self) -> int: + """ + Drops columns which have a non alnumeric (excluding whitespace) in their name from the schema. + + Returns the # of columns that were dropped. + """ + columns_to_drop = [] + for key in self.schema.keys(): + if any(not (c.isalnum() or c == " ") for c in key): + columns_to_drop.append(key) + + for column in columns_to_drop: + del self.schema[column] + return len(columns_to_drop) + + def drop_duplicate_columns(self) -> int: + """ + Drops columns from the schema which have a duplicate (case sensitive) match. Keeps the first column it reads. + + Returns the # of columns that were dropped. + """ + lowercased_keys = set() + columns_to_drop = [] + for key in self.schema.keys(): + if key.casefold() not in lowercased_keys: + lowercased_keys.add(key.casefold()) + else: + columns_to_drop.append(key) + + for column in columns_to_drop: + del self.schema[column] + return len(columns_to_drop) + def read_object(self, object: Dict): """ Read an object and merge into the current schema. diff --git a/test/schema.test.py b/test/schema.test.py index c1d135a..37a4068 100644 --- a/test/schema.test.py +++ b/test/schema.test.py @@ -177,10 +177,10 @@ def test_drop_null_columns(self): self.assertDictEqual({}, schema1.schema) schema2 = Schema() - schema1.read_object(CASE_3) - schema1.read_object(CASE_4) - schema1.drop_null_columns() - self.assertDictEqual({"1": "int"}, schema1.schema) + schema2.read_object(CASE_3) + schema2.read_object(CASE_4) + schema2.drop_null_columns() + self.assertDictEqual({"1": "int"}, schema2.schema) def test_generate_output_columns_no_choice(self): schema1 = Schema() @@ -196,6 +196,43 @@ def test_generate_output_columns_choice(self): schema1.generate_output_columns(), ) + def test_drop_special_char_columns(self): + schema1 = Schema() + schema1.read_object({"abc ": 1, "def@#": 1, "$$ghi": 1, "jkl": 1, "!@#mno": 1}) + self.assertEqual(3, schema1.drop_special_char_columns()) + self.assertEqual(schema1.schema, {"abc ": "int", "jkl": "int"}) + schema2 = Schema() + schema2.read_object({"abc": 1, "def": 2, "GH I ": 3}) + self.assertEqual(0, schema2.drop_special_char_columns()) + self.assertEqual(schema2.schema, {"abc": "int", "def": "int", "GH I ": "int"}) + + def test_drop_duplicate_columns(self): + schema1 = Schema() + schema1.read_object( + {"ABc ": 1, "DEf ": 1, "ghi": 1, "jkl": 1, "ABC": 1, "abc ": 1, "JkL": 1} + ) + self.assertEqual(2, schema1.drop_duplicate_columns()) + self.assertEqual( + schema1.schema, + {"ABc ": "int", "DEf ": "int", "ghi": "int", "jkl": "int", "ABC": "int"}, + ) + schema2 = Schema() + schema2.read_object( + {"abc": 1, "ABC": 2, "ABc": 3, "abC ": 4, "D E F": 5, "DEF": 5} + ) + self.assertEqual(2, schema2.drop_duplicate_columns()) + self.assertEqual( + schema2.schema, + {"abc": "int", "abC ": "int", "D E F": "int", "DEF": "int"}, + ) + schema3 = Schema() + schema3.read_object({"abc": 1, "def": 2, "GH I ": 3, "abC ": 4, "D E F": 5}) + self.assertEqual(0, schema3.drop_duplicate_columns()) + self.assertEqual( + schema3.schema, + {"abc": "int", "def": "int", "GH I ": "int", "abC ": "int", "D E F": "int"}, + ) + if __name__ == "__main__": unittest.main()