From e5cda0804fbb46973238f46d7ab90f19d465a929 Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Wed, 25 Oct 2023 15:24:09 -0400 Subject: [PATCH] build: bump delta-spark to 3.0 --- pyproject.toml | 2 +- tests/formats/test_deltalake.py | 29 ++++++++++++----------------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 827f44c8..02c03693 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ requires-python = ">= 3.10" # to fix any breakages since users won't immediately see the problem). dependencies = [ "ctakesclient >= 5.1, < 6", - "delta-spark >= 2.3, < 3", + "delta-spark >= 3, < 4", "fhirclient < 5", "httpx < 1", "inscriptis < 3", diff --git a/tests/formats/test_deltalake.py b/tests/formats/test_deltalake.py index af80a87b..b9b9754e 100644 --- a/tests/formats/test_deltalake.py +++ b/tests/formats/test_deltalake.py @@ -189,16 +189,12 @@ def test_altered_field(self): ] ) self.assertTrue(self.store(self.df(a=1), schema=schema)) + + # Confirm that Delta Lake will error out when presented with an altered type, with or without a schema. self.assertFalse(self.store(self.df(b="string"), schema=schema)) - self.assert_lake_equal(self.df(a=1)) + self.assertFalse(self.store(self.df(b="string"))) - # And just confirm the mildly buggy behavior that Delta Lake will silently ignore - # altered types when we don't force a schema. This is one reason we like to force a schema! - # We don't desire or care about this behavior, but just testing it here as a sort of documentation, - # in case they ever fix that, and then we get to know about it. - # Upstream issue: https://github.com/delta-io/delta/issues/1551 - self.assertTrue(self.store(self.df(b="string"))) - self.assert_lake_equal([{"id": "a", "value": 1}, {"id": "b"}]) + self.assert_lake_equal(self.df(a=1)) def test_schema_has_names(self): """Verify that the lake's schemas has valid nested names, which may not always happen with spark""" @@ -274,15 +270,14 @@ def test_merged_schema_for_resource(self): @ddt.data( # In general, the first type used wins - (pyarrow.int64(), 2000, pyarrow.int32(), 2000, "long", 2000), - (pyarrow.int32(), 2000, pyarrow.int64(), 2000, "integer", 2000), - (pyarrow.int64(), 3000000000, pyarrow.int32(), 2000, "long", 2000), - # Interestingly, delta lake will silently down-convert for us. - # This is not an expected scenario, but we should beware this gotcha. - (pyarrow.int32(), 2000, pyarrow.int64(), 3000000000, "integer", -1294967296), + (pyarrow.int64(), 2000, pyarrow.int32(), 2001, True, "long", 2001), + (pyarrow.int32(), 2000, pyarrow.int64(), 2001, True, "integer", 2001), + (pyarrow.int64(), 3000000000, pyarrow.int32(), 2000, True, "long", 2000), + # Delta lake will refuse to store too large a value for the type + (pyarrow.int32(), 2000, pyarrow.int64(), 3000000000, False, "integer", 2000), ) @ddt.unpack - def test_column_type_merges(self, type1, val1, type2, val2, expected_type, expected_value): + def test_column_type_merges(self, type1, val1, type2, val2, expected_success, expected_type, expected_value): """Verify that if we write a slightly different, but compatible field to the delta lake, it works""" schema1 = pyarrow.schema( [ @@ -290,7 +285,7 @@ def test_column_type_merges(self, type1, val1, type2, val2, expected_type, expec pyarrow.field("int", type1), ] ) - self.store([{"id": "1", "int": val1}], schema=schema1) + self.assertTrue(self.store([{"id": "1", "int": val1}], schema=schema1)) schema2 = pyarrow.schema( [ @@ -298,7 +293,7 @@ def test_column_type_merges(self, type1, val1, type2, val2, expected_type, expec pyarrow.field("int", type2), ] ) - self.store([{"id": "1", "int": val2}], schema=schema2) + self.assertEqual(expected_success, self.store([{"id": "1", "int": val2}], schema=schema2)) table_path = os.path.join(self.output_dir, "patient") table_df = DeltaLakeFormat.spark.read.format("delta").load(table_path)