From d9efdbb90fcf564b3d3f460ebdce9d1966106fcd Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Fri, 5 Jan 2024 14:02:41 +0100 Subject: [PATCH] Arrow: Use case instead of wrapping a map/list Wrapping the list seems to introduce an odd behavior where `null` values are converted to an empty list `[]`. Resolves #251 --- dev/provision.py | 20 ++++++++++++++++++++ pyiceberg/io/pyarrow.py | 16 ++-------------- tests/test_integration.py | 14 +++++++++++++- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/dev/provision.py b/dev/provision.py index 9917cd3f20..793ae70880 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -339,3 +339,23 @@ ) spark.sql("INSERT INTO default.test_table_add_column VALUES ('2', '2')") + +spark.sql( + """ +CREATE TABLE default.test_table_empty_list_and_map ( + col_list array, + col_map map +) +USING iceberg +TBLPROPERTIES ( + 'format-version'='1' +); +""" +) + +spark.sql( + """ +INSERT INTO default.test_table_empty_list_and_map +VALUES (null, null) +""" +) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index a537cf7a30..21c7b7e58f 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1052,24 +1052,12 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional return field_array def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[pa.Array]: - return ( - pa.ListArray.from_arrays(list_array.offsets, self.cast_if_needed(list_type.element_field, value_array)) - if isinstance(list_array, pa.ListArray) - else None - ) + return list_array.cast(schema_to_pyarrow(list_type)) if isinstance(list_array, pa.ListArray) else None def map( self, map_type: MapType, map_array: Optional[pa.Array], key_result: Optional[pa.Array], value_result: Optional[pa.Array] ) -> Optional[pa.Array]: - return ( - pa.MapArray.from_arrays( - map_array.offsets, - self.cast_if_needed(map_type.key_field, key_result), - self.cast_if_needed(map_type.value_field, value_result), - ) - if isinstance(map_array, pa.MapArray) - else None - ) + return map_array.cast(schema_to_pyarrow(map_type)) if isinstance(map_array, pa.MapArray) else None def primitive(self, _: PrimitiveType, array: Optional[pa.Array]) -> Optional[pa.Array]: return array diff --git a/tests/test_integration.py b/tests/test_integration.py index 2a173be3b3..602cb0263d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -93,6 +93,11 @@ def table_test_table_sanitized_character(catalog: Catalog) -> Table: return catalog.load_table("default.test_table_sanitized_character") +@pytest.fixture() +def table_test_empty_list_and_map(catalog: Catalog) -> Table: + return catalog.load_table("default.test_table_empty_list_and_map") + + TABLE_NAME = ("default", "t1") @@ -417,8 +422,15 @@ def test_upgrade_table_version(table_test_table_version: Table) -> None: @pytest.mark.integration -def test_reproduce_issue(table_test_table_sanitized_character: Table) -> None: +def test_sanitize_column_names(table_test_table_sanitized_character: Table) -> None: arrow_table = table_test_table_sanitized_character.scan().to_arrow() assert len(arrow_table.schema.names), 1 assert len(table_test_table_sanitized_character.schema().fields), 1 assert arrow_table.schema.names[0] == table_test_table_sanitized_character.schema().fields[0].name + + +@pytest.mark.integration +def test_null_list_and_map(table_test_empty_list_and_map: Table) -> None: + arrow_table = table_test_empty_list_and_map.scan().to_arrow() + assert arrow_table["col_list"].to_pylist() == [None] + assert arrow_table["col_map"].to_pylist() == [None]