Skip to content

Commit

Permalink
Arrow: Use case instead of wrapping a map/list
Browse files Browse the repository at this point in the history
Wrapping the list seems to introduce an odd behavior where
`null` values are converted to an empty list `[]`.

Resolves #251
  • Loading branch information
Fokko committed Jan 5, 2024
1 parent dba1ef8 commit d9efdbb
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 15 deletions.
20 changes: 20 additions & 0 deletions dev/provision.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,3 +339,23 @@
)

spark.sql("INSERT INTO default.test_table_add_column VALUES ('2', '2')")

spark.sql(
"""
CREATE TABLE default.test_table_empty_list_and_map (
col_list array<int>,
col_map map<int, int>
)
USING iceberg
TBLPROPERTIES (
'format-version'='1'
);
"""
)

spark.sql(
"""
INSERT INTO default.test_table_empty_list_and_map
VALUES (null, null)
"""
)
16 changes: 2 additions & 14 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1052,24 +1052,12 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional
return field_array

def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[pa.Array]:
return (
pa.ListArray.from_arrays(list_array.offsets, self.cast_if_needed(list_type.element_field, value_array))
if isinstance(list_array, pa.ListArray)
else None
)
return list_array.cast(schema_to_pyarrow(list_type)) if isinstance(list_array, pa.ListArray) else None

def map(
self, map_type: MapType, map_array: Optional[pa.Array], key_result: Optional[pa.Array], value_result: Optional[pa.Array]
) -> Optional[pa.Array]:
return (
pa.MapArray.from_arrays(
map_array.offsets,
self.cast_if_needed(map_type.key_field, key_result),
self.cast_if_needed(map_type.value_field, value_result),
)
if isinstance(map_array, pa.MapArray)
else None
)
return map_array.cast(schema_to_pyarrow(map_type)) if isinstance(map_array, pa.MapArray) else None

def primitive(self, _: PrimitiveType, array: Optional[pa.Array]) -> Optional[pa.Array]:
return array
Expand Down
14 changes: 13 additions & 1 deletion tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ def table_test_table_sanitized_character(catalog: Catalog) -> Table:
return catalog.load_table("default.test_table_sanitized_character")


@pytest.fixture()
def table_test_empty_list_and_map(catalog: Catalog) -> Table:
return catalog.load_table("default.test_table_empty_list_and_map")


TABLE_NAME = ("default", "t1")


Expand Down Expand Up @@ -417,8 +422,15 @@ def test_upgrade_table_version(table_test_table_version: Table) -> None:


@pytest.mark.integration
def test_reproduce_issue(table_test_table_sanitized_character: Table) -> None:
def test_sanitize_column_names(table_test_table_sanitized_character: Table) -> None:
arrow_table = table_test_table_sanitized_character.scan().to_arrow()
assert len(arrow_table.schema.names), 1
assert len(table_test_table_sanitized_character.schema().fields), 1
assert arrow_table.schema.names[0] == table_test_table_sanitized_character.schema().fields[0].name


@pytest.mark.integration
def test_null_list_and_map(table_test_empty_list_and_map: Table) -> None:
arrow_table = table_test_empty_list_and_map.scan().to_arrow()
assert arrow_table["col_list"].to_pylist() == [None]
assert arrow_table["col_map"].to_pylist() == [None]

0 comments on commit d9efdbb

Please sign in to comment.