From a576fc9ae7619d329f4d9bcd667231fc1be94a57 Mon Sep 17 00:00:00 2001 From: Kibum Park <38655427+castedice@users.noreply.github.com> Date: Sun, 11 Feb 2024 00:57:43 +0900 Subject: [PATCH] Arrow: Support Large Binary when using `to_arrow` (#409) * Arrow: Support Large Binary * Merge with binary --------- Co-authored-by: Fokko Driesprong --- pyiceberg/io/pyarrow.py | 4 ++-- tests/integration/test_writes.py | 2 +- tests/io/test_pyarrow.py | 2 +- tests/io/test_pyarrow_visitor.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 86571449fb..57f09ba172 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -533,7 +533,7 @@ def visit_uuid(self, _: UUIDType) -> pa.DataType: return pa.binary(16) def visit_binary(self, _: BinaryType) -> pa.DataType: - return pa.binary() + return pa.large_binary() def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar: @@ -882,7 +882,7 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType: return TimestamptzType() elif primitive.tz is None: return TimestampType() - elif pa.types.is_binary(primitive): + elif pa.types.is_binary(primitive) or pa.types.is_large_binary(primitive): return BinaryType() elif pa.types.is_fixed_size_binary(primitive): primitive = cast(pa.FixedSizeBinaryType, primitive) diff --git a/tests/integration/test_writes.py b/tests/integration/test_writes.py index c08916bc68..58ab830319 100644 --- a/tests/integration/test_writes.py +++ b/tests/integration/test_writes.py @@ -140,7 +140,7 @@ def pa_schema() -> pa.Schema: # ("time", pa.time64("us")), # Not natively supported by Arrow # ("uuid", pa.fixed(16)), - ("binary", pa.binary()), + ("binary", pa.large_binary()), ("fixed", pa.binary(16)), ]) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 745de1a3d3..a3dd56db7f 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -467,7 +467,7 @@ def test_string_type_to_pyarrow() -> None: def test_binary_type_to_pyarrow() -> None: iceberg_type = BinaryType() - assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.binary() + assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.large_binary() def test_struct_type_to_pyarrow(table_schema_simple: Schema) -> None: diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index c6ba18c7b0..7d35cae424 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -215,7 +215,7 @@ def test_pyarrow_string_to_iceberg() -> None: def test_pyarrow_variable_binary_to_iceberg() -> None: - pyarrow_type = pa.binary() + pyarrow_type = pa.large_binary() converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg()) assert converted_iceberg_type == BinaryType() assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pyarrow_type