Skip to content

Commit

Permalink
Merge branch 'main' of github.com:apache/iceberg-python into fd-write
Browse files Browse the repository at this point in the history
  • Loading branch information
Fokko committed Jan 12, 2024
2 parents bbc0b35 + 4593208 commit d441af9
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 22 deletions.
28 changes: 12 additions & 16 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,10 @@
ONE_MEGABYTE = 1024 * 1024
BUFFER_SIZE = "buffer-size"
ICEBERG_SCHEMA = b"iceberg.schema"
FIELD_ID = b"PARQUET:field_id"
DOC = "doc"
PYARROW_FIELD_ID_KEYS = [b"PARQUET:field_id", b"field_id"]
PYARROW_FIELD_DOC_KEYS = [b"PARQUET:field_doc", b"field_doc", b"doc"]
# The PARQUET: in front means that it is Parquet specific, in this case the field_id
PYARROW_PARQUET_FIELD_ID_KEY = b"PARQUET:field_id"
PYARROW_FIELD_DOC_KEY = b"doc"

T = TypeVar("T")

Expand Down Expand Up @@ -466,7 +466,9 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
name=field.name,
type=field_result,
nullable=field.optional,
metadata={DOC: field.doc, FIELD_ID: str(field.field_id)} if field.doc else {FIELD_ID: str(field.field_id)},
metadata={PYARROW_FIELD_DOC_KEY: field.doc, PYARROW_PARQUET_FIELD_ID_KEY: str(field.field_id)}
if field.doc
else {PYARROW_PARQUET_FIELD_ID_KEY: str(field.field_id)},
)

def list(self, list_type: ListType, element_result: pa.DataType) -> pa.DataType:
Expand Down Expand Up @@ -730,25 +732,19 @@ def primitive(self, primitive: pa.DataType) -> Optional[T]:


def _get_field_id(field: pa.Field) -> Optional[int]:
for pyarrow_field_id_key in PYARROW_FIELD_ID_KEYS:
if field_id_str := field.metadata.get(pyarrow_field_id_key):
return int(field_id_str.decode())
return None


def _get_field_doc(field: pa.Field) -> Optional[str]:
for pyarrow_doc_key in PYARROW_FIELD_DOC_KEYS:
if doc_str := field.metadata.get(pyarrow_doc_key):
return doc_str.decode()
return None
return (
int(field_id_str.decode())
if (field.metadata and (field_id_str := field.metadata.get(PYARROW_PARQUET_FIELD_ID_KEY)))
else None
)


class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
def _convert_fields(self, arrow_fields: Iterable[pa.Field], field_results: List[Optional[IcebergType]]) -> List[NestedField]:
fields = []
for i, field in enumerate(arrow_fields):
field_id = _get_field_id(field)
field_doc = _get_field_doc(field)
field_doc = doc_str.decode() if (field.metadata and (doc_str := field.metadata.get(PYARROW_FIELD_DOC_KEY))) else None
field_type = field_results[i]
if field_type is not None and field_id is not None:
fields.append(NestedField(field_id, field.name, field_type, required=not field.nullable, doc=field_doc))
Expand Down
12 changes: 6 additions & 6 deletions tests/io/test_pyarrow_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,9 @@ def test_pyarrow_variable_binary_to_iceberg() -> None:

def test_pyarrow_struct_to_iceberg() -> None:
pyarrow_struct = pa.struct([
pa.field("foo", pa.string(), nullable=True, metadata={"field_id": "1", "doc": "foo doc"}),
pa.field("bar", pa.int32(), nullable=False, metadata={"field_id": "2"}),
pa.field("baz", pa.bool_(), nullable=True, metadata={"field_id": "3"}),
pa.field("foo", pa.string(), nullable=True, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}),
pa.field("bar", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "2"}),
pa.field("baz", pa.bool_(), nullable=True, metadata={"PARQUET:field_id": "3"}),
])
expected = StructType(
NestedField(field_id=1, name="foo", field_type=StringType(), required=False, doc="foo doc"),
Expand All @@ -221,7 +221,7 @@ def test_pyarrow_struct_to_iceberg() -> None:


def test_pyarrow_list_to_iceberg() -> None:
pyarrow_list = pa.list_(pa.field("element", pa.int32(), nullable=False, metadata={"field_id": "1"}))
pyarrow_list = pa.list_(pa.field("element", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "1"}))
expected = ListType(
element_id=1,
element_type=IntegerType(),
Expand All @@ -232,8 +232,8 @@ def test_pyarrow_list_to_iceberg() -> None:

def test_pyarrow_map_to_iceberg() -> None:
pyarrow_map = pa.map_(
pa.field("key", pa.int32(), nullable=False, metadata={"field_id": "1"}),
pa.field("value", pa.string(), nullable=False, metadata={"field_id": "2"}),
pa.field("key", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "1"}),
pa.field("value", pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}),
)
expected = MapType(
key_id=1,
Expand Down

0 comments on commit d441af9

Please sign in to comment.