Skip to content

Commit

Permalink
feat(python): Add StringView and BinaryView IO to Python bindings (#637)
Browse files Browse the repository at this point in the history
This PR implements StringView support in the Python bindings. It is a
thin wrapper around the C functions added, although we should perhaps
abstract some of the buffer info calculation into the C library since I
had to work around that in the R bindings as well.

```python
import nanoarrow as na

array = na.Array(["abc", "def", None, "longer than 12 bytes"], na.string_view())
array
#> nanoarrow.Array<string_view>[4]
#> 'abc'
#> 'def'
#> None
#> 'longer than 12 bytes'
array.buffers
#> (nanoarrow.c_buffer.CBufferView(bool[1 b] 11010000),
#>  nanoarrow.c_buffer.CBufferView(string_view[64 b] b'\x03\x00\x00\x00abc\x00\x00\x00\x00\x00\x00\x00\x00\x00'...),
#>  nanoarrow.c_buffer.CBufferView(string[20 b] b'longer than 12 bytes'),
#>  nanoarrow.c_buffer.CBufferView(int64[8 b] 20))
```

---------

Co-authored-by: William Ayd <[email protected]>
  • Loading branch information
paleolimbot and WillAyd authored Sep 30, 2024
1 parent 97e7c61 commit d6ef480
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 18 deletions.
83 changes: 73 additions & 10 deletions python/src/nanoarrow/_array.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@ from cpython.pycapsule cimport PyCapsule_GetPointer
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cpython cimport (
Py_buffer,
PyObject_GetBuffer,
PyBuffer_Release,
PyBUF_ANY_CONTIGUOUS,
PyBUF_FORMAT,
PyBytes_FromStringAndSize,
PyObject_GetBuffer,
PyUnicode_FromStringAndSize,
)

from nanoarrow_c cimport (
Expand All @@ -43,6 +45,9 @@ from nanoarrow_c cimport (
ArrowArrayView,
ArrowArrayViewComputeNullCount,
ArrowArrayViewInitFromSchema,
ArrowArrayViewIsNull,
ArrowArrayViewGetStringUnsafe,
ArrowArrayViewGetBytesUnsafe,
ArrowArrayViewSetArray,
ArrowArrayViewSetArrayMinimal,
ArrowBitCountSet,
Expand All @@ -57,6 +62,7 @@ from nanoarrow_c cimport (
ArrowValidationLevel,
NANOARROW_BUFFER_TYPE_DATA,
NANOARROW_BUFFER_TYPE_DATA_OFFSET,
NANOARROW_BUFFER_TYPE_DATA_VIEW,
NANOARROW_BUFFER_TYPE_TYPE_ID,
NANOARROW_BUFFER_TYPE_UNION_OFFSET,
NANOARROW_BUFFER_TYPE_VALIDITY,
Expand All @@ -78,6 +84,7 @@ from nanoarrow._device cimport Device, CSharedSyncEvent

from nanoarrow._buffer cimport CBuffer, CBufferView
from nanoarrow._schema cimport CSchema, CLayout
from nanoarrow cimport _types
from nanoarrow._utils cimport (
alloc_c_array,
alloc_c_device_array,
Expand Down Expand Up @@ -189,13 +196,48 @@ cdef class CArrayView:

@property
def n_buffers(self):
if _types.is_data_view(self._ptr.storage_type):
return 2 + self._ptr.n_variadic_buffers + 1

return self.layout.n_buffers

def buffer_type(self, int64_t i):
def _buffer_info(self, int64_t i):
if i < 0 or i >= self.n_buffers:
raise IndexError(f"{i} out of range [0, {self.n_buffers}]")

buffer_type = self._ptr.layout.buffer_type[i]
if (
_types.is_data_view(self._ptr.storage_type)
and i == (2 + self._ptr.n_variadic_buffers)
):
return (
NANOARROW_BUFFER_TYPE_DATA,
_types.INT64,
64,
<uintptr_t>self._ptr.array.buffers[i],
(self._ptr.n_variadic_buffers) * 8
)
elif (
_types.is_data_view(self._ptr.storage_type)
and i >= 2
):
return (
NANOARROW_BUFFER_TYPE_DATA,
_types.STRING if int(self._ptr.storage_type) == _types.STRING_VIEW else _types.BINARY,
0,
<uintptr_t>self._ptr.array.buffers[i],
(<int64_t*>self._ptr.array.buffers[2 + self._ptr.n_variadic_buffers])[i - 2]
)

return (
self._ptr.layout.buffer_type[i],
self._ptr.layout.buffer_data_type[i],
self._ptr.layout.element_size_bits[i],
<uintptr_t>self._ptr.buffer_views[i].data.data,
self._ptr.buffer_views[i].size_bytes
)

def buffer_type(self, int64_t i):
buffer_type = self._buffer_info(i)[0]
if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY:
return "validity"
elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID:
Expand All @@ -206,14 +248,17 @@ cdef class CArrayView:
return "data_offset"
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA:
return "data"
elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_VIEW:
return "data_view"
else:
return "none"

def buffer(self, int64_t i):
if i < 0 or i >= self.n_buffers:
raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
_, data_type, element_size_bits, addr, size = self._buffer_info(i)

cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i])
cdef ArrowBufferView buffer_view
buffer_view.data.data = <void*>addr
buffer_view.size_bytes = size

# Check the buffer size here because the error later is cryptic.
# Buffer sizes are set to -1 when they are "unknown", so because of errors
Expand All @@ -224,10 +269,10 @@ cdef class CArrayView:

return CBufferView(
self._array_base,
<uintptr_t>buffer_view.data.data,
buffer_view.size_bytes,
self._ptr.layout.buffer_data_type[i],
self._ptr.layout.element_size_bits[i],
addr,
size,
data_type,
element_size_bits,
self._event
)

Expand All @@ -249,6 +294,24 @@ cdef class CArrayView:

return dictionary

def _iter_bytes(self, int64_t offset, int64_t length) -> bytes | None:
cdef ArrowBufferView item_view
for i in range(offset, length):
if ArrowArrayViewIsNull(self._ptr, i):
yield None
else:
item_view = ArrowArrayViewGetBytesUnsafe(self._ptr, i)
yield PyBytes_FromStringAndSize(item_view.data.as_char, item_view.size_bytes)

def _iter_str(self, int64_t offset, int64_t length) -> str | None:
cdef ArrowStringView item_view
for i in range(offset, length):
if ArrowArrayViewIsNull(self._ptr, i):
yield None
else:
item_view = ArrowArrayViewGetStringUnsafe(self._ptr, i)
yield PyUnicode_FromStringAndSize(item_view.data, item_view.size_bytes)

def __repr__(self):
return _repr_utils.array_view_repr(self)

Expand Down
2 changes: 2 additions & 0 deletions python/src/nanoarrow/_types.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ cpdef bint has_time_unit(int type_id)

cpdef bint is_union(int type_id)

cpdef bint is_data_view(int type_id)

cdef int to_format(int type_id, int element_size_bits, size_t out_size, char* out)

cdef tuple from_format(format)
7 changes: 7 additions & 0 deletions python/src/nanoarrow/_types.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import enum
from typing import Callable, ClassVar

BINARY: CArrowType
BINARY_VIEW: CArrowType
BOOL: CArrowType
DATE32: CArrowType
DATE64: CArrowType
Expand Down Expand Up @@ -47,8 +48,10 @@ LARGE_STRING: CArrowType
LIST: CArrowType
MAP: CArrowType
NA: CArrowType
RUN_END_ENCODED: CArrowType
SPARSE_UNION: CArrowType
STRING: CArrowType
STRING_VIEW: CArrowType
STRUCT: CArrowType
TIME32: CArrowType
TIME64: CArrowType
Expand All @@ -61,6 +64,7 @@ UNINITIALIZED: CArrowType
__pyx_capi__: dict
__test__: dict
has_time_unit: _cython_3_0_11.cython_function_or_method
is_data_view: _cython_3_0_11.cython_function_or_method
is_decimal: _cython_3_0_11.cython_function_or_method
is_fixed_size: _cython_3_0_11.cython_function_or_method
is_floating_point: _cython_3_0_11.cython_function_or_method
Expand All @@ -72,6 +76,7 @@ sys_byteorder: str
class CArrowType(enum.IntFlag):
__new__: ClassVar[Callable] = ...
BINARY: ClassVar[CArrowType] = ...
BINARY_VIEW: ClassVar[CArrowType] = ...
BOOL: ClassVar[CArrowType] = ...
DATE32: ClassVar[CArrowType] = ...
DATE64: ClassVar[CArrowType] = ...
Expand Down Expand Up @@ -99,8 +104,10 @@ class CArrowType(enum.IntFlag):
LIST: ClassVar[CArrowType] = ...
MAP: ClassVar[CArrowType] = ...
NA: ClassVar[CArrowType] = ...
RUN_END_ENCODED: ClassVar[CArrowType] = ...
SPARSE_UNION: ClassVar[CArrowType] = ...
STRING: ClassVar[CArrowType] = ...
STRING_VIEW: ClassVar[CArrowType] = ...
STRUCT: ClassVar[CArrowType] = ...
TIME32: ClassVar[CArrowType] = ...
TIME64: ClassVar[CArrowType] = ...
Expand Down
11 changes: 11 additions & 0 deletions python/src/nanoarrow/_types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ cpdef bint is_union(int type_id):
)


cpdef bint is_data_view(int type_id):
"""Check if type_id is a binary view or string view type"""
return type_id in (
_types.BINARY_VIEW,
_types.STRING_VIEW
)


cdef tuple from_format(format):
"""Convert a Python buffer protocol format string to a itemsize/type_id tuple
Expand Down Expand Up @@ -236,6 +244,9 @@ cdef int to_format(int type_id, int element_size_bits, size_t out_size, char* ou
elif type_id == _types.DECIMAL256:
format_const = "32s"
element_size_bits_calc = 256
elif is_data_view(type_id):
format_const = "16s"
element_size_bits_calc = 128
else:
raise ValueError(f"Unsupported Arrow type_id for format conversion: {type_id}")

Expand Down
2 changes: 2 additions & 0 deletions python/src/nanoarrow/c_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,8 +547,10 @@ def _append_using_buffer_builder(self, obj: Iterable) -> None:
_types.BINARY: "_append_bytes",
_types.LARGE_BINARY: "_append_bytes",
_types.FIXED_SIZE_BINARY: "_append_bytes",
_types.BINARY_VIEW: "_append_bytes",
_types.STRING: "_append_strings",
_types.LARGE_STRING: "_append_strings",
_types.STRING_VIEW: "_append_strings",
_types.INT8: "_append_using_array",
_types.UINT8: "_append_using_array",
_types.INT16: "_append_using_array",
Expand Down
8 changes: 8 additions & 0 deletions python/src/nanoarrow/iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,12 @@ def _binary_iter(self, offset, length):
for start, end in zip(starts, ends):
yield bytes(data[start:end])

def _binary_view_iter(self, offset, length):
return self._array_view._iter_bytes(offset, length)

def _string_view_iter(self, offset, length):
return self._array_view._iter_str(offset, length)

def _decimal_iter(self, offset, length):
from decimal import Context, Decimal
from sys import byteorder
Expand Down Expand Up @@ -564,6 +570,8 @@ def _get_tzinfo(tz_string, strategy=None):
_types.DURATION: "_duration_iter",
_types.DECIMAL128: "_decimal_iter",
_types.DECIMAL256: "_decimal_iter",
_types.STRING_VIEW: "_string_view_iter",
_types.BINARY_VIEW: "_binary_view_iter",
}

_PRIMITIVE_TYPE_NAMES = [
Expand Down
40 changes: 40 additions & 0 deletions python/tests/test_c_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,32 @@ def test_c_array_from_iterable_string():
na.c_array([b"1234"], na.string())


def test_c_array_from_iterable_string_view():
string = na.c_array(
["abc", None, "a string longer than 12 bytes"], na.string_view()
)
assert string.length == 3
assert string.null_count == 1
assert string.n_buffers == 4

array_view = string.view()
assert len(array_view.buffer(0)) == 1
assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes"
assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")]

# Make sure this also works when all strings are inlined (i.e., no variadic buffers)
string = na.c_array(["abc", None, "short string"], na.string_view())
assert string.length == 3
assert string.null_count == 1
assert string.n_buffers == 3

array_view = string.view()
assert len(array_view.buffer(0)) == 1
assert len(array_view.buffer(1)) == 3
assert len(bytes(array_view.buffer(1))) == 3 * 16
assert list(array_view.buffer(2)) == []


def test_c_array_from_iterable_bytes():
string = na.c_array([b"abc", None, b"defg"], na.binary())
assert string.length == 3
Expand All @@ -311,6 +337,20 @@ def test_c_array_from_iterable_bytes():
na.c_array([buf_2d], na.binary())


def test_c_array_from_iterable__view():
string = na.c_array(
[b"abc", None, b"a string longer than 12 bytes"], na.binary_view()
)
assert string.length == 3
assert string.null_count == 1
assert string.n_buffers == 4

array_view = string.view()
assert len(array_view.buffer(0)) == 1
assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes"
assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")]


def test_c_array_from_iterable_non_empty_nullable_without_nulls():
c_array = na.c_array([1, 2, 3], na.int32())
assert c_array.length == 3
Expand Down
28 changes: 20 additions & 8 deletions python/tests/test_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,35 +68,47 @@ def test_iterator_nullable_primitive():
assert list(iter_py(sliced)) == [2, 3, None]


def test_iterator_string():
array = na.c_array(["ab", "cde"], na.string())
@pytest.mark.parametrize(
"arrow_type", [na.string(), na.large_string(), na.string_view()]
)
def test_iterator_string(arrow_type):
array = na.c_array(["ab", "cde"], arrow_type)

assert list(iter_py(array)) == ["ab", "cde"]

sliced = array[1:]
assert list(iter_py(sliced)) == ["cde"]


def test_iterator_nullable_string():
array = na.c_array(["ab", "cde", None], na.string())
@pytest.mark.parametrize(
"arrow_type", [na.string(), na.large_string(), na.string_view()]
)
def test_iterator_nullable_string(arrow_type):
array = na.c_array(["ab", "cde", None], arrow_type)

assert list(iter_py(array)) == ["ab", "cde", None]

sliced = array[1:]
assert list(iter_py(sliced)) == ["cde", None]


def test_iterator_binary():
array = na.c_array([b"ab", b"cde"], na.binary())
@pytest.mark.parametrize(
"arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
)
def test_iterator_binary(arrow_type):
array = na.c_array([b"ab", b"cde"], arrow_type)

assert list(iter_py(array)) == [b"ab", b"cde"]

sliced = array[1:]
assert list(iter_py(sliced)) == [b"cde"]


def test_iterator_nullable_binary():
array = na.c_array([b"ab", b"cde", None], na.binary())
@pytest.mark.parametrize(
"arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
)
def test_iterator_nullable_binary(arrow_type):
array = na.c_array([b"ab", b"cde", None], arrow_type)

assert list(iter_py(array)) == [b"ab", b"cde", None]

Expand Down

0 comments on commit d6ef480

Please sign in to comment.