feat(python): Add StringView and BinaryView IO to Python bindings (#637)

This PR implements StringView support in the Python bindings. It is a thin wrapper around the C functions added, although we should perhaps abstract some of the buffer info calculation into the C library since I had to work around that in the R bindings as well. ```python import nanoarrow as na array = na.Array(["abc", "def", None, "longer than 12 bytes"], na.string_view()) array #> nanoarrow.Array<string_view>[4] #> 'abc' #> 'def' #> None #> 'longer than 12 bytes' array.buffers #> (nanoarrow.c_buffer.CBufferView(bool[1 b] 11010000), #> nanoarrow.c_buffer.CBufferView(string_view[64 b] b'\x03\x00\x00\x00abc\x00\x00\x00\x00\x00\x00\x00\x00\x00'...), #> nanoarrow.c_buffer.CBufferView(string[20 b] b'longer than 12 bytes'), #> nanoarrow.c_buffer.CBufferView(int64[8 b] 20)) ``` --------- Co-authored-by: William Ayd <[email protected]>
apache · Sep 30, 2024 · d6ef480 · d6ef480
1 parent 97e7c61
commit d6ef480
Show file tree

Hide file tree

Showing 8 changed files with 163 additions and 18 deletions.
diff --git a/python/src/nanoarrow/_array.pyx b/python/src/nanoarrow/_array.pyx
@@ -22,10 +22,12 @@ from cpython.pycapsule cimport PyCapsule_GetPointer
 from cpython.unicode cimport PyUnicode_AsUTF8AndSize
 from cpython cimport (
     Py_buffer,
-    PyObject_GetBuffer,
     PyBuffer_Release,
     PyBUF_ANY_CONTIGUOUS,
     PyBUF_FORMAT,
+    PyBytes_FromStringAndSize,
+    PyObject_GetBuffer,
+    PyUnicode_FromStringAndSize,
 )
 
 from nanoarrow_c cimport (
@@ -43,6 +45,9 @@ from nanoarrow_c cimport (
     ArrowArrayView,
     ArrowArrayViewComputeNullCount,
     ArrowArrayViewInitFromSchema,
+    ArrowArrayViewIsNull,
+    ArrowArrayViewGetStringUnsafe,
+    ArrowArrayViewGetBytesUnsafe,
     ArrowArrayViewSetArray,
     ArrowArrayViewSetArrayMinimal,
     ArrowBitCountSet,
@@ -57,6 +62,7 @@ from nanoarrow_c cimport (
     ArrowValidationLevel,
     NANOARROW_BUFFER_TYPE_DATA,
     NANOARROW_BUFFER_TYPE_DATA_OFFSET,
+    NANOARROW_BUFFER_TYPE_DATA_VIEW,
     NANOARROW_BUFFER_TYPE_TYPE_ID,
     NANOARROW_BUFFER_TYPE_UNION_OFFSET,
     NANOARROW_BUFFER_TYPE_VALIDITY,
@@ -78,6 +84,7 @@ from nanoarrow._device cimport Device, CSharedSyncEvent
 
 from nanoarrow._buffer cimport CBuffer, CBufferView
 from nanoarrow._schema cimport CSchema, CLayout
+from nanoarrow cimport _types
 from nanoarrow._utils cimport (
     alloc_c_array,
     alloc_c_device_array,
@@ -189,13 +196,48 @@ cdef class CArrayView:
 
     @property
     def n_buffers(self):
+        if _types.is_data_view(self._ptr.storage_type):
+            return 2 + self._ptr.n_variadic_buffers + 1
+
         return self.layout.n_buffers
 
-    def buffer_type(self, int64_t i):
+    def _buffer_info(self, int64_t i):
         if i < 0 or i >= self.n_buffers:
             raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
 
-        buffer_type = self._ptr.layout.buffer_type[i]
+        if (
+            _types.is_data_view(self._ptr.storage_type)
+            and i == (2 + self._ptr.n_variadic_buffers)
+        ):
+            return (
+                NANOARROW_BUFFER_TYPE_DATA,
+                _types.INT64,
+                64,
+                <uintptr_t>self._ptr.array.buffers[i],
+                (self._ptr.n_variadic_buffers) * 8
+            )
+        elif (
+            _types.is_data_view(self._ptr.storage_type)
+            and i >= 2
+        ):
+            return (
+                NANOARROW_BUFFER_TYPE_DATA,
+                _types.STRING if int(self._ptr.storage_type) == _types.STRING_VIEW else _types.BINARY,
+                0,
+                <uintptr_t>self._ptr.array.buffers[i],
+                (<int64_t*>self._ptr.array.buffers[2 + self._ptr.n_variadic_buffers])[i - 2]
+            )
+
+        return (
+            self._ptr.layout.buffer_type[i],
+            self._ptr.layout.buffer_data_type[i],
+            self._ptr.layout.element_size_bits[i],
+            <uintptr_t>self._ptr.buffer_views[i].data.data,
+            self._ptr.buffer_views[i].size_bytes
+        )
+
+    def buffer_type(self, int64_t i):
+        buffer_type = self._buffer_info(i)[0]
         if buffer_type == NANOARROW_BUFFER_TYPE_VALIDITY:
             return "validity"
         elif buffer_type == NANOARROW_BUFFER_TYPE_TYPE_ID:
@@ -206,14 +248,17 @@ cdef class CArrayView:
             return "data_offset"
         elif buffer_type == NANOARROW_BUFFER_TYPE_DATA:
             return "data"
+        elif buffer_type == NANOARROW_BUFFER_TYPE_DATA_VIEW:
+            return "data_view"
         else:
             return "none"
 
     def buffer(self, int64_t i):
-        if i < 0 or i >= self.n_buffers:
-            raise IndexError(f"{i} out of range [0, {self.n_buffers}]")
+        _, data_type, element_size_bits, addr, size = self._buffer_info(i)
 
-        cdef ArrowBufferView* buffer_view = &(self._ptr.buffer_views[i])
+        cdef ArrowBufferView buffer_view
+        buffer_view.data.data = <void*>addr
+        buffer_view.size_bytes = size
 
         # Check the buffer size here because the error later is cryptic.
         # Buffer sizes are set to -1 when they are "unknown", so because of errors
@@ -224,10 +269,10 @@ cdef class CArrayView:
 
         return CBufferView(
             self._array_base,
-            <uintptr_t>buffer_view.data.data,
-            buffer_view.size_bytes,
-            self._ptr.layout.buffer_data_type[i],
-            self._ptr.layout.element_size_bits[i],
+            addr,
+            size,
+            data_type,
+            element_size_bits,
             self._event
         )
 
@@ -249,6 +294,24 @@ cdef class CArrayView:
 
         return dictionary
 
+    def _iter_bytes(self, int64_t offset, int64_t length) -> bytes | None:
+        cdef ArrowBufferView item_view
+        for i in range(offset, length):
+            if ArrowArrayViewIsNull(self._ptr, i):
+                yield None
+            else:
+                item_view = ArrowArrayViewGetBytesUnsafe(self._ptr, i)
+                yield PyBytes_FromStringAndSize(item_view.data.as_char, item_view.size_bytes)
+
+    def _iter_str(self, int64_t offset, int64_t length) -> str | None:
+        cdef ArrowStringView item_view
+        for i in range(offset, length):
+            if ArrowArrayViewIsNull(self._ptr, i):
+                yield None
+            else:
+                item_view = ArrowArrayViewGetStringUnsafe(self._ptr, i)
+                yield PyUnicode_FromStringAndSize(item_view.data, item_view.size_bytes)
+
     def __repr__(self):
         return _repr_utils.array_view_repr(self)
 

diff --git a/python/src/nanoarrow/_types.pxd b/python/src/nanoarrow/_types.pxd
@@ -90,6 +90,8 @@ cpdef bint has_time_unit(int type_id)
 
 cpdef bint is_union(int type_id)
 
+cpdef bint is_data_view(int type_id)
+
 cdef int to_format(int type_id, int element_size_bits, size_t out_size, char* out)
 
 cdef tuple from_format(format)
diff --git a/python/src/nanoarrow/_types.pyi b/python/src/nanoarrow/_types.pyi
@@ -20,6 +20,7 @@ import enum
 from typing import Callable, ClassVar
 
 BINARY: CArrowType
+BINARY_VIEW: CArrowType
 BOOL: CArrowType
 DATE32: CArrowType
 DATE64: CArrowType
@@ -47,8 +48,10 @@ LARGE_STRING: CArrowType
 LIST: CArrowType
 MAP: CArrowType
 NA: CArrowType
+RUN_END_ENCODED: CArrowType
 SPARSE_UNION: CArrowType
 STRING: CArrowType
+STRING_VIEW: CArrowType
 STRUCT: CArrowType
 TIME32: CArrowType
 TIME64: CArrowType
@@ -61,6 +64,7 @@ UNINITIALIZED: CArrowType
 __pyx_capi__: dict
 __test__: dict
 has_time_unit: _cython_3_0_11.cython_function_or_method
+is_data_view: _cython_3_0_11.cython_function_or_method
 is_decimal: _cython_3_0_11.cython_function_or_method
 is_fixed_size: _cython_3_0_11.cython_function_or_method
 is_floating_point: _cython_3_0_11.cython_function_or_method
@@ -72,6 +76,7 @@ sys_byteorder: str
 class CArrowType(enum.IntFlag):
     __new__: ClassVar[Callable] = ...
     BINARY: ClassVar[CArrowType] = ...
+    BINARY_VIEW: ClassVar[CArrowType] = ...
     BOOL: ClassVar[CArrowType] = ...
     DATE32: ClassVar[CArrowType] = ...
     DATE64: ClassVar[CArrowType] = ...
@@ -99,8 +104,10 @@ class CArrowType(enum.IntFlag):
     LIST: ClassVar[CArrowType] = ...
     MAP: ClassVar[CArrowType] = ...
     NA: ClassVar[CArrowType] = ...
+    RUN_END_ENCODED: ClassVar[CArrowType] = ...
     SPARSE_UNION: ClassVar[CArrowType] = ...
     STRING: ClassVar[CArrowType] = ...
+    STRING_VIEW: ClassVar[CArrowType] = ...
     STRUCT: ClassVar[CArrowType] = ...
     TIME32: ClassVar[CArrowType] = ...
     TIME64: ClassVar[CArrowType] = ...

diff --git a/python/src/nanoarrow/_types.pyx b/python/src/nanoarrow/_types.pyx
@@ -109,6 +109,14 @@ cpdef bint is_union(int type_id):
     )
 
 
+cpdef bint is_data_view(int type_id):
+    """Check if type_id is a binary view or string view type"""
+    return type_id in (
+        _types.BINARY_VIEW,
+        _types.STRING_VIEW
+    )
+
+
 cdef tuple from_format(format):
     """Convert a Python buffer protocol format string to a itemsize/type_id tuple
 
@@ -236,6 +244,9 @@ cdef int to_format(int type_id, int element_size_bits, size_t out_size, char* ou
     elif type_id == _types.DECIMAL256:
         format_const = "32s"
         element_size_bits_calc = 256
+    elif is_data_view(type_id):
+        format_const = "16s"
+        element_size_bits_calc = 128
     else:
         raise ValueError(f"Unsupported Arrow type_id for format conversion: {type_id}")
 

diff --git a/python/src/nanoarrow/c_array.py b/python/src/nanoarrow/c_array.py
@@ -547,8 +547,10 @@ def _append_using_buffer_builder(self, obj: Iterable) -> None:
     _types.BINARY: "_append_bytes",
     _types.LARGE_BINARY: "_append_bytes",
     _types.FIXED_SIZE_BINARY: "_append_bytes",
+    _types.BINARY_VIEW: "_append_bytes",
     _types.STRING: "_append_strings",
     _types.LARGE_STRING: "_append_strings",
+    _types.STRING_VIEW: "_append_strings",
     _types.INT8: "_append_using_array",
     _types.UINT8: "_append_using_array",
     _types.INT16: "_append_using_array",

diff --git a/python/src/nanoarrow/iterator.py b/python/src/nanoarrow/iterator.py
@@ -322,6 +322,12 @@ def _binary_iter(self, offset, length):
             for start, end in zip(starts, ends):
                 yield bytes(data[start:end])
 
+    def _binary_view_iter(self, offset, length):
+        return self._array_view._iter_bytes(offset, length)
+
+    def _string_view_iter(self, offset, length):
+        return self._array_view._iter_str(offset, length)
+
     def _decimal_iter(self, offset, length):
         from decimal import Context, Decimal
         from sys import byteorder
@@ -564,6 +570,8 @@ def _get_tzinfo(tz_string, strategy=None):
     _types.DURATION: "_duration_iter",
     _types.DECIMAL128: "_decimal_iter",
     _types.DECIMAL256: "_decimal_iter",
+    _types.STRING_VIEW: "_string_view_iter",
+    _types.BINARY_VIEW: "_binary_view_iter",
 }
 
 _PRIMITIVE_TYPE_NAMES = [

diff --git a/python/tests/test_c_array.py b/python/tests/test_c_array.py
@@ -288,6 +288,32 @@ def test_c_array_from_iterable_string():
         na.c_array([b"1234"], na.string())
 
 
+def test_c_array_from_iterable_string_view():
+    string = na.c_array(
+        ["abc", None, "a string longer than 12 bytes"], na.string_view()
+    )
+    assert string.length == 3
+    assert string.null_count == 1
+    assert string.n_buffers == 4
+
+    array_view = string.view()
+    assert len(array_view.buffer(0)) == 1
+    assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes"
+    assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")]
+
+    # Make sure this also works when all strings are inlined (i.e., no variadic buffers)
+    string = na.c_array(["abc", None, "short string"], na.string_view())
+    assert string.length == 3
+    assert string.null_count == 1
+    assert string.n_buffers == 3
+
+    array_view = string.view()
+    assert len(array_view.buffer(0)) == 1
+    assert len(array_view.buffer(1)) == 3
+    assert len(bytes(array_view.buffer(1))) == 3 * 16
+    assert list(array_view.buffer(2)) == []
+
+
 def test_c_array_from_iterable_bytes():
     string = na.c_array([b"abc", None, b"defg"], na.binary())
     assert string.length == 3
@@ -311,6 +337,20 @@ def test_c_array_from_iterable_bytes():
         na.c_array([buf_2d], na.binary())
 
 
+def test_c_array_from_iterable__view():
+    string = na.c_array(
+        [b"abc", None, b"a string longer than 12 bytes"], na.binary_view()
+    )
+    assert string.length == 3
+    assert string.null_count == 1
+    assert string.n_buffers == 4
+
+    array_view = string.view()
+    assert len(array_view.buffer(0)) == 1
+    assert bytes(array_view.buffer(2)) == b"a string longer than 12 bytes"
+    assert list(array_view.buffer(3)) == [len("a string longer than 12 bytes")]
+
+
 def test_c_array_from_iterable_non_empty_nullable_without_nulls():
     c_array = na.c_array([1, 2, 3], na.int32())
     assert c_array.length == 3

diff --git a/python/tests/test_iterator.py b/python/tests/test_iterator.py
@@ -68,35 +68,47 @@ def test_iterator_nullable_primitive():
     assert list(iter_py(sliced)) == [2, 3, None]
 
 
-def test_iterator_string():
-    array = na.c_array(["ab", "cde"], na.string())
+@pytest.mark.parametrize(
+    "arrow_type", [na.string(), na.large_string(), na.string_view()]
+)
+def test_iterator_string(arrow_type):
+    array = na.c_array(["ab", "cde"], arrow_type)
 
     assert list(iter_py(array)) == ["ab", "cde"]
 
     sliced = array[1:]
     assert list(iter_py(sliced)) == ["cde"]
 
 
-def test_iterator_nullable_string():
-    array = na.c_array(["ab", "cde", None], na.string())
+@pytest.mark.parametrize(
+    "arrow_type", [na.string(), na.large_string(), na.string_view()]
+)
+def test_iterator_nullable_string(arrow_type):
+    array = na.c_array(["ab", "cde", None], arrow_type)
 
     assert list(iter_py(array)) == ["ab", "cde", None]
 
     sliced = array[1:]
     assert list(iter_py(sliced)) == ["cde", None]
 
 
-def test_iterator_binary():
-    array = na.c_array([b"ab", b"cde"], na.binary())
+@pytest.mark.parametrize(
+    "arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
+)
+def test_iterator_binary(arrow_type):
+    array = na.c_array([b"ab", b"cde"], arrow_type)
 
     assert list(iter_py(array)) == [b"ab", b"cde"]
 
     sliced = array[1:]
     assert list(iter_py(sliced)) == [b"cde"]
 
 
-def test_iterator_nullable_binary():
-    array = na.c_array([b"ab", b"cde", None], na.binary())
+@pytest.mark.parametrize(
+    "arrow_type", [na.binary(), na.large_binary(), na.binary_view()]
+)
+def test_iterator_nullable_binary(arrow_type):
+    array = na.c_array([b"ab", b"cde", None], arrow_type)
 
     assert list(iter_py(array)) == [b"ab", b"cde", None]