diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index 37e767826..c5287c8e2 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -33,6 +33,12 @@ static void ArrowArrayReleaseInternal(struct ArrowArray* array) { ArrowBitmapReset(&private_data->bitmap); ArrowBufferReset(&private_data->buffers[0]); ArrowBufferReset(&private_data->buffers[1]); + ArrowFree(private_data->buffer_data); + for (int32_t i = 0; i < private_data->n_variadic_buffers; ++i) { + ArrowBufferReset(&private_data->variadic_buffers[i]); + } + ArrowFree(private_data->variadic_buffers); + ArrowFree(private_data->variadic_buffer_sizes); ArrowFree(private_data); } @@ -74,8 +80,6 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, case NANOARROW_TYPE_UNINITIALIZED: case NANOARROW_TYPE_NA: case NANOARROW_TYPE_RUN_END_ENCODED: - case NANOARROW_TYPE_BINARY_VIEW: - case NANOARROW_TYPE_STRING_VIEW: array->n_buffers = 0; break; @@ -107,6 +111,8 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array, case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: case NANOARROW_TYPE_FIXED_SIZE_BINARY: case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_BINARY_VIEW: + case NANOARROW_TYPE_STRING_VIEW: array->n_buffers = 2; break; case NANOARROW_TYPE_STRING: @@ -151,12 +157,17 @@ ArrowErrorCode ArrowArrayInitFromType(struct ArrowArray* array, ArrowBitmapInit(&private_data->bitmap); ArrowBufferInit(&private_data->buffers[0]); ArrowBufferInit(&private_data->buffers[1]); - private_data->buffer_data[0] = NULL; - private_data->buffer_data[1] = NULL; - private_data->buffer_data[2] = NULL; + private_data->buffer_data = + (const void**)ArrowMalloc(sizeof(void*) * NANOARROW_MAX_FIXED_BUFFERS); + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; ++i) { + private_data->buffer_data[i] = NULL; + } + private_data->n_variadic_buffers = 0; + private_data->variadic_buffers = NULL; + private_data->variadic_buffer_sizes = NULL; array->private_data = private_data; - array->buffers = (const void**)(&private_data->buffer_data); + array->buffers = (const void**)(private_data->buffer_data); // These are not technically "storage" in the sense that they do not appear // in the ArrowSchemaView's storage_type member; however, allowing them here @@ -452,6 +463,19 @@ static ArrowErrorCode ArrowArrayFinalizeBuffers(struct ArrowArray* array) { NANOARROW_RETURN_NOT_OK(ArrowArrayFinalizeBuffers(array->dictionary)); } + const int32_t n_vbuf = private_data->n_variadic_buffers; + if (n_vbuf > 0) { + // TODO: the hard-coded 2's are tied to binary/string view implementations + // but maybe we should make them generic + private_data->buffer_data = + realloc(private_data->buffer_data, sizeof(void*) * (2 + n_vbuf + 1)); + for (int32_t i = 0; i < n_vbuf; ++i) { + private_data->buffer_data[2 + i] = private_data->variadic_buffers[i].data; + } + private_data->buffer_data[n_vbuf + 2] = private_data->variadic_buffer_sizes; + array->buffers = (const void**)(private_data->buffer_data); + } + return NANOARROW_OK; } @@ -459,7 +483,11 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; - for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + // TODO: this does not seem correct - should probably merge changes from + // ArrowArrayFinalizeBuffers into this + const int32_t limit = + private_data->n_variadic_buffers == 0 ? NANOARROW_MAX_FIXED_BUFFERS : 2; + for (int32_t i = 0; i < limit; i++) { private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; } @@ -734,19 +762,20 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, if (array_view->storage_type == NANOARROW_TYPE_STRING_VIEW || array_view->storage_type == NANOARROW_TYPE_BINARY_VIEW) { const int64_t n_buffers = array->n_buffers; - const int64_t n_fixed_buffers = 2; + const int32_t n_fixed_buffers = 2; - int64_t n_variadic_buffers = n_buffers - n_fixed_buffers - 1; + // TODO: bounds checking + int32_t n_variadic_buffers = (int32_t)(n_buffers - n_fixed_buffers - 1); // Theoretically if the variadic buffers are not used the above subtraction // could yield a negative number n_variadic_buffers = (n_variadic_buffers < 0) ? 0 : n_variadic_buffers; array_view->n_variadic_buffers = n_variadic_buffers; // TODO: check malloc failures - array_view->variadic_buffer_views = - ArrowMalloc(sizeof(struct ArrowBufferView) * n_variadic_buffers); + array_view->variadic_buffer_views = (struct ArrowBufferView*)ArrowMalloc( + sizeof(struct ArrowBufferView) * n_variadic_buffers); - for (int64_t i = 0; i < n_variadic_buffers; ++i) { + for (int32_t i = 0; i < n_variadic_buffers; ++i) { ++buffers_required; array_view->variadic_buffer_views[i].data.data = array->buffers[i + n_fixed_buffers]; @@ -762,7 +791,7 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, // final buffer ++buffers_required; - array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1]; + array_view->variadic_buffer_sizes = (int32_t*)array->buffers[n_buffers - 1]; } if (buffers_required != array->n_buffers) { diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index 4e9f4a405..5e2403cde 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -904,7 +904,7 @@ TEST(ArrayTest, ArrayTestAppendToStringViewArray) { // Check that we can reserve ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK); EXPECT_EQ(ArrowArrayBuffer(&array, 1)->capacity_bytes, - (5 + 1) * sizeof(union ArrowBinaryViewType)); + 5 * sizeof(union ArrowBinaryViewType)); EXPECT_EQ(ArrowArrayAppendString(&array, "1234"_asv), NANOARROW_OK); EXPECT_EQ(ArrowArrayAppendNull(&array, 2), NANOARROW_OK); @@ -916,10 +916,21 @@ TEST(ArrayTest, ArrayTestAppendToStringViewArray) { EXPECT_EQ(array.length, 5); EXPECT_EQ(array.null_count, 2); auto validity_buffer = reinterpret_cast(array.buffers[0]); - auto data_buffer = reinterpret_cast(array.buffers[1]); - auto out_of_line_buffer = reinterpret_cast(array.buffers[2]); + auto inline_buffer = + reinterpret_cast(array.buffers[1]); + auto variable_buffer = reinterpret_cast(array.buffers[2]); + auto sizes_buffer = reinterpret_cast(array.buffers[3]); + EXPECT_EQ(validity_buffer[0], 0b00011001); - EXPECT_EQ(memcmp(data_buffer, "1234", 4), 0); + EXPECT_EQ(memcmp(inline_buffer[0].inlined.data, "1234", 4), 0); + EXPECT_EQ(inline_buffer[0].inlined.size, 4); + EXPECT_EQ(memcmp(inline_buffer[3].ref.data, "long", 4), 0); + EXPECT_EQ(inline_buffer[3].ref.size, 27); + EXPECT_EQ(inline_buffer[3].ref.buffer_index, 0); + EXPECT_EQ(inline_buffer[3].ref.offset, 0); + + EXPECT_EQ(memcmp(variable_buffer, "longer_than_the_inline_size", 27), 0); + EXPECT_EQ(sizes_buffer[0], 27); // TODO: need to add overload for ViewArrayAsBytes /* diff --git a/src/nanoarrow/common/inline_array.h b/src/nanoarrow/common/inline_array.h index dafe50e52..a56430f34 100644 --- a/src/nanoarrow/common/inline_array.h +++ b/src/nanoarrow/common/inline_array.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "nanoarrow/common/inline_buffer.h" @@ -501,20 +502,37 @@ static inline ArrowErrorCode ArrowArrayAppendBytes(struct ArrowArray* array, private_data->storage_type == NANOARROW_TYPE_BINARY_VIEW) { struct ArrowBuffer* data_buffer = ArrowArrayBuffer(array, 1); union ArrowBinaryViewType bvt; - bvt.inlined.size = value.size_bytes; + // TODO: bounds check + bvt.inlined.size = (int32_t)value.size_bytes; if (value.size_bytes <= NANOARROW_BINARY_VIEW_INLINE_SIZE) { memcpy(bvt.inlined.data, value.data.as_char, value.size_bytes); } else { - struct ArrowBuffer* current_out_buffer = ArrowArrayBuffer(array, 2); - // TODO: with C11 would be great to static_assert that sizeof(bvt.ref.data) <= - // NANOARROW_BINARY_VIEW_INLINE_SIZE - // assert(sizeof(bvt.ref.data) <= NANOARROW_BINARY_VIEW_INLINE_SIZE); - memcpy(bvt.ref.data, value.data.as_char, sizeof(bvt.ref.data)); - // TODO: do not hardcode buffer position - bvt.ref.buffer_index = 0; - bvt.ref.offset = current_out_buffer->size_bytes; - NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(current_out_buffer, &bvt, sizeof(bvt))); + int32_t n_vbufs = private_data->n_variadic_buffers; + if (n_vbufs == 0 || + private_data->variadic_buffers[n_vbufs - 1].size_bytes + value.size_bytes > + NANOARROW_BINARY_VIEW_BLOCK_SIZE) { + // TODO: ArrowMalloc? + ++n_vbufs; + // TODO: these should be realloc for > 0 case + private_data->variadic_buffers = + (struct ArrowBuffer*)malloc(sizeof(struct ArrowBuffer) * (n_vbufs)); + private_data->variadic_buffer_sizes = + (int32_t*)malloc(sizeof(int32_t) * (n_vbufs)); + ArrowBufferInit(&private_data->variadic_buffers[n_vbufs - 1]); + private_data->n_variadic_buffers = n_vbufs; + } + + struct ArrowBuffer* variadic_buf = &private_data->variadic_buffers[n_vbufs - 1]; + memcpy(bvt.ref.data, value.data.as_char, NANOARROW_BINARY_VIEW_PREVIEW_SIZE); + bvt.ref.buffer_index = n_vbufs - 1; + // TODO: bounds check + bvt.ref.offset = (int32_t)variadic_buf->size_bytes; + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppend(variadic_buf, value.data.as_char, value.size_bytes)); + // TODO: bounds check + private_data->variadic_buffer_sizes[n_vbufs - 1] = + (int32_t)variadic_buf->size_bytes; } NANOARROW_RETURN_NOT_OK(ArrowBufferAppend(data_buffer, &bvt, sizeof(bvt))); } else { diff --git a/src/nanoarrow/common/inline_types.h b/src/nanoarrow/common/inline_types.h index f00b645a0..fd3689f55 100644 --- a/src/nanoarrow/common/inline_types.h +++ b/src/nanoarrow/common/inline_types.h @@ -626,12 +626,8 @@ enum ArrowBufferType { NANOARROW_BUFFER_TYPE_DATA_VIEW }; -/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout +/// \brief The maximum number of fixed buffers in an ArrowArrayView or ArrowLayout /// \ingroup nanoarrow-array-view -/// -/// All currently supported types have 3 buffers or fewer; however, future types -/// may involve a variable number of buffers (e.g., string view). These buffers -/// will be represented by separate members of the ArrowArrayView or ArrowLayout. #define NANOARROW_MAX_FIXED_BUFFERS 3 /// \brief An non-owning view of a string @@ -817,10 +813,10 @@ struct ArrowArrayView { int8_t* union_type_id_map; /// \brief Number of variadic buffers - int64_t n_variadic_buffers; + int32_t n_variadic_buffers; /// \brief Size of each variadic buffer - int64_t* variadic_buffer_sizes; + int32_t* variadic_buffer_sizes; /// \brief Variadic buffer contents struct ArrowBufferView* variadic_buffer_views; @@ -838,8 +834,8 @@ struct ArrowArrayPrivateData { // The array of pointers to buffers. This must be updated after a sequence // of appends to synchronize its values with the actual buffer addresses - // (which may have ben reallocated uring that time) - const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; + // (which may have ben reallocated during that time) + const void** buffer_data; // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown enum ArrowType storage_type; @@ -851,6 +847,15 @@ struct ArrowArrayPrivateData { // In the future this could be replaced with a type id<->child mapping // to support constructing unions in append mode where type_id != child_index int8_t union_type_id_is_child_index; + + // Number of variadic buffers for binary view types + int32_t n_variadic_buffers; + + // Variadic buffers for binary view types + struct ArrowBuffer* variadic_buffers; + + // Size of each variadic buffer in bytes + int32_t* variadic_buffer_sizes; }; /// \brief A representation of an interval.