Skip to content

Commit

Permalink
fix: Properly ingest Binary View types without variadic buffers (#635)
Browse files Browse the repository at this point in the history
closes #634
  • Loading branch information
WillAyd authored Sep 27, 2024
1 parent 53c9f8f commit 97e7c61
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 43 deletions.
10 changes: 4 additions & 6 deletions src/nanoarrow/common/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
break;
case NANOARROW_TYPE_BINARY_VIEW:
case NANOARROW_TYPE_STRING_VIEW:
array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS;
array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1;
break;
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_LARGE_STRING:
Expand Down Expand Up @@ -765,11 +765,9 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view,
const int32_t nfixed_buf = NANOARROW_BINARY_VIEW_FIXED_BUFFERS;

const int32_t nvariadic_buf = (int32_t)(n_buffers - nfixed_buf - 1);
if (nvariadic_buf > 0) {
array_view->n_variadic_buffers = nvariadic_buf;
buffers_required += nvariadic_buf + 1;
array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1];
}
array_view->n_variadic_buffers = nvariadic_buf;
buffers_required += nvariadic_buf + 1;
array_view->variadic_buffer_sizes = (int64_t*)array->buffers[n_buffers - 1];
}

if (buffers_required != array->n_buffers) {
Expand Down
149 changes: 117 additions & 32 deletions src/nanoarrow/common/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -895,9 +895,48 @@ TEST(ArrayTest, ArrayTestAppendToLargeStringArray) {
ArrowArrayRelease(&array);
}

template <enum ArrowType ArrowT, typename ValueT,
ArrowErrorCode (*AppendFunc)(struct ArrowArray*, ValueT)>
void TestAppendToDataViewArray() {
template <enum ArrowType ArrowT, typename ValueT>
void TestAppendToInlinedDataViewArray(
std::function<ArrowErrorCode(struct ArrowArray*, ValueT)> AppendFunc) {
struct ArrowArray array;

ASSERT_EQ(ArrowArrayInitFromType(&array, ArrowT), NANOARROW_OK);
EXPECT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);

// Check that we can reserve
ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
EXPECT_EQ(ArrowArrayBuffer(&array, 1)->capacity_bytes,
5 * sizeof(union ArrowBinaryView));

EXPECT_EQ(AppendFunc(&array, ValueT{{"inlinestring"}, 12}), NANOARROW_OK);
EXPECT_EQ(ArrowArrayAppendNull(&array, 2), NANOARROW_OK);
EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);

EXPECT_EQ(array.length, 4);
EXPECT_EQ(array.null_count, 2);
EXPECT_EQ(array.n_buffers, 3);
auto validity_buffer = reinterpret_cast<const uint8_t*>(array.buffers[0]);
auto inline_buffer = reinterpret_cast<const union ArrowBinaryView*>(array.buffers[1]);
auto sizes_buffer = reinterpret_cast<const int64_t*>(array.buffers[2]);

EXPECT_EQ(validity_buffer[0], 0b00001001);
EXPECT_EQ(memcmp(inline_buffer[0].inlined.data, "inlinestring", 12), 0);
EXPECT_EQ(inline_buffer[0].inlined.size, 12);

EXPECT_EQ(sizes_buffer, nullptr);

// TODO: issue #633
/*
EXPECT_THAT(nanoarrow::ViewArrayAsBytes<64>(&array),
ElementsAre("1234"_asv, NA, NA, "56789"_asv, ""_asv));
*/
ArrowArrayRelease(&array);
};

template <enum ArrowType ArrowT, typename ValueT>
void TestAppendToDataViewArray(
std::function<ArrowErrorCode(struct ArrowArray*, ValueT)> AppendFunc) {
struct ArrowArray array;

ASSERT_EQ(ArrowArrayInitFromType(&array, ArrowT), NANOARROW_OK);
Expand Down Expand Up @@ -925,6 +964,7 @@ void TestAppendToDataViewArray() {

EXPECT_EQ(array.length, 7);
EXPECT_EQ(array.null_count, 2);
EXPECT_EQ(array.n_buffers, 5);
auto validity_buffer = reinterpret_cast<const uint8_t*>(array.buffers[0]);
auto inline_buffer = reinterpret_cast<const union ArrowBinaryView*>(array.buffers[1]);
auto vbuf1 = reinterpret_cast<const char*>(array.buffers[2]);
Expand Down Expand Up @@ -964,13 +1004,17 @@ void TestAppendToDataViewArray() {
};

TEST(ArrayTest, ArrayTestAppendToBinaryViewArray) {
TestAppendToDataViewArray<NANOARROW_TYPE_STRING_VIEW, struct ArrowStringView,
ArrowArrayAppendString>();
TestAppendToInlinedDataViewArray<NANOARROW_TYPE_STRING_VIEW, struct ArrowStringView>(
ArrowArrayAppendString);
TestAppendToDataViewArray<NANOARROW_TYPE_STRING_VIEW, struct ArrowStringView>(
ArrowArrayAppendString);
};

TEST(ArrayTest, ArrayTestAppendToStringViewArray) {
TestAppendToDataViewArray<NANOARROW_TYPE_BINARY_VIEW, struct ArrowBufferView,
ArrowArrayAppendBytes>();
TestAppendToInlinedDataViewArray<NANOARROW_TYPE_BINARY_VIEW, struct ArrowBufferView>(
ArrowArrayAppendBytes);
TestAppendToDataViewArray<NANOARROW_TYPE_BINARY_VIEW, struct ArrowBufferView>(
ArrowArrayAppendBytes);
};

TEST(ArrayTest, ArrayTestAppendToFixedSizeBinaryArray) {
Expand Down Expand Up @@ -3343,8 +3387,49 @@ TEST(ArrayViewTest, ArrayViewTestGetString) {
TestGetFromBinary<FixedSizeBinaryBuilder>(fixed_size_builder);
}

template <typename BuilderClass>
void TestGetFromBinaryView(BuilderClass& builder) {
template <typename BuilderClass, typename ValueT>
void TestGetFromInlinedBinaryView(
BuilderClass& builder,
std::function<ValueT(const struct ArrowArrayView*, int64_t)> GetValueFunc,
std::function<const void*(const ValueT*)> GetValueDataFunc) {
struct ArrowArray array;
struct ArrowSchema schema;
struct ArrowArrayView array_view;
struct ArrowError error;

auto type = builder.type();
ARROW_EXPECT_OK(builder.Append("1234"));
ARROW_EXPECT_OK(builder.AppendNulls(2));
ARROW_EXPECT_OK(builder.Append("four"));

auto maybe_arrow_array = builder.Finish();
ARROW_EXPECT_OK(maybe_arrow_array);
auto arrow_array = maybe_arrow_array.ValueUnsafe();

ARROW_EXPECT_OK(ExportArray(*arrow_array, &array, &schema));
ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, &error), NANOARROW_OK);
ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK);
EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error),
NANOARROW_OK);

EXPECT_EQ(array_view.n_variadic_buffers, 0);
EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), 1);
EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 3), 0);

const auto value = GetValueFunc(&array_view, 3);
EXPECT_EQ(value.size_bytes, strlen("four"));
EXPECT_EQ(memcmp(GetValueDataFunc(&value), "four", value.size_bytes), 0);

ArrowArrayViewReset(&array_view);
ArrowArrayRelease(&array);
ArrowSchemaRelease(&schema);
}

template <typename BuilderClass, typename ValueT>
void TestGetFromBinaryView(
BuilderClass& builder,
std::function<ValueT(const struct ArrowArrayView*, int64_t)> GetValueFunc,
std::function<const void*(const ValueT*)> GetValueDataFunc) {
struct ArrowArray array;
struct ArrowSchema schema;
struct ArrowArrayView array_view;
Expand Down Expand Up @@ -3380,29 +3465,17 @@ void TestGetFromBinaryView(BuilderClass& builder) {
EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 2), 1);
EXPECT_EQ(ArrowArrayViewIsNull(&array_view, 3), 0);

auto string_view = ArrowArrayViewGetStringUnsafe(&array_view, 3);
EXPECT_EQ(string_view.size_bytes, strlen("four"));
EXPECT_EQ(memcmp(string_view.data, "four", string_view.size_bytes), 0);

auto buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 3);
EXPECT_EQ(buffer_view.size_bytes, strlen("four"));
EXPECT_EQ(memcmp(buffer_view.data.as_char, "four", buffer_view.size_bytes), 0);

string_view = ArrowArrayViewGetStringUnsafe(&array_view, 4);
EXPECT_EQ(string_view.size_bytes, str1.size());
EXPECT_EQ(memcmp(string_view.data, str1.c_str(), string_view.size_bytes), 0);
const auto value1 = GetValueFunc(&array_view, 3);
EXPECT_EQ(value1.size_bytes, strlen("four"));
EXPECT_EQ(memcmp(GetValueDataFunc(&value1), "four", value1.size_bytes), 0);

string_view = ArrowArrayViewGetStringUnsafe(&array_view, 6);
EXPECT_EQ(string_view.size_bytes, str2.size());
EXPECT_EQ(memcmp(string_view.data, str2.c_str(), string_view.size_bytes), 0);
const auto value2 = GetValueFunc(&array_view, 4);
EXPECT_EQ(value2.size_bytes, str1.size());
EXPECT_EQ(memcmp(GetValueDataFunc(&value2), str1.c_str(), value2.size_bytes), 0);

buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 4);
EXPECT_EQ(buffer_view.size_bytes, str1.size());
EXPECT_EQ(memcmp(buffer_view.data.as_char, str1.c_str(), buffer_view.size_bytes), 0);

buffer_view = ArrowArrayViewGetBytesUnsafe(&array_view, 6);
EXPECT_EQ(buffer_view.size_bytes, str2.size());
EXPECT_EQ(memcmp(buffer_view.data.as_char, str2.c_str(), buffer_view.size_bytes), 0);
const auto value3 = GetValueFunc(&array_view, 6);
EXPECT_EQ(value3.size_bytes, str2.size());
EXPECT_EQ(memcmp(GetValueDataFunc(&value3), str2.c_str(), value3.size_bytes), 0);

ArrowArrayViewReset(&array_view);
ArrowArrayRelease(&array);
Expand All @@ -3411,10 +3484,22 @@ void TestGetFromBinaryView(BuilderClass& builder) {

TEST(ArrayViewTest, ArrayViewTestGetStringView) {
auto string_view_builder = StringViewBuilder();
TestGetFromBinaryView<StringViewBuilder>(string_view_builder);
const auto get_string_view = [](const struct ArrowStringView* sv) { return sv->data; };
TestGetFromInlinedBinaryView<StringViewBuilder, struct ArrowStringView>(
string_view_builder, ArrowArrayViewGetStringUnsafe, get_string_view);
TestGetFromBinaryView<StringViewBuilder, struct ArrowStringView>(
string_view_builder, ArrowArrayViewGetStringUnsafe, get_string_view);
}

TEST(ArrayViewTest, ArrayViewTestGetBinaryView) {
auto binary_view_builder = BinaryViewBuilder();
TestGetFromBinaryView<BinaryViewBuilder>(binary_view_builder);
const auto get_buffer_view = [](const struct ArrowBufferView* bv) {
return bv->data.data;
};
TestGetFromInlinedBinaryView<BinaryViewBuilder, struct ArrowBufferView>(
binary_view_builder, ArrowArrayViewGetBytesUnsafe, get_buffer_view);
TestGetFromBinaryView<BinaryViewBuilder, struct ArrowBufferView>(
binary_view_builder, ArrowArrayViewGetBytesUnsafe, get_buffer_view);
}

TEST(ArrayViewTest, ArrayViewTestGetIntervalYearMonth) {
Expand Down
13 changes: 8 additions & 5 deletions src/nanoarrow/common/inline_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,8 @@ static inline ArrowErrorCode ArrowArrayAppendDouble(struct ArrowArray* array,
return NANOARROW_OK;
}

// Binary views only have two fixed buffers, but be aware that they must also
// always have more 1 buffer to store variadic buffer sizes (even if there are none)
#define NANOARROW_BINARY_VIEW_FIXED_BUFFERS 2
#define NANOARROW_BINARY_VIEW_INLINE_SIZE 12
#define NANOARROW_BINARY_VIEW_PREFIX_SIZE 4
Expand Down Expand Up @@ -504,27 +506,28 @@ static inline int32_t ArrowArrayVariadicBufferCount(struct ArrowArray* array) {
static inline ArrowErrorCode ArrowArrayAddVariadicBuffers(struct ArrowArray* array,
int32_t nbuffers) {
const int32_t n_current_bufs = ArrowArrayVariadicBufferCount(array);
const int32_t n_bufs_needed = n_current_bufs + nbuffers;
const int32_t nvariadic_bufs_needed = n_current_bufs + nbuffers;

struct ArrowArrayPrivateData* private_data =
(struct ArrowArrayPrivateData*)array->private_data;

private_data->variadic_buffers = (struct ArrowBuffer*)ArrowRealloc(
private_data->variadic_buffers, sizeof(struct ArrowBuffer) * n_bufs_needed);
private_data->variadic_buffers, sizeof(struct ArrowBuffer) * nvariadic_bufs_needed);
if (private_data->variadic_buffers == NULL) {
return ENOMEM;
}
private_data->variadic_buffer_sizes = (int64_t*)ArrowRealloc(
private_data->variadic_buffer_sizes, sizeof(int64_t) * n_bufs_needed);
private_data->variadic_buffer_sizes, sizeof(int64_t) * nvariadic_bufs_needed);
if (private_data->variadic_buffer_sizes == NULL) {
return ENOMEM;
}

for (int32_t i = n_current_bufs; i < n_bufs_needed; i++) {
for (int32_t i = n_current_bufs; i < nvariadic_bufs_needed; i++) {
ArrowBufferInit(&private_data->variadic_buffers[i]);
private_data->variadic_buffer_sizes[i] = 0;
}
private_data->n_variadic_buffers = n_bufs_needed;
private_data->n_variadic_buffers = nvariadic_bufs_needed;
array->n_buffers = NANOARROW_BINARY_VIEW_FIXED_BUFFERS + 1 + nvariadic_bufs_needed;

return NANOARROW_OK;
}
Expand Down

0 comments on commit 97e7c61

Please sign in to comment.