From bc8f990b8d97728859f604c6da2ab14c3b778284 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 27 Nov 2023 16:35:32 -0400 Subject: [PATCH 01/19] first stabs at buffer converters --- src/nanoarrow/nanoarrow_testing.hpp | 136 ++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 454a6da04..f1746458c 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include @@ -1053,6 +1054,141 @@ class TestingJSONReader { return NANOARROW_OK; } + template + ArrowErrorCode SetBufferInt(const json& value, ArrowBuffer* buffer, ArrowError* error) { + // NANOARROW_RETURN_NOT_OK() interacts poorly with multiple template args + using SetItem = SetBufferIntItem; + NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "int buffer must be array")); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK(SetItem(item, buffer, error)); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferIntItem(const json& item, ArrowBuffer* buffer, + ArrowError* error) { + if (item.is_string()) { + try { + // The JSON parser here can handle up to 2^64 - 1 + auto item_int = json::parse(item.get()); + return SetBufferIntItem(item_int, buffer, error); + } catch (json::parse_error& e) { + ArrowErrorSet(error, + "integer buffer item encoded as string must parse as integer: %s", + item.dump().c_str()); + return EINVAL; + } + } + + NANOARROW_RETURN_NOT_OK( + Check(item.is_number_integer(), error, + "integer buffer item must be integer number or string")); + auto item_int = item.get(); + + NANOARROW_RETURN_NOT_OK(Check(item_int >= std::numeric_limits::lowest() && + item_int <= std::numeric_limits::max(), + error, "integer buffer item outside type limits")) + + T buffer_value = item_int; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBuffersString(const json& value, ArrowBuffer* offsets, + ArrowBuffer* data, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "utf8 data buffer must be array")); + int64_t last_offset = 0; + T offset_buffer_value = static_cast(last_offset); + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(offsets, &offset_buffer_value, sizeof(T)), error); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_string(), error, "utf8 data buffer item must be string")); + auto item_str = item.get(); + + // Append data + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(data, reinterpret_cast(item_str.data()), + item.size()), + error); + + // Append offset + last_offset += item_str.size(); + offset_buffer_value = static_cast(last_offset); + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(offsets, &offset_buffer_value, sizeof(T)), error); + } + + // Check if overflow occurred + NANOARROW_RETURN_NOT_OK( + Check(last_offset <= std::numeric_limits::max(), error, + "utf8 data buffer overflowed maximum value of offset type")); + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBuffersBinary(const json& value, ArrowBuffer* offsets, + ArrowBuffer* data, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "binary data buffer must be array")); + int64_t last_offset = 0; + T offset_buffer_value = static_cast(last_offset); + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(offsets, &offset_buffer_value, sizeof(T)), error); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_string(), error, "binary data buffer item must be string")); + auto item_str = item.get(); + + int64_t item_size_bytes = item_str.size() / 2; + NANOARROW_RETURN_NOT_OK(Check((item_size_bytes * 2) == item_str.size(), error, + "binary data buffer item must have even size")); + + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBufferReserve(data, item_size_bytes), + error); + for (int64_t i = 0; i < item_str.size(); i += 2) { + std::string byte_hex = item_str.substr(i, 2); + char* end_ptr; + uint8_t byte = std::strtoul(byte_hex.data(), &end_ptr, 16); + NANOARROW_RETURN_NOT_OK(Check( + end_ptr != (byte_hex.data() == 2), error, + "binary data buffer item must contain a valid hex-encoded byte string")); + + data->data[data->size_bytes] = byte; + data->size_bytes++; + } + + // Append data + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(data, reinterpret_cast(item_str.data()), + item.size()), + error); + + // Append offset + last_offset += item_size_bytes; + offset_buffer_value = static_cast(last_offset); + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(offsets, &offset_buffer_value, sizeof(T)), error); + } + + // Check if overflow occurred + NANOARROW_RETURN_NOT_OK( + Check(last_offset <= std::numeric_limits::max(), error, + "binary data buffer overflowed maximum value of offset type")); + + return NANOARROW_OK; + } + ArrowErrorCode Check(bool value, ArrowError* error, const std::string& err) { if (value) { return NANOARROW_OK; From 19f684cf4036c5a2a93a27c0f6814402fba06044 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 27 Nov 2023 16:41:36 -0400 Subject: [PATCH 02/19] bitmap setter --- src/nanoarrow/nanoarrow_testing.hpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index f1746458c..59b6df29e 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1054,6 +1054,21 @@ class TestingJSONReader { return NANOARROW_OK; } + ArrowErrorCode SetBufferBitmap(const json& value, ArrowBitmap* bitmap, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "bitmap buffer must be array")); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK(Check(item.is_boolean() || item.is_number_integer(), error, + "bitmap item must be bool or integer")); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapAppend(bitmap, item.get(), 1), + error); + } + + return NANOARROW_OK; + } + template ArrowErrorCode SetBufferInt(const json& value, ArrowBuffer* buffer, ArrowError* error) { // NANOARROW_RETURN_NOT_OK() interacts poorly with multiple template args From e710119f2b078395b3b845acde878435189cc79a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 27 Nov 2023 16:44:05 -0400 Subject: [PATCH 03/19] building --- src/nanoarrow/nanoarrow_testing.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 59b6df29e..9f4effff0 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1071,12 +1071,12 @@ class TestingJSONReader { template ArrowErrorCode SetBufferInt(const json& value, ArrowBuffer* buffer, ArrowError* error) { - // NANOARROW_RETURN_NOT_OK() interacts poorly with multiple template args - using SetItem = SetBufferIntItem; NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "int buffer must be array")); for (const auto& item : value) { - NANOARROW_RETURN_NOT_OK(SetItem(item, buffer, error)); + // NANOARROW_RETURN_NOT_OK() interacts poorly with multiple template args + ArrowErrorCode result = SetBufferIntItem(item, buffer, error); + NANOARROW_RETURN_NOT_OK(result); } return NANOARROW_OK; @@ -1105,7 +1105,7 @@ class TestingJSONReader { NANOARROW_RETURN_NOT_OK(Check(item_int >= std::numeric_limits::lowest() && item_int <= std::numeric_limits::max(), - error, "integer buffer item outside type limits")) + error, "integer buffer item outside type limits")); T buffer_value = item_int; NANOARROW_RETURN_NOT_OK_WITH_ERROR( @@ -1176,7 +1176,7 @@ class TestingJSONReader { char* end_ptr; uint8_t byte = std::strtoul(byte_hex.data(), &end_ptr, 16); NANOARROW_RETURN_NOT_OK(Check( - end_ptr != (byte_hex.data() == 2), error, + end_ptr == (byte_hex.data() + 2), error, "binary data buffer item must contain a valid hex-encoded byte string")); data->data[data->size_bytes] = byte; From 893d5ba962f3fe4c41326e39400b8a7befb6adf5 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 13:26:45 -0400 Subject: [PATCH 04/19] more cases --- src/nanoarrow/nanoarrow_testing.hpp | 191 ++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 9f4effff0..09a075e00 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1054,12 +1054,193 @@ class TestingJSONReader { return NANOARROW_OK; } + ArrowErrorCode SetArrayColumn(const json& value, ArrowArrayView* array_view, + ArrowArray* array, ArrowError* error, + const std::string& parent_error_prefix = "") { + NANOARROW_RETURN_NOT_OK( + Check(value.is_object(), error, "Expected Column to be a JSON object")); + + // Check + resolve name early to generate better error messages + NANOARROW_RETURN_NOT_OK( + Check(value.contains("name"), error, "Column missing key 'name'")); + + const auto& name = value["name"]; + NANOARROW_RETURN_NOT_OK(Check(name.is_null() || name.is_string(), error, + "Column name must be string or null")); + + std::string error_prefix; + if (name.is_string()) { + error_prefix = parent_error_prefix + "-> Column '" + name.get() + "' "; + } else { + error_prefix = parent_error_prefix + "-> Column "; + } + + // Check, resolve, and recurse children + NANOARROW_RETURN_NOT_OK( + Check(array_view->n_children > 0 || value.contains("children"), error, + error_prefix + "missing key children")); + + if (value.contains("children")) { + const auto& children = value["children"]; + NANOARROW_RETURN_NOT_OK( + Check(children.is_array(), error, error_prefix + "children must be array")); + NANOARROW_RETURN_NOT_OK(Check(children.size() == array_view->n_children, error, + error_prefix + "children has incorrect size")); + + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], array_view->children[i], + array->children[i], error, error_prefix)); + } + } + + // Build buffers + for (int i = 0; i < 3; i++) { + NANOARROW_RETURN_NOT_OK( + PrefixError(SetArrayColumnBuffers(value, array_view, array, i, error), error, + error_prefix)); + } + + // Check + resolve count + NANOARROW_RETURN_NOT_OK( + Check(value.contains("count"), error, error_prefix + "missing key 'count'")); + const auto& count = value["count"]; + NANOARROW_RETURN_NOT_OK( + Check(count.is_number_integer(), error, error_prefix + "count must be integer")); + array_view->length = count.get(); + + // Set ArrayView buffer views. This is because ArrowArrayInitFromSchema() doesn't + // support custom type ids for unions but the ArrayView does (otherwise + // ArrowArrayFinishBuilding() would work). + for (int i = 0; i < 3; i++) { + ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + ArrowBufferView* buffer_view = array_view->buffer_views + i; + buffer_view->data.as_uint8 = buffer->data; + buffer_view->size_bytes = buffer->size_bytes; + } + + // Validate the array view + NANOARROW_RETURN_NOT_OK(PrefixError( + ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL, error), error, + "failed to validate: ")); + + // Flush length and buffer pointers to the Array + array->length = array_view->length; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE, nullptr), error); + + return NANOARROW_OK; + } + + ArrowErrorCode SetArrayColumnBuffers(const json& value, ArrowArrayView* array_view, + ArrowArray* array, int buffer_i, + ArrowError* error) { + ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + + switch (array_view->layout.buffer_type[buffer_i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("VALIDITY"), error, "missing key 'VALIDITY'")); + const auto& validity = value["VALIDITY"]; + NANOARROW_RETURN_NOT_OK( + SetBufferBitmap(validity, ArrowArrayValidityBitmap(array), error)); + break; + } + case NANOARROW_BUFFER_TYPE_TYPE_ID: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("TYPE_ID"), error, "missing key 'TYPE_ID'")); + const auto& type_id = value["TYPE_ID"]; + NANOARROW_RETURN_NOT_OK(SetBufferInt(type_id, buffer, error)); + break; + } + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); + const auto& offset = value["OFFSET"]; + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + break; + } + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: { + // String/Binary just encodes values, not offset + data. + switch (array_view->storage_type) { + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + return NANOARROW_OK; + default: + break; + } + + NANOARROW_RETURN_NOT_OK( + Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); + const auto& offset = value["OFFSET"]; + + if (array_view->layout.element_size_bits[buffer_i] == 32) { + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + } else { + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + } + break; + } + + case NANOARROW_BUFFER_TYPE_DATA: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("DATA"), error, "missing key 'DATA'")); + const auto& data = value["DATA"]; + + switch (array_view->storage_type) { + case NANOARROW_TYPE_INT8: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT8: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT16: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT16: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT32: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT32: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT64: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT64: + return SetBufferInt(data, buffer, error); + + case NANOARROW_TYPE_STRING: + return SetBuffersString(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_LARGE_STRING: + return SetBuffersString(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_BINARY: + return SetBuffersBinary(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_LARGE_BINARY: + return SetBuffersBinary(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + + default: + ArrowErrorSet(error, "storage type %s DATA buffer not supported", + ArrowTypeString(array_view->storage_type)); + return ENOTSUP; + } + break; + } + case NANOARROW_BUFFER_TYPE_NONE: + break; + } + + return NANOARROW_OK; + } + ArrowErrorCode SetBufferBitmap(const json& value, ArrowBitmap* bitmap, ArrowError* error) { NANOARROW_RETURN_NOT_OK( Check(value.is_array(), error, "bitmap buffer must be array")); for (const auto& item : value) { + // Some example files write bitmaps as [true, false, true] but the documentation + // says [1, 0, 1]. Accept both for simplicity. NANOARROW_RETURN_NOT_OK(Check(item.is_boolean() || item.is_number_integer(), error, "bitmap item must be bool or integer")); NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapAppend(bitmap, item.get(), 1), @@ -1204,6 +1385,16 @@ class TestingJSONReader { return NANOARROW_OK; } + ArrowErrorCode PrefixError(ArrowErrorCode value, ArrowError* error, + const std::string& prefix) { + if (value != NANOARROW_OK && error != nullptr) { + std::string msg = prefix + error->message; + ArrowErrorSet(error, "%s", msg.c_str()); + } + + return value; + } + ArrowErrorCode Check(bool value, ArrowError* error, const std::string& err) { if (value) { return NANOARROW_OK; From 88f993142c3324a6a2b62d13ad611e27f890b2f6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 14:54:19 -0400 Subject: [PATCH 05/19] one basic column test --- src/nanoarrow/nanoarrow_testing.hpp | 31 +++++++++++++++++-- src/nanoarrow/nanoarrow_testing_test.cc | 40 +++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 09a075e00..9a902504b 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -652,6 +652,33 @@ class TestingJSONReader { } } + ArrowErrorCode ReadColumn(const std::string& value, const ArrowSchema* schema, + ArrowArray* out, ArrowError* error = nullptr) { + try { + auto obj = json::parse(value); + + // ArrowArrayView to enable validation + nanoarrow::UniqueArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema( + array_view.get(), const_cast(schema), error)); + + // ArrowArray to hold memory + nanoarrow::UniqueArray array; + NANOARROW_RETURN_NOT_OK( + ArrowArrayInitFromSchema(array.get(), const_cast(schema), error)); + + // Parse the JSON into the array + NANOARROW_RETURN_NOT_OK(SetArrayColumn(obj, array_view.get(), array.get(), error)); + + // Return the result + ArrowArrayMove(array.get(), out); + return NANOARROW_OK; + } catch (std::exception& e) { + ArrowErrorSet(error, "Exception in TestingJSONReader::ReadColumn(): %s", e.what()); + return EINVAL; + } + } + private: ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError* error) { NANOARROW_RETURN_NOT_OK( @@ -1077,7 +1104,7 @@ class TestingJSONReader { // Check, resolve, and recurse children NANOARROW_RETURN_NOT_OK( - Check(array_view->n_children > 0 || value.contains("children"), error, + Check(array_view->n_children == 0 || value.contains("children"), error, error_prefix + "missing key children")); if (value.contains("children")) { @@ -1121,7 +1148,7 @@ class TestingJSONReader { // Validate the array view NANOARROW_RETURN_NOT_OK(PrefixError( ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL, error), error, - "failed to validate: ")); + error_prefix + "failed to validate: ")); // Flush length and buffer pointers to the Array array->length = array_view->length; diff --git a/src/nanoarrow/nanoarrow_testing_test.cc b/src/nanoarrow/nanoarrow_testing_test.cc index 73c8f79ca..a1a404e52 100644 --- a/src/nanoarrow/nanoarrow_testing_test.cc +++ b/src/nanoarrow/nanoarrow_testing_test.cc @@ -912,3 +912,43 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldUnion) { TestTypeError(R"({"name": "union", "mode": "NOT_A_MODE", "typeIds": []})", "Type[name=='union'] mode must be 'DENSE' or 'SPARSE'"); } + +TEST(NanoarrowTestingTest, NanoarrowTestingTestReadColumnBasic) { + nanoarrow::UniqueSchema schema; + nanoarrow::UniqueArray array; + ArrowError error; + error.message[0] = '\0'; + + TestingJSONReader reader; + + ASSERT_EQ( + reader.ReadField( + R"({"name": null, "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null})", + schema.get()), + NANOARROW_OK); + + ASSERT_EQ(reader.ReadColumn(R"({"name": null, "count": 2})", schema.get(), array.get(), + &error), + NANOARROW_OK) + << error.message; + EXPECT_EQ(array->length, 2); + + // Check invalid JSON + EXPECT_EQ(reader.ReadColumn(R"({)", schema.get(), array.get()), EINVAL); + + // Check at least one failed Check() + EXPECT_EQ( + reader.ReadColumn(R"("this is not a JSON object")", schema.get(), array.get()), + EINVAL); + + // Check at least one failed PrefixError() + EXPECT_EQ(reader.ReadColumn(R"({"name": "colname", "count": "not an integer"})", + schema.get(), array.get(), &error), + EINVAL); + EXPECT_STREQ(error.message, "-> Column 'colname' count must be integer"); + + // Check that field is validated + EXPECT_EQ( + reader.ReadColumn(R"({"name": null, "count": -1})", schema.get(), array.get()), + EINVAL); +} From 31100a558e36c367a4708775f5ea56199e49422b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 14:59:14 -0400 Subject: [PATCH 06/19] move length/offset validation to array view --- src/nanoarrow/array.c | 25 ++++++++++++------------- src/nanoarrow/array_test.cc | 4 ++-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c index 1e59777ea..6ca33bcda 100644 --- a/src/nanoarrow/array.c +++ b/src/nanoarrow/array.c @@ -671,19 +671,6 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, struct ArrowArray* array, struct ArrowError* error) { - // Check length and offset - if (array->offset < 0) { - ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of %ld", - (long)array->offset); - return EINVAL; - } - - if (array->length < 0) { - ArrowErrorSet(error, "Expected array length >= 0 but found array length of %ld", - (long)array->length); - return EINVAL; - } - array_view->array = array; array_view->offset = array->offset; array_view->length = array->length; @@ -749,6 +736,18 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, struct ArrowError* error) { + if (array_view->length < 0) { + ArrowErrorSet(error, "Expected length >= 0 but found length %ld", + (long)array_view->length); + return EINVAL; + } + + if (array_view->offset < 0) { + ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld", + (long)array_view->offset); + return EINVAL; + } + // Calculate buffer sizes that do not require buffer access. If marked as // unknown, assign the buffer size; otherwise, validate it. int64_t offset_plus_length = array_view->offset + array_view->length; diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc index 658595781..6a2200809 100644 --- a/src/nanoarrow/array_test.cc +++ b/src/nanoarrow/array_test.cc @@ -1589,12 +1589,12 @@ TEST(ArrayTest, ArrayViewTestBasic) { // Expect error for bad offset + length array.length = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); - EXPECT_STREQ(error.message, "Expected array length >= 0 but found array length of -1"); + EXPECT_STREQ(error.message, "Expected length >= 0 but found length -1"); array.length = 3; array.offset = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); - EXPECT_STREQ(error.message, "Expected array offset >= 0 but found array offset of -1"); + EXPECT_STREQ(error.message, "Expected offset >= 0 but found offset -1"); array.offset = 0; // Expect error for the wrong number of buffers From 7953e9bf0ff99f6ad88fd320f8dc7e441eab002d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 15:18:06 -0400 Subject: [PATCH 07/19] tack on roundtrip for column json --- src/nanoarrow/nanoarrow_testing_test.cc | 119 ++++++++++++++---------- 1 file changed, 70 insertions(+), 49 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing_test.cc b/src/nanoarrow/nanoarrow_testing_test.cc index a1a404e52..791048708 100644 --- a/src/nanoarrow/nanoarrow_testing_test.cc +++ b/src/nanoarrow/nanoarrow_testing_test.cc @@ -747,26 +747,87 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestReadFieldNested) { EXPECT_STREQ(schema->children[0]->format, "n"); } -void TestFieldRoundtrip(const std::string& field_json) { +TEST(NanoarrowTestingTest, NanoarrowTestingTestReadColumnBasic) { + nanoarrow::UniqueSchema schema; + nanoarrow::UniqueArray array; + ArrowError error; + error.message[0] = '\0'; + + TestingJSONReader reader; + + ASSERT_EQ( + reader.ReadField( + R"({"name": null, "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null})", + schema.get()), + NANOARROW_OK); + + ASSERT_EQ(reader.ReadColumn(R"({"name": null, "count": 2})", schema.get(), array.get(), + &error), + NANOARROW_OK) + << error.message; + EXPECT_EQ(array->length, 2); + + // Check invalid JSON + EXPECT_EQ(reader.ReadColumn(R"({)", schema.get(), array.get()), EINVAL); + + // Check at least one failed Check() + EXPECT_EQ( + reader.ReadColumn(R"("this is not a JSON object")", schema.get(), array.get()), + EINVAL); + + // Check at least one failed PrefixError() + EXPECT_EQ(reader.ReadColumn(R"({"name": "colname", "count": "not an integer"})", + schema.get(), array.get(), &error), + EINVAL); + EXPECT_STREQ(error.message, "-> Column 'colname' count must be integer"); + + // Check that field is validated + EXPECT_EQ( + reader.ReadColumn(R"({"name": null, "count": -1})", schema.get(), array.get()), + EINVAL); +} + +void TestFieldRoundtrip(const std::string& field_json, + const std::string& column_json = "") { nanoarrow::UniqueSchema schema; TestingJSONReader reader; TestingJSONWriter writer; ArrowError error; error.message[0] = '\0'; - int result = reader.ReadField(field_json, schema.get(), &error); - ASSERT_EQ(result, NANOARROW_OK) << "Error: " << error.message; + ASSERT_EQ(reader.ReadField(field_json, schema.get(), &error), NANOARROW_OK) + << "Error: " << error.message; + + std::stringstream json_roundtrip; + ASSERT_EQ(writer.WriteField(json_roundtrip, schema.get()), NANOARROW_OK); + EXPECT_EQ(json_roundtrip.str(), field_json); + + if (column_json == "") { + return; + } + + nanoarrow::UniqueArray array; + ASSERT_EQ(reader.ReadColumn(column_json, schema.get(), array.get(), &error), + NANOARROW_OK) + << error.message; + + nanoarrow::UniqueArrayView array_view; + ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr), + NANOARROW_OK); + ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr), NANOARROW_OK); - std::stringstream field_json_roundtrip; - ASSERT_EQ(writer.WriteField(field_json_roundtrip, schema.get()), NANOARROW_OK); - EXPECT_EQ(field_json_roundtrip.str(), field_json); + json_roundtrip.str(""); + ASSERT_EQ(writer.WriteColumn(json_roundtrip, schema.get(), array_view.get()), + NANOARROW_OK); + EXPECT_EQ(json_roundtrip.str(), column_json); } -void TestTypeRoundtrip(const std::string& type_json) { +void TestTypeRoundtrip(const std::string& type_json, + const std::string& column_json = "") { std::stringstream field_json_builder; field_json_builder << R"({"name": null, "nullable": true, "type": )" << type_json << R"(, "children": [], "metadata": null})"; - TestFieldRoundtrip(field_json_builder.str()); + TestFieldRoundtrip(field_json_builder.str(), column_json); } void TestFieldError(const std::string& field_json, const std::string& msg, @@ -789,7 +850,7 @@ void TestTypeError(const std::string& type_json, const std::string& msg, } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldPrimitive) { - TestTypeRoundtrip(R"({"name": "null"})"); + TestTypeRoundtrip(R"({"name": "null"})", R"({"name": null, "count": 2})"); TestTypeRoundtrip(R"({"name": "bool"})"); TestTypeRoundtrip(R"({"name": "utf8"})"); TestTypeRoundtrip(R"({"name": "largeutf8"})"); @@ -912,43 +973,3 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldUnion) { TestTypeError(R"({"name": "union", "mode": "NOT_A_MODE", "typeIds": []})", "Type[name=='union'] mode must be 'DENSE' or 'SPARSE'"); } - -TEST(NanoarrowTestingTest, NanoarrowTestingTestReadColumnBasic) { - nanoarrow::UniqueSchema schema; - nanoarrow::UniqueArray array; - ArrowError error; - error.message[0] = '\0'; - - TestingJSONReader reader; - - ASSERT_EQ( - reader.ReadField( - R"({"name": null, "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null})", - schema.get()), - NANOARROW_OK); - - ASSERT_EQ(reader.ReadColumn(R"({"name": null, "count": 2})", schema.get(), array.get(), - &error), - NANOARROW_OK) - << error.message; - EXPECT_EQ(array->length, 2); - - // Check invalid JSON - EXPECT_EQ(reader.ReadColumn(R"({)", schema.get(), array.get()), EINVAL); - - // Check at least one failed Check() - EXPECT_EQ( - reader.ReadColumn(R"("this is not a JSON object")", schema.get(), array.get()), - EINVAL); - - // Check at least one failed PrefixError() - EXPECT_EQ(reader.ReadColumn(R"({"name": "colname", "count": "not an integer"})", - schema.get(), array.get(), &error), - EINVAL); - EXPECT_STREQ(error.message, "-> Column 'colname' count must be integer"); - - // Check that field is validated - EXPECT_EQ( - reader.ReadColumn(R"({"name": null, "count": -1})", schema.get(), array.get()), - EINVAL); -} From 8964f2c6dd09f9b5123ad2a4e12b14820673d29f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 15:40:42 -0400 Subject: [PATCH 08/19] with integer tests --- src/nanoarrow/nanoarrow_testing.hpp | 8 +++--- src/nanoarrow/nanoarrow_testing_test.cc | 35 ++++++++++++++++++------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 9a902504b..6f461a1df 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1311,9 +1311,11 @@ class TestingJSONReader { "integer buffer item must be integer number or string")); auto item_int = item.get(); - NANOARROW_RETURN_NOT_OK(Check(item_int >= std::numeric_limits::lowest() && - item_int <= std::numeric_limits::max(), - error, "integer buffer item outside type limits")); + NANOARROW_RETURN_NOT_OK(Check( + item_int >= std::numeric_limits::lowest() && + item_int <= std::numeric_limits::max(), + error, + "integer buffer item '" + std::to_string(item_int) + "' outside type limits")); T buffer_value = item_int; NANOARROW_RETURN_NOT_OK_WITH_ERROR( diff --git a/src/nanoarrow/nanoarrow_testing_test.cc b/src/nanoarrow/nanoarrow_testing_test.cc index 791048708..f39546abd 100644 --- a/src/nanoarrow/nanoarrow_testing_test.cc +++ b/src/nanoarrow/nanoarrow_testing_test.cc @@ -862,20 +862,36 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldPrimitive) { } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldInt) { - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 8, "isSigned": true})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 16, "isSigned": true})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 32, "isSigned": true})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 64, "isSigned": true})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 8, "isSigned": true})", + R"({"name": null, "count": 3, "VALIDITY": [1, 1, 1], "DATA": [-128, 0, 127]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 16, "isSigned": true})", + R"({"name": null, "count": 3, "VALIDITY": [1, 1, 1], "DATA": [-129, 0, 127]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 32, "isSigned": true})", + R"({"name": null, "count": 3, "VALIDITY": [1, 1, 1], "DATA": [-130, 0, 127]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 64, "isSigned": true})", + R"({"name": null, "count": 3, "VALIDITY": [1, 1, 1], "DATA": ["-131", "0", "127"]})"); TestTypeError(R"({"name": "int", "bitWidth": 1, "isSigned": true})", "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldUInt) { - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 8, "isSigned": false})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 16, "isSigned": false})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 32, "isSigned": false})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 64, "isSigned": false})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 8, "isSigned": false})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0, 0, 255]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 16, "isSigned": false})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0, 0, 256]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 32, "isSigned": false})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0, 0, 257]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 64, "isSigned": false})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": ["0", "0", "258"]})"); TestTypeError(R"({"name": "int", "bitWidth": 1, "isSigned": false})", "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); @@ -929,7 +945,8 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldStruct) { // Empty TestFieldRoundtrip( R"({"name": null, "nullable": true, "type": {"name": "struct"}, "children": [)" - R"(], "metadata": null})"); + R"(], "metadata": null})", + R"({"name": null, "count": 0, "VALIDITY": [], "children": []})"); // Non-empty TestFieldRoundtrip( From fd07808c7aa0d0b8576c52b47658ca11353521d0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 15:47:57 -0400 Subject: [PATCH 09/19] better error for signed integer in uint64 column --- src/nanoarrow/nanoarrow_testing.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 6f461a1df..9771ecbcd 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1309,6 +1309,11 @@ class TestingJSONReader { NANOARROW_RETURN_NOT_OK( Check(item.is_number_integer(), error, "integer buffer item must be integer number or string")); + NANOARROW_RETURN_NOT_OK( + Check(std::numeric_limits::is_signed || item.is_number_unsigned(), error, + "expected unsigned integer buffer item but found signed integer '" + + item.dump() + "'")); + auto item_int = item.get(); NANOARROW_RETURN_NOT_OK(Check( From 00dea858fdf2fbd62499289eb9ec8e63635f5c4c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 16:29:02 -0400 Subject: [PATCH 10/19] fix string/binary --- src/nanoarrow/nanoarrow_testing.hpp | 66 ++++++++++++------------- src/nanoarrow/nanoarrow_testing_test.cc | 19 +++++-- 2 files changed, 47 insertions(+), 38 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 9771ecbcd..4736324ef 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1187,17 +1187,6 @@ class TestingJSONReader { break; } case NANOARROW_BUFFER_TYPE_DATA_OFFSET: { - // String/Binary just encodes values, not offset + data. - switch (array_view->storage_type) { - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_STRING: - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_LARGE_BINARY: - return NANOARROW_OK; - default: - break; - } - NANOARROW_RETURN_NOT_OK( Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); const auto& offset = value["OFFSET"]; @@ -1334,34 +1323,36 @@ class TestingJSONReader { ArrowBuffer* data, ArrowError* error) { NANOARROW_RETURN_NOT_OK( Check(value.is_array(), error, "utf8 data buffer must be array")); + + // Check offsets against values + const T* expected_offset = reinterpret_cast(offsets->data); + NANOARROW_RETURN_NOT_OK(Check( + offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, + "Expected offset buffer with " + std::to_string(value.size()) + " elements")); + NANOARROW_RETURN_NOT_OK( + Check(*expected_offset++ == 0, error, "first offset must be zero")); + int64_t last_offset = 0; - T offset_buffer_value = static_cast(last_offset); - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(offsets, &offset_buffer_value, sizeof(T)), error); for (const auto& item : value) { NANOARROW_RETURN_NOT_OK( - Check(value.is_string(), error, "utf8 data buffer item must be string")); + Check(item.is_string(), error, "utf8 data buffer item must be string")); auto item_str = item.get(); // Append data NANOARROW_RETURN_NOT_OK_WITH_ERROR( ArrowBufferAppend(data, reinterpret_cast(item_str.data()), - item.size()), + item_str.size()), error); - // Append offset + // Check offset last_offset += item_str.size(); - offset_buffer_value = static_cast(last_offset); - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(offsets, &offset_buffer_value, sizeof(T)), error); + NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, + "Expected offset value " + + std::to_string(last_offset) + + " at utf8 data buffer item " + item.dump())); } - // Check if overflow occurred - NANOARROW_RETURN_NOT_OK( - Check(last_offset <= std::numeric_limits::max(), error, - "utf8 data buffer overflowed maximum value of offset type")); - return NANOARROW_OK; } @@ -1370,14 +1361,20 @@ class TestingJSONReader { ArrowBuffer* data, ArrowError* error) { NANOARROW_RETURN_NOT_OK( Check(value.is_array(), error, "binary data buffer must be array")); + + // Check offsets against values + const T* expected_offset = reinterpret_cast(offsets->data); + NANOARROW_RETURN_NOT_OK(Check( + offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, + "Expected offset buffer with " + std::to_string(value.size()) + " elements")); + NANOARROW_RETURN_NOT_OK( + Check(*expected_offset++ == 0, error, "first offset must be zero")); + int64_t last_offset = 0; - T offset_buffer_value = static_cast(last_offset); - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(offsets, &offset_buffer_value, sizeof(T)), error); for (const auto& item : value) { NANOARROW_RETURN_NOT_OK( - Check(value.is_string(), error, "binary data buffer item must be string")); + Check(item.is_string(), error, "binary data buffer item must be string")); auto item_str = item.get(); int64_t item_size_bytes = item_str.size() / 2; @@ -1401,14 +1398,15 @@ class TestingJSONReader { // Append data NANOARROW_RETURN_NOT_OK_WITH_ERROR( ArrowBufferAppend(data, reinterpret_cast(item_str.data()), - item.size()), + item_size_bytes), error); - // Append offset + // Check offset last_offset += item_size_bytes; - offset_buffer_value = static_cast(last_offset); - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(offsets, &offset_buffer_value, sizeof(T)), error); + NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, + "Expected offset value " + + std::to_string(last_offset) + + " at binary data buffer item " + item.dump())); } // Check if overflow occurred diff --git a/src/nanoarrow/nanoarrow_testing_test.cc b/src/nanoarrow/nanoarrow_testing_test.cc index f39546abd..a4b048849 100644 --- a/src/nanoarrow/nanoarrow_testing_test.cc +++ b/src/nanoarrow/nanoarrow_testing_test.cc @@ -852,15 +852,26 @@ void TestTypeError(const std::string& type_json, const std::string& msg, TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldPrimitive) { TestTypeRoundtrip(R"({"name": "null"})", R"({"name": null, "count": 2})"); TestTypeRoundtrip(R"({"name": "bool"})"); - TestTypeRoundtrip(R"({"name": "utf8"})"); - TestTypeRoundtrip(R"({"name": "largeutf8"})"); - TestTypeRoundtrip(R"({"name": "binary"})"); - TestTypeRoundtrip(R"({"name": "largebinary"})"); TestTypeError(R"({"name": "an unsupported type"})", "Unsupported Type name: 'an unsupported type'", ENOTSUP); } +TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldString) { + TestTypeRoundtrip( + R"({"name": "utf8"})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "OFFSET": [0, 3, 3], "DATA": ["abc", ""]})"); + TestTypeRoundtrip( + R"({"name": "largeutf8"})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "OFFSET": ["0", "3", "3"], "DATA": ["abc", ""]})"); + TestTypeRoundtrip( + R"({"name": "binary"})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "OFFSET": [0, 3, 3], "DATA": ["00FFA0", ""]})"); + TestTypeRoundtrip( + R"({"name": "largebinary"})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "OFFSET": ["0", "3", "3"], "DATA": ["00FFA0", ""]})"); +} + TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldInt) { TestTypeRoundtrip( R"({"name": "int", "bitWidth": 8, "isSigned": true})", From 46e7c5fe75d3ae0a7e4eb31e8bcbac2fd15f289d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 16:48:04 -0400 Subject: [PATCH 11/19] fixed size binary --- src/nanoarrow/nanoarrow_testing.hpp | 54 +++++++++++++------------ src/nanoarrow/nanoarrow_testing_test.cc | 4 +- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 4736324ef..87738535b 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1234,6 +1234,10 @@ class TestingJSONReader { case NANOARROW_TYPE_LARGE_BINARY: return SetBuffersBinary(data, ArrowArrayBuffer(array, buffer_i - 1), buffer, error); + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return SetBuffersBinary( + data, nullptr, buffer, error, + array_view->layout.element_size_bits[buffer_i] / 8); default: ArrowErrorSet(error, "storage type %s DATA buffer not supported", @@ -1358,17 +1362,21 @@ class TestingJSONReader { template ArrowErrorCode SetBuffersBinary(const json& value, ArrowBuffer* offsets, - ArrowBuffer* data, ArrowError* error) { + ArrowBuffer* data, ArrowError* error, + int64_t fixed_size = 0) { NANOARROW_RETURN_NOT_OK( Check(value.is_array(), error, "binary data buffer must be array")); - // Check offsets against values - const T* expected_offset = reinterpret_cast(offsets->data); - NANOARROW_RETURN_NOT_OK(Check( - offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, - "Expected offset buffer with " + std::to_string(value.size()) + " elements")); - NANOARROW_RETURN_NOT_OK( - Check(*expected_offset++ == 0, error, "first offset must be zero")); + // Check offsets against values if not fixed size + const T* expected_offset = nullptr; + if (fixed_size == 0) { + expected_offset = reinterpret_cast(offsets->data); + NANOARROW_RETURN_NOT_OK(Check( + offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, + "Expected offset buffer with " + std::to_string(value.size()) + " elements")); + NANOARROW_RETURN_NOT_OK( + Check(*expected_offset++ == 0, error, "first offset must be zero")); + } int64_t last_offset = 0; @@ -1395,25 +1403,21 @@ class TestingJSONReader { data->size_bytes++; } - // Append data - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(data, reinterpret_cast(item_str.data()), - item_size_bytes), - error); - - // Check offset - last_offset += item_size_bytes; - NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, - "Expected offset value " + - std::to_string(last_offset) + - " at binary data buffer item " + item.dump())); + // Check offset or fixed size + if (fixed_size == 0) { + last_offset += item_size_bytes; + NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, + "Expected offset value " + + std::to_string(last_offset) + + " at binary data buffer item " + item.dump())); + } else { + NANOARROW_RETURN_NOT_OK(Check(item_size_bytes == fixed_size, error, + "Expected fixed size binary value of size " + + std::to_string(fixed_size) + + " at binary data buffer item " + item.dump())); + } } - // Check if overflow occurred - NANOARROW_RETURN_NOT_OK( - Check(last_offset <= std::numeric_limits::max(), error, - "binary data buffer overflowed maximum value of offset type")); - return NANOARROW_OK; } diff --git a/src/nanoarrow/nanoarrow_testing_test.cc b/src/nanoarrow/nanoarrow_testing_test.cc index a4b048849..d52ffbf51 100644 --- a/src/nanoarrow/nanoarrow_testing_test.cc +++ b/src/nanoarrow/nanoarrow_testing_test.cc @@ -919,7 +919,9 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFloatingPoint) { } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFixedSizeBinary) { - TestTypeRoundtrip(R"({"name": "fixedsizebinary", "byteWidth": 123})"); + TestTypeRoundtrip( + R"({"name": "fixedsizebinary", "byteWidth": 3})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "DATA": ["00FFA0", "000000"]})"); } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldDecimal) { From 07a1fa56d610e47f4d9d44acb945dce32440cad5 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 28 Nov 2023 17:03:47 -0400 Subject: [PATCH 12/19] floatingpoint --- src/nanoarrow/nanoarrow_testing.hpp | 44 ++++++++++++++++++++++--- src/nanoarrow/nanoarrow_testing_test.cc | 17 +++++++--- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 87738535b..3db97ce16 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1205,6 +1205,12 @@ class TestingJSONReader { const auto& data = value["DATA"]; switch (array_view->storage_type) { + case NANOARROW_TYPE_BOOL: { + nanoarrow::UniqueBitmap bitmap; + NANOARROW_RETURN_NOT_OK(SetBufferBitmap(data, bitmap.get(), error)); + ArrowBufferMove(&bitmap->buffer, buffer); + return NANOARROW_OK; + } case NANOARROW_TYPE_INT8: return SetBufferInt(data, buffer, error); case NANOARROW_TYPE_UINT8: @@ -1222,6 +1228,11 @@ class TestingJSONReader { case NANOARROW_TYPE_UINT64: return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_FLOAT: + return SetBufferFloatingPoint(data, buffer, error); + case NANOARROW_TYPE_DOUBLE: + return SetBufferFloatingPoint(data, buffer, error); + case NANOARROW_TYPE_STRING: return SetBuffersString(data, ArrowArrayBuffer(array, buffer_i - 1), buffer, error); @@ -1309,11 +1320,10 @@ class TestingJSONReader { auto item_int = item.get(); - NANOARROW_RETURN_NOT_OK(Check( - item_int >= std::numeric_limits::lowest() && - item_int <= std::numeric_limits::max(), - error, - "integer buffer item '" + std::to_string(item_int) + "' outside type limits")); + NANOARROW_RETURN_NOT_OK( + Check(item_int >= std::numeric_limits::lowest() && + item_int <= std::numeric_limits::max(), + error, "integer buffer item '" + item.dump() + "' outside type limits")); T buffer_value = item_int; NANOARROW_RETURN_NOT_OK_WITH_ERROR( @@ -1322,6 +1332,30 @@ class TestingJSONReader { return NANOARROW_OK; } + template + ArrowErrorCode SetBufferFloatingPoint(const json& value, ArrowBuffer* buffer, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "floatingpoint buffer must be array")); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_number(), error, "floatingpoint buffer item must be number")); + double item_dbl = item.get(); + + NANOARROW_RETURN_NOT_OK(Check( + item_dbl >= std::numeric_limits::lowest() && + item_dbl <= std::numeric_limits::max(), + error, "floatingpoint buffer item '" + item.dump() + "' outside type limits")); + + T buffer_value = item_dbl; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); + } + + return NANOARROW_OK; + } + template ArrowErrorCode SetBuffersString(const json& value, ArrowBuffer* offsets, ArrowBuffer* data, ArrowError* error) { diff --git a/src/nanoarrow/nanoarrow_testing_test.cc b/src/nanoarrow/nanoarrow_testing_test.cc index d52ffbf51..d9ed452eb 100644 --- a/src/nanoarrow/nanoarrow_testing_test.cc +++ b/src/nanoarrow/nanoarrow_testing_test.cc @@ -849,14 +849,19 @@ void TestTypeError(const std::string& type_json, const std::string& msg, TestFieldError(field_json_builder.str(), msg, code); } -TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldPrimitive) { +TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldNull) { TestTypeRoundtrip(R"({"name": "null"})", R"({"name": null, "count": 2})"); - TestTypeRoundtrip(R"({"name": "bool"})"); TestTypeError(R"({"name": "an unsupported type"})", "Unsupported Type name: 'an unsupported type'", ENOTSUP); } +TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldBool) { + TestTypeRoundtrip( + R"({"name": "bool"})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0, 1, 0]})"); +} + TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldString) { TestTypeRoundtrip( R"({"name": "utf8"})", @@ -910,8 +915,12 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldUInt) { TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFloatingPoint) { TestTypeRoundtrip(R"({"name": "floatingpoint", "precision": "HALF"})"); - TestTypeRoundtrip(R"({"name": "floatingpoint", "precision": "SINGLE"})"); - TestTypeRoundtrip(R"({"name": "floatingpoint", "precision": "DOUBLE"})"); + TestTypeRoundtrip( + R"({"name": "floatingpoint", "precision": "SINGLE"})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0.000, 1.230, 4.560]})"); + TestTypeRoundtrip( + R"({"name": "floatingpoint", "precision": "DOUBLE"})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0.000, 1.230, 4.560]})"); TestTypeError( R"({"name": "floatingpoint", "precision": "NOT_A_PRECISION"})", From 112287530053851a47072aa4c731026a67530089 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Nov 2023 10:12:18 -0400 Subject: [PATCH 13/19] review comments --- .../src/nanoarrow/nanoarrow_device.c | 2 +- .../src/nanoarrow/nanoarrow_ipc_decoder.c | 4 +-- src/nanoarrow/array.c | 8 ++--- src/nanoarrow/array_inline.h | 6 ++-- src/nanoarrow/nanoarrow_testing.hpp | 29 ++++++++++++++----- src/nanoarrow/nanoarrow_types.h | 20 +++++++++---- 6 files changed, 45 insertions(+), 24 deletions(-) diff --git a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c index 4be7a9374..c4df3d144 100644 --- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c +++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c @@ -408,7 +408,7 @@ static ArrowErrorCode ArrowDeviceArrayViewCopyInternal(struct ArrowDevice* devic dst->offset = src->offset; dst->null_count = src->null_count; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (src->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c index 2fac3c7d7..9e4a6c9b5 100644 --- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c +++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c @@ -1155,7 +1155,7 @@ static void ArrowIpcDecoderInitFields(struct ArrowIpcField* fields, field->array = array; field->buffer_offset = *n_buffers; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { *n_buffers += array_view->layout.buffer_type[i] != NANOARROW_BUFFER_TYPE_NONE; } @@ -1524,7 +1524,7 @@ static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcArraySetter* setter, array_view->null_count = ns(FieldNode_null_count(field)); setter->field_i += 1; - for (int64_t i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c index 6ca33bcda..a0e711eca 100644 --- a/src/nanoarrow/array.c +++ b/src/nanoarrow/array.c @@ -437,7 +437,7 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; - for (int64_t i = 0; i < 3; i++) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; } @@ -621,7 +621,7 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) { } void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; switch (array_view->layout.buffer_type[i]) { @@ -677,7 +677,7 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, array_view->null_count = array->null_count; int64_t buffers_required = 0; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } @@ -1102,7 +1102,7 @@ static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, struct ArrowError* error) { - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_DATA_OFFSET: if (array_view->layout.element_size_bits[i] == 32) { diff --git a/src/nanoarrow/array_inline.h b/src/nanoarrow/array_inline.h index 96fdf573b..c089d2bf0 100644 --- a/src/nanoarrow/array_inline.h +++ b/src/nanoarrow/array_inline.h @@ -140,7 +140,7 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) } // Initialize any data offset buffer with a single zero - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && private_data->layout.element_size_bits[i] == 64) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); @@ -163,7 +163,7 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) } static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { - for (int64_t i = 0; i < 3; i++) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); } @@ -278,7 +278,7 @@ static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* a struct ArrowBuffer* buffer; int64_t size_bytes; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { buffer = ArrowArrayBuffer(array, i); size_bytes = private_data->layout.element_size_bits[i] / 8; diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 3db97ce16..1b360f335 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -622,10 +622,14 @@ class TestingJSONReader { using json = nlohmann::json; public: - ArrowErrorCode ReadSchema(const std::string& value, ArrowSchema* out, + /// \brief Read JSON representing a Schema + /// + /// Reads a JSON object in the form `{"fields": [...], "metadata": [...]}`, + /// propagating `out` on success. + ArrowErrorCode ReadSchema(const std::string& schema_json, ArrowSchema* out, ArrowError* error = nullptr) { try { - auto obj = json::parse(value); + auto obj = json::parse(schema_json); nanoarrow::UniqueSchema schema; NANOARROW_RETURN_NOT_OK(SetSchema(schema.get(), obj, error)); @@ -637,10 +641,14 @@ class TestingJSONReader { } } - ArrowErrorCode ReadField(const std::string& value, ArrowSchema* out, + /// \brief Read JSON representing a Field + /// + /// Read a JSON object in the form `{"name" : "col", "type": {...}, ...}`, + /// propagating `out` on success. + ArrowErrorCode ReadField(const std::string& field_json, ArrowSchema* out, ArrowError* error = nullptr) { try { - auto obj = json::parse(value); + auto obj = json::parse(field_json); nanoarrow::UniqueSchema schema; NANOARROW_RETURN_NOT_OK(SetField(schema.get(), obj, error)); @@ -652,10 +660,15 @@ class TestingJSONReader { } } - ArrowErrorCode ReadColumn(const std::string& value, const ArrowSchema* schema, + /// \brief Read JSON representing a Column + /// + /// Read a JSON object in the form + /// `{"name": "col", "count": 123, "VALIDITY": [...], ...}`, propagating + /// `out` on success. + ArrowErrorCode ReadColumn(const std::string& column_json, const ArrowSchema* schema, ArrowArray* out, ArrowError* error = nullptr) { try { - auto obj = json::parse(value); + auto obj = json::parse(column_json); // ArrowArrayView to enable validation nanoarrow::UniqueArrayView array_view; @@ -1121,7 +1134,7 @@ class TestingJSONReader { } // Build buffers - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { NANOARROW_RETURN_NOT_OK( PrefixError(SetArrayColumnBuffers(value, array_view, array, i, error), error, error_prefix)); @@ -1138,7 +1151,7 @@ class TestingJSONReader { // Set ArrayView buffer views. This is because ArrowArrayInitFromSchema() doesn't // support custom type ids for unions but the ArrayView does (otherwise // ArrowArrayFinishBuilding() would work). - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { ArrowBuffer* buffer = ArrowArrayBuffer(array, i); ArrowBufferView* buffer_view = array_view->buffer_views + i; buffer_view->data.as_uint8 = buffer->data; diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h index e96207ead..2ea16b2c5 100644 --- a/src/nanoarrow/nanoarrow_types.h +++ b/src/nanoarrow/nanoarrow_types.h @@ -449,6 +449,14 @@ enum ArrowBufferType { NANOARROW_BUFFER_TYPE_DATA }; +/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout +/// \ingroup nanoarrow-array-view +/// +/// All currently supported types have 3 buffers or fewer; however, future types +/// may involve a variable number of buffers (e.g., string view). These buffers +/// will be represented by separate members of the ArrowArrayView or ArrowLayout. +#define NANOARROW_MAX_FIXED_BUFFERS 3 + /// \brief An non-owning view of a string /// \ingroup nanoarrow-utils struct ArrowStringView { @@ -561,13 +569,13 @@ struct ArrowBitmap { /// the length and offset of the array. struct ArrowLayout { /// \brief The function of each buffer - enum ArrowBufferType buffer_type[3]; + enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The data type of each buffer - enum ArrowType buffer_data_type[3]; + enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The size of an element each buffer or 0 if this size is variable or unknown - int64_t element_size_bits[3]; + int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The number of elements in the child array per element in this array for a /// fixed-size list @@ -609,7 +617,7 @@ struct ArrowArrayView { struct ArrowLayout layout; /// \brief This Array's buffers as ArrowBufferView objects - struct ArrowBufferView buffer_views[3]; + struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The number of children of this view int64_t n_children; @@ -637,12 +645,12 @@ struct ArrowArrayPrivateData { struct ArrowBitmap bitmap; // Holder for additional buffers as required - struct ArrowBuffer buffers[2]; + struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; // The array of pointers to buffers. This must be updated after a sequence // of appends to synchronize its values with the actual buffer addresses // (which may have ben reallocated uring that time) - const void* buffer_data[3]; + const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown enum ArrowType storage_type; From a96f9bd722c13ff7e507e206045565351d0db8d7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Nov 2023 10:30:07 -0400 Subject: [PATCH 14/19] separate fixed size and non-fixed size --- src/nanoarrow/nanoarrow_testing.hpp | 124 +++++++++++++++------------- 1 file changed, 68 insertions(+), 56 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 1b360f335..e50ea1fd8 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -1247,21 +1247,20 @@ class TestingJSONReader { return SetBufferFloatingPoint(data, buffer, error); case NANOARROW_TYPE_STRING: - return SetBuffersString(data, ArrowArrayBuffer(array, buffer_i - 1), - buffer, error); + return SetBufferString(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); case NANOARROW_TYPE_LARGE_STRING: - return SetBuffersString(data, ArrowArrayBuffer(array, buffer_i - 1), - buffer, error); + return SetBufferString(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); case NANOARROW_TYPE_BINARY: - return SetBuffersBinary(data, ArrowArrayBuffer(array, buffer_i - 1), - buffer, error); + return SetBufferBinary(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); case NANOARROW_TYPE_LARGE_BINARY: - return SetBuffersBinary(data, ArrowArrayBuffer(array, buffer_i - 1), - buffer, error); + return SetBufferBinary(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); case NANOARROW_TYPE_FIXED_SIZE_BINARY: - return SetBuffersBinary( - data, nullptr, buffer, error, - array_view->layout.element_size_bits[buffer_i] / 8); + return SetBufferFixedSizeBinary( + data, buffer, array_view->layout.element_size_bits[buffer_i] / 8, error); default: ArrowErrorSet(error, "storage type %s DATA buffer not supported", @@ -1370,8 +1369,8 @@ class TestingJSONReader { } template - ArrowErrorCode SetBuffersString(const json& value, ArrowBuffer* offsets, - ArrowBuffer* data, ArrowError* error) { + ArrowErrorCode SetBufferString(const json& value, ArrowBuffer* offsets, + ArrowBuffer* data, ArrowError* error) { NANOARROW_RETURN_NOT_OK( Check(value.is_array(), error, "utf8 data buffer must be array")); @@ -1408,61 +1407,74 @@ class TestingJSONReader { } template - ArrowErrorCode SetBuffersBinary(const json& value, ArrowBuffer* offsets, - ArrowBuffer* data, ArrowError* error, - int64_t fixed_size = 0) { + ArrowErrorCode SetBufferBinary(const json& value, ArrowBuffer* offsets, + ArrowBuffer* data, ArrowError* error) { NANOARROW_RETURN_NOT_OK( Check(value.is_array(), error, "binary data buffer must be array")); // Check offsets against values if not fixed size - const T* expected_offset = nullptr; - if (fixed_size == 0) { - expected_offset = reinterpret_cast(offsets->data); - NANOARROW_RETURN_NOT_OK(Check( - offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, - "Expected offset buffer with " + std::to_string(value.size()) + " elements")); - NANOARROW_RETURN_NOT_OK( - Check(*expected_offset++ == 0, error, "first offset must be zero")); + const T* expected_offset = reinterpret_cast(offsets->data); + NANOARROW_RETURN_NOT_OK(Check( + offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, + "Expected offset buffer with " + std::to_string(value.size()) + " elements")); + NANOARROW_RETURN_NOT_OK( + Check(*expected_offset++ == 0, error, "first offset must be zero")); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); + + // Check offset + NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == data->size_bytes, error, + "Expected offset value " + + std::to_string(data->size_bytes) + + " at binary data buffer item " + item.dump())); } + return NANOARROW_OK; + } + + ArrowErrorCode SetBufferFixedSizeBinary(const json& value, ArrowBuffer* data, + int64_t fixed_size, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "binary data buffer must be array")); + int64_t last_offset = 0; for (const auto& item : value) { - NANOARROW_RETURN_NOT_OK( - Check(item.is_string(), error, "binary data buffer item must be string")); - auto item_str = item.get(); + NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); + int64_t item_size_bytes = data->size_bytes - last_offset; + + NANOARROW_RETURN_NOT_OK(Check(item_size_bytes == fixed_size, error, + "Expected fixed size binary value of size " + + std::to_string(fixed_size) + + " at binary data buffer item " + item.dump())); + last_offset = data->size_bytes; + } - int64_t item_size_bytes = item_str.size() / 2; - NANOARROW_RETURN_NOT_OK(Check((item_size_bytes * 2) == item_str.size(), error, - "binary data buffer item must have even size")); + return NANOARROW_OK; + } - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBufferReserve(data, item_size_bytes), - error); - for (int64_t i = 0; i < item_str.size(); i += 2) { - std::string byte_hex = item_str.substr(i, 2); - char* end_ptr; - uint8_t byte = std::strtoul(byte_hex.data(), &end_ptr, 16); - NANOARROW_RETURN_NOT_OK(Check( - end_ptr == (byte_hex.data() + 2), error, - "binary data buffer item must contain a valid hex-encoded byte string")); - - data->data[data->size_bytes] = byte; - data->size_bytes++; - } + ArrowErrorCode AppendBinaryElement(const json& item, ArrowBuffer* data, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_string(), error, "binary data buffer item must be string")); + auto item_str = item.get(); + + int64_t item_size_bytes = item_str.size() / 2; + NANOARROW_RETURN_NOT_OK(Check((item_size_bytes * 2) == item_str.size(), error, + "binary data buffer item must have even size")); + + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBufferReserve(data, item_size_bytes), error); + for (int64_t i = 0; i < item_str.size(); i += 2) { + std::string byte_hex = item_str.substr(i, 2); + char* end_ptr; + uint8_t byte = std::strtoul(byte_hex.data(), &end_ptr, 16); + NANOARROW_RETURN_NOT_OK( + Check(end_ptr == (byte_hex.data() + 2), error, + "binary data buffer item must contain a valid hex-encoded byte string")); - // Check offset or fixed size - if (fixed_size == 0) { - last_offset += item_size_bytes; - NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, - "Expected offset value " + - std::to_string(last_offset) + - " at binary data buffer item " + item.dump())); - } else { - NANOARROW_RETURN_NOT_OK(Check(item_size_bytes == fixed_size, error, - "Expected fixed size binary value of size " + - std::to_string(fixed_size) + - " at binary data buffer item " + item.dump())); - } + data->data[data->size_bytes] = byte; + data->size_bytes++; } return NANOARROW_OK; From 0b79088a8f83a920a0543ad81d1326abbb8f046a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Nov 2023 10:32:03 -0400 Subject: [PATCH 15/19] use json::exception --- src/nanoarrow/nanoarrow_testing.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index e50ea1fd8..bfb15462d 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -635,7 +635,7 @@ class TestingJSONReader { NANOARROW_RETURN_NOT_OK(SetSchema(schema.get(), obj, error)); ArrowSchemaMove(schema.get(), out); return NANOARROW_OK; - } catch (std::exception& e) { + } catch (json::exception& e) { ArrowErrorSet(error, "Exception in TestingJSONReader::ReadSchema(): %s", e.what()); return EINVAL; } @@ -654,7 +654,7 @@ class TestingJSONReader { NANOARROW_RETURN_NOT_OK(SetField(schema.get(), obj, error)); ArrowSchemaMove(schema.get(), out); return NANOARROW_OK; - } catch (std::exception& e) { + } catch (json::exception& e) { ArrowErrorSet(error, "Exception in TestingJSONReader::ReadField(): %s", e.what()); return EINVAL; } @@ -686,7 +686,7 @@ class TestingJSONReader { // Return the result ArrowArrayMove(array.get(), out); return NANOARROW_OK; - } catch (std::exception& e) { + } catch (json::exception& e) { ArrowErrorSet(error, "Exception in TestingJSONReader::ReadColumn(): %s", e.what()); return EINVAL; } From 2e8b76f003be453b56a62a818a41e8768eed461d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Nov 2023 10:39:12 -0400 Subject: [PATCH 16/19] fix python --- python/bootstrap.py | 5 + python/src/nanoarrow/nanoarrow_testing.hpp | 1508 ++++++++++++++++++++ 2 files changed, 1513 insertions(+) create mode 100644 python/src/nanoarrow/nanoarrow_testing.hpp diff --git a/python/bootstrap.py b/python/bootstrap.py index 9e54cb704..bbb5d6693 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -36,6 +36,9 @@ def generate_nanoarrow_pxd(self, file_in, file_out): # Strip comments content = self.re_comment.sub("", content) + # Replace NANOARROW_MAX_FIXED_BUFFERS with its value + content = self.re_max_buffers.sub("3", content) + # Find types and function definitions types = self._find_types(content) func_defs = self._find_func_defs(content) @@ -59,6 +62,7 @@ def generate_nanoarrow_pxd(self, file_in, file_out): output.write(b"\n") output.write(b" ctypedef int ArrowErrorCode\n") output.write(b" cdef int NANOARROW_OK\n") + output.write(b" cdef int NANOARROW_MAX_FIXED_BUFFERS\n") output.write(b"\n") for type in types_cython: @@ -71,6 +75,7 @@ def generate_nanoarrow_pxd(self, file_in, file_out): def _define_regexes(self): self.re_comment = re.compile(r"\s*//[^\n]*") + self.re_max_buffers = re.compile(r"NANOARROW_MAX_FIXED_BUFFERS") self.re_type = re.compile( r"(?Pstruct|union|enum) (?PArrow[^ ]+) {(?P[^}]*)}" ) diff --git a/python/src/nanoarrow/nanoarrow_testing.hpp b/python/src/nanoarrow/nanoarrow_testing.hpp new file mode 100644 index 000000000..bfb15462d --- /dev/null +++ b/python/src/nanoarrow/nanoarrow_testing.hpp @@ -0,0 +1,1508 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +#include "nanoarrow.hpp" + +#ifndef NANOARROW_TESTING_HPP_INCLUDED +#define NANOARROW_TESTING_HPP_INCLUDED + +/// \defgroup nanoarrow_testing Nanoarrow Testing Helpers +/// +/// Utilities for testing nanoarrow structures and functions. + +namespace nanoarrow { + +namespace testing { + +/// \defgroup nanoarrow_testing-json Integration test helpers +/// +/// See testing format documentation for details of the JSON representation. This +/// representation is not canonical but can be used to implement integration tests with +/// other implementations. +/// +/// @{ + +/// \brief Writer for the Arrow integration testing JSON format +class TestingJSONWriter { + public: + /// \brief Write a schema to out + /// + /// Creates output like `{"fields": [...], "metadata": [...]}`. + ArrowErrorCode WriteSchema(std::ostream& out, const ArrowSchema* schema) { + // Make sure we have a struct + if (std::string(schema->format) != "+s") { + return EINVAL; + } + + out << "{"; + + // Write fields + out << R"("fields": )"; + if (schema->n_children == 0) { + out << "[]"; + } else { + out << "["; + NANOARROW_RETURN_NOT_OK(WriteField(out, schema->children[0])); + for (int64_t i = 1; i < schema->n_children; i++) { + out << ", "; + NANOARROW_RETURN_NOT_OK(WriteField(out, schema->children[i])); + } + out << "]"; + } + + // Write metadata + out << R"(, "metadata": )"; + NANOARROW_RETURN_NOT_OK(WriteMetadata(out, schema->metadata)); + + out << "}"; + return NANOARROW_OK; + } + + /// \brief Write a field to out + /// + /// Creates output like `{"name" : "col", "type": {...}, ...}` + ArrowErrorCode WriteField(std::ostream& out, const ArrowSchema* field) { + ArrowSchemaView view; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, (ArrowSchema*)field, nullptr)); + + out << "{"; + + // Write schema->name (may be null) + if (field->name == nullptr) { + out << R"("name": null)"; + } else { + out << R"("name": )"; + NANOARROW_RETURN_NOT_OK(WriteString(out, ArrowCharView(field->name))); + } + + // Write nullability + if (field->flags & ARROW_FLAG_NULLABLE) { + out << R"(, "nullable": true)"; + } else { + out << R"(, "nullable": false)"; + } + + // Write type + out << R"(, "type": )"; + NANOARROW_RETURN_NOT_OK(WriteType(out, &view)); + + // Write children + out << R"(, "children": )"; + if (field->n_children == 0) { + out << "[]"; + } else { + out << "["; + NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0])); + for (int64_t i = 1; i < field->n_children; i++) { + out << ", "; + NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i])); + } + out << "]"; + } + + // TODO: Dictionary (currently fails at WriteType) + + // Write metadata + out << R"(, "metadata": )"; + NANOARROW_RETURN_NOT_OK(WriteMetadata(out, field->metadata)); + + out << "}"; + return NANOARROW_OK; + } + + /// \brief Write the type portion of a field + /// + /// Creates output like `{"name": "int", ...}` + ArrowErrorCode WriteType(std::ostream& out, const ArrowSchema* field) { + ArrowSchemaView view; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, (ArrowSchema*)field, nullptr)); + NANOARROW_RETURN_NOT_OK(WriteType(out, &view)); + return NANOARROW_OK; + } + + /// \brief Write a "batch" to out + /// + /// Creates output like `{"count": 123, "columns": [...]}`. + ArrowErrorCode WriteBatch(std::ostream& out, const ArrowSchema* schema, + ArrowArrayView* value) { + // Make sure we have a struct + if (std::string(schema->format) != "+s") { + return EINVAL; + } + + out << "{"; + + // Write length + out << R"("count": )" << value->length; + + // Write children + out << R"(, "columns": )"; + NANOARROW_RETURN_NOT_OK(WriteChildren(out, schema, value)); + + out << "}"; + return NANOARROW_OK; + } + + /// \brief Write a column to out + /// + /// Creates output like `{"name": "col", "count": 123, "VALIDITY": [...], ...}`. + ArrowErrorCode WriteColumn(std::ostream& out, const ArrowSchema* field, + ArrowArrayView* value) { + out << "{"; + + // Write schema->name (may be null) + if (field->name == nullptr) { + out << R"("name": null)"; + } else { + out << R"("name": )"; + NANOARROW_RETURN_NOT_OK(WriteString(out, ArrowCharView(field->name))); + } + + // Write length + out << R"(, "count": )" << value->length; + + // Write the VALIDITY element if required + switch (value->storage_type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + break; + default: + out << R"(, "VALIDITY": )"; + WriteBitmap(out, value->buffer_views[0].data.as_uint8, value->length); + break; + } + + // Write the TYPE_ID element if required + switch (value->storage_type) { + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_DENSE_UNION: + out << R"(, "TYPE_ID": )"; + NANOARROW_RETURN_NOT_OK(WriteOffsetOrTypeID(out, value->buffer_views[0])); + break; + default: + break; + } + + // Write the OFFSET element if required + switch (value->storage_type) { + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_LIST: + out << R"(, "OFFSET": )"; + NANOARROW_RETURN_NOT_OK( + WriteOffsetOrTypeID(out, value->buffer_views[1])); + break; + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_LARGE_STRING: + out << R"(, "OFFSET": )"; + NANOARROW_RETURN_NOT_OK( + WriteOffsetOrTypeID(out, value->buffer_views[1])); + break; + default: + break; + } + + // Write the DATA element if required + switch (value->storage_type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + break; + default: + out << R"(, "DATA": )"; + NANOARROW_RETURN_NOT_OK(WriteData(out, value)); + break; + } + + switch (value->storage_type) { + case NANOARROW_TYPE_STRUCT: + case NANOARROW_TYPE_LIST: + case NANOARROW_TYPE_LARGE_LIST: + case NANOARROW_TYPE_FIXED_SIZE_LIST: + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + out << R"(, "children": )"; + NANOARROW_RETURN_NOT_OK(WriteChildren(out, field, value)); + break; + default: + break; + } + + out << "}"; + return NANOARROW_OK; + } + + private: + ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) { + ArrowType type; + if (field->extension_name.data != nullptr) { + type = field->storage_type; + } else { + type = field->type; + } + + out << "{"; + + switch (field->type) { + case NANOARROW_TYPE_NA: + out << R"("name": "null")"; + break; + case NANOARROW_TYPE_BOOL: + out << R"("name": "bool")"; + break; + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_INT64: + out << R"("name": "int", "bitWidth": )" << field->layout.element_size_bits[1] + << R"(, "isSigned": true)"; + break; + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_UINT32: + out << R"("name": "int", "bitWidth": )" << field->layout.element_size_bits[1] + << R"(, "isSigned": false)"; + break; + case NANOARROW_TYPE_HALF_FLOAT: + out << R"("name": "floatingpoint", "precision": "HALF")"; + break; + case NANOARROW_TYPE_FLOAT: + out << R"("name": "floatingpoint", "precision": "SINGLE")"; + break; + case NANOARROW_TYPE_DOUBLE: + out << R"("name": "floatingpoint", "precision": "DOUBLE")"; + break; + case NANOARROW_TYPE_STRING: + out << R"("name": "utf8")"; + break; + case NANOARROW_TYPE_LARGE_STRING: + out << R"("name": "largeutf8")"; + break; + case NANOARROW_TYPE_BINARY: + out << R"("name": "binary")"; + break; + case NANOARROW_TYPE_LARGE_BINARY: + out << R"("name": "largebinary")"; + break; + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + out << R"("name": "fixedsizebinary", "byteWidth": )" << field->fixed_size; + break; + case NANOARROW_TYPE_DECIMAL128: + case NANOARROW_TYPE_DECIMAL256: + out << R"("name": "decimal", "bitWidth": )" << field->decimal_bitwidth + << R"(, "precision": )" << field->decimal_precision << R"(, "scale": )" + << field->decimal_scale; + break; + case NANOARROW_TYPE_STRUCT: + out << R"("name": "struct")"; + break; + case NANOARROW_TYPE_LIST: + out << R"("name": "list")"; + break; + case NANOARROW_TYPE_MAP: + out << R"("name": "map", "keysSorted": )"; + if (field->schema->flags & ARROW_FLAG_MAP_KEYS_SORTED) { + out << "true"; + } else { + out << "false"; + } + break; + case NANOARROW_TYPE_LARGE_LIST: + out << R"("name": "largelist")"; + break; + case NANOARROW_TYPE_FIXED_SIZE_LIST: + out << R"("name": "fixedsizelist", "listSize": )" + << field->layout.child_size_elements; + break; + case NANOARROW_TYPE_DENSE_UNION: + out << R"("name": "union", "mode": "DENSE", "typeIds": [)" + << field->union_type_ids << "]"; + break; + case NANOARROW_TYPE_SPARSE_UNION: + out << R"("name": "union", "mode": "SPARSE", "typeIds": [)" + << field->union_type_ids << "]"; + break; + + default: + // Not supported + return ENOTSUP; + } + + out << "}"; + return NANOARROW_OK; + } + + ArrowErrorCode WriteMetadata(std::ostream& out, const char* metadata) { + if (metadata == nullptr) { + out << "null"; + return NANOARROW_OK; + } + + ArrowMetadataReader reader; + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); + if (reader.remaining_keys == 0) { + out << "[]"; + return NANOARROW_OK; + } + + out << "["; + NANOARROW_RETURN_NOT_OK(WriteMetadataItem(out, &reader)); + while (reader.remaining_keys > 0) { + out << ", "; + NANOARROW_RETURN_NOT_OK(WriteMetadataItem(out, &reader)); + } + + out << "]"; + return NANOARROW_OK; + } + + ArrowErrorCode WriteMetadataItem(std::ostream& out, ArrowMetadataReader* reader) { + ArrowStringView key; + ArrowStringView value; + NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderRead(reader, &key, &value)); + out << R"({"key": )"; + NANOARROW_RETURN_NOT_OK(WriteString(out, key)); + out << R"(, "value": )"; + NANOARROW_RETURN_NOT_OK(WriteString(out, value)); + out << "}"; + return NANOARROW_OK; + } + + void WriteBitmap(std::ostream& out, const uint8_t* bits, int64_t length) { + if (length == 0) { + out << "[]"; + return; + } + + out << "["; + + if (bits == nullptr) { + out << "1"; + for (int64_t i = 1; i < length; i++) { + out << ", 1"; + } + } else { + out << static_cast(ArrowBitGet(bits, 0)); + for (int64_t i = 1; i < length; i++) { + out << ", " << static_cast(ArrowBitGet(bits, i)); + } + } + + out << "]"; + } + + template + ArrowErrorCode WriteOffsetOrTypeID(std::ostream& out, ArrowBufferView content) { + if (content.size_bytes == 0) { + out << "[]"; + return NANOARROW_OK; + } + + const T* values = reinterpret_cast(content.data.data); + int64_t n_values = content.size_bytes / sizeof(T); + + out << "["; + + if (sizeof(T) == sizeof(int64_t)) { + // Ensure int64s are quoted (i.e, "123456") + out << R"(")" << values[0] << R"(")"; + for (int64_t i = 1; i < n_values; i++) { + out << R"(, ")" << values[i] << R"(")"; + } + } else { + // No need to quote smaller ints (i.e., 123456) + out << values[0]; + for (int64_t i = 1; i < n_values; i++) { + out << ", " << static_cast(values[i]); + } + } + + out << "]"; + return NANOARROW_OK; + } + + ArrowErrorCode WriteData(std::ostream& out, ArrowArrayView* value) { + if (value->length == 0) { + out << "[]"; + return NANOARROW_OK; + } + + out << "["; + + switch (value->storage_type) { + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + // Regular JSON integers (i.e., 123456) + out << ArrowArrayViewGetIntUnsafe(value, 0); + for (int64_t i = 1; i < value->length; i++) { + out << ", " << ArrowArrayViewGetIntUnsafe(value, i); + } + break; + case NANOARROW_TYPE_INT64: + // Quoted integers to avoid overflow (i.e., "123456") + out << R"(")" << ArrowArrayViewGetIntUnsafe(value, 0) << R"(")"; + for (int64_t i = 1; i < value->length; i++) { + out << R"(, ")" << ArrowArrayViewGetIntUnsafe(value, i) << R"(")"; + } + break; + case NANOARROW_TYPE_UINT64: + // Quoted integers to avoid overflow (i.e., "123456") + out << R"(")" << ArrowArrayViewGetUIntUnsafe(value, 0) << R"(")"; + for (int64_t i = 1; i < value->length; i++) { + out << R"(, ")" << ArrowArrayViewGetUIntUnsafe(value, i) << R"(")"; + } + break; + + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: { + // JSON number to 3 decimal places + LocalizedStream local_stream_opt(out); + local_stream_opt.SetFixed(3); + + out << ArrowArrayViewGetDoubleUnsafe(value, 0); + for (int64_t i = 1; i < value->length; i++) { + out << ", " << ArrowArrayViewGetDoubleUnsafe(value, i); + } + break; + } + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + NANOARROW_RETURN_NOT_OK( + WriteString(out, ArrowArrayViewGetStringUnsafe(value, 0))); + for (int64_t i = 1; i < value->length; i++) { + out << ", "; + NANOARROW_RETURN_NOT_OK( + WriteString(out, ArrowArrayViewGetStringUnsafe(value, i))); + } + break; + + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + case NANOARROW_TYPE_FIXED_SIZE_BINARY: { + NANOARROW_RETURN_NOT_OK(WriteBytes(out, ArrowArrayViewGetBytesUnsafe(value, 0))); + for (int64_t i = 1; i < value->length; i++) { + out << ", "; + NANOARROW_RETURN_NOT_OK( + WriteBytes(out, ArrowArrayViewGetBytesUnsafe(value, i))); + } + break; + } + + default: + // Not supported + return ENOTSUP; + } + + out << "]"; + return NANOARROW_OK; + } + + ArrowErrorCode WriteString(std::ostream& out, ArrowStringView value) { + out << R"(")"; + + for (int64_t i = 0; i < value.size_bytes; i++) { + char c = value.data[i]; + if (c == '"') { + out << R"(\")"; + } else if (c == '\\') { + out << R"(\\)"; + } else if (c < 0) { + // Not supporting multibyte unicode yet + return ENOTSUP; + } else if (c < 20) { + // Data in the arrow-testing repo has a lot of content that requires escaping + // in this way (\uXXXX). + uint16_t utf16_bytes = static_cast(c); + + char utf16_esc[7]; + utf16_esc[6] = '\0'; + snprintf(utf16_esc, sizeof(utf16_esc), R"(\u%04x)", utf16_bytes); + out << utf16_esc; + } else { + out << c; + } + } + + out << R"(")"; + return NANOARROW_OK; + } + + ArrowErrorCode WriteBytes(std::ostream& out, ArrowBufferView value) { + out << R"(")"; + char hex[3]; + hex[2] = '\0'; + + for (int64_t i = 0; i < value.size_bytes; i++) { + snprintf(hex, sizeof(hex), "%02X", static_cast(value.data.as_uint8[i])); + out << hex; + } + out << R"(")"; + return NANOARROW_OK; + } + + ArrowErrorCode WriteChildren(std::ostream& out, const ArrowSchema* field, + ArrowArrayView* value) { + if (field->n_children == 0) { + out << "[]"; + return NANOARROW_OK; + } + + out << "["; + NANOARROW_RETURN_NOT_OK(WriteColumn(out, field->children[0], value->children[0])); + for (int64_t i = 1; i < field->n_children; i++) { + out << ", "; + NANOARROW_RETURN_NOT_OK(WriteColumn(out, field->children[i], value->children[i])); + } + out << "]"; + return NANOARROW_OK; + } + + class LocalizedStream { + public: + LocalizedStream(std::ostream& out) : out_(out) { + previous_locale_ = out.imbue(std::locale::classic()); + previous_precision_ = out.precision(); + fmt_flags_ = out.flags(); + out.setf(out.fixed); + } + + void SetFixed(int precision) { out_.precision(precision); } + + ~LocalizedStream() { + out_.flags(fmt_flags_); + out_.precision(previous_precision_); + out_.imbue(previous_locale_); + } + + private: + std::ostream& out_; + std::locale previous_locale_; + std::ios::fmtflags fmt_flags_; + std::streamsize previous_precision_; + }; +}; + +/// \brief Reader for the Arrow integration testing JSON format +class TestingJSONReader { + using json = nlohmann::json; + + public: + /// \brief Read JSON representing a Schema + /// + /// Reads a JSON object in the form `{"fields": [...], "metadata": [...]}`, + /// propagating `out` on success. + ArrowErrorCode ReadSchema(const std::string& schema_json, ArrowSchema* out, + ArrowError* error = nullptr) { + try { + auto obj = json::parse(schema_json); + nanoarrow::UniqueSchema schema; + + NANOARROW_RETURN_NOT_OK(SetSchema(schema.get(), obj, error)); + ArrowSchemaMove(schema.get(), out); + return NANOARROW_OK; + } catch (json::exception& e) { + ArrowErrorSet(error, "Exception in TestingJSONReader::ReadSchema(): %s", e.what()); + return EINVAL; + } + } + + /// \brief Read JSON representing a Field + /// + /// Read a JSON object in the form `{"name" : "col", "type": {...}, ...}`, + /// propagating `out` on success. + ArrowErrorCode ReadField(const std::string& field_json, ArrowSchema* out, + ArrowError* error = nullptr) { + try { + auto obj = json::parse(field_json); + nanoarrow::UniqueSchema schema; + + NANOARROW_RETURN_NOT_OK(SetField(schema.get(), obj, error)); + ArrowSchemaMove(schema.get(), out); + return NANOARROW_OK; + } catch (json::exception& e) { + ArrowErrorSet(error, "Exception in TestingJSONReader::ReadField(): %s", e.what()); + return EINVAL; + } + } + + /// \brief Read JSON representing a Column + /// + /// Read a JSON object in the form + /// `{"name": "col", "count": 123, "VALIDITY": [...], ...}`, propagating + /// `out` on success. + ArrowErrorCode ReadColumn(const std::string& column_json, const ArrowSchema* schema, + ArrowArray* out, ArrowError* error = nullptr) { + try { + auto obj = json::parse(column_json); + + // ArrowArrayView to enable validation + nanoarrow::UniqueArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema( + array_view.get(), const_cast(schema), error)); + + // ArrowArray to hold memory + nanoarrow::UniqueArray array; + NANOARROW_RETURN_NOT_OK( + ArrowArrayInitFromSchema(array.get(), const_cast(schema), error)); + + // Parse the JSON into the array + NANOARROW_RETURN_NOT_OK(SetArrayColumn(obj, array_view.get(), array.get(), error)); + + // Return the result + ArrowArrayMove(array.get(), out); + return NANOARROW_OK; + } catch (json::exception& e) { + ArrowErrorSet(error, "Exception in TestingJSONReader::ReadColumn(): %s", e.what()); + return EINVAL; + } + } + + private: + ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_object(), error, "Expected Schema to be a JSON object")); + NANOARROW_RETURN_NOT_OK( + Check(value.contains("fields"), error, "Schema missing key 'fields'")); + NANOARROW_RETURN_NOT_OK( + Check(value.contains("metadata"), error, "Schema missing key 'metadata'")); + + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaInitFromType(schema, NANOARROW_TYPE_STRUCT), error); + + const auto& fields = value["fields"]; + NANOARROW_RETURN_NOT_OK( + Check(fields.is_array(), error, "Schema fields must be array")); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaAllocateChildren(schema, fields.size()), + error); + for (int64_t i = 0; i < schema->n_children; i++) { + NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], fields[i], error)); + } + + NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error)); + + // Validate! + ArrowSchemaView schema_view; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error)); + return NANOARROW_OK; + } + + ArrowErrorCode SetField(ArrowSchema* schema, const json& value, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_object(), error, "Expected Field to be a JSON object")); + NANOARROW_RETURN_NOT_OK( + Check(value.contains("name"), error, "Field missing key 'name'")); + NANOARROW_RETURN_NOT_OK( + Check(value.contains("nullable"), error, "Field missing key 'nullable'")); + NANOARROW_RETURN_NOT_OK( + Check(value.contains("type"), error, "Field missing key 'type'")); + NANOARROW_RETURN_NOT_OK( + Check(value.contains("children"), error, "Field missing key 'children'")); + NANOARROW_RETURN_NOT_OK( + Check(value.contains("metadata"), error, "Field missing key 'metadata'")); + + ArrowSchemaInit(schema); + + const auto& name = value["name"]; + NANOARROW_RETURN_NOT_OK(Check(name.is_string() || name.is_null(), error, + "Field name must be string or null")); + if (name.is_string()) { + auto name_str = name.get(); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetName(schema, name_str.c_str()), + error); + } + + const auto& nullable = value["nullable"]; + NANOARROW_RETURN_NOT_OK( + Check(nullable.is_boolean(), error, "Field nullable must be boolean")); + if (nullable.get()) { + schema->flags |= ARROW_FLAG_NULLABLE; + } else { + schema->flags &= ~ARROW_FLAG_NULLABLE; + } + + NANOARROW_RETURN_NOT_OK(SetType(schema, value["type"], error)); + + const auto& children = value["children"]; + NANOARROW_RETURN_NOT_OK( + Check(children.is_array(), error, "Field children must be array")); + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaAllocateChildren(schema, children.size()), error); + for (int64_t i = 0; i < schema->n_children; i++) { + NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], children[i], error)); + } + + NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error)); + + // Validate! + ArrowSchemaView schema_view; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error)); + return NANOARROW_OK; + } + + ArrowErrorCode SetType(ArrowSchema* schema, const json& value, ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Type must be object")); + NANOARROW_RETURN_NOT_OK( + Check(value.contains("name"), error, "Type missing key 'name'")); + + const auto& name = value["name"]; + NANOARROW_RETURN_NOT_OK(Check(name.is_string(), error, "Type name must be string")); + auto name_str = name.get(); + + if (name_str == "null") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, NANOARROW_TYPE_NA), + error); + } else if (name_str == "bool") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, NANOARROW_TYPE_BOOL), + error); + } else if (name_str == "int") { + NANOARROW_RETURN_NOT_OK(SetTypeInt(schema, value, error)); + } else if (name_str == "floatingpoint") { + NANOARROW_RETURN_NOT_OK(SetTypeFloatingPoint(schema, value, error)); + } else if (name_str == "decimal") { + NANOARROW_RETURN_NOT_OK(SetTypeDecimal(schema, value, error)); + } else if (name_str == "utf8") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaSetType(schema, NANOARROW_TYPE_STRING), error); + } else if (name_str == "largeutf8") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaSetType(schema, NANOARROW_TYPE_LARGE_STRING), error); + } else if (name_str == "binary") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaSetType(schema, NANOARROW_TYPE_BINARY), error); + } else if (name_str == "largebinary") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaSetType(schema, NANOARROW_TYPE_LARGE_BINARY), error); + } else if (name_str == "fixedsizebinary") { + NANOARROW_RETURN_NOT_OK(SetTypeFixedSizeBinary(schema, value, error)); + } else if (name_str == "list") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+l"), error); + } else if (name_str == "largelist") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+L"), error); + } else if (name_str == "fixedsizelist") { + NANOARROW_RETURN_NOT_OK(SetTypeFixedSizeList(schema, value, error)); + } else if (name_str == "map") { + NANOARROW_RETURN_NOT_OK(SetTypeMap(schema, value, error)); + } else if (name_str == "union") { + NANOARROW_RETURN_NOT_OK(SetTypeUnion(schema, value, error)); + } else if (name_str == "struct") { + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+s"), error); + } else { + ArrowErrorSet(error, "Unsupported Type name: '%s'", name_str.c_str()); + return ENOTSUP; + } + + return NANOARROW_OK; + } + + ArrowErrorCode SetTypeInt(ArrowSchema* schema, const json& value, ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.contains("bitWidth"), error, + "Type[name=='int'] missing key 'bitWidth'")); + NANOARROW_RETURN_NOT_OK(Check(value.contains("isSigned"), error, + "Type[name=='int'] missing key 'isSigned'")); + + const auto& bitwidth = value["bitWidth"]; + NANOARROW_RETURN_NOT_OK(Check(bitwidth.is_number_integer(), error, + "Type[name=='int'] bitWidth must be integer")); + + const auto& issigned = value["isSigned"]; + NANOARROW_RETURN_NOT_OK(Check(issigned.is_boolean(), error, + "Type[name=='int'] isSigned must be boolean")); + + ArrowType type = NANOARROW_TYPE_UNINITIALIZED; + if (issigned.get()) { + switch (bitwidth.get()) { + case 8: + type = NANOARROW_TYPE_INT8; + break; + case 16: + type = NANOARROW_TYPE_INT16; + break; + case 32: + type = NANOARROW_TYPE_INT32; + break; + case 64: + type = NANOARROW_TYPE_INT64; + break; + default: + ArrowErrorSet(error, "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); + return EINVAL; + } + } else { + switch (bitwidth.get()) { + case 8: + type = NANOARROW_TYPE_UINT8; + break; + case 16: + type = NANOARROW_TYPE_UINT16; + break; + case 32: + type = NANOARROW_TYPE_UINT32; + break; + case 64: + type = NANOARROW_TYPE_UINT64; + break; + default: + ArrowErrorSet(error, "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); + return EINVAL; + } + } + + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, type), error); + return NANOARROW_OK; + } + + ArrowErrorCode SetTypeFloatingPoint(ArrowSchema* schema, const json& value, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.contains("precision"), error, + "Type[name=='floatingpoint'] missing key 'precision'")); + + const auto& precision = value["precision"]; + NANOARROW_RETURN_NOT_OK(Check(precision.is_string(), error, + "Type[name=='floatingpoint'] bitWidth must be string")); + + ArrowType type = NANOARROW_TYPE_UNINITIALIZED; + auto precision_str = precision.get(); + if (precision_str == "HALF") { + type = NANOARROW_TYPE_HALF_FLOAT; + } else if (precision_str == "SINGLE") { + type = NANOARROW_TYPE_FLOAT; + } else if (precision_str == "DOUBLE") { + type = NANOARROW_TYPE_DOUBLE; + } else { + ArrowErrorSet( + error, + "Type[name=='floatingpoint'] precision must be 'HALF', 'SINGLE', or 'DOUBLE'"); + return EINVAL; + } + + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, type), error); + return NANOARROW_OK; + } + + ArrowErrorCode SetTypeFixedSizeBinary(ArrowSchema* schema, const json& value, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("byteWidth"), error, + "Type[name=='fixedsizebinary'] missing key 'byteWidth'")); + + const auto& byteWidth = value["byteWidth"]; + NANOARROW_RETURN_NOT_OK( + Check(byteWidth.is_number_integer(), error, + "Type[name=='fixedsizebinary'] byteWidth must be integer")); + + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaSetTypeFixedSize(schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, + byteWidth.get()), + error); + return NANOARROW_OK; + } + + ArrowErrorCode SetTypeDecimal(ArrowSchema* schema, const json& value, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.contains("bitWidth"), error, + "Type[name=='decimal'] missing key 'bitWidth'")); + NANOARROW_RETURN_NOT_OK(Check(value.contains("precision"), error, + "Type[name=='decimal'] missing key 'precision'")); + NANOARROW_RETURN_NOT_OK(Check(value.contains("scale"), error, + "Type[name=='decimal'] missing key 'scale'")); + + const auto& bitWidth = value["bitWidth"]; + NANOARROW_RETURN_NOT_OK(Check(bitWidth.is_number_integer(), error, + "Type[name=='decimal'] bitWidth must be integer")); + + ArrowType type; + switch (bitWidth.get()) { + case 128: + type = NANOARROW_TYPE_DECIMAL128; + break; + case 256: + type = NANOARROW_TYPE_DECIMAL256; + break; + default: + ArrowErrorSet(error, "Type[name=='decimal'] bitWidth must be 128 or 256"); + return EINVAL; + } + + const auto& precision = value["precision"]; + NANOARROW_RETURN_NOT_OK(Check(precision.is_number_integer(), error, + "Type[name=='decimal'] precision must be integer")); + + const auto& scale = value["scale"]; + NANOARROW_RETURN_NOT_OK(Check(scale.is_number_integer(), error, + "Type[name=='decimal'] scale must be integer")); + + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaSetTypeDecimal(schema, type, precision.get(), scale.get()), + error); + + return NANOARROW_OK; + } + + ArrowErrorCode SetTypeMap(ArrowSchema* schema, const json& value, ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.contains("keysSorted"), error, + "Type[name=='map'] missing key 'keysSorted'")); + + const auto& keys_sorted = value["keysSorted"]; + NANOARROW_RETURN_NOT_OK(Check(keys_sorted.is_boolean(), error, + "Type[name=='map'] keysSorted must be boolean")); + + if (keys_sorted.get()) { + schema->flags |= ARROW_FLAG_MAP_KEYS_SORTED; + } else { + schema->flags &= ~ARROW_FLAG_MAP_KEYS_SORTED; + } + + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+m"), error); + return NANOARROW_OK; + } + + ArrowErrorCode SetTypeFixedSizeList(ArrowSchema* schema, const json& value, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.contains("listSize"), error, + "Type[name=='fixedsizelist'] missing key 'listSize'")); + + const auto& list_size = value["listSize"]; + NANOARROW_RETURN_NOT_OK( + Check(list_size.is_number_integer(), error, + "Type[name=='fixedsizelist'] listSize must be integer")); + + std::stringstream format_builder; + format_builder << "+w:" << list_size; + std::string format = format_builder.str(); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, format.c_str()), + error); + return NANOARROW_OK; + } + + ArrowErrorCode SetTypeUnion(ArrowSchema* schema, const json& value, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("mode"), error, "Type[name=='union'] missing key 'mode'")); + NANOARROW_RETURN_NOT_OK(Check(value.contains("typeIds"), error, + "Type[name=='union'] missing key 'typeIds'")); + + const auto& mode = value["mode"]; + NANOARROW_RETURN_NOT_OK( + Check(mode.is_string(), error, "Type[name=='union'] mode must be string")); + + auto mode_str = mode.get(); + std::stringstream type_ids_format; + + if (mode_str == "DENSE") { + type_ids_format << "+ud:"; + } else if (mode_str == "SPARSE") { + type_ids_format << "+us:"; + } else { + ArrowErrorSet(error, "Type[name=='union'] mode must be 'DENSE' or 'SPARSE'"); + return EINVAL; + } + + const auto& type_ids = value["typeIds"]; + NANOARROW_RETURN_NOT_OK( + Check(type_ids.is_array(), error, "Type[name=='union'] typeIds must be array")); + + if (type_ids.size() > 0) { + for (size_t i = 0; i < type_ids.size(); i++) { + const auto& type_id = type_ids[i]; + NANOARROW_RETURN_NOT_OK( + Check(type_id.is_number_integer(), error, + "Type[name=='union'] typeIds item must be integer")); + type_ids_format << type_id; + + if ((i + 1) < type_ids.size()) { + type_ids_format << ","; + } + } + } + + std::string type_ids_format_str = type_ids_format.str(); + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaSetFormat(schema, type_ids_format_str.c_str()), error); + + return NANOARROW_OK; + } + + ArrowErrorCode SetMetadata(ArrowSchema* schema, const json& value, ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.is_null() || value.is_array(), error, + "Field or Schema metadata must be null or array")); + if (value.is_null()) { + return NANOARROW_OK; + } + + nanoarrow::UniqueBuffer metadata; + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataBuilderInit(metadata.get(), nullptr), + error); + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_object(), error, "metadata item must be object")); + NANOARROW_RETURN_NOT_OK( + Check(item.contains("key"), error, "metadata item missing key 'key'")); + NANOARROW_RETURN_NOT_OK( + Check(item.contains("value"), error, "metadata item missing key 'value'")); + + const auto& key = item["key"]; + const auto& value = item["value"]; + NANOARROW_RETURN_NOT_OK( + Check(key.is_string(), error, "metadata item key must be string")); + NANOARROW_RETURN_NOT_OK( + Check(value.is_string(), error, "metadata item value must be string")); + + auto key_str = key.get(); + auto value_str = value.get(); + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowMetadataBuilderAppend(metadata.get(), ArrowCharView(key_str.c_str()), + ArrowCharView(value_str.c_str())), + error); + } + + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowSchemaSetMetadata(schema, reinterpret_cast(metadata->data)), error); + return NANOARROW_OK; + } + + ArrowErrorCode SetArrayColumn(const json& value, ArrowArrayView* array_view, + ArrowArray* array, ArrowError* error, + const std::string& parent_error_prefix = "") { + NANOARROW_RETURN_NOT_OK( + Check(value.is_object(), error, "Expected Column to be a JSON object")); + + // Check + resolve name early to generate better error messages + NANOARROW_RETURN_NOT_OK( + Check(value.contains("name"), error, "Column missing key 'name'")); + + const auto& name = value["name"]; + NANOARROW_RETURN_NOT_OK(Check(name.is_null() || name.is_string(), error, + "Column name must be string or null")); + + std::string error_prefix; + if (name.is_string()) { + error_prefix = parent_error_prefix + "-> Column '" + name.get() + "' "; + } else { + error_prefix = parent_error_prefix + "-> Column "; + } + + // Check, resolve, and recurse children + NANOARROW_RETURN_NOT_OK( + Check(array_view->n_children == 0 || value.contains("children"), error, + error_prefix + "missing key children")); + + if (value.contains("children")) { + const auto& children = value["children"]; + NANOARROW_RETURN_NOT_OK( + Check(children.is_array(), error, error_prefix + "children must be array")); + NANOARROW_RETURN_NOT_OK(Check(children.size() == array_view->n_children, error, + error_prefix + "children has incorrect size")); + + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], array_view->children[i], + array->children[i], error, error_prefix)); + } + } + + // Build buffers + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + NANOARROW_RETURN_NOT_OK( + PrefixError(SetArrayColumnBuffers(value, array_view, array, i, error), error, + error_prefix)); + } + + // Check + resolve count + NANOARROW_RETURN_NOT_OK( + Check(value.contains("count"), error, error_prefix + "missing key 'count'")); + const auto& count = value["count"]; + NANOARROW_RETURN_NOT_OK( + Check(count.is_number_integer(), error, error_prefix + "count must be integer")); + array_view->length = count.get(); + + // Set ArrayView buffer views. This is because ArrowArrayInitFromSchema() doesn't + // support custom type ids for unions but the ArrayView does (otherwise + // ArrowArrayFinishBuilding() would work). + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + ArrowBufferView* buffer_view = array_view->buffer_views + i; + buffer_view->data.as_uint8 = buffer->data; + buffer_view->size_bytes = buffer->size_bytes; + } + + // Validate the array view + NANOARROW_RETURN_NOT_OK(PrefixError( + ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL, error), error, + error_prefix + "failed to validate: ")); + + // Flush length and buffer pointers to the Array + array->length = array_view->length; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE, nullptr), error); + + return NANOARROW_OK; + } + + ArrowErrorCode SetArrayColumnBuffers(const json& value, ArrowArrayView* array_view, + ArrowArray* array, int buffer_i, + ArrowError* error) { + ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + + switch (array_view->layout.buffer_type[buffer_i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("VALIDITY"), error, "missing key 'VALIDITY'")); + const auto& validity = value["VALIDITY"]; + NANOARROW_RETURN_NOT_OK( + SetBufferBitmap(validity, ArrowArrayValidityBitmap(array), error)); + break; + } + case NANOARROW_BUFFER_TYPE_TYPE_ID: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("TYPE_ID"), error, "missing key 'TYPE_ID'")); + const auto& type_id = value["TYPE_ID"]; + NANOARROW_RETURN_NOT_OK(SetBufferInt(type_id, buffer, error)); + break; + } + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); + const auto& offset = value["OFFSET"]; + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + break; + } + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); + const auto& offset = value["OFFSET"]; + + if (array_view->layout.element_size_bits[buffer_i] == 32) { + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + } else { + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + } + break; + } + + case NANOARROW_BUFFER_TYPE_DATA: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("DATA"), error, "missing key 'DATA'")); + const auto& data = value["DATA"]; + + switch (array_view->storage_type) { + case NANOARROW_TYPE_BOOL: { + nanoarrow::UniqueBitmap bitmap; + NANOARROW_RETURN_NOT_OK(SetBufferBitmap(data, bitmap.get(), error)); + ArrowBufferMove(&bitmap->buffer, buffer); + return NANOARROW_OK; + } + case NANOARROW_TYPE_INT8: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT8: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT16: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT16: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT32: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT32: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT64: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT64: + return SetBufferInt(data, buffer, error); + + case NANOARROW_TYPE_FLOAT: + return SetBufferFloatingPoint(data, buffer, error); + case NANOARROW_TYPE_DOUBLE: + return SetBufferFloatingPoint(data, buffer, error); + + case NANOARROW_TYPE_STRING: + return SetBufferString(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_LARGE_STRING: + return SetBufferString(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_BINARY: + return SetBufferBinary(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_LARGE_BINARY: + return SetBufferBinary(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return SetBufferFixedSizeBinary( + data, buffer, array_view->layout.element_size_bits[buffer_i] / 8, error); + + default: + ArrowErrorSet(error, "storage type %s DATA buffer not supported", + ArrowTypeString(array_view->storage_type)); + return ENOTSUP; + } + break; + } + case NANOARROW_BUFFER_TYPE_NONE: + break; + } + + return NANOARROW_OK; + } + + ArrowErrorCode SetBufferBitmap(const json& value, ArrowBitmap* bitmap, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "bitmap buffer must be array")); + + for (const auto& item : value) { + // Some example files write bitmaps as [true, false, true] but the documentation + // says [1, 0, 1]. Accept both for simplicity. + NANOARROW_RETURN_NOT_OK(Check(item.is_boolean() || item.is_number_integer(), error, + "bitmap item must be bool or integer")); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapAppend(bitmap, item.get(), 1), + error); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferInt(const json& value, ArrowBuffer* buffer, ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "int buffer must be array")); + + for (const auto& item : value) { + // NANOARROW_RETURN_NOT_OK() interacts poorly with multiple template args + ArrowErrorCode result = SetBufferIntItem(item, buffer, error); + NANOARROW_RETURN_NOT_OK(result); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferIntItem(const json& item, ArrowBuffer* buffer, + ArrowError* error) { + if (item.is_string()) { + try { + // The JSON parser here can handle up to 2^64 - 1 + auto item_int = json::parse(item.get()); + return SetBufferIntItem(item_int, buffer, error); + } catch (json::parse_error& e) { + ArrowErrorSet(error, + "integer buffer item encoded as string must parse as integer: %s", + item.dump().c_str()); + return EINVAL; + } + } + + NANOARROW_RETURN_NOT_OK( + Check(item.is_number_integer(), error, + "integer buffer item must be integer number or string")); + NANOARROW_RETURN_NOT_OK( + Check(std::numeric_limits::is_signed || item.is_number_unsigned(), error, + "expected unsigned integer buffer item but found signed integer '" + + item.dump() + "'")); + + auto item_int = item.get(); + + NANOARROW_RETURN_NOT_OK( + Check(item_int >= std::numeric_limits::lowest() && + item_int <= std::numeric_limits::max(), + error, "integer buffer item '" + item.dump() + "' outside type limits")); + + T buffer_value = item_int; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferFloatingPoint(const json& value, ArrowBuffer* buffer, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "floatingpoint buffer must be array")); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_number(), error, "floatingpoint buffer item must be number")); + double item_dbl = item.get(); + + NANOARROW_RETURN_NOT_OK(Check( + item_dbl >= std::numeric_limits::lowest() && + item_dbl <= std::numeric_limits::max(), + error, "floatingpoint buffer item '" + item.dump() + "' outside type limits")); + + T buffer_value = item_dbl; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferString(const json& value, ArrowBuffer* offsets, + ArrowBuffer* data, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "utf8 data buffer must be array")); + + // Check offsets against values + const T* expected_offset = reinterpret_cast(offsets->data); + NANOARROW_RETURN_NOT_OK(Check( + offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, + "Expected offset buffer with " + std::to_string(value.size()) + " elements")); + NANOARROW_RETURN_NOT_OK( + Check(*expected_offset++ == 0, error, "first offset must be zero")); + + int64_t last_offset = 0; + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_string(), error, "utf8 data buffer item must be string")); + auto item_str = item.get(); + + // Append data + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(data, reinterpret_cast(item_str.data()), + item_str.size()), + error); + + // Check offset + last_offset += item_str.size(); + NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, + "Expected offset value " + + std::to_string(last_offset) + + " at utf8 data buffer item " + item.dump())); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferBinary(const json& value, ArrowBuffer* offsets, + ArrowBuffer* data, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "binary data buffer must be array")); + + // Check offsets against values if not fixed size + const T* expected_offset = reinterpret_cast(offsets->data); + NANOARROW_RETURN_NOT_OK(Check( + offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, + "Expected offset buffer with " + std::to_string(value.size()) + " elements")); + NANOARROW_RETURN_NOT_OK( + Check(*expected_offset++ == 0, error, "first offset must be zero")); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); + + // Check offset + NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == data->size_bytes, error, + "Expected offset value " + + std::to_string(data->size_bytes) + + " at binary data buffer item " + item.dump())); + } + + return NANOARROW_OK; + } + + ArrowErrorCode SetBufferFixedSizeBinary(const json& value, ArrowBuffer* data, + int64_t fixed_size, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "binary data buffer must be array")); + + int64_t last_offset = 0; + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); + int64_t item_size_bytes = data->size_bytes - last_offset; + + NANOARROW_RETURN_NOT_OK(Check(item_size_bytes == fixed_size, error, + "Expected fixed size binary value of size " + + std::to_string(fixed_size) + + " at binary data buffer item " + item.dump())); + last_offset = data->size_bytes; + } + + return NANOARROW_OK; + } + + ArrowErrorCode AppendBinaryElement(const json& item, ArrowBuffer* data, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_string(), error, "binary data buffer item must be string")); + auto item_str = item.get(); + + int64_t item_size_bytes = item_str.size() / 2; + NANOARROW_RETURN_NOT_OK(Check((item_size_bytes * 2) == item_str.size(), error, + "binary data buffer item must have even size")); + + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBufferReserve(data, item_size_bytes), error); + for (int64_t i = 0; i < item_str.size(); i += 2) { + std::string byte_hex = item_str.substr(i, 2); + char* end_ptr; + uint8_t byte = std::strtoul(byte_hex.data(), &end_ptr, 16); + NANOARROW_RETURN_NOT_OK( + Check(end_ptr == (byte_hex.data() + 2), error, + "binary data buffer item must contain a valid hex-encoded byte string")); + + data->data[data->size_bytes] = byte; + data->size_bytes++; + } + + return NANOARROW_OK; + } + + ArrowErrorCode PrefixError(ArrowErrorCode value, ArrowError* error, + const std::string& prefix) { + if (value != NANOARROW_OK && error != nullptr) { + std::string msg = prefix + error->message; + ArrowErrorSet(error, "%s", msg.c_str()); + } + + return value; + } + + ArrowErrorCode Check(bool value, ArrowError* error, const std::string& err) { + if (value) { + return NANOARROW_OK; + } else { + ArrowErrorSet(error, "%s", err.c_str()); + return EINVAL; + } + } +}; + +/// @} + +} // namespace testing +} // namespace nanoarrow + +#endif From d96bc2827f34056d78720d0cf5abb7ead4aa061b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Nov 2023 11:49:20 -0400 Subject: [PATCH 17/19] remove testing file --- python/src/nanoarrow/nanoarrow_testing.hpp | 1508 -------------------- 1 file changed, 1508 deletions(-) delete mode 100644 python/src/nanoarrow/nanoarrow_testing.hpp diff --git a/python/src/nanoarrow/nanoarrow_testing.hpp b/python/src/nanoarrow/nanoarrow_testing.hpp deleted file mode 100644 index bfb15462d..000000000 --- a/python/src/nanoarrow/nanoarrow_testing.hpp +++ /dev/null @@ -1,1508 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include - -#include - -#include "nanoarrow.hpp" - -#ifndef NANOARROW_TESTING_HPP_INCLUDED -#define NANOARROW_TESTING_HPP_INCLUDED - -/// \defgroup nanoarrow_testing Nanoarrow Testing Helpers -/// -/// Utilities for testing nanoarrow structures and functions. - -namespace nanoarrow { - -namespace testing { - -/// \defgroup nanoarrow_testing-json Integration test helpers -/// -/// See testing format documentation for details of the JSON representation. This -/// representation is not canonical but can be used to implement integration tests with -/// other implementations. -/// -/// @{ - -/// \brief Writer for the Arrow integration testing JSON format -class TestingJSONWriter { - public: - /// \brief Write a schema to out - /// - /// Creates output like `{"fields": [...], "metadata": [...]}`. - ArrowErrorCode WriteSchema(std::ostream& out, const ArrowSchema* schema) { - // Make sure we have a struct - if (std::string(schema->format) != "+s") { - return EINVAL; - } - - out << "{"; - - // Write fields - out << R"("fields": )"; - if (schema->n_children == 0) { - out << "[]"; - } else { - out << "["; - NANOARROW_RETURN_NOT_OK(WriteField(out, schema->children[0])); - for (int64_t i = 1; i < schema->n_children; i++) { - out << ", "; - NANOARROW_RETURN_NOT_OK(WriteField(out, schema->children[i])); - } - out << "]"; - } - - // Write metadata - out << R"(, "metadata": )"; - NANOARROW_RETURN_NOT_OK(WriteMetadata(out, schema->metadata)); - - out << "}"; - return NANOARROW_OK; - } - - /// \brief Write a field to out - /// - /// Creates output like `{"name" : "col", "type": {...}, ...}` - ArrowErrorCode WriteField(std::ostream& out, const ArrowSchema* field) { - ArrowSchemaView view; - NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, (ArrowSchema*)field, nullptr)); - - out << "{"; - - // Write schema->name (may be null) - if (field->name == nullptr) { - out << R"("name": null)"; - } else { - out << R"("name": )"; - NANOARROW_RETURN_NOT_OK(WriteString(out, ArrowCharView(field->name))); - } - - // Write nullability - if (field->flags & ARROW_FLAG_NULLABLE) { - out << R"(, "nullable": true)"; - } else { - out << R"(, "nullable": false)"; - } - - // Write type - out << R"(, "type": )"; - NANOARROW_RETURN_NOT_OK(WriteType(out, &view)); - - // Write children - out << R"(, "children": )"; - if (field->n_children == 0) { - out << "[]"; - } else { - out << "["; - NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0])); - for (int64_t i = 1; i < field->n_children; i++) { - out << ", "; - NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i])); - } - out << "]"; - } - - // TODO: Dictionary (currently fails at WriteType) - - // Write metadata - out << R"(, "metadata": )"; - NANOARROW_RETURN_NOT_OK(WriteMetadata(out, field->metadata)); - - out << "}"; - return NANOARROW_OK; - } - - /// \brief Write the type portion of a field - /// - /// Creates output like `{"name": "int", ...}` - ArrowErrorCode WriteType(std::ostream& out, const ArrowSchema* field) { - ArrowSchemaView view; - NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, (ArrowSchema*)field, nullptr)); - NANOARROW_RETURN_NOT_OK(WriteType(out, &view)); - return NANOARROW_OK; - } - - /// \brief Write a "batch" to out - /// - /// Creates output like `{"count": 123, "columns": [...]}`. - ArrowErrorCode WriteBatch(std::ostream& out, const ArrowSchema* schema, - ArrowArrayView* value) { - // Make sure we have a struct - if (std::string(schema->format) != "+s") { - return EINVAL; - } - - out << "{"; - - // Write length - out << R"("count": )" << value->length; - - // Write children - out << R"(, "columns": )"; - NANOARROW_RETURN_NOT_OK(WriteChildren(out, schema, value)); - - out << "}"; - return NANOARROW_OK; - } - - /// \brief Write a column to out - /// - /// Creates output like `{"name": "col", "count": 123, "VALIDITY": [...], ...}`. - ArrowErrorCode WriteColumn(std::ostream& out, const ArrowSchema* field, - ArrowArrayView* value) { - out << "{"; - - // Write schema->name (may be null) - if (field->name == nullptr) { - out << R"("name": null)"; - } else { - out << R"("name": )"; - NANOARROW_RETURN_NOT_OK(WriteString(out, ArrowCharView(field->name))); - } - - // Write length - out << R"(, "count": )" << value->length; - - // Write the VALIDITY element if required - switch (value->storage_type) { - case NANOARROW_TYPE_NA: - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - break; - default: - out << R"(, "VALIDITY": )"; - WriteBitmap(out, value->buffer_views[0].data.as_uint8, value->length); - break; - } - - // Write the TYPE_ID element if required - switch (value->storage_type) { - case NANOARROW_TYPE_SPARSE_UNION: - case NANOARROW_TYPE_DENSE_UNION: - out << R"(, "TYPE_ID": )"; - NANOARROW_RETURN_NOT_OK(WriteOffsetOrTypeID(out, value->buffer_views[0])); - break; - default: - break; - } - - // Write the OFFSET element if required - switch (value->storage_type) { - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_LIST: - out << R"(, "OFFSET": )"; - NANOARROW_RETURN_NOT_OK( - WriteOffsetOrTypeID(out, value->buffer_views[1])); - break; - case NANOARROW_TYPE_LARGE_LIST: - case NANOARROW_TYPE_LARGE_BINARY: - case NANOARROW_TYPE_LARGE_STRING: - out << R"(, "OFFSET": )"; - NANOARROW_RETURN_NOT_OK( - WriteOffsetOrTypeID(out, value->buffer_views[1])); - break; - default: - break; - } - - // Write the DATA element if required - switch (value->storage_type) { - case NANOARROW_TYPE_NA: - case NANOARROW_TYPE_STRUCT: - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_LARGE_LIST: - case NANOARROW_TYPE_FIXED_SIZE_LIST: - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - break; - default: - out << R"(, "DATA": )"; - NANOARROW_RETURN_NOT_OK(WriteData(out, value)); - break; - } - - switch (value->storage_type) { - case NANOARROW_TYPE_STRUCT: - case NANOARROW_TYPE_LIST: - case NANOARROW_TYPE_LARGE_LIST: - case NANOARROW_TYPE_FIXED_SIZE_LIST: - case NANOARROW_TYPE_DENSE_UNION: - case NANOARROW_TYPE_SPARSE_UNION: - out << R"(, "children": )"; - NANOARROW_RETURN_NOT_OK(WriteChildren(out, field, value)); - break; - default: - break; - } - - out << "}"; - return NANOARROW_OK; - } - - private: - ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) { - ArrowType type; - if (field->extension_name.data != nullptr) { - type = field->storage_type; - } else { - type = field->type; - } - - out << "{"; - - switch (field->type) { - case NANOARROW_TYPE_NA: - out << R"("name": "null")"; - break; - case NANOARROW_TYPE_BOOL: - out << R"("name": "bool")"; - break; - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_INT64: - out << R"("name": "int", "bitWidth": )" << field->layout.element_size_bits[1] - << R"(, "isSigned": true)"; - break; - case NANOARROW_TYPE_UINT8: - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_UINT32: - out << R"("name": "int", "bitWidth": )" << field->layout.element_size_bits[1] - << R"(, "isSigned": false)"; - break; - case NANOARROW_TYPE_HALF_FLOAT: - out << R"("name": "floatingpoint", "precision": "HALF")"; - break; - case NANOARROW_TYPE_FLOAT: - out << R"("name": "floatingpoint", "precision": "SINGLE")"; - break; - case NANOARROW_TYPE_DOUBLE: - out << R"("name": "floatingpoint", "precision": "DOUBLE")"; - break; - case NANOARROW_TYPE_STRING: - out << R"("name": "utf8")"; - break; - case NANOARROW_TYPE_LARGE_STRING: - out << R"("name": "largeutf8")"; - break; - case NANOARROW_TYPE_BINARY: - out << R"("name": "binary")"; - break; - case NANOARROW_TYPE_LARGE_BINARY: - out << R"("name": "largebinary")"; - break; - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - out << R"("name": "fixedsizebinary", "byteWidth": )" << field->fixed_size; - break; - case NANOARROW_TYPE_DECIMAL128: - case NANOARROW_TYPE_DECIMAL256: - out << R"("name": "decimal", "bitWidth": )" << field->decimal_bitwidth - << R"(, "precision": )" << field->decimal_precision << R"(, "scale": )" - << field->decimal_scale; - break; - case NANOARROW_TYPE_STRUCT: - out << R"("name": "struct")"; - break; - case NANOARROW_TYPE_LIST: - out << R"("name": "list")"; - break; - case NANOARROW_TYPE_MAP: - out << R"("name": "map", "keysSorted": )"; - if (field->schema->flags & ARROW_FLAG_MAP_KEYS_SORTED) { - out << "true"; - } else { - out << "false"; - } - break; - case NANOARROW_TYPE_LARGE_LIST: - out << R"("name": "largelist")"; - break; - case NANOARROW_TYPE_FIXED_SIZE_LIST: - out << R"("name": "fixedsizelist", "listSize": )" - << field->layout.child_size_elements; - break; - case NANOARROW_TYPE_DENSE_UNION: - out << R"("name": "union", "mode": "DENSE", "typeIds": [)" - << field->union_type_ids << "]"; - break; - case NANOARROW_TYPE_SPARSE_UNION: - out << R"("name": "union", "mode": "SPARSE", "typeIds": [)" - << field->union_type_ids << "]"; - break; - - default: - // Not supported - return ENOTSUP; - } - - out << "}"; - return NANOARROW_OK; - } - - ArrowErrorCode WriteMetadata(std::ostream& out, const char* metadata) { - if (metadata == nullptr) { - out << "null"; - return NANOARROW_OK; - } - - ArrowMetadataReader reader; - NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, metadata)); - if (reader.remaining_keys == 0) { - out << "[]"; - return NANOARROW_OK; - } - - out << "["; - NANOARROW_RETURN_NOT_OK(WriteMetadataItem(out, &reader)); - while (reader.remaining_keys > 0) { - out << ", "; - NANOARROW_RETURN_NOT_OK(WriteMetadataItem(out, &reader)); - } - - out << "]"; - return NANOARROW_OK; - } - - ArrowErrorCode WriteMetadataItem(std::ostream& out, ArrowMetadataReader* reader) { - ArrowStringView key; - ArrowStringView value; - NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderRead(reader, &key, &value)); - out << R"({"key": )"; - NANOARROW_RETURN_NOT_OK(WriteString(out, key)); - out << R"(, "value": )"; - NANOARROW_RETURN_NOT_OK(WriteString(out, value)); - out << "}"; - return NANOARROW_OK; - } - - void WriteBitmap(std::ostream& out, const uint8_t* bits, int64_t length) { - if (length == 0) { - out << "[]"; - return; - } - - out << "["; - - if (bits == nullptr) { - out << "1"; - for (int64_t i = 1; i < length; i++) { - out << ", 1"; - } - } else { - out << static_cast(ArrowBitGet(bits, 0)); - for (int64_t i = 1; i < length; i++) { - out << ", " << static_cast(ArrowBitGet(bits, i)); - } - } - - out << "]"; - } - - template - ArrowErrorCode WriteOffsetOrTypeID(std::ostream& out, ArrowBufferView content) { - if (content.size_bytes == 0) { - out << "[]"; - return NANOARROW_OK; - } - - const T* values = reinterpret_cast(content.data.data); - int64_t n_values = content.size_bytes / sizeof(T); - - out << "["; - - if (sizeof(T) == sizeof(int64_t)) { - // Ensure int64s are quoted (i.e, "123456") - out << R"(")" << values[0] << R"(")"; - for (int64_t i = 1; i < n_values; i++) { - out << R"(, ")" << values[i] << R"(")"; - } - } else { - // No need to quote smaller ints (i.e., 123456) - out << values[0]; - for (int64_t i = 1; i < n_values; i++) { - out << ", " << static_cast(values[i]); - } - } - - out << "]"; - return NANOARROW_OK; - } - - ArrowErrorCode WriteData(std::ostream& out, ArrowArrayView* value) { - if (value->length == 0) { - out << "[]"; - return NANOARROW_OK; - } - - out << "["; - - switch (value->storage_type) { - case NANOARROW_TYPE_BOOL: - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT32: - // Regular JSON integers (i.e., 123456) - out << ArrowArrayViewGetIntUnsafe(value, 0); - for (int64_t i = 1; i < value->length; i++) { - out << ", " << ArrowArrayViewGetIntUnsafe(value, i); - } - break; - case NANOARROW_TYPE_INT64: - // Quoted integers to avoid overflow (i.e., "123456") - out << R"(")" << ArrowArrayViewGetIntUnsafe(value, 0) << R"(")"; - for (int64_t i = 1; i < value->length; i++) { - out << R"(, ")" << ArrowArrayViewGetIntUnsafe(value, i) << R"(")"; - } - break; - case NANOARROW_TYPE_UINT64: - // Quoted integers to avoid overflow (i.e., "123456") - out << R"(")" << ArrowArrayViewGetUIntUnsafe(value, 0) << R"(")"; - for (int64_t i = 1; i < value->length; i++) { - out << R"(, ")" << ArrowArrayViewGetUIntUnsafe(value, i) << R"(")"; - } - break; - - case NANOARROW_TYPE_FLOAT: - case NANOARROW_TYPE_DOUBLE: { - // JSON number to 3 decimal places - LocalizedStream local_stream_opt(out); - local_stream_opt.SetFixed(3); - - out << ArrowArrayViewGetDoubleUnsafe(value, 0); - for (int64_t i = 1; i < value->length; i++) { - out << ", " << ArrowArrayViewGetDoubleUnsafe(value, i); - } - break; - } - - case NANOARROW_TYPE_STRING: - case NANOARROW_TYPE_LARGE_STRING: - NANOARROW_RETURN_NOT_OK( - WriteString(out, ArrowArrayViewGetStringUnsafe(value, 0))); - for (int64_t i = 1; i < value->length; i++) { - out << ", "; - NANOARROW_RETURN_NOT_OK( - WriteString(out, ArrowArrayViewGetStringUnsafe(value, i))); - } - break; - - case NANOARROW_TYPE_BINARY: - case NANOARROW_TYPE_LARGE_BINARY: - case NANOARROW_TYPE_FIXED_SIZE_BINARY: { - NANOARROW_RETURN_NOT_OK(WriteBytes(out, ArrowArrayViewGetBytesUnsafe(value, 0))); - for (int64_t i = 1; i < value->length; i++) { - out << ", "; - NANOARROW_RETURN_NOT_OK( - WriteBytes(out, ArrowArrayViewGetBytesUnsafe(value, i))); - } - break; - } - - default: - // Not supported - return ENOTSUP; - } - - out << "]"; - return NANOARROW_OK; - } - - ArrowErrorCode WriteString(std::ostream& out, ArrowStringView value) { - out << R"(")"; - - for (int64_t i = 0; i < value.size_bytes; i++) { - char c = value.data[i]; - if (c == '"') { - out << R"(\")"; - } else if (c == '\\') { - out << R"(\\)"; - } else if (c < 0) { - // Not supporting multibyte unicode yet - return ENOTSUP; - } else if (c < 20) { - // Data in the arrow-testing repo has a lot of content that requires escaping - // in this way (\uXXXX). - uint16_t utf16_bytes = static_cast(c); - - char utf16_esc[7]; - utf16_esc[6] = '\0'; - snprintf(utf16_esc, sizeof(utf16_esc), R"(\u%04x)", utf16_bytes); - out << utf16_esc; - } else { - out << c; - } - } - - out << R"(")"; - return NANOARROW_OK; - } - - ArrowErrorCode WriteBytes(std::ostream& out, ArrowBufferView value) { - out << R"(")"; - char hex[3]; - hex[2] = '\0'; - - for (int64_t i = 0; i < value.size_bytes; i++) { - snprintf(hex, sizeof(hex), "%02X", static_cast(value.data.as_uint8[i])); - out << hex; - } - out << R"(")"; - return NANOARROW_OK; - } - - ArrowErrorCode WriteChildren(std::ostream& out, const ArrowSchema* field, - ArrowArrayView* value) { - if (field->n_children == 0) { - out << "[]"; - return NANOARROW_OK; - } - - out << "["; - NANOARROW_RETURN_NOT_OK(WriteColumn(out, field->children[0], value->children[0])); - for (int64_t i = 1; i < field->n_children; i++) { - out << ", "; - NANOARROW_RETURN_NOT_OK(WriteColumn(out, field->children[i], value->children[i])); - } - out << "]"; - return NANOARROW_OK; - } - - class LocalizedStream { - public: - LocalizedStream(std::ostream& out) : out_(out) { - previous_locale_ = out.imbue(std::locale::classic()); - previous_precision_ = out.precision(); - fmt_flags_ = out.flags(); - out.setf(out.fixed); - } - - void SetFixed(int precision) { out_.precision(precision); } - - ~LocalizedStream() { - out_.flags(fmt_flags_); - out_.precision(previous_precision_); - out_.imbue(previous_locale_); - } - - private: - std::ostream& out_; - std::locale previous_locale_; - std::ios::fmtflags fmt_flags_; - std::streamsize previous_precision_; - }; -}; - -/// \brief Reader for the Arrow integration testing JSON format -class TestingJSONReader { - using json = nlohmann::json; - - public: - /// \brief Read JSON representing a Schema - /// - /// Reads a JSON object in the form `{"fields": [...], "metadata": [...]}`, - /// propagating `out` on success. - ArrowErrorCode ReadSchema(const std::string& schema_json, ArrowSchema* out, - ArrowError* error = nullptr) { - try { - auto obj = json::parse(schema_json); - nanoarrow::UniqueSchema schema; - - NANOARROW_RETURN_NOT_OK(SetSchema(schema.get(), obj, error)); - ArrowSchemaMove(schema.get(), out); - return NANOARROW_OK; - } catch (json::exception& e) { - ArrowErrorSet(error, "Exception in TestingJSONReader::ReadSchema(): %s", e.what()); - return EINVAL; - } - } - - /// \brief Read JSON representing a Field - /// - /// Read a JSON object in the form `{"name" : "col", "type": {...}, ...}`, - /// propagating `out` on success. - ArrowErrorCode ReadField(const std::string& field_json, ArrowSchema* out, - ArrowError* error = nullptr) { - try { - auto obj = json::parse(field_json); - nanoarrow::UniqueSchema schema; - - NANOARROW_RETURN_NOT_OK(SetField(schema.get(), obj, error)); - ArrowSchemaMove(schema.get(), out); - return NANOARROW_OK; - } catch (json::exception& e) { - ArrowErrorSet(error, "Exception in TestingJSONReader::ReadField(): %s", e.what()); - return EINVAL; - } - } - - /// \brief Read JSON representing a Column - /// - /// Read a JSON object in the form - /// `{"name": "col", "count": 123, "VALIDITY": [...], ...}`, propagating - /// `out` on success. - ArrowErrorCode ReadColumn(const std::string& column_json, const ArrowSchema* schema, - ArrowArray* out, ArrowError* error = nullptr) { - try { - auto obj = json::parse(column_json); - - // ArrowArrayView to enable validation - nanoarrow::UniqueArrayView array_view; - NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema( - array_view.get(), const_cast(schema), error)); - - // ArrowArray to hold memory - nanoarrow::UniqueArray array; - NANOARROW_RETURN_NOT_OK( - ArrowArrayInitFromSchema(array.get(), const_cast(schema), error)); - - // Parse the JSON into the array - NANOARROW_RETURN_NOT_OK(SetArrayColumn(obj, array_view.get(), array.get(), error)); - - // Return the result - ArrowArrayMove(array.get(), out); - return NANOARROW_OK; - } catch (json::exception& e) { - ArrowErrorSet(error, "Exception in TestingJSONReader::ReadColumn(): %s", e.what()); - return EINVAL; - } - } - - private: - ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.is_object(), error, "Expected Schema to be a JSON object")); - NANOARROW_RETURN_NOT_OK( - Check(value.contains("fields"), error, "Schema missing key 'fields'")); - NANOARROW_RETURN_NOT_OK( - Check(value.contains("metadata"), error, "Schema missing key 'metadata'")); - - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaInitFromType(schema, NANOARROW_TYPE_STRUCT), error); - - const auto& fields = value["fields"]; - NANOARROW_RETURN_NOT_OK( - Check(fields.is_array(), error, "Schema fields must be array")); - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaAllocateChildren(schema, fields.size()), - error); - for (int64_t i = 0; i < schema->n_children; i++) { - NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], fields[i], error)); - } - - NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error)); - - // Validate! - ArrowSchemaView schema_view; - NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error)); - return NANOARROW_OK; - } - - ArrowErrorCode SetField(ArrowSchema* schema, const json& value, ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.is_object(), error, "Expected Field to be a JSON object")); - NANOARROW_RETURN_NOT_OK( - Check(value.contains("name"), error, "Field missing key 'name'")); - NANOARROW_RETURN_NOT_OK( - Check(value.contains("nullable"), error, "Field missing key 'nullable'")); - NANOARROW_RETURN_NOT_OK( - Check(value.contains("type"), error, "Field missing key 'type'")); - NANOARROW_RETURN_NOT_OK( - Check(value.contains("children"), error, "Field missing key 'children'")); - NANOARROW_RETURN_NOT_OK( - Check(value.contains("metadata"), error, "Field missing key 'metadata'")); - - ArrowSchemaInit(schema); - - const auto& name = value["name"]; - NANOARROW_RETURN_NOT_OK(Check(name.is_string() || name.is_null(), error, - "Field name must be string or null")); - if (name.is_string()) { - auto name_str = name.get(); - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetName(schema, name_str.c_str()), - error); - } - - const auto& nullable = value["nullable"]; - NANOARROW_RETURN_NOT_OK( - Check(nullable.is_boolean(), error, "Field nullable must be boolean")); - if (nullable.get()) { - schema->flags |= ARROW_FLAG_NULLABLE; - } else { - schema->flags &= ~ARROW_FLAG_NULLABLE; - } - - NANOARROW_RETURN_NOT_OK(SetType(schema, value["type"], error)); - - const auto& children = value["children"]; - NANOARROW_RETURN_NOT_OK( - Check(children.is_array(), error, "Field children must be array")); - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaAllocateChildren(schema, children.size()), error); - for (int64_t i = 0; i < schema->n_children; i++) { - NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], children[i], error)); - } - - NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error)); - - // Validate! - ArrowSchemaView schema_view; - NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error)); - return NANOARROW_OK; - } - - ArrowErrorCode SetType(ArrowSchema* schema, const json& value, ArrowError* error) { - NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Type must be object")); - NANOARROW_RETURN_NOT_OK( - Check(value.contains("name"), error, "Type missing key 'name'")); - - const auto& name = value["name"]; - NANOARROW_RETURN_NOT_OK(Check(name.is_string(), error, "Type name must be string")); - auto name_str = name.get(); - - if (name_str == "null") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, NANOARROW_TYPE_NA), - error); - } else if (name_str == "bool") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, NANOARROW_TYPE_BOOL), - error); - } else if (name_str == "int") { - NANOARROW_RETURN_NOT_OK(SetTypeInt(schema, value, error)); - } else if (name_str == "floatingpoint") { - NANOARROW_RETURN_NOT_OK(SetTypeFloatingPoint(schema, value, error)); - } else if (name_str == "decimal") { - NANOARROW_RETURN_NOT_OK(SetTypeDecimal(schema, value, error)); - } else if (name_str == "utf8") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaSetType(schema, NANOARROW_TYPE_STRING), error); - } else if (name_str == "largeutf8") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaSetType(schema, NANOARROW_TYPE_LARGE_STRING), error); - } else if (name_str == "binary") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaSetType(schema, NANOARROW_TYPE_BINARY), error); - } else if (name_str == "largebinary") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaSetType(schema, NANOARROW_TYPE_LARGE_BINARY), error); - } else if (name_str == "fixedsizebinary") { - NANOARROW_RETURN_NOT_OK(SetTypeFixedSizeBinary(schema, value, error)); - } else if (name_str == "list") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+l"), error); - } else if (name_str == "largelist") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+L"), error); - } else if (name_str == "fixedsizelist") { - NANOARROW_RETURN_NOT_OK(SetTypeFixedSizeList(schema, value, error)); - } else if (name_str == "map") { - NANOARROW_RETURN_NOT_OK(SetTypeMap(schema, value, error)); - } else if (name_str == "union") { - NANOARROW_RETURN_NOT_OK(SetTypeUnion(schema, value, error)); - } else if (name_str == "struct") { - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+s"), error); - } else { - ArrowErrorSet(error, "Unsupported Type name: '%s'", name_str.c_str()); - return ENOTSUP; - } - - return NANOARROW_OK; - } - - ArrowErrorCode SetTypeInt(ArrowSchema* schema, const json& value, ArrowError* error) { - NANOARROW_RETURN_NOT_OK(Check(value.contains("bitWidth"), error, - "Type[name=='int'] missing key 'bitWidth'")); - NANOARROW_RETURN_NOT_OK(Check(value.contains("isSigned"), error, - "Type[name=='int'] missing key 'isSigned'")); - - const auto& bitwidth = value["bitWidth"]; - NANOARROW_RETURN_NOT_OK(Check(bitwidth.is_number_integer(), error, - "Type[name=='int'] bitWidth must be integer")); - - const auto& issigned = value["isSigned"]; - NANOARROW_RETURN_NOT_OK(Check(issigned.is_boolean(), error, - "Type[name=='int'] isSigned must be boolean")); - - ArrowType type = NANOARROW_TYPE_UNINITIALIZED; - if (issigned.get()) { - switch (bitwidth.get()) { - case 8: - type = NANOARROW_TYPE_INT8; - break; - case 16: - type = NANOARROW_TYPE_INT16; - break; - case 32: - type = NANOARROW_TYPE_INT32; - break; - case 64: - type = NANOARROW_TYPE_INT64; - break; - default: - ArrowErrorSet(error, "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); - return EINVAL; - } - } else { - switch (bitwidth.get()) { - case 8: - type = NANOARROW_TYPE_UINT8; - break; - case 16: - type = NANOARROW_TYPE_UINT16; - break; - case 32: - type = NANOARROW_TYPE_UINT32; - break; - case 64: - type = NANOARROW_TYPE_UINT64; - break; - default: - ArrowErrorSet(error, "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); - return EINVAL; - } - } - - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, type), error); - return NANOARROW_OK; - } - - ArrowErrorCode SetTypeFloatingPoint(ArrowSchema* schema, const json& value, - ArrowError* error) { - NANOARROW_RETURN_NOT_OK(Check(value.contains("precision"), error, - "Type[name=='floatingpoint'] missing key 'precision'")); - - const auto& precision = value["precision"]; - NANOARROW_RETURN_NOT_OK(Check(precision.is_string(), error, - "Type[name=='floatingpoint'] bitWidth must be string")); - - ArrowType type = NANOARROW_TYPE_UNINITIALIZED; - auto precision_str = precision.get(); - if (precision_str == "HALF") { - type = NANOARROW_TYPE_HALF_FLOAT; - } else if (precision_str == "SINGLE") { - type = NANOARROW_TYPE_FLOAT; - } else if (precision_str == "DOUBLE") { - type = NANOARROW_TYPE_DOUBLE; - } else { - ArrowErrorSet( - error, - "Type[name=='floatingpoint'] precision must be 'HALF', 'SINGLE', or 'DOUBLE'"); - return EINVAL; - } - - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetType(schema, type), error); - return NANOARROW_OK; - } - - ArrowErrorCode SetTypeFixedSizeBinary(ArrowSchema* schema, const json& value, - ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.contains("byteWidth"), error, - "Type[name=='fixedsizebinary'] missing key 'byteWidth'")); - - const auto& byteWidth = value["byteWidth"]; - NANOARROW_RETURN_NOT_OK( - Check(byteWidth.is_number_integer(), error, - "Type[name=='fixedsizebinary'] byteWidth must be integer")); - - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaSetTypeFixedSize(schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, - byteWidth.get()), - error); - return NANOARROW_OK; - } - - ArrowErrorCode SetTypeDecimal(ArrowSchema* schema, const json& value, - ArrowError* error) { - NANOARROW_RETURN_NOT_OK(Check(value.contains("bitWidth"), error, - "Type[name=='decimal'] missing key 'bitWidth'")); - NANOARROW_RETURN_NOT_OK(Check(value.contains("precision"), error, - "Type[name=='decimal'] missing key 'precision'")); - NANOARROW_RETURN_NOT_OK(Check(value.contains("scale"), error, - "Type[name=='decimal'] missing key 'scale'")); - - const auto& bitWidth = value["bitWidth"]; - NANOARROW_RETURN_NOT_OK(Check(bitWidth.is_number_integer(), error, - "Type[name=='decimal'] bitWidth must be integer")); - - ArrowType type; - switch (bitWidth.get()) { - case 128: - type = NANOARROW_TYPE_DECIMAL128; - break; - case 256: - type = NANOARROW_TYPE_DECIMAL256; - break; - default: - ArrowErrorSet(error, "Type[name=='decimal'] bitWidth must be 128 or 256"); - return EINVAL; - } - - const auto& precision = value["precision"]; - NANOARROW_RETURN_NOT_OK(Check(precision.is_number_integer(), error, - "Type[name=='decimal'] precision must be integer")); - - const auto& scale = value["scale"]; - NANOARROW_RETURN_NOT_OK(Check(scale.is_number_integer(), error, - "Type[name=='decimal'] scale must be integer")); - - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaSetTypeDecimal(schema, type, precision.get(), scale.get()), - error); - - return NANOARROW_OK; - } - - ArrowErrorCode SetTypeMap(ArrowSchema* schema, const json& value, ArrowError* error) { - NANOARROW_RETURN_NOT_OK(Check(value.contains("keysSorted"), error, - "Type[name=='map'] missing key 'keysSorted'")); - - const auto& keys_sorted = value["keysSorted"]; - NANOARROW_RETURN_NOT_OK(Check(keys_sorted.is_boolean(), error, - "Type[name=='map'] keysSorted must be boolean")); - - if (keys_sorted.get()) { - schema->flags |= ARROW_FLAG_MAP_KEYS_SORTED; - } else { - schema->flags &= ~ARROW_FLAG_MAP_KEYS_SORTED; - } - - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, "+m"), error); - return NANOARROW_OK; - } - - ArrowErrorCode SetTypeFixedSizeList(ArrowSchema* schema, const json& value, - ArrowError* error) { - NANOARROW_RETURN_NOT_OK(Check(value.contains("listSize"), error, - "Type[name=='fixedsizelist'] missing key 'listSize'")); - - const auto& list_size = value["listSize"]; - NANOARROW_RETURN_NOT_OK( - Check(list_size.is_number_integer(), error, - "Type[name=='fixedsizelist'] listSize must be integer")); - - std::stringstream format_builder; - format_builder << "+w:" << list_size; - std::string format = format_builder.str(); - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaSetFormat(schema, format.c_str()), - error); - return NANOARROW_OK; - } - - ArrowErrorCode SetTypeUnion(ArrowSchema* schema, const json& value, ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.contains("mode"), error, "Type[name=='union'] missing key 'mode'")); - NANOARROW_RETURN_NOT_OK(Check(value.contains("typeIds"), error, - "Type[name=='union'] missing key 'typeIds'")); - - const auto& mode = value["mode"]; - NANOARROW_RETURN_NOT_OK( - Check(mode.is_string(), error, "Type[name=='union'] mode must be string")); - - auto mode_str = mode.get(); - std::stringstream type_ids_format; - - if (mode_str == "DENSE") { - type_ids_format << "+ud:"; - } else if (mode_str == "SPARSE") { - type_ids_format << "+us:"; - } else { - ArrowErrorSet(error, "Type[name=='union'] mode must be 'DENSE' or 'SPARSE'"); - return EINVAL; - } - - const auto& type_ids = value["typeIds"]; - NANOARROW_RETURN_NOT_OK( - Check(type_ids.is_array(), error, "Type[name=='union'] typeIds must be array")); - - if (type_ids.size() > 0) { - for (size_t i = 0; i < type_ids.size(); i++) { - const auto& type_id = type_ids[i]; - NANOARROW_RETURN_NOT_OK( - Check(type_id.is_number_integer(), error, - "Type[name=='union'] typeIds item must be integer")); - type_ids_format << type_id; - - if ((i + 1) < type_ids.size()) { - type_ids_format << ","; - } - } - } - - std::string type_ids_format_str = type_ids_format.str(); - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaSetFormat(schema, type_ids_format_str.c_str()), error); - - return NANOARROW_OK; - } - - ArrowErrorCode SetMetadata(ArrowSchema* schema, const json& value, ArrowError* error) { - NANOARROW_RETURN_NOT_OK(Check(value.is_null() || value.is_array(), error, - "Field or Schema metadata must be null or array")); - if (value.is_null()) { - return NANOARROW_OK; - } - - nanoarrow::UniqueBuffer metadata; - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowMetadataBuilderInit(metadata.get(), nullptr), - error); - for (const auto& item : value) { - NANOARROW_RETURN_NOT_OK( - Check(item.is_object(), error, "metadata item must be object")); - NANOARROW_RETURN_NOT_OK( - Check(item.contains("key"), error, "metadata item missing key 'key'")); - NANOARROW_RETURN_NOT_OK( - Check(item.contains("value"), error, "metadata item missing key 'value'")); - - const auto& key = item["key"]; - const auto& value = item["value"]; - NANOARROW_RETURN_NOT_OK( - Check(key.is_string(), error, "metadata item key must be string")); - NANOARROW_RETURN_NOT_OK( - Check(value.is_string(), error, "metadata item value must be string")); - - auto key_str = key.get(); - auto value_str = value.get(); - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowMetadataBuilderAppend(metadata.get(), ArrowCharView(key_str.c_str()), - ArrowCharView(value_str.c_str())), - error); - } - - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowSchemaSetMetadata(schema, reinterpret_cast(metadata->data)), error); - return NANOARROW_OK; - } - - ArrowErrorCode SetArrayColumn(const json& value, ArrowArrayView* array_view, - ArrowArray* array, ArrowError* error, - const std::string& parent_error_prefix = "") { - NANOARROW_RETURN_NOT_OK( - Check(value.is_object(), error, "Expected Column to be a JSON object")); - - // Check + resolve name early to generate better error messages - NANOARROW_RETURN_NOT_OK( - Check(value.contains("name"), error, "Column missing key 'name'")); - - const auto& name = value["name"]; - NANOARROW_RETURN_NOT_OK(Check(name.is_null() || name.is_string(), error, - "Column name must be string or null")); - - std::string error_prefix; - if (name.is_string()) { - error_prefix = parent_error_prefix + "-> Column '" + name.get() + "' "; - } else { - error_prefix = parent_error_prefix + "-> Column "; - } - - // Check, resolve, and recurse children - NANOARROW_RETURN_NOT_OK( - Check(array_view->n_children == 0 || value.contains("children"), error, - error_prefix + "missing key children")); - - if (value.contains("children")) { - const auto& children = value["children"]; - NANOARROW_RETURN_NOT_OK( - Check(children.is_array(), error, error_prefix + "children must be array")); - NANOARROW_RETURN_NOT_OK(Check(children.size() == array_view->n_children, error, - error_prefix + "children has incorrect size")); - - for (int64_t i = 0; i < array_view->n_children; i++) { - NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], array_view->children[i], - array->children[i], error, error_prefix)); - } - } - - // Build buffers - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - NANOARROW_RETURN_NOT_OK( - PrefixError(SetArrayColumnBuffers(value, array_view, array, i, error), error, - error_prefix)); - } - - // Check + resolve count - NANOARROW_RETURN_NOT_OK( - Check(value.contains("count"), error, error_prefix + "missing key 'count'")); - const auto& count = value["count"]; - NANOARROW_RETURN_NOT_OK( - Check(count.is_number_integer(), error, error_prefix + "count must be integer")); - array_view->length = count.get(); - - // Set ArrayView buffer views. This is because ArrowArrayInitFromSchema() doesn't - // support custom type ids for unions but the ArrayView does (otherwise - // ArrowArrayFinishBuilding() would work). - for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { - ArrowBuffer* buffer = ArrowArrayBuffer(array, i); - ArrowBufferView* buffer_view = array_view->buffer_views + i; - buffer_view->data.as_uint8 = buffer->data; - buffer_view->size_bytes = buffer->size_bytes; - } - - // Validate the array view - NANOARROW_RETURN_NOT_OK(PrefixError( - ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL, error), error, - error_prefix + "failed to validate: ")); - - // Flush length and buffer pointers to the Array - array->length = array_view->length; - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE, nullptr), error); - - return NANOARROW_OK; - } - - ArrowErrorCode SetArrayColumnBuffers(const json& value, ArrowArrayView* array_view, - ArrowArray* array, int buffer_i, - ArrowError* error) { - ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); - - switch (array_view->layout.buffer_type[buffer_i]) { - case NANOARROW_BUFFER_TYPE_VALIDITY: { - NANOARROW_RETURN_NOT_OK( - Check(value.contains("VALIDITY"), error, "missing key 'VALIDITY'")); - const auto& validity = value["VALIDITY"]; - NANOARROW_RETURN_NOT_OK( - SetBufferBitmap(validity, ArrowArrayValidityBitmap(array), error)); - break; - } - case NANOARROW_BUFFER_TYPE_TYPE_ID: { - NANOARROW_RETURN_NOT_OK( - Check(value.contains("TYPE_ID"), error, "missing key 'TYPE_ID'")); - const auto& type_id = value["TYPE_ID"]; - NANOARROW_RETURN_NOT_OK(SetBufferInt(type_id, buffer, error)); - break; - } - case NANOARROW_BUFFER_TYPE_UNION_OFFSET: { - NANOARROW_RETURN_NOT_OK( - Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); - const auto& offset = value["OFFSET"]; - NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); - break; - } - case NANOARROW_BUFFER_TYPE_DATA_OFFSET: { - NANOARROW_RETURN_NOT_OK( - Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); - const auto& offset = value["OFFSET"]; - - if (array_view->layout.element_size_bits[buffer_i] == 32) { - NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); - } else { - NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); - } - break; - } - - case NANOARROW_BUFFER_TYPE_DATA: { - NANOARROW_RETURN_NOT_OK( - Check(value.contains("DATA"), error, "missing key 'DATA'")); - const auto& data = value["DATA"]; - - switch (array_view->storage_type) { - case NANOARROW_TYPE_BOOL: { - nanoarrow::UniqueBitmap bitmap; - NANOARROW_RETURN_NOT_OK(SetBufferBitmap(data, bitmap.get(), error)); - ArrowBufferMove(&bitmap->buffer, buffer); - return NANOARROW_OK; - } - case NANOARROW_TYPE_INT8: - return SetBufferInt(data, buffer, error); - case NANOARROW_TYPE_UINT8: - return SetBufferInt(data, buffer, error); - case NANOARROW_TYPE_INT16: - return SetBufferInt(data, buffer, error); - case NANOARROW_TYPE_UINT16: - return SetBufferInt(data, buffer, error); - case NANOARROW_TYPE_INT32: - return SetBufferInt(data, buffer, error); - case NANOARROW_TYPE_UINT32: - return SetBufferInt(data, buffer, error); - case NANOARROW_TYPE_INT64: - return SetBufferInt(data, buffer, error); - case NANOARROW_TYPE_UINT64: - return SetBufferInt(data, buffer, error); - - case NANOARROW_TYPE_FLOAT: - return SetBufferFloatingPoint(data, buffer, error); - case NANOARROW_TYPE_DOUBLE: - return SetBufferFloatingPoint(data, buffer, error); - - case NANOARROW_TYPE_STRING: - return SetBufferString(data, ArrowArrayBuffer(array, buffer_i - 1), - buffer, error); - case NANOARROW_TYPE_LARGE_STRING: - return SetBufferString(data, ArrowArrayBuffer(array, buffer_i - 1), - buffer, error); - case NANOARROW_TYPE_BINARY: - return SetBufferBinary(data, ArrowArrayBuffer(array, buffer_i - 1), - buffer, error); - case NANOARROW_TYPE_LARGE_BINARY: - return SetBufferBinary(data, ArrowArrayBuffer(array, buffer_i - 1), - buffer, error); - case NANOARROW_TYPE_FIXED_SIZE_BINARY: - return SetBufferFixedSizeBinary( - data, buffer, array_view->layout.element_size_bits[buffer_i] / 8, error); - - default: - ArrowErrorSet(error, "storage type %s DATA buffer not supported", - ArrowTypeString(array_view->storage_type)); - return ENOTSUP; - } - break; - } - case NANOARROW_BUFFER_TYPE_NONE: - break; - } - - return NANOARROW_OK; - } - - ArrowErrorCode SetBufferBitmap(const json& value, ArrowBitmap* bitmap, - ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.is_array(), error, "bitmap buffer must be array")); - - for (const auto& item : value) { - // Some example files write bitmaps as [true, false, true] but the documentation - // says [1, 0, 1]. Accept both for simplicity. - NANOARROW_RETURN_NOT_OK(Check(item.is_boolean() || item.is_number_integer(), error, - "bitmap item must be bool or integer")); - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapAppend(bitmap, item.get(), 1), - error); - } - - return NANOARROW_OK; - } - - template - ArrowErrorCode SetBufferInt(const json& value, ArrowBuffer* buffer, ArrowError* error) { - NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "int buffer must be array")); - - for (const auto& item : value) { - // NANOARROW_RETURN_NOT_OK() interacts poorly with multiple template args - ArrowErrorCode result = SetBufferIntItem(item, buffer, error); - NANOARROW_RETURN_NOT_OK(result); - } - - return NANOARROW_OK; - } - - template - ArrowErrorCode SetBufferIntItem(const json& item, ArrowBuffer* buffer, - ArrowError* error) { - if (item.is_string()) { - try { - // The JSON parser here can handle up to 2^64 - 1 - auto item_int = json::parse(item.get()); - return SetBufferIntItem(item_int, buffer, error); - } catch (json::parse_error& e) { - ArrowErrorSet(error, - "integer buffer item encoded as string must parse as integer: %s", - item.dump().c_str()); - return EINVAL; - } - } - - NANOARROW_RETURN_NOT_OK( - Check(item.is_number_integer(), error, - "integer buffer item must be integer number or string")); - NANOARROW_RETURN_NOT_OK( - Check(std::numeric_limits::is_signed || item.is_number_unsigned(), error, - "expected unsigned integer buffer item but found signed integer '" + - item.dump() + "'")); - - auto item_int = item.get(); - - NANOARROW_RETURN_NOT_OK( - Check(item_int >= std::numeric_limits::lowest() && - item_int <= std::numeric_limits::max(), - error, "integer buffer item '" + item.dump() + "' outside type limits")); - - T buffer_value = item_int; - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); - - return NANOARROW_OK; - } - - template - ArrowErrorCode SetBufferFloatingPoint(const json& value, ArrowBuffer* buffer, - ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.is_array(), error, "floatingpoint buffer must be array")); - - for (const auto& item : value) { - NANOARROW_RETURN_NOT_OK( - Check(item.is_number(), error, "floatingpoint buffer item must be number")); - double item_dbl = item.get(); - - NANOARROW_RETURN_NOT_OK(Check( - item_dbl >= std::numeric_limits::lowest() && - item_dbl <= std::numeric_limits::max(), - error, "floatingpoint buffer item '" + item.dump() + "' outside type limits")); - - T buffer_value = item_dbl; - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); - } - - return NANOARROW_OK; - } - - template - ArrowErrorCode SetBufferString(const json& value, ArrowBuffer* offsets, - ArrowBuffer* data, ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.is_array(), error, "utf8 data buffer must be array")); - - // Check offsets against values - const T* expected_offset = reinterpret_cast(offsets->data); - NANOARROW_RETURN_NOT_OK(Check( - offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, - "Expected offset buffer with " + std::to_string(value.size()) + " elements")); - NANOARROW_RETURN_NOT_OK( - Check(*expected_offset++ == 0, error, "first offset must be zero")); - - int64_t last_offset = 0; - - for (const auto& item : value) { - NANOARROW_RETURN_NOT_OK( - Check(item.is_string(), error, "utf8 data buffer item must be string")); - auto item_str = item.get(); - - // Append data - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(data, reinterpret_cast(item_str.data()), - item_str.size()), - error); - - // Check offset - last_offset += item_str.size(); - NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, - "Expected offset value " + - std::to_string(last_offset) + - " at utf8 data buffer item " + item.dump())); - } - - return NANOARROW_OK; - } - - template - ArrowErrorCode SetBufferBinary(const json& value, ArrowBuffer* offsets, - ArrowBuffer* data, ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.is_array(), error, "binary data buffer must be array")); - - // Check offsets against values if not fixed size - const T* expected_offset = reinterpret_cast(offsets->data); - NANOARROW_RETURN_NOT_OK(Check( - offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, - "Expected offset buffer with " + std::to_string(value.size()) + " elements")); - NANOARROW_RETURN_NOT_OK( - Check(*expected_offset++ == 0, error, "first offset must be zero")); - - for (const auto& item : value) { - NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); - - // Check offset - NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == data->size_bytes, error, - "Expected offset value " + - std::to_string(data->size_bytes) + - " at binary data buffer item " + item.dump())); - } - - return NANOARROW_OK; - } - - ArrowErrorCode SetBufferFixedSizeBinary(const json& value, ArrowBuffer* data, - int64_t fixed_size, ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(value.is_array(), error, "binary data buffer must be array")); - - int64_t last_offset = 0; - - for (const auto& item : value) { - NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); - int64_t item_size_bytes = data->size_bytes - last_offset; - - NANOARROW_RETURN_NOT_OK(Check(item_size_bytes == fixed_size, error, - "Expected fixed size binary value of size " + - std::to_string(fixed_size) + - " at binary data buffer item " + item.dump())); - last_offset = data->size_bytes; - } - - return NANOARROW_OK; - } - - ArrowErrorCode AppendBinaryElement(const json& item, ArrowBuffer* data, - ArrowError* error) { - NANOARROW_RETURN_NOT_OK( - Check(item.is_string(), error, "binary data buffer item must be string")); - auto item_str = item.get(); - - int64_t item_size_bytes = item_str.size() / 2; - NANOARROW_RETURN_NOT_OK(Check((item_size_bytes * 2) == item_str.size(), error, - "binary data buffer item must have even size")); - - NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBufferReserve(data, item_size_bytes), error); - for (int64_t i = 0; i < item_str.size(); i += 2) { - std::string byte_hex = item_str.substr(i, 2); - char* end_ptr; - uint8_t byte = std::strtoul(byte_hex.data(), &end_ptr, 16); - NANOARROW_RETURN_NOT_OK( - Check(end_ptr == (byte_hex.data() + 2), error, - "binary data buffer item must contain a valid hex-encoded byte string")); - - data->data[data->size_bytes] = byte; - data->size_bytes++; - } - - return NANOARROW_OK; - } - - ArrowErrorCode PrefixError(ArrowErrorCode value, ArrowError* error, - const std::string& prefix) { - if (value != NANOARROW_OK && error != nullptr) { - std::string msg = prefix + error->message; - ArrowErrorSet(error, "%s", msg.c_str()); - } - - return value; - } - - ArrowErrorCode Check(bool value, ArrowError* error, const std::string& err) { - if (value) { - return NANOARROW_OK; - } else { - ArrowErrorSet(error, "%s", err.c_str()); - return EINVAL; - } - } -}; - -/// @} - -} // namespace testing -} // namespace nanoarrow - -#endif From 8350dd026738e8b9fbeb100c78213afd60e6142a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Nov 2023 11:50:13 -0400 Subject: [PATCH 18/19] ignore testing file --- python/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/python/.gitignore b/python/.gitignore index d30e19868..092798027 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -19,6 +19,7 @@ src/nanoarrow/nanoarrow.c src/nanoarrow/nanoarrow.h src/nanoarrow/nanoarrow_device.h +src/nanoarrow/nanoarrow_testing.hpp src/nanoarrow/nanoarrow_c.pxd src/nanoarrow/*.c From e1afd57ac97d5b5a6900b94f63c34505087becf2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Nov 2023 14:31:23 -0400 Subject: [PATCH 19/19] test union roundtrips --- src/nanoarrow/nanoarrow_testing.hpp | 4 ++-- src/nanoarrow/nanoarrow_testing_test.cc | 28 ++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index bfb15462d..103f22e92 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -440,7 +440,7 @@ class TestingJSONWriter { } } else { // No need to quote smaller ints (i.e., 123456) - out << values[0]; + out << static_cast(values[0]); for (int64_t i = 1; i < n_values; i++) { out << ", " << static_cast(values[i]); } @@ -1189,7 +1189,7 @@ class TestingJSONReader { NANOARROW_RETURN_NOT_OK( Check(value.contains("TYPE_ID"), error, "missing key 'TYPE_ID'")); const auto& type_id = value["TYPE_ID"]; - NANOARROW_RETURN_NOT_OK(SetBufferInt(type_id, buffer, error)); + NANOARROW_RETURN_NOT_OK(SetBufferInt(type_id, buffer, error)); break; } case NANOARROW_BUFFER_TYPE_UNION_OFFSET: { diff --git a/src/nanoarrow/nanoarrow_testing_test.cc b/src/nanoarrow/nanoarrow_testing_test.cc index d9ed452eb..0b8f733b3 100644 --- a/src/nanoarrow/nanoarrow_testing_test.cc +++ b/src/nanoarrow/nanoarrow_testing_test.cc @@ -997,17 +997,39 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFixedSizeList) { } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldUnion) { + // Empty unions + TestFieldRoundtrip( + R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "DENSE", "typeIds": []}, "children": [], "metadata": null})", + R"({"name": null, "count": 0, "TYPE_ID": [], "OFFSET": [], "children": []})"); + TestFieldRoundtrip( + R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "SPARSE", "typeIds": []}, "children": [], "metadata": null})", + R"({"name": null, "count": 0, "TYPE_ID": [], "children": []})"); + TestFieldRoundtrip( R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "DENSE", "typeIds": [10,20]}, "children": [)" R"({"name": null, "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null}, )" R"({"name": null, "nullable": true, "type": {"name": "utf8"}, "children": [], "metadata": null})" R"(], "metadata": null})"); + // Non-empty unions (null, "abc") TestFieldRoundtrip( R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "SPARSE", "typeIds": [10,20]}, "children": [)" - R"({"name": null, "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null}, )" - R"({"name": null, "nullable": true, "type": {"name": "utf8"}, "children": [], "metadata": null})" - R"(], "metadata": null})"); + R"({"name": "nulls", "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null}, )" + R"({"name": "strings", "nullable": true, "type": {"name": "utf8"}, "children": [], "metadata": null})" + R"(], "metadata": null})", + R"({"name": null, "count": 2, "TYPE_ID": [20, 10], "children": [)" + R"({"name": "nulls", "count": 2}, )" + R"({"name": "strings", "count": 2, "VALIDITY": [1, 1], "OFFSET": [0, 3, 3], "DATA": ["abc", ""]})" + R"(]})"); + TestFieldRoundtrip( + R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "DENSE", "typeIds": [10,20]}, "children": [)" + R"({"name": "nulls", "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null}, )" + R"({"name": "strings", "nullable": true, "type": {"name": "utf8"}, "children": [], "metadata": null})" + R"(], "metadata": null})", + R"({"name": null, "count": 2, "TYPE_ID": [20, 10], "OFFSET": [0, 0], "children": [)" + R"({"name": "nulls", "count": 1}, )" + R"({"name": "strings", "count": 1, "VALIDITY": [1], "OFFSET": [0, 3], "DATA": ["abc"]})" + R"(]})"); TestTypeError(R"({"name": "union", "mode": "NOT_A_MODE", "typeIds": []})", "Type[name=='union'] mode must be 'DENSE' or 'SPARSE'");