From e0329f4e9188d198680439cba74006173b7c02aa Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 29 Nov 2023 15:37:43 -0400 Subject: [PATCH] feat: Add integration testing reader for Column (#325) --- .../src/nanoarrow/nanoarrow_device.c | 2 +- .../src/nanoarrow/nanoarrow_ipc_decoder.c | 4 +- python/.gitignore | 1 + python/bootstrap.py | 5 + src/nanoarrow/array.c | 33 +- src/nanoarrow/array_inline.h | 6 +- src/nanoarrow/array_test.cc | 4 +- src/nanoarrow/nanoarrow_testing.hpp | 451 +++++++++++++++++- src/nanoarrow/nanoarrow_testing_test.cc | 182 +++++-- src/nanoarrow/nanoarrow_types.h | 20 +- 10 files changed, 640 insertions(+), 68 deletions(-) diff --git a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c index 4be7a9374..c4df3d144 100644 --- a/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c +++ b/extensions/nanoarrow_device/src/nanoarrow/nanoarrow_device.c @@ -408,7 +408,7 @@ static ArrowErrorCode ArrowDeviceArrayViewCopyInternal(struct ArrowDevice* devic dst->offset = src->offset; dst->null_count = src->null_count; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (src->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } diff --git a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c index 2fac3c7d7..9e4a6c9b5 100644 --- a/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c +++ b/extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c @@ -1155,7 +1155,7 @@ static void ArrowIpcDecoderInitFields(struct ArrowIpcField* fields, field->array = array; field->buffer_offset = *n_buffers; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { *n_buffers += array_view->layout.buffer_type[i] != NANOARROW_BUFFER_TYPE_NONE; } @@ -1524,7 +1524,7 @@ static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcArraySetter* setter, array_view->null_count = ns(FieldNode_null_count(field)); setter->field_i += 1; - for (int64_t i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } diff --git a/python/.gitignore b/python/.gitignore index d30e19868..092798027 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -19,6 +19,7 @@ src/nanoarrow/nanoarrow.c src/nanoarrow/nanoarrow.h src/nanoarrow/nanoarrow_device.h +src/nanoarrow/nanoarrow_testing.hpp src/nanoarrow/nanoarrow_c.pxd src/nanoarrow/*.c diff --git a/python/bootstrap.py b/python/bootstrap.py index 9e54cb704..bbb5d6693 100644 --- a/python/bootstrap.py +++ b/python/bootstrap.py @@ -36,6 +36,9 @@ def generate_nanoarrow_pxd(self, file_in, file_out): # Strip comments content = self.re_comment.sub("", content) + # Replace NANOARROW_MAX_FIXED_BUFFERS with its value + content = self.re_max_buffers.sub("3", content) + # Find types and function definitions types = self._find_types(content) func_defs = self._find_func_defs(content) @@ -59,6 +62,7 @@ def generate_nanoarrow_pxd(self, file_in, file_out): output.write(b"\n") output.write(b" ctypedef int ArrowErrorCode\n") output.write(b" cdef int NANOARROW_OK\n") + output.write(b" cdef int NANOARROW_MAX_FIXED_BUFFERS\n") output.write(b"\n") for type in types_cython: @@ -71,6 +75,7 @@ def generate_nanoarrow_pxd(self, file_in, file_out): def _define_regexes(self): self.re_comment = re.compile(r"\s*//[^\n]*") + self.re_max_buffers = re.compile(r"NANOARROW_MAX_FIXED_BUFFERS") self.re_type = re.compile( r"(?Pstruct|union|enum) (?PArrow[^ ]+) {(?P[^}]*)}" ) diff --git a/src/nanoarrow/array.c b/src/nanoarrow/array.c index 1e59777ea..a0e711eca 100644 --- a/src/nanoarrow/array.c +++ b/src/nanoarrow/array.c @@ -437,7 +437,7 @@ static void ArrowArrayFlushInternalPointers(struct ArrowArray* array) { struct ArrowArrayPrivateData* private_data = (struct ArrowArrayPrivateData*)array->private_data; - for (int64_t i = 0; i < 3; i++) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { private_data->buffer_data[i] = ArrowArrayBuffer(array, i)->data; } @@ -621,7 +621,7 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view) { } void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) { - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { int64_t element_size_bytes = array_view->layout.element_size_bits[i] / 8; switch (array_view->layout.buffer_type[i]) { @@ -671,26 +671,13 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length) static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, struct ArrowArray* array, struct ArrowError* error) { - // Check length and offset - if (array->offset < 0) { - ArrowErrorSet(error, "Expected array offset >= 0 but found array offset of %ld", - (long)array->offset); - return EINVAL; - } - - if (array->length < 0) { - ArrowErrorSet(error, "Expected array length >= 0 but found array length of %ld", - (long)array->length); - return EINVAL; - } - array_view->array = array; array_view->offset = array->offset; array_view->length = array->length; array_view->null_count = array->null_count; int64_t buffers_required = 0; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_NONE) { break; } @@ -749,6 +736,18 @@ static int ArrowArrayViewSetArrayInternal(struct ArrowArrayView* array_view, static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view, struct ArrowError* error) { + if (array_view->length < 0) { + ArrowErrorSet(error, "Expected length >= 0 but found length %ld", + (long)array_view->length); + return EINVAL; + } + + if (array_view->offset < 0) { + ArrowErrorSet(error, "Expected offset >= 0 but found offset %ld", + (long)array_view->offset); + return EINVAL; + } + // Calculate buffer sizes that do not require buffer access. If marked as // unknown, assign the buffer size; otherwise, validate it. int64_t offset_plus_length = array_view->offset + array_view->length; @@ -1103,7 +1102,7 @@ static int ArrowAssertInt8In(struct ArrowBufferView view, const int8_t* values, static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, struct ArrowError* error) { - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { switch (array_view->layout.buffer_type[i]) { case NANOARROW_BUFFER_TYPE_DATA_OFFSET: if (array_view->layout.element_size_bits[i] == 32) { diff --git a/src/nanoarrow/array_inline.h b/src/nanoarrow/array_inline.h index 96fdf573b..c089d2bf0 100644 --- a/src/nanoarrow/array_inline.h +++ b/src/nanoarrow/array_inline.h @@ -140,7 +140,7 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) } // Initialize any data offset buffer with a single zero - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { if (private_data->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_DATA_OFFSET && private_data->layout.element_size_bits[i] == 64) { NANOARROW_RETURN_NOT_OK(ArrowBufferAppendInt64(ArrowArrayBuffer(array, i), 0)); @@ -163,7 +163,7 @@ static inline ArrowErrorCode ArrowArrayStartAppending(struct ArrowArray* array) } static inline ArrowErrorCode ArrowArrayShrinkToFit(struct ArrowArray* array) { - for (int64_t i = 0; i < 3; i++) { + for (int64_t i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { struct ArrowBuffer* buffer = ArrowArrayBuffer(array, i); NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, buffer->size_bytes, 1)); } @@ -278,7 +278,7 @@ static inline ArrowErrorCode _ArrowArrayAppendEmptyInternal(struct ArrowArray* a struct ArrowBuffer* buffer; int64_t size_bytes; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { buffer = ArrowArrayBuffer(array, i); size_bytes = private_data->layout.element_size_bits[i] / 8; diff --git a/src/nanoarrow/array_test.cc b/src/nanoarrow/array_test.cc index 658595781..6a2200809 100644 --- a/src/nanoarrow/array_test.cc +++ b/src/nanoarrow/array_test.cc @@ -1589,12 +1589,12 @@ TEST(ArrayTest, ArrayViewTestBasic) { // Expect error for bad offset + length array.length = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); - EXPECT_STREQ(error.message, "Expected array length >= 0 but found array length of -1"); + EXPECT_STREQ(error.message, "Expected length >= 0 but found length -1"); array.length = 3; array.offset = -1; EXPECT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), EINVAL); - EXPECT_STREQ(error.message, "Expected array offset >= 0 but found array offset of -1"); + EXPECT_STREQ(error.message, "Expected offset >= 0 but found offset -1"); array.offset = 0; // Expect error for the wrong number of buffers diff --git a/src/nanoarrow/nanoarrow_testing.hpp b/src/nanoarrow/nanoarrow_testing.hpp index 454a6da04..103f22e92 100644 --- a/src/nanoarrow/nanoarrow_testing.hpp +++ b/src/nanoarrow/nanoarrow_testing.hpp @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include @@ -439,7 +440,7 @@ class TestingJSONWriter { } } else { // No need to quote smaller ints (i.e., 123456) - out << values[0]; + out << static_cast(values[0]); for (int64_t i = 1; i < n_values; i++) { out << ", " << static_cast(values[i]); } @@ -621,36 +622,76 @@ class TestingJSONReader { using json = nlohmann::json; public: - ArrowErrorCode ReadSchema(const std::string& value, ArrowSchema* out, + /// \brief Read JSON representing a Schema + /// + /// Reads a JSON object in the form `{"fields": [...], "metadata": [...]}`, + /// propagating `out` on success. + ArrowErrorCode ReadSchema(const std::string& schema_json, ArrowSchema* out, ArrowError* error = nullptr) { try { - auto obj = json::parse(value); + auto obj = json::parse(schema_json); nanoarrow::UniqueSchema schema; NANOARROW_RETURN_NOT_OK(SetSchema(schema.get(), obj, error)); ArrowSchemaMove(schema.get(), out); return NANOARROW_OK; - } catch (std::exception& e) { + } catch (json::exception& e) { ArrowErrorSet(error, "Exception in TestingJSONReader::ReadSchema(): %s", e.what()); return EINVAL; } } - ArrowErrorCode ReadField(const std::string& value, ArrowSchema* out, + /// \brief Read JSON representing a Field + /// + /// Read a JSON object in the form `{"name" : "col", "type": {...}, ...}`, + /// propagating `out` on success. + ArrowErrorCode ReadField(const std::string& field_json, ArrowSchema* out, ArrowError* error = nullptr) { try { - auto obj = json::parse(value); + auto obj = json::parse(field_json); nanoarrow::UniqueSchema schema; NANOARROW_RETURN_NOT_OK(SetField(schema.get(), obj, error)); ArrowSchemaMove(schema.get(), out); return NANOARROW_OK; - } catch (std::exception& e) { + } catch (json::exception& e) { ArrowErrorSet(error, "Exception in TestingJSONReader::ReadField(): %s", e.what()); return EINVAL; } } + /// \brief Read JSON representing a Column + /// + /// Read a JSON object in the form + /// `{"name": "col", "count": 123, "VALIDITY": [...], ...}`, propagating + /// `out` on success. + ArrowErrorCode ReadColumn(const std::string& column_json, const ArrowSchema* schema, + ArrowArray* out, ArrowError* error = nullptr) { + try { + auto obj = json::parse(column_json); + + // ArrowArrayView to enable validation + nanoarrow::UniqueArrayView array_view; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema( + array_view.get(), const_cast(schema), error)); + + // ArrowArray to hold memory + nanoarrow::UniqueArray array; + NANOARROW_RETURN_NOT_OK( + ArrowArrayInitFromSchema(array.get(), const_cast(schema), error)); + + // Parse the JSON into the array + NANOARROW_RETURN_NOT_OK(SetArrayColumn(obj, array_view.get(), array.get(), error)); + + // Return the result + ArrowArrayMove(array.get(), out); + return NANOARROW_OK; + } catch (json::exception& e) { + ArrowErrorSet(error, "Exception in TestingJSONReader::ReadColumn(): %s", e.what()); + return EINVAL; + } + } + private: ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError* error) { NANOARROW_RETURN_NOT_OK( @@ -1053,6 +1094,402 @@ class TestingJSONReader { return NANOARROW_OK; } + ArrowErrorCode SetArrayColumn(const json& value, ArrowArrayView* array_view, + ArrowArray* array, ArrowError* error, + const std::string& parent_error_prefix = "") { + NANOARROW_RETURN_NOT_OK( + Check(value.is_object(), error, "Expected Column to be a JSON object")); + + // Check + resolve name early to generate better error messages + NANOARROW_RETURN_NOT_OK( + Check(value.contains("name"), error, "Column missing key 'name'")); + + const auto& name = value["name"]; + NANOARROW_RETURN_NOT_OK(Check(name.is_null() || name.is_string(), error, + "Column name must be string or null")); + + std::string error_prefix; + if (name.is_string()) { + error_prefix = parent_error_prefix + "-> Column '" + name.get() + "' "; + } else { + error_prefix = parent_error_prefix + "-> Column "; + } + + // Check, resolve, and recurse children + NANOARROW_RETURN_NOT_OK( + Check(array_view->n_children == 0 || value.contains("children"), error, + error_prefix + "missing key children")); + + if (value.contains("children")) { + const auto& children = value["children"]; + NANOARROW_RETURN_NOT_OK( + Check(children.is_array(), error, error_prefix + "children must be array")); + NANOARROW_RETURN_NOT_OK(Check(children.size() == array_view->n_children, error, + error_prefix + "children has incorrect size")); + + for (int64_t i = 0; i < array_view->n_children; i++) { + NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], array_view->children[i], + array->children[i], error, error_prefix)); + } + } + + // Build buffers + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + NANOARROW_RETURN_NOT_OK( + PrefixError(SetArrayColumnBuffers(value, array_view, array, i, error), error, + error_prefix)); + } + + // Check + resolve count + NANOARROW_RETURN_NOT_OK( + Check(value.contains("count"), error, error_prefix + "missing key 'count'")); + const auto& count = value["count"]; + NANOARROW_RETURN_NOT_OK( + Check(count.is_number_integer(), error, error_prefix + "count must be integer")); + array_view->length = count.get(); + + // Set ArrayView buffer views. This is because ArrowArrayInitFromSchema() doesn't + // support custom type ids for unions but the ArrayView does (otherwise + // ArrowArrayFinishBuilding() would work). + for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) { + ArrowBuffer* buffer = ArrowArrayBuffer(array, i); + ArrowBufferView* buffer_view = array_view->buffer_views + i; + buffer_view->data.as_uint8 = buffer->data; + buffer_view->size_bytes = buffer->size_bytes; + } + + // Validate the array view + NANOARROW_RETURN_NOT_OK(PrefixError( + ArrowArrayViewValidate(array_view, NANOARROW_VALIDATION_LEVEL_FULL, error), error, + error_prefix + "failed to validate: ")); + + // Flush length and buffer pointers to the Array + array->length = array_view->length; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE, nullptr), error); + + return NANOARROW_OK; + } + + ArrowErrorCode SetArrayColumnBuffers(const json& value, ArrowArrayView* array_view, + ArrowArray* array, int buffer_i, + ArrowError* error) { + ArrowBuffer* buffer = ArrowArrayBuffer(array, buffer_i); + + switch (array_view->layout.buffer_type[buffer_i]) { + case NANOARROW_BUFFER_TYPE_VALIDITY: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("VALIDITY"), error, "missing key 'VALIDITY'")); + const auto& validity = value["VALIDITY"]; + NANOARROW_RETURN_NOT_OK( + SetBufferBitmap(validity, ArrowArrayValidityBitmap(array), error)); + break; + } + case NANOARROW_BUFFER_TYPE_TYPE_ID: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("TYPE_ID"), error, "missing key 'TYPE_ID'")); + const auto& type_id = value["TYPE_ID"]; + NANOARROW_RETURN_NOT_OK(SetBufferInt(type_id, buffer, error)); + break; + } + case NANOARROW_BUFFER_TYPE_UNION_OFFSET: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); + const auto& offset = value["OFFSET"]; + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + break; + } + case NANOARROW_BUFFER_TYPE_DATA_OFFSET: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("OFFSET"), error, "missing key 'OFFSET'")); + const auto& offset = value["OFFSET"]; + + if (array_view->layout.element_size_bits[buffer_i] == 32) { + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + } else { + NANOARROW_RETURN_NOT_OK(SetBufferInt(offset, buffer, error)); + } + break; + } + + case NANOARROW_BUFFER_TYPE_DATA: { + NANOARROW_RETURN_NOT_OK( + Check(value.contains("DATA"), error, "missing key 'DATA'")); + const auto& data = value["DATA"]; + + switch (array_view->storage_type) { + case NANOARROW_TYPE_BOOL: { + nanoarrow::UniqueBitmap bitmap; + NANOARROW_RETURN_NOT_OK(SetBufferBitmap(data, bitmap.get(), error)); + ArrowBufferMove(&bitmap->buffer, buffer); + return NANOARROW_OK; + } + case NANOARROW_TYPE_INT8: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT8: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT16: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT16: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT32: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT32: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_INT64: + return SetBufferInt(data, buffer, error); + case NANOARROW_TYPE_UINT64: + return SetBufferInt(data, buffer, error); + + case NANOARROW_TYPE_FLOAT: + return SetBufferFloatingPoint(data, buffer, error); + case NANOARROW_TYPE_DOUBLE: + return SetBufferFloatingPoint(data, buffer, error); + + case NANOARROW_TYPE_STRING: + return SetBufferString(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_LARGE_STRING: + return SetBufferString(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_BINARY: + return SetBufferBinary(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_LARGE_BINARY: + return SetBufferBinary(data, ArrowArrayBuffer(array, buffer_i - 1), + buffer, error); + case NANOARROW_TYPE_FIXED_SIZE_BINARY: + return SetBufferFixedSizeBinary( + data, buffer, array_view->layout.element_size_bits[buffer_i] / 8, error); + + default: + ArrowErrorSet(error, "storage type %s DATA buffer not supported", + ArrowTypeString(array_view->storage_type)); + return ENOTSUP; + } + break; + } + case NANOARROW_BUFFER_TYPE_NONE: + break; + } + + return NANOARROW_OK; + } + + ArrowErrorCode SetBufferBitmap(const json& value, ArrowBitmap* bitmap, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "bitmap buffer must be array")); + + for (const auto& item : value) { + // Some example files write bitmaps as [true, false, true] but the documentation + // says [1, 0, 1]. Accept both for simplicity. + NANOARROW_RETURN_NOT_OK(Check(item.is_boolean() || item.is_number_integer(), error, + "bitmap item must be bool or integer")); + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBitmapAppend(bitmap, item.get(), 1), + error); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferInt(const json& value, ArrowBuffer* buffer, ArrowError* error) { + NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "int buffer must be array")); + + for (const auto& item : value) { + // NANOARROW_RETURN_NOT_OK() interacts poorly with multiple template args + ArrowErrorCode result = SetBufferIntItem(item, buffer, error); + NANOARROW_RETURN_NOT_OK(result); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferIntItem(const json& item, ArrowBuffer* buffer, + ArrowError* error) { + if (item.is_string()) { + try { + // The JSON parser here can handle up to 2^64 - 1 + auto item_int = json::parse(item.get()); + return SetBufferIntItem(item_int, buffer, error); + } catch (json::parse_error& e) { + ArrowErrorSet(error, + "integer buffer item encoded as string must parse as integer: %s", + item.dump().c_str()); + return EINVAL; + } + } + + NANOARROW_RETURN_NOT_OK( + Check(item.is_number_integer(), error, + "integer buffer item must be integer number or string")); + NANOARROW_RETURN_NOT_OK( + Check(std::numeric_limits::is_signed || item.is_number_unsigned(), error, + "expected unsigned integer buffer item but found signed integer '" + + item.dump() + "'")); + + auto item_int = item.get(); + + NANOARROW_RETURN_NOT_OK( + Check(item_int >= std::numeric_limits::lowest() && + item_int <= std::numeric_limits::max(), + error, "integer buffer item '" + item.dump() + "' outside type limits")); + + T buffer_value = item_int; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferFloatingPoint(const json& value, ArrowBuffer* buffer, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "floatingpoint buffer must be array")); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_number(), error, "floatingpoint buffer item must be number")); + double item_dbl = item.get(); + + NANOARROW_RETURN_NOT_OK(Check( + item_dbl >= std::numeric_limits::lowest() && + item_dbl <= std::numeric_limits::max(), + error, "floatingpoint buffer item '" + item.dump() + "' outside type limits")); + + T buffer_value = item_dbl; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(buffer, &buffer_value, sizeof(T)), error); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferString(const json& value, ArrowBuffer* offsets, + ArrowBuffer* data, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "utf8 data buffer must be array")); + + // Check offsets against values + const T* expected_offset = reinterpret_cast(offsets->data); + NANOARROW_RETURN_NOT_OK(Check( + offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, + "Expected offset buffer with " + std::to_string(value.size()) + " elements")); + NANOARROW_RETURN_NOT_OK( + Check(*expected_offset++ == 0, error, "first offset must be zero")); + + int64_t last_offset = 0; + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_string(), error, "utf8 data buffer item must be string")); + auto item_str = item.get(); + + // Append data + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowBufferAppend(data, reinterpret_cast(item_str.data()), + item_str.size()), + error); + + // Check offset + last_offset += item_str.size(); + NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == last_offset, error, + "Expected offset value " + + std::to_string(last_offset) + + " at utf8 data buffer item " + item.dump())); + } + + return NANOARROW_OK; + } + + template + ArrowErrorCode SetBufferBinary(const json& value, ArrowBuffer* offsets, + ArrowBuffer* data, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "binary data buffer must be array")); + + // Check offsets against values if not fixed size + const T* expected_offset = reinterpret_cast(offsets->data); + NANOARROW_RETURN_NOT_OK(Check( + offsets->size_bytes == ((value.size() + 1) * sizeof(T)), error, + "Expected offset buffer with " + std::to_string(value.size()) + " elements")); + NANOARROW_RETURN_NOT_OK( + Check(*expected_offset++ == 0, error, "first offset must be zero")); + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); + + // Check offset + NANOARROW_RETURN_NOT_OK(Check(*expected_offset++ == data->size_bytes, error, + "Expected offset value " + + std::to_string(data->size_bytes) + + " at binary data buffer item " + item.dump())); + } + + return NANOARROW_OK; + } + + ArrowErrorCode SetBufferFixedSizeBinary(const json& value, ArrowBuffer* data, + int64_t fixed_size, ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(value.is_array(), error, "binary data buffer must be array")); + + int64_t last_offset = 0; + + for (const auto& item : value) { + NANOARROW_RETURN_NOT_OK(AppendBinaryElement(item, data, error)); + int64_t item_size_bytes = data->size_bytes - last_offset; + + NANOARROW_RETURN_NOT_OK(Check(item_size_bytes == fixed_size, error, + "Expected fixed size binary value of size " + + std::to_string(fixed_size) + + " at binary data buffer item " + item.dump())); + last_offset = data->size_bytes; + } + + return NANOARROW_OK; + } + + ArrowErrorCode AppendBinaryElement(const json& item, ArrowBuffer* data, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK( + Check(item.is_string(), error, "binary data buffer item must be string")); + auto item_str = item.get(); + + int64_t item_size_bytes = item_str.size() / 2; + NANOARROW_RETURN_NOT_OK(Check((item_size_bytes * 2) == item_str.size(), error, + "binary data buffer item must have even size")); + + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowBufferReserve(data, item_size_bytes), error); + for (int64_t i = 0; i < item_str.size(); i += 2) { + std::string byte_hex = item_str.substr(i, 2); + char* end_ptr; + uint8_t byte = std::strtoul(byte_hex.data(), &end_ptr, 16); + NANOARROW_RETURN_NOT_OK( + Check(end_ptr == (byte_hex.data() + 2), error, + "binary data buffer item must contain a valid hex-encoded byte string")); + + data->data[data->size_bytes] = byte; + data->size_bytes++; + } + + return NANOARROW_OK; + } + + ArrowErrorCode PrefixError(ArrowErrorCode value, ArrowError* error, + const std::string& prefix) { + if (value != NANOARROW_OK && error != nullptr) { + std::string msg = prefix + error->message; + ArrowErrorSet(error, "%s", msg.c_str()); + } + + return value; + } + ArrowErrorCode Check(bool value, ArrowError* error, const std::string& err) { if (value) { return NANOARROW_OK; diff --git a/src/nanoarrow/nanoarrow_testing_test.cc b/src/nanoarrow/nanoarrow_testing_test.cc index 73c8f79ca..0b8f733b3 100644 --- a/src/nanoarrow/nanoarrow_testing_test.cc +++ b/src/nanoarrow/nanoarrow_testing_test.cc @@ -747,26 +747,87 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestReadFieldNested) { EXPECT_STREQ(schema->children[0]->format, "n"); } -void TestFieldRoundtrip(const std::string& field_json) { +TEST(NanoarrowTestingTest, NanoarrowTestingTestReadColumnBasic) { + nanoarrow::UniqueSchema schema; + nanoarrow::UniqueArray array; + ArrowError error; + error.message[0] = '\0'; + + TestingJSONReader reader; + + ASSERT_EQ( + reader.ReadField( + R"({"name": null, "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null})", + schema.get()), + NANOARROW_OK); + + ASSERT_EQ(reader.ReadColumn(R"({"name": null, "count": 2})", schema.get(), array.get(), + &error), + NANOARROW_OK) + << error.message; + EXPECT_EQ(array->length, 2); + + // Check invalid JSON + EXPECT_EQ(reader.ReadColumn(R"({)", schema.get(), array.get()), EINVAL); + + // Check at least one failed Check() + EXPECT_EQ( + reader.ReadColumn(R"("this is not a JSON object")", schema.get(), array.get()), + EINVAL); + + // Check at least one failed PrefixError() + EXPECT_EQ(reader.ReadColumn(R"({"name": "colname", "count": "not an integer"})", + schema.get(), array.get(), &error), + EINVAL); + EXPECT_STREQ(error.message, "-> Column 'colname' count must be integer"); + + // Check that field is validated + EXPECT_EQ( + reader.ReadColumn(R"({"name": null, "count": -1})", schema.get(), array.get()), + EINVAL); +} + +void TestFieldRoundtrip(const std::string& field_json, + const std::string& column_json = "") { nanoarrow::UniqueSchema schema; TestingJSONReader reader; TestingJSONWriter writer; ArrowError error; error.message[0] = '\0'; - int result = reader.ReadField(field_json, schema.get(), &error); - ASSERT_EQ(result, NANOARROW_OK) << "Error: " << error.message; + ASSERT_EQ(reader.ReadField(field_json, schema.get(), &error), NANOARROW_OK) + << "Error: " << error.message; - std::stringstream field_json_roundtrip; - ASSERT_EQ(writer.WriteField(field_json_roundtrip, schema.get()), NANOARROW_OK); - EXPECT_EQ(field_json_roundtrip.str(), field_json); + std::stringstream json_roundtrip; + ASSERT_EQ(writer.WriteField(json_roundtrip, schema.get()), NANOARROW_OK); + EXPECT_EQ(json_roundtrip.str(), field_json); + + if (column_json == "") { + return; + } + + nanoarrow::UniqueArray array; + ASSERT_EQ(reader.ReadColumn(column_json, schema.get(), array.get(), &error), + NANOARROW_OK) + << error.message; + + nanoarrow::UniqueArrayView array_view; + ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), nullptr), + NANOARROW_OK); + ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), nullptr), NANOARROW_OK); + + json_roundtrip.str(""); + ASSERT_EQ(writer.WriteColumn(json_roundtrip, schema.get(), array_view.get()), + NANOARROW_OK); + EXPECT_EQ(json_roundtrip.str(), column_json); } -void TestTypeRoundtrip(const std::string& type_json) { +void TestTypeRoundtrip(const std::string& type_json, + const std::string& column_json = "") { std::stringstream field_json_builder; field_json_builder << R"({"name": null, "nullable": true, "type": )" << type_json << R"(, "children": [], "metadata": null})"; - TestFieldRoundtrip(field_json_builder.str()); + TestFieldRoundtrip(field_json_builder.str(), column_json); } void TestFieldError(const std::string& field_json, const std::string& msg, @@ -788,33 +849,65 @@ void TestTypeError(const std::string& type_json, const std::string& msg, TestFieldError(field_json_builder.str(), msg, code); } -TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldPrimitive) { - TestTypeRoundtrip(R"({"name": "null"})"); - TestTypeRoundtrip(R"({"name": "bool"})"); - TestTypeRoundtrip(R"({"name": "utf8"})"); - TestTypeRoundtrip(R"({"name": "largeutf8"})"); - TestTypeRoundtrip(R"({"name": "binary"})"); - TestTypeRoundtrip(R"({"name": "largebinary"})"); +TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldNull) { + TestTypeRoundtrip(R"({"name": "null"})", R"({"name": null, "count": 2})"); TestTypeError(R"({"name": "an unsupported type"})", "Unsupported Type name: 'an unsupported type'", ENOTSUP); } +TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldBool) { + TestTypeRoundtrip( + R"({"name": "bool"})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0, 1, 0]})"); +} + +TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldString) { + TestTypeRoundtrip( + R"({"name": "utf8"})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "OFFSET": [0, 3, 3], "DATA": ["abc", ""]})"); + TestTypeRoundtrip( + R"({"name": "largeutf8"})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "OFFSET": ["0", "3", "3"], "DATA": ["abc", ""]})"); + TestTypeRoundtrip( + R"({"name": "binary"})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "OFFSET": [0, 3, 3], "DATA": ["00FFA0", ""]})"); + TestTypeRoundtrip( + R"({"name": "largebinary"})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "OFFSET": ["0", "3", "3"], "DATA": ["00FFA0", ""]})"); +} + TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldInt) { - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 8, "isSigned": true})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 16, "isSigned": true})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 32, "isSigned": true})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 64, "isSigned": true})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 8, "isSigned": true})", + R"({"name": null, "count": 3, "VALIDITY": [1, 1, 1], "DATA": [-128, 0, 127]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 16, "isSigned": true})", + R"({"name": null, "count": 3, "VALIDITY": [1, 1, 1], "DATA": [-129, 0, 127]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 32, "isSigned": true})", + R"({"name": null, "count": 3, "VALIDITY": [1, 1, 1], "DATA": [-130, 0, 127]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 64, "isSigned": true})", + R"({"name": null, "count": 3, "VALIDITY": [1, 1, 1], "DATA": ["-131", "0", "127"]})"); TestTypeError(R"({"name": "int", "bitWidth": 1, "isSigned": true})", "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldUInt) { - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 8, "isSigned": false})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 16, "isSigned": false})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 32, "isSigned": false})"); - TestTypeRoundtrip(R"({"name": "int", "bitWidth": 64, "isSigned": false})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 8, "isSigned": false})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0, 0, 255]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 16, "isSigned": false})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0, 0, 256]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 32, "isSigned": false})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0, 0, 257]})"); + TestTypeRoundtrip( + R"({"name": "int", "bitWidth": 64, "isSigned": false})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": ["0", "0", "258"]})"); TestTypeError(R"({"name": "int", "bitWidth": 1, "isSigned": false})", "Type[name=='int'] bitWidth must be 8, 16, 32, or 64"); @@ -822,8 +915,12 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldUInt) { TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFloatingPoint) { TestTypeRoundtrip(R"({"name": "floatingpoint", "precision": "HALF"})"); - TestTypeRoundtrip(R"({"name": "floatingpoint", "precision": "SINGLE"})"); - TestTypeRoundtrip(R"({"name": "floatingpoint", "precision": "DOUBLE"})"); + TestTypeRoundtrip( + R"({"name": "floatingpoint", "precision": "SINGLE"})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0.000, 1.230, 4.560]})"); + TestTypeRoundtrip( + R"({"name": "floatingpoint", "precision": "DOUBLE"})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": [0.000, 1.230, 4.560]})"); TestTypeError( R"({"name": "floatingpoint", "precision": "NOT_A_PRECISION"})", @@ -831,7 +928,9 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFloatingPoint) { } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFixedSizeBinary) { - TestTypeRoundtrip(R"({"name": "fixedsizebinary", "byteWidth": 123})"); + TestTypeRoundtrip( + R"({"name": "fixedsizebinary", "byteWidth": 3})", + R"({"name": null, "count": 2, "VALIDITY": [1, 0], "DATA": ["00FFA0", "000000"]})"); } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldDecimal) { @@ -868,7 +967,8 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldStruct) { // Empty TestFieldRoundtrip( R"({"name": null, "nullable": true, "type": {"name": "struct"}, "children": [)" - R"(], "metadata": null})"); + R"(], "metadata": null})", + R"({"name": null, "count": 0, "VALIDITY": [], "children": []})"); // Non-empty TestFieldRoundtrip( @@ -897,17 +997,39 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFixedSizeList) { } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldUnion) { + // Empty unions + TestFieldRoundtrip( + R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "DENSE", "typeIds": []}, "children": [], "metadata": null})", + R"({"name": null, "count": 0, "TYPE_ID": [], "OFFSET": [], "children": []})"); + TestFieldRoundtrip( + R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "SPARSE", "typeIds": []}, "children": [], "metadata": null})", + R"({"name": null, "count": 0, "TYPE_ID": [], "children": []})"); + TestFieldRoundtrip( R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "DENSE", "typeIds": [10,20]}, "children": [)" R"({"name": null, "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null}, )" R"({"name": null, "nullable": true, "type": {"name": "utf8"}, "children": [], "metadata": null})" R"(], "metadata": null})"); + // Non-empty unions (null, "abc") TestFieldRoundtrip( R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "SPARSE", "typeIds": [10,20]}, "children": [)" - R"({"name": null, "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null}, )" - R"({"name": null, "nullable": true, "type": {"name": "utf8"}, "children": [], "metadata": null})" - R"(], "metadata": null})"); + R"({"name": "nulls", "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null}, )" + R"({"name": "strings", "nullable": true, "type": {"name": "utf8"}, "children": [], "metadata": null})" + R"(], "metadata": null})", + R"({"name": null, "count": 2, "TYPE_ID": [20, 10], "children": [)" + R"({"name": "nulls", "count": 2}, )" + R"({"name": "strings", "count": 2, "VALIDITY": [1, 1], "OFFSET": [0, 3, 3], "DATA": ["abc", ""]})" + R"(]})"); + TestFieldRoundtrip( + R"({"name": null, "nullable": true, "type": {"name": "union", "mode": "DENSE", "typeIds": [10,20]}, "children": [)" + R"({"name": "nulls", "nullable": true, "type": {"name": "null"}, "children": [], "metadata": null}, )" + R"({"name": "strings", "nullable": true, "type": {"name": "utf8"}, "children": [], "metadata": null})" + R"(], "metadata": null})", + R"({"name": null, "count": 2, "TYPE_ID": [20, 10], "OFFSET": [0, 0], "children": [)" + R"({"name": "nulls", "count": 1}, )" + R"({"name": "strings", "count": 1, "VALIDITY": [1], "OFFSET": [0, 3], "DATA": ["abc"]})" + R"(]})"); TestTypeError(R"({"name": "union", "mode": "NOT_A_MODE", "typeIds": []})", "Type[name=='union'] mode must be 'DENSE' or 'SPARSE'"); diff --git a/src/nanoarrow/nanoarrow_types.h b/src/nanoarrow/nanoarrow_types.h index e96207ead..2ea16b2c5 100644 --- a/src/nanoarrow/nanoarrow_types.h +++ b/src/nanoarrow/nanoarrow_types.h @@ -449,6 +449,14 @@ enum ArrowBufferType { NANOARROW_BUFFER_TYPE_DATA }; +/// \brief The maximum number of buffers in an ArrowArrayView or ArrowLayout +/// \ingroup nanoarrow-array-view +/// +/// All currently supported types have 3 buffers or fewer; however, future types +/// may involve a variable number of buffers (e.g., string view). These buffers +/// will be represented by separate members of the ArrowArrayView or ArrowLayout. +#define NANOARROW_MAX_FIXED_BUFFERS 3 + /// \brief An non-owning view of a string /// \ingroup nanoarrow-utils struct ArrowStringView { @@ -561,13 +569,13 @@ struct ArrowBitmap { /// the length and offset of the array. struct ArrowLayout { /// \brief The function of each buffer - enum ArrowBufferType buffer_type[3]; + enum ArrowBufferType buffer_type[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The data type of each buffer - enum ArrowType buffer_data_type[3]; + enum ArrowType buffer_data_type[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The size of an element each buffer or 0 if this size is variable or unknown - int64_t element_size_bits[3]; + int64_t element_size_bits[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The number of elements in the child array per element in this array for a /// fixed-size list @@ -609,7 +617,7 @@ struct ArrowArrayView { struct ArrowLayout layout; /// \brief This Array's buffers as ArrowBufferView objects - struct ArrowBufferView buffer_views[3]; + struct ArrowBufferView buffer_views[NANOARROW_MAX_FIXED_BUFFERS]; /// \brief The number of children of this view int64_t n_children; @@ -637,12 +645,12 @@ struct ArrowArrayPrivateData { struct ArrowBitmap bitmap; // Holder for additional buffers as required - struct ArrowBuffer buffers[2]; + struct ArrowBuffer buffers[NANOARROW_MAX_FIXED_BUFFERS - 1]; // The array of pointers to buffers. This must be updated after a sequence // of appends to synchronize its values with the actual buffer addresses // (which may have ben reallocated uring that time) - const void* buffer_data[3]; + const void* buffer_data[NANOARROW_MAX_FIXED_BUFFERS]; // The storage data type, or NANOARROW_TYPE_UNINITIALIZED if unknown enum ArrowType storage_type;