From 8345991ad662d63feed35356171842c27fb36783 Mon Sep 17 00:00:00 2001 From: Paolo Angioletti Date: Mon, 15 Jan 2024 14:27:25 +0000 Subject: [PATCH] Result into error in case of endianness mismatches (#5301) This commit implements the Byte Order (Endianness) recommendations we could read from the Apache Arrow official specification (quoted here): > _"At first we will return an error when trying to read a Schema > with an endianness that does not match the underlying system."_ Resolves: #3459 --- arrow-integration-testing/tests/ipc_reader.rs | 31 +++++-------------- arrow-ipc/src/gen/Schema.rs | 9 ++++++ arrow-ipc/src/reader.rs | 6 ++++ 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/arrow-integration-testing/tests/ipc_reader.rs b/arrow-integration-testing/tests/ipc_reader.rs index 11b8fa84534e..88cdad64f92f 100644 --- a/arrow-integration-testing/tests/ipc_reader.rs +++ b/arrow-integration-testing/tests/ipc_reader.rs @@ -18,6 +18,7 @@ //! Tests for reading the content of [`FileReader`] and [`StreamReader`] //! in `testing/arrow-ipc-stream/integration/...` +use arrow::error::ArrowError; use arrow::ipc::reader::{FileReader, StreamReader}; use arrow::util::test_util::arrow_test_data; use arrow_integration_testing::read_gzip_json; @@ -55,26 +56,12 @@ fn read_0_1_7() { }); } -#[test] -#[should_panic(expected = "Big Endian is not supported for Decimal!")] -fn read_1_0_0_bigendian_decimal_should_panic() { - let testdata = arrow_test_data(); - verify_arrow_file(&testdata, "1.0.0-bigendian", "generated_decimal"); -} - -#[test] -#[should_panic(expected = "Last offset 687865856 of Utf8 is larger than values length 41")] -fn read_1_0_0_bigendian_dictionary_should_panic() { - // The offsets are not translated for big-endian files - // https://github.com/apache/arrow-rs/issues/859 - let testdata = arrow_test_data(); - verify_arrow_file(&testdata, "1.0.0-bigendian", "generated_dictionary"); -} - #[test] fn read_1_0_0_bigendian() { let testdata = arrow_test_data(); let paths = [ + "generated_decimal", + "generated_dictionary", "generated_interval", "generated_datetime", "generated_map", @@ -91,14 +78,12 @@ fn read_1_0_0_bigendian() { )) .unwrap(); - FileReader::try_new(file, None).unwrap(); + let reader = FileReader::try_new(file, None); - // While the the reader doesn't error but the values are not - // read correctly on little endian platforms so verifying the - // contents fails - // - // https://github.com/apache/arrow-rs/issues/3459 - //verify_arrow_file(&testdata, "1.0.0-bigendian", path); + assert!(reader.is_err()); + let err = reader.err().unwrap(); + assert!(matches!(err, ArrowError::IpcError(_))); + assert_eq!(err.to_string(), "Ipc error: the endianness of the source system does not match the endianness of the target system."); }); } diff --git a/arrow-ipc/src/gen/Schema.rs b/arrow-ipc/src/gen/Schema.rs index 282b38b67195..0dc5dccd39e7 100644 --- a/arrow-ipc/src/gen/Schema.rs +++ b/arrow-ipc/src/gen/Schema.rs @@ -1039,6 +1039,15 @@ impl Endianness { _ => None, } } + + /// Returns true if the endianness of the source system matches the endianness of the target system. + pub fn equals_to_target_endianness(self) -> bool { + match self { + Self::Little => cfg!(target_endian = "little"), + Self::Big => cfg!(target_endian = "big"), + _ => false, + } + } } impl core::fmt::Debug for Endianness { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 8ac3a387d5bb..81b8b530734a 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -790,6 +790,12 @@ impl FileReaderBuilder { let total_blocks = blocks.len(); let ipc_schema = footer.schema().unwrap(); + if !ipc_schema.endianness().equals_to_target_endianness() { + return Err(ArrowError::IpcError( + "the endianness of the source system does not match the endianness of the target system.".to_owned() + )); + } + let schema = crate::convert::fb_to_schema(ipc_schema); let mut custom_metadata = HashMap::new();