From eeb57e348ba4b6e1eab998b578efab442798ccb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 31 Oct 2024 20:17:46 +0100 Subject: [PATCH 1/2] Faster utf8 validation --- Cargo.toml | 1 + parquet/Cargo.toml | 1 + parquet/src/arrow/array_reader/byte_view_array.rs | 7 +++++-- parquet/src/arrow/buffer/offset_buffer.rs | 8 ++++++-- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f210ae210012..5ecdaef31061 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,3 +94,4 @@ arrow-string = { version = "53.2.0", path = "./arrow-string" } parquet = { version = "53.2.0", path = "./parquet", default-features = false } chrono = { version = "0.4.34", default-features = false, features = ["clock"] } +simdutf8 = { version = "0.1.5" } \ No newline at end of file diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 32bc13b62a53..2c89b15b2ec0 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -69,6 +69,7 @@ paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } sysinfo = { version = "0.32.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } +simdutf8 = { workspace = true } [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 5845e2c08cec..2e2b55f42e89 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -679,9 +679,12 @@ impl ByteViewArrayDecoderDelta { /// Check that `val` is a valid UTF-8 sequence pub fn check_valid_utf8(val: &[u8]) -> Result<()> { - match std::str::from_utf8(val) { + match simdutf8::basic::from_utf8(val) { Ok(_) => Ok(()), - Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)), + Err(_) => { + let e = simdutf8::compat::from_utf8(val).unwrap_err(); + Err(general_err!("encountered non UTF-8 data: {}", e)) + } } } diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index ce9eb1142a5b..8dfb859612cb 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -117,9 +117,13 @@ impl OffsetBuffer { /// /// [`Self::try_push`] can perform this validation check on insertion pub fn check_valid_utf8(&self, start_offset: usize) -> Result<()> { - match std::str::from_utf8(&self.values.as_slice()[start_offset..]) { + match simdutf8::basic::from_utf8(&self.values.as_slice()[start_offset..]) { Ok(_) => Ok(()), - Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)), + Err(_) => { + let e = simdutf8::compat::from_utf8(&self.values.as_slice()[start_offset..]) + .unwrap_err(); + Err(general_err!("encountered non UTF-8 data: {}", e)) + } } } From adbd07aff40af8f67bda3f3aba76c966ee735c3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 31 Oct 2024 20:33:31 +0100 Subject: [PATCH 2/2] Move dependency --- Cargo.toml | 1 - parquet/Cargo.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5ecdaef31061..f210ae210012 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,4 +94,3 @@ arrow-string = { version = "53.2.0", path = "./arrow-string" } parquet = { version = "53.2.0", path = "./parquet", default-features = false } chrono = { version = "0.4.34", default-features = false, features = ["clock"] } -simdutf8 = { version = "0.1.5" } \ No newline at end of file diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 2c89b15b2ec0..663b50a08043 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -69,7 +69,7 @@ paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } sysinfo = { version = "0.32.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } -simdutf8 = { workspace = true } +simdutf8 = { version = "0.1.5"} [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] }