From a7244f565e9a0ed63ef16368b1bae231ac1033c4 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Fri, 31 May 2024 14:11:30 +0200 Subject: [PATCH 1/5] Add `ExtensionType` for `uuid` and map to parquet logical type --- arrow-schema/src/datatype.rs | 76 +++++++++++++++++++++++++++++++++ arrow-schema/src/field.rs | 33 +++++++++++++- parquet/src/arrow/schema/mod.rs | 6 ++- 3 files changed, 113 insertions(+), 2 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 449d363db671..6561f5b8855b 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -672,6 +672,82 @@ impl DataType { } } +/// Canonical extension types. +/// +/// The Arrow columnar format allows defining extension types so as to extend +/// standard Arrow data types with custom semantics. Often these semantics will +/// be specific to a system or application. However, it is beneficial to share +/// the definitions of well-known extension types so as to improve +/// interoperability between different systems integrating Arrow columnar data. +/// +/// https://arrow.apache.org/docs/format/CanonicalExtensions.html +#[non_exhaustive] +#[derive(Debug, Clone)] +pub enum ExtensionType { + /// Extension name: `arrow.uuid`. + /// + /// The storage type of the extension is `FixedSizeBinary` with a length of + /// 16 bytes. + /// + /// Note: + /// A specific UUID version is not required or guaranteed. This extension + /// represents UUIDs as FixedSizeBinary(16) with big-endian notation and + /// does not interpret the bytes in any way. + Uuid, +} + +impl fmt::Display for ExtensionType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.name()) + } +} + +impl ExtensionType { + /// The metadata key for the string name identifying the custom data type. + pub const NAME_KEY: &'static str = "ARROW:extension:name"; + + /// The metadata key for a serialized representation of the ExtensionType + /// necessary to reconstruct the custom type. + pub const METADATA_KEY: &'static str = "ARROW:extension:metadata"; + + /// Returns the name of this extension type. + pub fn name(&self) -> &'static str { + match self { + ExtensionType::Uuid => "arrow.uuid", + } + } + + /// Returns the metadata of this extension type. + pub fn metadata(&self) -> Option { + match self { + ExtensionType::Uuid => None, + } + } + + /// Returns `true` iff the given [`DataType`] can be used as storage type + /// for this extension type. + pub(crate) fn supports_storage_type(&self, data_type: &DataType) -> bool { + match self { + ExtensionType::Uuid => matches!(data_type, DataType::FixedSizeBinary(16)), + } + } + + /// Extract an [`ExtensionType`] from the given [`Field`]. + /// + /// This function returns `None` if the extension type is not supported or + /// recognized. + pub(crate) fn try_from_field(field: &Field) -> Option { + let metadata = field.metadata().get(ExtensionType::METADATA_KEY); + field + .metadata() + .get(ExtensionType::NAME_KEY) + .and_then(|name| match name.as_str() { + "arrow.uuid" if metadata.is_none() => Some(ExtensionType::Uuid), + _ => None, + }) + } +} + /// The maximum precision for [DataType::Decimal128] values pub const DECIMAL128_MAX_PRECISION: u8 = 38; diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index b84a2568ed8a..02a87264a25a 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use crate::datatype::DataType; use crate::schema::SchemaBuilder; -use crate::{Fields, UnionFields, UnionMode}; +use crate::{ExtensionType, Fields, UnionFields, UnionMode}; /// A reference counted [`Field`] pub type FieldRef = Arc; @@ -337,6 +337,37 @@ impl Field { self } + /// Returns the canonical [`ExtensionType`] of this [`Field`], if set. + pub fn extension_type(&self) -> Option { + ExtensionType::try_from_field(self) + } + + /// Updates the metadata of this [`Field`] with the [`ExtensionType::name`] + /// and [`ExtensionType::metadata`] of the given [`ExtensionType`]. + /// + /// # Panics + /// + /// This function panics when the datatype of this field is not a valid + /// storage type for the given extension type. + pub fn with_extension_type(mut self, extension_type: ExtensionType) -> Self { + if extension_type.supports_storage_type(&self.data_type) { + self.metadata.insert( + ExtensionType::NAME_KEY.to_owned(), + extension_type.name().to_owned(), + ); + if let Some(metadata) = extension_type.metadata() { + self.metadata + .insert(ExtensionType::METADATA_KEY.to_owned(), metadata); + } + self + } else { + panic!( + "{extension_type} does not support {} as storage type", + self.data_type + ); + } + } + /// Indicates whether this [`Field`] supports null values. #[inline] pub const fn is_nullable(&self) -> bool { diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 8c583eebac5b..61e14a40a23d 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -29,7 +29,7 @@ use std::collections::HashMap; use std::sync::Arc; use arrow_ipc::writer; -use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; +use arrow_schema::{DataType, ExtensionType, Field, Fields, Schema, TimeUnit}; use crate::basic::{ ConvertedType, LogicalType, Repetition, TimeUnit as ParquetTimeUnit, Type as PhysicalType, @@ -468,6 +468,10 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .with_length(*length) + .with_logical_type(match field.extension_type() { + Some(ExtensionType::Uuid) => Some(LogicalType::Uuid), + _ => None, + }) .build() } DataType::BinaryView => Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) From 6b2e7aa8fd759ac8ff86681e4a71886fe9928679 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Fri, 31 May 2024 15:13:44 +0200 Subject: [PATCH 2/5] Fix docs --- arrow-schema/src/datatype.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 6561f5b8855b..7e1dafbb7e04 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -680,7 +680,7 @@ impl DataType { /// the definitions of well-known extension types so as to improve /// interoperability between different systems integrating Arrow columnar data. /// -/// https://arrow.apache.org/docs/format/CanonicalExtensions.html +/// #[non_exhaustive] #[derive(Debug, Clone)] pub enum ExtensionType { From bdeab9f47e925a9c06233e502cf743458f158b93 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 26 Sep 2024 21:10:35 +0200 Subject: [PATCH 3/5] Use an `ExtensionType` trait instead --- arrow-schema/Cargo.toml | 2 +- arrow-schema/src/datatype.rs | 311 +++++++++++++++++++++++++++----- arrow-schema/src/field.rs | 70 +++++-- parquet/src/arrow/schema/mod.rs | 28 ++- 4 files changed, 338 insertions(+), 73 deletions(-) diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml index 628d4a683cac..711543a18677 100644 --- a/arrow-schema/Cargo.toml +++ b/arrow-schema/Cargo.toml @@ -36,6 +36,7 @@ bench = false [dependencies] serde = { version = "1.0", default-features = false, features = ["derive", "std", "rc"], optional = true } bitflags = { version = "2.0.0", default-features = false, optional = true } +serde_json = "1.0" [features] # Enable ffi support @@ -45,5 +46,4 @@ ffi = ["bitflags"] features = ["ffi"] [dev-dependencies] -serde_json = "1.0" bincode = { version = "1.3.3", default-features = false } diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 7e1dafbb7e04..fb14fe0695ab 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -672,6 +672,76 @@ impl DataType { } } +/// The metadata key for the string name identifying the custom data type. +pub const EXTENSION_TYPE_NAME_KEY: &str = "ARROW:extension:name"; + +/// The metadata key for a serialized representation of the ExtensionType +/// necessary to reconstruct the custom type. +pub const EXTENSION_TYPE_METADATA_KEY: &str = "ARROW:extension:metadata"; + +/// Extension types. +/// +/// +pub trait ExtensionType: Sized { + /// The name of this extension type. + const NAME: &'static str; + + /// The supported storage types of this extension type. + fn storage_types(&self) -> &[DataType]; + + /// The metadata type of this extension type. + type Metadata; + + /// Returns a reference to the metadata of this extension type, or `None` + /// if this extension type has no metadata. + fn metadata(&self) -> Option<&Self::Metadata>; + + /// Returns the serialized representation of the metadata of this extension + /// type, or `None` if this extension type has no metadata. + fn into_serialized_metadata(&self) -> Option; + + /// Deserialize this extension type from the serialized representation of the + /// metadata of this extension. An extension type that has no metadata should + /// expect `None` for for the serialized metadata. + fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option; +} + +pub(crate) trait ExtensionTypeExt: ExtensionType { + /// Returns `true` if the given data type is supported by this extension + /// type. + fn supports(&self, data_type: &DataType) -> bool { + self.storage_types().contains(data_type) + } + + /// Try to extract this extension type from the given [`Field`]. + /// + /// This function returns `None` if extension type + /// - information is missing + /// - name does not match + /// - metadata deserialization failed + /// - does not support the data type of this field + fn try_from_field(field: &Field) -> Option { + field + .metadata() + .get(EXTENSION_TYPE_NAME_KEY) + .and_then(|name| { + (name == ::NAME) + .then(|| { + Self::from_serialized_metadata( + field + .metadata() + .get(EXTENSION_TYPE_METADATA_KEY) + .map(String::as_str), + ) + }) + .flatten() + }) + .filter(|extension_type| extension_type.supports(field.data_type())) + } +} + +impl ExtensionTypeExt for T where T: ExtensionType {} + /// Canonical extension types. /// /// The Arrow columnar format allows defining extension types so as to extend @@ -679,11 +749,90 @@ impl DataType { /// be specific to a system or application. However, it is beneficial to share /// the definitions of well-known extension types so as to improve /// interoperability between different systems integrating Arrow columnar data. -/// -/// -#[non_exhaustive] -#[derive(Debug, Clone)] -pub enum ExtensionType { +pub mod canonical_extension_types { + use serde_json::{Map, Value}; + + use super::{DataType, ExtensionType}; + + /// Canonical extension types. + #[non_exhaustive] + #[derive(Debug, Clone, PartialEq)] + pub enum CanonicalExtensionTypes { + /// The extension type for 'JSON'. + Json(Json), + /// The extension type for `UUID`. + Uuid(Uuid), + } + + impl From for CanonicalExtensionTypes { + fn from(value: Json) -> Self { + CanonicalExtensionTypes::Json(value) + } + } + + impl From for CanonicalExtensionTypes { + fn from(value: Uuid) -> Self { + CanonicalExtensionTypes::Uuid(value) + } + } + + /// The extension type for `JSON`. + /// + /// Extension name: `arrow.json`. + /// + /// The storage type of this extension is `String` or `LargeString` or + /// `StringView`. Only UTF-8 encoded JSON as specified in [rfc8259](https://datatracker.ietf.org/doc/html/rfc8259) + /// is supported. + /// + /// This type does not have any parameters. + /// + /// Metadata is either an empty string or a JSON string with an empty + /// object. In the future, additional fields may be added, but they are not + /// required to interpret the array. + /// + /// + #[derive(Debug, Clone, PartialEq)] + pub struct Json(Value); + + impl Default for Json { + fn default() -> Self { + Self(Value::String("".to_owned())) + } + } + + impl ExtensionType for Json { + const NAME: &'static str = "arrow.json"; + + type Metadata = Value; + + fn storage_types(&self) -> &[DataType] { + &[DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View] + } + + fn metadata(&self) -> Option<&Self::Metadata> { + Some(&self.0) + } + + fn into_serialized_metadata(&self) -> Option { + Some(self.0.to_string()) + } + + fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option { + serialized_metadata.and_then(|metadata| match metadata { + // Empty string + r#""""# => Some(Default::default()), + // Empty object + value => value + .parse::() + .ok() + .filter(|value| value.as_object().is_some_and(Map::is_empty)) + .map(Self), + }) + } + } + + /// The extension type for `UUID`. + /// /// Extension name: `arrow.uuid`. /// /// The storage type of the extension is `FixedSizeBinary` with a length of @@ -691,60 +840,128 @@ pub enum ExtensionType { /// /// Note: /// A specific UUID version is not required or guaranteed. This extension - /// represents UUIDs as FixedSizeBinary(16) with big-endian notation and + /// represents UUIDs as `FixedSizeBinary(16)` with big-endian notation and /// does not interpret the bytes in any way. - Uuid, -} + /// + /// + #[derive(Debug, Default, Clone, Copy, PartialEq)] + pub struct Uuid; -impl fmt::Display for ExtensionType { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.name()) - } -} + impl ExtensionType for Uuid { + const NAME: &'static str = "arrow.uuid"; -impl ExtensionType { - /// The metadata key for the string name identifying the custom data type. - pub const NAME_KEY: &'static str = "ARROW:extension:name"; + type Metadata = (); - /// The metadata key for a serialized representation of the ExtensionType - /// necessary to reconstruct the custom type. - pub const METADATA_KEY: &'static str = "ARROW:extension:metadata"; + fn storage_types(&self) -> &[DataType] { + &[DataType::FixedSizeBinary(16)] + } - /// Returns the name of this extension type. - pub fn name(&self) -> &'static str { - match self { - ExtensionType::Uuid => "arrow.uuid", + fn metadata(&self) -> Option<&Self::Metadata> { + None } - } - /// Returns the metadata of this extension type. - pub fn metadata(&self) -> Option { - match self { - ExtensionType::Uuid => None, + fn into_serialized_metadata(&self) -> Option { + None } - } - /// Returns `true` iff the given [`DataType`] can be used as storage type - /// for this extension type. - pub(crate) fn supports_storage_type(&self, data_type: &DataType) -> bool { - match self { - ExtensionType::Uuid => matches!(data_type, DataType::FixedSizeBinary(16)), + fn from_serialized_metadata(serialized_metadata: Option<&str>) -> Option { + serialized_metadata.is_none().then_some(Self) } } - /// Extract an [`ExtensionType`] from the given [`Field`]. - /// - /// This function returns `None` if the extension type is not supported or - /// recognized. - pub(crate) fn try_from_field(field: &Field) -> Option { - let metadata = field.metadata().get(ExtensionType::METADATA_KEY); - field - .metadata() - .get(ExtensionType::NAME_KEY) - .and_then(|name| match name.as_str() { - "arrow.uuid" if metadata.is_none() => Some(ExtensionType::Uuid), - _ => None, - }) + #[cfg(test)] + mod tests { + use std::collections::HashMap; + + use serde_json::Map; + + use crate::{ArrowError, Field, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}; + + use super::*; + + #[test] + fn json() -> Result<(), ArrowError> { + let mut field = Field::new("", DataType::Utf8, false); + field.try_with_extension_type(Json::default())?; + assert_eq!( + field.metadata().get(EXTENSION_TYPE_METADATA_KEY), + Some(&r#""""#.to_owned()) + ); + assert!(field.extension_type::().is_some()); + + let mut field = Field::new("", DataType::LargeUtf8, false); + field.try_with_extension_type(Json(serde_json::Value::Object(Map::default())))?; + assert_eq!( + field.metadata().get(EXTENSION_TYPE_METADATA_KEY), + Some(&"{}".to_owned()) + ); + assert!(field.extension_type::().is_some()); + + let mut field = Field::new("", DataType::Utf8View, false); + field.try_with_extension_type(Json::default())?; + assert!(field.extension_type::().is_some()); + assert_eq!( + field.canonical_extension_type(), + Some(CanonicalExtensionTypes::Json(Json::default())) + ); + Ok(()) + } + + #[test] + #[should_panic(expected = "expected Utf8 or LargeUtf8 or Utf8View, found Boolean")] + fn json_bad_type() { + Field::new("", DataType::Boolean, false).with_extension_type(Json::default()); + } + + #[test] + fn json_bad_metadata() { + let field = Field::new("", DataType::Utf8, false).with_metadata(HashMap::from_iter([ + (EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned()), + (EXTENSION_TYPE_METADATA_KEY.to_owned(), "1234".to_owned()), + ])); + // This returns `None` now because this metadata is invalid. + assert!(field.extension_type::().is_none()); + } + + #[test] + fn json_missing_metadata() { + let field = Field::new("", DataType::LargeUtf8, false).with_metadata( + HashMap::from_iter([(EXTENSION_TYPE_NAME_KEY.to_owned(), Json::NAME.to_owned())]), + ); + // This returns `None` now because the metadata is missing. + assert!(field.extension_type::().is_none()); + } + + #[test] + fn uuid() -> Result<(), ArrowError> { + let mut field = Field::new("", DataType::FixedSizeBinary(16), false); + field.try_with_extension_type(Uuid)?; + assert!(field.extension_type::().is_some()); + assert_eq!( + field.canonical_extension_type(), + Some(CanonicalExtensionTypes::Uuid(Uuid)) + ); + Ok(()) + } + + #[test] + #[should_panic(expected = "expected FixedSizeBinary(16), found FixedSizeBinary(8)")] + fn uuid_bad_type() { + Field::new("", DataType::FixedSizeBinary(8), false).with_extension_type(Uuid); + } + + #[test] + fn uuid_with_metadata() { + // Add metadata that's not expected for uuid. + let field = Field::new("", DataType::FixedSizeBinary(16), false) + .with_metadata(HashMap::from_iter([( + EXTENSION_TYPE_METADATA_KEY.to_owned(), + "".to_owned(), + )])) + .with_extension_type(Uuid); + // This returns `None` now because `Uuid` expects no metadata. + assert!(field.extension_type::().is_none()); + } } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 02a87264a25a..04cc3665223b 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::canonical_extension_types::{CanonicalExtensionTypes, Json, Uuid}; use crate::error::ArrowError; use std::cmp::Ordering; use std::collections::HashMap; @@ -23,7 +24,10 @@ use std::sync::Arc; use crate::datatype::DataType; use crate::schema::SchemaBuilder; -use crate::{ExtensionType, Fields, UnionFields, UnionMode}; +use crate::{ + ExtensionType, ExtensionTypeExt, Fields, UnionFields, UnionMode, EXTENSION_TYPE_METADATA_KEY, + EXTENSION_TYPE_NAME_KEY, +}; /// A reference counted [`Field`] pub type FieldRef = Arc; @@ -337,37 +341,63 @@ impl Field { self } - /// Returns the canonical [`ExtensionType`] of this [`Field`], if set. - pub fn extension_type(&self) -> Option { - ExtensionType::try_from_field(self) + /// Returns the given [`ExtensionType`] of this [`Field`], if set. + /// Returns `None` if this field does not have this extension type. + pub fn extension_type(&self) -> Option { + E::try_from_field(self) } - /// Updates the metadata of this [`Field`] with the [`ExtensionType::name`] + /// Returns the [`CanonicalExtensionTypes`] of this [`Field`], if set. + pub fn canonical_extension_type(&self) -> Option { + Json::try_from_field(self) + .map(Into::into) + .or(Uuid::try_from_field(self).map(Into::into)) + } + + /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] /// and [`ExtensionType::metadata`] of the given [`ExtensionType`]. /// - /// # Panics + /// # Error /// - /// This function panics when the datatype of this field is not a valid - /// storage type for the given extension type. - pub fn with_extension_type(mut self, extension_type: ExtensionType) -> Self { - if extension_type.supports_storage_type(&self.data_type) { - self.metadata.insert( - ExtensionType::NAME_KEY.to_owned(), - extension_type.name().to_owned(), - ); - if let Some(metadata) = extension_type.metadata() { + /// This functions returns an error if the datatype of this field does not + /// match the storage type of the given extension type. + pub fn try_with_extension_type( + &mut self, + extension_type: E, + ) -> Result<(), ArrowError> { + if extension_type.supports(&self.data_type) { + // Insert the name + self.metadata + .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned()); + // Insert the metadata, if any + if let Some(metadata) = extension_type.into_serialized_metadata() { self.metadata - .insert(ExtensionType::METADATA_KEY.to_owned(), metadata); + .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata); } - self + Ok(()) } else { - panic!( - "{extension_type} does not support {} as storage type", + Err(ArrowError::InvalidArgumentError(format!( + "storage type of extension type {} does not match field data type, expected {}, found {}", + ::NAME, + extension_type.storage_types().iter().map(ToString::to_string).collect::>().join(" or "), self.data_type - ); + ))) } } + /// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`] + /// and [`ExtensionType::metadata`] of the given [`ExtensionType`]. + /// + /// # Panics + /// + /// This functions panics if the datatype of this field does match the + /// storage type of the given extension type. + pub fn with_extension_type(mut self, extension_type: E) -> Self { + self.try_with_extension_type(extension_type) + .unwrap_or_else(|e| panic!("{e}")); + self + } + /// Indicates whether this [`Field`] supports null values. #[inline] pub const fn is_nullable(&self) -> bool { diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 61e14a40a23d..3a871aaba9ef 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -23,13 +23,14 @@ //! //! The interfaces for converting arrow schema to parquet schema is coming. +use arrow_schema::canonical_extension_types::Uuid; use base64::prelude::BASE64_STANDARD; use base64::Engine; use std::collections::HashMap; use std::sync::Arc; use arrow_ipc::writer; -use arrow_schema::{DataType, ExtensionType, Field, Fields, Schema, TimeUnit}; +use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; use crate::basic::{ ConvertedType, LogicalType, Repetition, TimeUnit as ParquetTimeUnit, Type as PhysicalType, @@ -468,10 +469,8 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_repetition(repetition) .with_id(id) .with_length(*length) - .with_logical_type(match field.extension_type() { - Some(ExtensionType::Uuid) => Some(LogicalType::Uuid), - _ => None, - }) + // If set, map arrow uuid extension type to parquet uuid logical type. + .with_logical_type(field.extension_type::().map(|_| LogicalType::Uuid)) .build() } DataType::BinaryView => Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) @@ -1913,4 +1912,23 @@ mod tests { fn test_get_arrow_schema_from_metadata() { assert!(get_arrow_schema_from_metadata("").is_err()); } + + #[test] + fn arrow_uuid_to_parquet_uuid() -> Result<()> { + let arrow_schema = Schema::new(vec![Field::new( + "uuid", + DataType::FixedSizeBinary(16), + false, + ) + .with_extension_type(Uuid)]); + + let parquet_schema = arrow_to_parquet_schema(&arrow_schema)?; + + assert_eq!( + parquet_schema.column(0).logical_type(), + Some(LogicalType::Uuid) + ); + + Ok(()) + } } From 84286535639daf2d21275f81cc0d46c37619952f Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 26 Sep 2024 21:24:17 +0200 Subject: [PATCH 4/5] Fix clippy warnings --- arrow-schema/src/datatype.rs | 10 +++++----- arrow-schema/src/field.rs | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 2bf9663ef512..b7a326f605f3 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -790,7 +790,7 @@ pub trait ExtensionType: Sized { /// Returns the serialized representation of the metadata of this extension /// type, or `None` if this extension type has no metadata. - fn into_serialized_metadata(&self) -> Option; + fn serialized_metadata(&self) -> Option; /// Deserialize this extension type from the serialized representation of the /// metadata of this extension. An extension type that has no metadata should @@ -842,7 +842,7 @@ impl ExtensionTypeExt for T where T: ExtensionType {} /// the definitions of well-known extension types so as to improve /// interoperability between different systems integrating Arrow columnar data. pub mod canonical_extension_types { - use serde_json::{Map, Value}; + use serde_json::Value; use super::{DataType, ExtensionType}; @@ -905,7 +905,7 @@ pub mod canonical_extension_types { Some(&self.0) } - fn into_serialized_metadata(&self) -> Option { + fn serialized_metadata(&self) -> Option { Some(self.0.to_string()) } @@ -917,7 +917,7 @@ pub mod canonical_extension_types { value => value .parse::() .ok() - .filter(|value| value.as_object().is_some_and(Map::is_empty)) + .filter(|value| matches!(value.as_object(), Some(map) if map.is_empty())) .map(Self), }) } @@ -952,7 +952,7 @@ pub mod canonical_extension_types { None } - fn into_serialized_metadata(&self) -> Option { + fn serialized_metadata(&self) -> Option { None } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index bca3257b4bd6..f16e2f9bbc05 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -370,7 +370,7 @@ impl Field { self.metadata .insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned()); // Insert the metadata, if any - if let Some(metadata) = extension_type.into_serialized_metadata() { + if let Some(metadata) = extension_type.serialized_metadata() { self.metadata .insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata); } From 7896455d4cbd5f084c190d60ac51c0dfc7ccc99b Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Thu, 26 Sep 2024 23:06:33 +0200 Subject: [PATCH 5/5] Add type annotation to fix build --- arrow-select/src/dictionary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-select/src/dictionary.rs b/arrow-select/src/dictionary.rs index 2a532600b6cc..c363b99920a7 100644 --- a/arrow-select/src/dictionary.rs +++ b/arrow-select/src/dictionary.rs @@ -315,7 +315,7 @@ mod tests { assert_eq!(merged.values.as_ref(), &expected); assert_eq!(merged.key_mappings.len(), 2); assert_eq!(&merged.key_mappings[0], &[0, 0, 0, 1, 0]); - assert_eq!(&merged.key_mappings[1], &[]); + assert_eq!(&merged.key_mappings[1], &[] as &[i32; 0]); } #[test]