From 1d308e0fc51723d64f66fd420c689fd173d41ec8 Mon Sep 17 00:00:00 2001 From: yanghua Date: Wed, 25 Dec 2024 20:23:24 +0800 Subject: [PATCH 1/4] feat(rust): introduce catalog interface for rust module --- rust/lance-arrow/src/lib.rs | 5 +- rust/lance/src/catalog.rs | 10 ++ rust/lance/src/catalog/catalog_trait.rs | 51 ++++++ rust/lance/src/catalog/dataset_identifier.rs | 171 +++++++++++++++++++ rust/lance/src/catalog/namespace.rs | 141 +++++++++++++++ rust/lance/src/lib.rs | 1 + 6 files changed, 375 insertions(+), 4 deletions(-) create mode 100644 rust/lance/src/catalog.rs create mode 100644 rust/lance/src/catalog/catalog_trait.rs create mode 100644 rust/lance/src/catalog/dataset_identifier.rs create mode 100644 rust/lance/src/catalog/namespace.rs diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index 78c2b224e9..d08992bf3b 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -930,10 +930,7 @@ mod tests { DataType::Struct(fields.clone()), false, )]); - let children = types - .iter() - .map(|ty| new_empty_array(ty)) - .collect::>(); + let children = types.iter().map(new_empty_array).collect::>(); let batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![Arc::new(StructArray::new(fields, children, None)) as ArrayRef], diff --git a/rust/lance/src/catalog.rs b/rust/lance/src/catalog.rs new file mode 100644 index 0000000000..a2110cfdc9 --- /dev/null +++ b/rust/lance/src/catalog.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +pub(crate) mod catalog_trait; +pub(crate) mod dataset_identifier; +pub(crate) mod namespace; + +pub use catalog_trait::Catalog; +pub use dataset_identifier::DatasetIdentifier; +pub use namespace::Namespace; diff --git a/rust/lance/src/catalog/catalog_trait.rs b/rust/lance/src/catalog/catalog_trait.rs new file mode 100644 index 0000000000..7fb65eee9c --- /dev/null +++ b/rust/lance/src/catalog/catalog_trait.rs @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use crate::catalog::dataset_identifier::DatasetIdentifier; +use crate::catalog::namespace::Namespace; +use crate::dataset::Dataset; +use std::collections::HashMap; + +pub trait Catalog { + /// List all datasets under a specified namespace. + fn list_datasets(&self, namespace: &Namespace) -> Vec; + + /// Create a new dataset in the catalog. + fn create_dataset( + &self, + identifier: &DatasetIdentifier, + location: &str, + ) -> Result; + + /// Check if a dataset exists in the catalog. + fn dataset_exists(&self, identifier: &DatasetIdentifier) -> bool; + + /// Drop a dataset from the catalog. + fn drop_dataset(&self, identifier: &DatasetIdentifier) -> Result<(), String>; + + /// Drop a dataset from the catalog and purge the metadata. + fn drop_dataset_with_purge( + &self, + identifier: &DatasetIdentifier, + purge: &bool, + ) -> Result<(), String>; + + /// Rename a dataset in the catalog. + fn rename_dataset( + &self, + from: &DatasetIdentifier, + to: &DatasetIdentifier, + ) -> Result<(), String>; + + /// Load a dataset from the catalog. + fn load_dataset(&self, name: &DatasetIdentifier) -> Result; + + /// Invalidate cached table metadata from current catalog. + fn invalidate_dataset(&self, identifier: &DatasetIdentifier) -> Result<(), String>; + + /// Register a dataset in the catalog. + fn register_dataset(&self, identifier: &DatasetIdentifier) -> Result; + + /// Initialize the catalog. + fn initialize(&self, name: &str, properties: &HashMap<&str, &str>) -> Result<(), String>; +} diff --git a/rust/lance/src/catalog/dataset_identifier.rs b/rust/lance/src/catalog/dataset_identifier.rs new file mode 100644 index 0000000000..b447298116 --- /dev/null +++ b/rust/lance/src/catalog/dataset_identifier.rs @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use crate::catalog::namespace::Namespace; +use std::fmt; +use std::hash::{Hash, Hasher}; + +#[derive(Clone, Debug)] +pub struct DatasetIdentifier { + namespace: Namespace, + name: String, +} + +impl DatasetIdentifier { + pub fn of(names: &[&str]) -> Self { + assert!( + !names.is_empty(), + "Cannot create dataset identifier without a dataset name" + ); + let namespace = Namespace::of(&names[..names.len() - 1]); + let name = names[names.len() - 1].to_string(); + Self { namespace, name } + } + + pub fn of_namespace(namespace: Namespace, name: &str) -> Self { + assert!(!name.is_empty(), "Invalid dataset name: null or empty"); + Self { + namespace, + name: name.to_string(), + } + } + + pub fn parse(identifier: &str) -> Self { + let parts: Vec<&str> = identifier.split('.').collect(); + Self::of(&parts) + } + + pub fn has_namespace(&self) -> bool { + !self.namespace.is_empty() + } + + pub fn namespace(&self) -> &Namespace { + &self.namespace + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn to_lowercase(&self) -> Self { + let new_levels: Vec = self + .namespace + .levels() + .iter() + .map(|s| s.to_lowercase()) + .collect(); + let new_name = self.name.to_lowercase(); + Self::of_namespace( + Namespace::of(&new_levels.iter().map(String::as_str).collect::>()), + &new_name, + ) + } +} + +impl PartialEq for DatasetIdentifier { + fn eq(&self, other: &Self) -> bool { + self.namespace == other.namespace && self.name == other.name + } +} + +impl Eq for DatasetIdentifier {} + +impl Hash for DatasetIdentifier { + fn hash(&self, state: &mut H) { + self.namespace.hash(state); + self.name.hash(state); + } +} + +impl fmt::Display for DatasetIdentifier { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.has_namespace() { + write!(f, "{}.{}", self.namespace, self.name) + } else { + write!(f, "{}", self.name) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::hash::DefaultHasher; + + #[test] + fn test_dataset_identifier_of() { + let ds_id = DatasetIdentifier::of(&["namespace1", "namespace2", "dataset"]); + assert_eq!( + ds_id.namespace().levels(), + &vec!["namespace1".to_string(), "namespace2".to_string()] + ); + assert_eq!(ds_id.name(), "dataset"); + } + + #[test] + fn test_dataset_identifier_of_namespace() { + let namespace = Namespace::of(&["namespace1", "namespace2"]); + let ds_id = DatasetIdentifier::of_namespace(namespace.clone(), "dataset"); + assert_eq!(ds_id.namespace(), &namespace); + assert_eq!(ds_id.name(), "dataset"); + } + + #[test] + fn test_dataset_identifier_parse() { + let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + assert_eq!( + ds_id.namespace().levels(), + &vec!["namespace1".to_string(), "namespace2".to_string()] + ); + assert_eq!(ds_id.name(), "dataset"); + } + + #[test] + fn test_dataset_identifier_has_namespace() { + let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + assert!(ds_id.has_namespace()); + + let ds_id_no_ns = DatasetIdentifier::of(&["dataset"]); + assert!(!ds_id_no_ns.has_namespace()); + } + + #[test] + fn test_dataset_identifier_to_lowercase() { + let ds_id = DatasetIdentifier::parse("Namespace1.Namespace2.Dataset"); + let lower_ds_id = ds_id.to_lowercase(); + assert_eq!( + lower_ds_id.namespace().levels(), + &vec!["namespace1".to_string(), "namespace2".to_string()] + ); + assert_eq!(lower_ds_id.name(), "dataset"); + } + + #[test] + fn test_dataset_identifier_equality() { + let ds_id1 = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + let ds_id2 = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + let ds_id3 = DatasetIdentifier::parse("namespace1.namespace2.other_dataset"); + assert_eq!(ds_id1, ds_id2); + assert_ne!(ds_id1, ds_id3); + } + + #[test] + fn test_dataset_identifier_hash() { + let ds_id1 = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + let ds_id2 = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + let mut hasher1 = DefaultHasher::new(); + ds_id1.hash(&mut hasher1); + let mut hasher2 = DefaultHasher::new(); + ds_id2.hash(&mut hasher2); + assert_eq!(hasher1.finish(), hasher2.finish()); + } + + #[test] + fn test_dataset_identifier_display() { + let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + assert_eq!(format!("{}", ds_id), "namespace1.namespace2.dataset"); + + let ds_id_no_ns = DatasetIdentifier::of(&["dataset"]); + assert_eq!(format!("{}", ds_id_no_ns), "dataset"); + } +} diff --git a/rust/lance/src/catalog/namespace.rs b/rust/lance/src/catalog/namespace.rs new file mode 100644 index 0000000000..960b95f81f --- /dev/null +++ b/rust/lance/src/catalog/namespace.rs @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::fmt; +use std::hash::{Hash, Hasher}; + +#[derive(Clone)] +pub struct Namespace { + levels: Vec, +} + +impl Namespace { + pub fn empty() -> Self { + Self { levels: Vec::new() } + } + + pub fn of(levels: &[&str]) -> Self { + assert!( + levels.iter().all(|&level| level != "\0"), + "Cannot create a namespace with the null-byte character" + ); + Self { + levels: levels.iter().map(|&s| s.to_string()).collect(), + } + } + + pub fn levels(&self) -> &[String] { + &self.levels + } + + pub fn level(&self, pos: usize) -> &str { + &self.levels[pos] + } + + pub fn is_empty(&self) -> bool { + self.levels.is_empty() + } + + pub fn length(&self) -> usize { + self.levels.len() + } +} + +impl PartialEq for Namespace { + fn eq(&self, other: &Self) -> bool { + self.levels == other.levels + } +} + +impl Eq for Namespace {} + +impl Hash for Namespace { + fn hash(&self, state: &mut H) { + self.levels.hash(state); + } +} + +impl fmt::Display for Namespace { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.levels.join(".")) + } +} + +impl fmt::Debug for Namespace { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Namespace") + .field("levels", &self.levels) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::hash::DefaultHasher; + + #[test] + fn test_empty_namespace() { + let ns = Namespace::empty(); + assert!(ns.is_empty()); + assert_eq!(ns.length(), 0); + assert_eq!(ns.levels().len(), 0); + } + + #[test] + fn test_namespace_of() { + let ns = Namespace::of(&["level1", "level2"]); + assert!(!ns.is_empty()); + assert_eq!(ns.length(), 2); + assert_eq!(ns.level(0), "level1"); + assert_eq!(ns.level(1), "level2"); + } + + #[test] + #[should_panic(expected = "Cannot create a namespace with the null-byte character")] + fn test_namespace_of_with_null_byte() { + Namespace::of(&["level1", "\0"]); + } + + #[test] + fn test_namespace_levels() { + let ns = Namespace::of(&["level1", "level2"]); + let levels = ns.levels(); + assert_eq!(levels, &vec!["level1".to_string(), "level2".to_string()]); + } + + #[test] + fn test_namespace_equality() { + let ns1 = Namespace::of(&["level1", "level2"]); + let ns2 = Namespace::of(&["level1", "level2"]); + let ns3 = Namespace::of(&["level1", "level3"]); + assert_eq!(ns1, ns2); + assert_ne!(ns1, ns3); + } + + #[test] + fn test_namespace_hash() { + let ns1 = Namespace::of(&["level1", "level2"]); + let ns2 = Namespace::of(&["level1", "level2"]); + let mut hasher1 = DefaultHasher::new(); + ns1.hash(&mut hasher1); + let mut hasher2 = DefaultHasher::new(); + ns2.hash(&mut hasher2); + assert_eq!(hasher1.finish(), hasher2.finish()); + } + + #[test] + fn test_namespace_display() { + let ns = Namespace::of(&["level1", "level2"]); + assert_eq!(format!("{}", ns), "level1.level2"); + } + + #[test] + fn test_namespace_debug() { + let ns = Namespace::of(&["level1", "level2"]); + assert_eq!( + format!("{:?}", ns), + "Namespace { levels: [\"level1\", \"level2\"] }" + ); + } +} diff --git a/rust/lance/src/lib.rs b/rust/lance/src/lib.rs index 706a553841..0c88f7b3c4 100644 --- a/rust/lance/src/lib.rs +++ b/rust/lance/src/lib.rs @@ -75,6 +75,7 @@ pub use lance_core::{datatypes, error}; pub use lance_core::{Error, Result}; pub mod arrow; +pub mod catalog; pub mod datafusion; pub mod dataset; pub mod index; From 98c0f0a82847ddb54e3b98842ea7f95c3eb4da06 Mon Sep 17 00:00:00 2001 From: yanghua Date: Thu, 26 Dec 2024 19:49:02 +0800 Subject: [PATCH 2/4] trigger ci From 68fee4a7c1d73f777ebc947c291366a9ec7091ff Mon Sep 17 00:00:00 2001 From: yanghua Date: Thu, 26 Dec 2024 20:01:46 +0800 Subject: [PATCH 3/4] feat(rust): introduce catalog interface for rust module --- rust/lance/src/catalog/catalog_trait.rs | 58 +++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/catalog/catalog_trait.rs b/rust/lance/src/catalog/catalog_trait.rs index 7fb65eee9c..734c086020 100644 --- a/rust/lance/src/catalog/catalog_trait.rs +++ b/rust/lance/src/catalog/catalog_trait.rs @@ -4,9 +4,16 @@ use crate::catalog::dataset_identifier::DatasetIdentifier; use crate::catalog::namespace::Namespace; use crate::dataset::Dataset; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; pub trait Catalog { + /// Initialize the catalog. + fn initialize(&self, name: &str, properties: &HashMap<&str, &str>) -> Result<(), String>; + + /// + /// Dataset traits + /// + /// List all datasets under a specified namespace. fn list_datasets(&self, namespace: &Namespace) -> Vec; @@ -46,6 +53,51 @@ pub trait Catalog { /// Register a dataset in the catalog. fn register_dataset(&self, identifier: &DatasetIdentifier) -> Result; - /// Initialize the catalog. - fn initialize(&self, name: &str, properties: &HashMap<&str, &str>) -> Result<(), String>; + /// + /// Namespace traits + /// + + /// Create a namespace in the catalog. + fn create_namespace( + &self, + namespace: &Namespace, + metadata: HashMap, + ) -> Result<(), String>; + + /// List top-level namespaces from the catalog. + fn list_namespaces(&self) -> Vec { + self.list_child_namespaces(&Namespace::empty()) + .unwrap_or_default() + } + + /// List child namespaces from the namespace. + fn list_child_namespaces(&self, namespace: &Namespace) -> Result, String>; + + /// Load metadata properties for a namespace. + fn load_namespace_metadata( + &self, + namespace: &Namespace, + ) -> Result, String>; + + /// Drop a namespace. + fn drop_namespace(&self, namespace: &Namespace) -> Result; + + /// Set a collection of properties on a namespace in the catalog. + fn set_properties( + &self, + namespace: &Namespace, + properties: HashMap, + ) -> Result; + + /// Remove a set of property keys from a namespace in the catalog. + fn remove_properties( + &self, + namespace: &Namespace, + properties: HashSet, + ) -> Result; + + /// Checks whether the Namespace exists. + fn namespace_exists(&self, namespace: &Namespace) -> bool { + self.load_namespace_metadata(namespace).is_ok() + } } From 23f34e30cadb94f3af77e9c9cd5b402d14fa7c0a Mon Sep 17 00:00:00 2001 From: yanghua Date: Fri, 27 Dec 2024 10:04:38 +0800 Subject: [PATCH 4/4] feat(rust): introduce catalog interface for rust module --- rust/lance/src/catalog/catalog_trait.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/rust/lance/src/catalog/catalog_trait.rs b/rust/lance/src/catalog/catalog_trait.rs index 734c086020..9ae20a6abb 100644 --- a/rust/lance/src/catalog/catalog_trait.rs +++ b/rust/lance/src/catalog/catalog_trait.rs @@ -10,10 +10,6 @@ pub trait Catalog { /// Initialize the catalog. fn initialize(&self, name: &str, properties: &HashMap<&str, &str>) -> Result<(), String>; - /// - /// Dataset traits - /// - /// List all datasets under a specified namespace. fn list_datasets(&self, namespace: &Namespace) -> Vec; @@ -53,10 +49,6 @@ pub trait Catalog { /// Register a dataset in the catalog. fn register_dataset(&self, identifier: &DatasetIdentifier) -> Result; - /// - /// Namespace traits - /// - /// Create a namespace in the catalog. fn create_namespace( &self,