From eda7b8d0b487e9d844c00ba9a55c0f6aa70cb51d Mon Sep 17 00:00:00 2001 From: yanghua Date: Wed, 25 Dec 2024 20:23:24 +0800 Subject: [PATCH] feat(rust): introduce catalog interface for rust module --- rust/lance/src/catalog.rs | 10 ++ rust/lance/src/catalog/catalog.rs | 51 ++++++ rust/lance/src/catalog/dataset_identifier.rs | 170 +++++++++++++++++++ rust/lance/src/catalog/namespace.rs | 140 +++++++++++++++ rust/lance/src/lib.rs | 1 + 5 files changed, 372 insertions(+) create mode 100644 rust/lance/src/catalog.rs create mode 100644 rust/lance/src/catalog/catalog.rs create mode 100644 rust/lance/src/catalog/dataset_identifier.rs create mode 100644 rust/lance/src/catalog/namespace.rs diff --git a/rust/lance/src/catalog.rs b/rust/lance/src/catalog.rs new file mode 100644 index 0000000000..8210be9742 --- /dev/null +++ b/rust/lance/src/catalog.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +pub(crate) mod dataset_identifier; +pub(crate) mod catalog; +pub(crate) mod namespace; + +pub use dataset_identifier::DatasetIdentifier; +pub use catalog::Catalog; +pub use namespace::Namespace; diff --git a/rust/lance/src/catalog/catalog.rs b/rust/lance/src/catalog/catalog.rs new file mode 100644 index 0000000000..d4e6ed8020 --- /dev/null +++ b/rust/lance/src/catalog/catalog.rs @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use crate::dataset::Dataset; +use crate::catalog::dataset_identifier::DatasetIdentifier; +use crate::catalog::namespace::Namespace; +use std::collections::HashMap; + +pub trait Catalog { + /// List all datasets under a specified namespace. + fn list_datasets(&self, namespace: &Namespace) -> Vec; + + /// Create a new dataset in the catalog. + fn create_dataset( + &self, + identifier: &DatasetIdentifier, + location: &str, + ) -> Result; + + /// Check if a dataset exists in the catalog. + fn dataset_exists(&self, identifier: &DatasetIdentifier) -> bool; + + /// Drop a dataset from the catalog. + fn drop_dataset(&self, identifier: &DatasetIdentifier) -> Result<(), String>; + + /// Drop a dataset from the catalog and purge the metadata. + fn drop_dataset_with_purge( + &self, + identifier: &DatasetIdentifier, + purge: &bool, + ) -> Result<(), String>; + + /// Rename a dataset in the catalog. + fn rename_dataset( + &self, + from: &DatasetIdentifier, + to: &DatasetIdentifier, + ) -> Result<(), String>; + + /// Load a dataset from the catalog. + fn load_dataset(&self, name: &DatasetIdentifier) -> Result; + + /// Invalidate cached table metadata from current catalog. + fn invalidate_dataset(&self, identifier: &DatasetIdentifier) -> Result<(), String>; + + /// Register a dataset in the catalog. + fn register_dataset(&self, identifier: &DatasetIdentifier) -> Result; + + /// Initialize the catalog. + fn initialize(&self, name: &str, properties: &HashMap<&str, &str>) -> Result<(), String>; +} diff --git a/rust/lance/src/catalog/dataset_identifier.rs b/rust/lance/src/catalog/dataset_identifier.rs new file mode 100644 index 0000000000..0677883f46 --- /dev/null +++ b/rust/lance/src/catalog/dataset_identifier.rs @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use crate::catalog::namespace::Namespace; +use std::fmt; +use std::hash::{Hash, Hasher}; + +#[derive(Clone, Debug)] +pub struct DatasetIdentifier { + namespace: Namespace, + name: String, +} + +impl DatasetIdentifier { + pub fn of(names: &[&str]) -> Self { + assert!( + !names.is_empty(), + "Cannot create dataset identifier without a dataset name" + ); + let namespace = Namespace::of(&names[..names.len() - 1]); + let name = names[names.len() - 1].to_string(); + DatasetIdentifier { namespace, name } + } + + pub fn of_namespace(namespace: Namespace, name: &str) -> Self { + assert!(!name.is_empty(), "Invalid dataset name: null or empty"); + DatasetIdentifier { + namespace, + name: name.to_string(), + } + } + + pub fn parse(identifier: &str) -> Self { + let parts: Vec<&str> = identifier.split('.').collect(); + DatasetIdentifier::of(&parts) + } + + pub fn has_namespace(&self) -> bool { + !self.namespace.is_empty() + } + + pub fn namespace(&self) -> &Namespace { + &self.namespace + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn to_lowercase(&self) -> Self { + let new_levels: Vec = self + .namespace + .levels() + .iter() + .map(|s| s.to_lowercase()) + .collect(); + let new_name = self.name.to_lowercase(); + DatasetIdentifier::of_namespace( + Namespace::of(&new_levels.iter().map(String::as_str).collect::>()), + &new_name, + ) + } +} + +impl PartialEq for DatasetIdentifier { + fn eq(&self, other: &Self) -> bool { + self.namespace == other.namespace && self.name == other.name + } +} + +impl Eq for DatasetIdentifier {} + +impl Hash for DatasetIdentifier { + fn hash(&self, state: &mut H) { + self.namespace.hash(state); + self.name.hash(state); + } +} + +impl fmt::Display for DatasetIdentifier { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.has_namespace() { + write!(f, "{}.{}", self.namespace, self.name) + } else { + write!(f, "{}", self.name) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_dataset_identifier_of() { + let ds_id = DatasetIdentifier::of(&["namespace1", "namespace2", "dataset"]); + assert_eq!( + ds_id.namespace().levels(), + &vec!["namespace1".to_string(), "namespace2".to_string()] + ); + assert_eq!(ds_id.name(), "dataset"); + } + + #[test] + fn test_dataset_identifier_of_namespace() { + let namespace = Namespace::of(&["namespace1", "namespace2"]); + let ds_id = DatasetIdentifier::of_namespace(namespace.clone(), "dataset"); + assert_eq!(ds_id.namespace(), &namespace); + assert_eq!(ds_id.name(), "dataset"); + } + + #[test] + fn test_dataset_identifier_parse() { + let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + assert_eq!( + ds_id.namespace().levels(), + &vec!["namespace1".to_string(), "namespace2".to_string()] + ); + assert_eq!(ds_id.name(), "dataset"); + } + + #[test] + fn test_dataset_identifier_has_namespace() { + let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + assert!(ds_id.has_namespace()); + + let ds_id_no_ns = DatasetIdentifier::of(&["dataset"]); + assert!(!ds_id_no_ns.has_namespace()); + } + + #[test] + fn test_dataset_identifier_to_lowercase() { + let ds_id = DatasetIdentifier::parse("Namespace1.Namespace2.Dataset"); + let lower_ds_id = ds_id.to_lowercase(); + assert_eq!( + lower_ds_id.namespace().levels(), + &vec!["namespace1".to_string(), "namespace2".to_string()] + ); + assert_eq!(lower_ds_id.name(), "dataset"); + } + + #[test] + fn test_dataset_identifier_equality() { + let ds_id1 = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + let ds_id2 = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + let ds_id3 = DatasetIdentifier::parse("namespace1.namespace2.other_dataset"); + assert_eq!(ds_id1, ds_id2); + assert_ne!(ds_id1, ds_id3); + } + + #[test] + fn test_dataset_identifier_hash() { + let ds_id1 = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + let ds_id2 = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + let mut hasher1 = DefaultHasher::new(); + ds_id1.hash(&mut hasher1); + let mut hasher2 = DefaultHasher::new(); + ds_id2.hash(&mut hasher2); + assert_eq!(hasher1.finish(), hasher2.finish()); + } + + #[test] + fn test_dataset_identifier_display() { + let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset"); + assert_eq!(format!("{}", ds_id), "namespace1.namespace2.dataset"); + + let ds_id_no_ns = DatasetIdentifier::of(&["dataset"]); + assert_eq!(format!("{}", ds_id_no_ns), "dataset"); + } +} diff --git a/rust/lance/src/catalog/namespace.rs b/rust/lance/src/catalog/namespace.rs new file mode 100644 index 0000000000..dac2275637 --- /dev/null +++ b/rust/lance/src/catalog/namespace.rs @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::fmt; +use std::hash::{Hash, Hasher}; + +#[derive(Clone)] +pub struct Namespace { + levels: Vec, +} + +impl Namespace { + pub fn empty() -> Self { + Namespace { levels: Vec::new() } + } + + pub fn of(levels: &[&str]) -> Self { + assert!( + levels.iter().all(|&level| level != "\0"), + "Cannot create a namespace with the null-byte character" + ); + Namespace { + levels: levels.iter().map(|&s| s.to_string()).collect(), + } + } + + pub fn levels(&self) -> &[String] { + &self.levels + } + + pub fn level(&self, pos: usize) -> &str { + &self.levels[pos] + } + + pub fn is_empty(&self) -> bool { + self.levels.is_empty() + } + + pub fn length(&self) -> usize { + self.levels.len() + } +} + +impl PartialEq for Namespace { + fn eq(&self, other: &Self) -> bool { + self.levels == other.levels + } +} + +impl Eq for Namespace {} + +impl Hash for Namespace { + fn hash(&self, state: &mut H) { + self.levels.hash(state); + } +} + +impl fmt::Display for Namespace { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.levels.join(".")) + } +} + +impl fmt::Debug for Namespace { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Namespace") + .field("levels", &self.levels) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty_namespace() { + let ns = Namespace::empty(); + assert!(ns.is_empty()); + assert_eq!(ns.length(), 0); + assert_eq!(ns.levels().len(), 0); + } + + #[test] + fn test_namespace_of() { + let ns = Namespace::of(&["level1", "level2"]); + assert!(!ns.is_empty()); + assert_eq!(ns.length(), 2); + assert_eq!(ns.level(0), "level1"); + assert_eq!(ns.level(1), "level2"); + } + + #[test] + #[should_panic(expected = "Cannot create a namespace with the null-byte character")] + fn test_namespace_of_with_null_byte() { + Namespace::of(&["level1", "\0"]); + } + + #[test] + fn test_namespace_levels() { + let ns = Namespace::of(&["level1", "level2"]); + let levels = ns.levels(); + assert_eq!(levels, &vec!["level1".to_string(), "level2".to_string()]); + } + + #[test] + fn test_namespace_equality() { + let ns1 = Namespace::of(&["level1", "level2"]); + let ns2 = Namespace::of(&["level1", "level2"]); + let ns3 = Namespace::of(&["level1", "level3"]); + assert_eq!(ns1, ns2); + assert_ne!(ns1, ns3); + } + + #[test] + fn test_namespace_hash() { + let ns1 = Namespace::of(&["level1", "level2"]); + let ns2 = Namespace::of(&["level1", "level2"]); + let mut hasher1 = DefaultHasher::new(); + ns1.hash(&mut hasher1); + let mut hasher2 = DefaultHasher::new(); + ns2.hash(&mut hasher2); + assert_eq!(hasher1.finish(), hasher2.finish()); + } + + #[test] + fn test_namespace_display() { + let ns = Namespace::of(&["level1", "level2"]); + assert_eq!(format!("{}", ns), "level1.level2"); + } + + #[test] + fn test_namespace_debug() { + let ns = Namespace::of(&["level1", "level2"]); + assert_eq!( + format!("{:?}", ns), + "Namespace { levels: [\"level1\", \"level2\"] }" + ); + } +} diff --git a/rust/lance/src/lib.rs b/rust/lance/src/lib.rs index 706a553841..3d3ffe805f 100644 --- a/rust/lance/src/lib.rs +++ b/rust/lance/src/lib.rs @@ -82,6 +82,7 @@ pub mod io; pub mod session; pub mod table; pub mod utils; +pub mod catalog; pub use dataset::Dataset; use lance_index::vector::DIST_COL;