Skip to content

Commit

Permalink
feat(rust): introduce catalog interface for rust module
Browse files Browse the repository at this point in the history
  • Loading branch information
yanghua committed Dec 26, 2024
1 parent c6fcb31 commit 1d308e0
Show file tree
Hide file tree
Showing 6 changed files with 375 additions and 4 deletions.
5 changes: 1 addition & 4 deletions rust/lance-arrow/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -930,10 +930,7 @@ mod tests {
DataType::Struct(fields.clone()),
false,
)]);
let children = types
.iter()
.map(|ty| new_empty_array(ty))
.collect::<Vec<_>>();
let children = types.iter().map(new_empty_array).collect::<Vec<_>>();
let batch = RecordBatch::try_new(
Arc::new(schema.clone()),
vec![Arc::new(StructArray::new(fields, children, None)) as ArrayRef],
Expand Down
10 changes: 10 additions & 0 deletions rust/lance/src/catalog.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

pub(crate) mod catalog_trait;
pub(crate) mod dataset_identifier;
pub(crate) mod namespace;

pub use catalog_trait::Catalog;
pub use dataset_identifier::DatasetIdentifier;
pub use namespace::Namespace;
51 changes: 51 additions & 0 deletions rust/lance/src/catalog/catalog_trait.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use crate::catalog::dataset_identifier::DatasetIdentifier;
use crate::catalog::namespace::Namespace;
use crate::dataset::Dataset;
use std::collections::HashMap;

pub trait Catalog {
/// List all datasets under a specified namespace.
fn list_datasets(&self, namespace: &Namespace) -> Vec<DatasetIdentifier>;

/// Create a new dataset in the catalog.
fn create_dataset(
&self,
identifier: &DatasetIdentifier,
location: &str,
) -> Result<Dataset, String>;

/// Check if a dataset exists in the catalog.
fn dataset_exists(&self, identifier: &DatasetIdentifier) -> bool;

/// Drop a dataset from the catalog.
fn drop_dataset(&self, identifier: &DatasetIdentifier) -> Result<(), String>;

/// Drop a dataset from the catalog and purge the metadata.
fn drop_dataset_with_purge(
&self,
identifier: &DatasetIdentifier,
purge: &bool,
) -> Result<(), String>;

/// Rename a dataset in the catalog.
fn rename_dataset(
&self,
from: &DatasetIdentifier,
to: &DatasetIdentifier,
) -> Result<(), String>;

/// Load a dataset from the catalog.
fn load_dataset(&self, name: &DatasetIdentifier) -> Result<Dataset, String>;

/// Invalidate cached table metadata from current catalog.
fn invalidate_dataset(&self, identifier: &DatasetIdentifier) -> Result<(), String>;

/// Register a dataset in the catalog.
fn register_dataset(&self, identifier: &DatasetIdentifier) -> Result<Dataset, String>;

/// Initialize the catalog.
fn initialize(&self, name: &str, properties: &HashMap<&str, &str>) -> Result<(), String>;
}
171 changes: 171 additions & 0 deletions rust/lance/src/catalog/dataset_identifier.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use crate::catalog::namespace::Namespace;
use std::fmt;
use std::hash::{Hash, Hasher};

#[derive(Clone, Debug)]
pub struct DatasetIdentifier {
namespace: Namespace,
name: String,
}

impl DatasetIdentifier {
pub fn of(names: &[&str]) -> Self {
assert!(
!names.is_empty(),
"Cannot create dataset identifier without a dataset name"
);
let namespace = Namespace::of(&names[..names.len() - 1]);
let name = names[names.len() - 1].to_string();
Self { namespace, name }
}

pub fn of_namespace(namespace: Namespace, name: &str) -> Self {
assert!(!name.is_empty(), "Invalid dataset name: null or empty");
Self {
namespace,
name: name.to_string(),
}
}

pub fn parse(identifier: &str) -> Self {
let parts: Vec<&str> = identifier.split('.').collect();
Self::of(&parts)
}

pub fn has_namespace(&self) -> bool {
!self.namespace.is_empty()
}

pub fn namespace(&self) -> &Namespace {
&self.namespace
}

pub fn name(&self) -> &str {
&self.name
}

pub fn to_lowercase(&self) -> Self {
let new_levels: Vec<String> = self
.namespace
.levels()
.iter()
.map(|s| s.to_lowercase())
.collect();
let new_name = self.name.to_lowercase();
Self::of_namespace(
Namespace::of(&new_levels.iter().map(String::as_str).collect::<Vec<&str>>()),
&new_name,
)
}
}

impl PartialEq for DatasetIdentifier {
fn eq(&self, other: &Self) -> bool {
self.namespace == other.namespace && self.name == other.name
}
}

impl Eq for DatasetIdentifier {}

impl Hash for DatasetIdentifier {
fn hash<H: Hasher>(&self, state: &mut H) {
self.namespace.hash(state);
self.name.hash(state);
}
}

impl fmt::Display for DatasetIdentifier {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.has_namespace() {
write!(f, "{}.{}", self.namespace, self.name)
} else {
write!(f, "{}", self.name)
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::hash::DefaultHasher;

#[test]
fn test_dataset_identifier_of() {
let ds_id = DatasetIdentifier::of(&["namespace1", "namespace2", "dataset"]);
assert_eq!(
ds_id.namespace().levels(),
&vec!["namespace1".to_string(), "namespace2".to_string()]
);
assert_eq!(ds_id.name(), "dataset");
}

#[test]
fn test_dataset_identifier_of_namespace() {
let namespace = Namespace::of(&["namespace1", "namespace2"]);
let ds_id = DatasetIdentifier::of_namespace(namespace.clone(), "dataset");
assert_eq!(ds_id.namespace(), &namespace);
assert_eq!(ds_id.name(), "dataset");
}

#[test]
fn test_dataset_identifier_parse() {
let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset");
assert_eq!(
ds_id.namespace().levels(),
&vec!["namespace1".to_string(), "namespace2".to_string()]
);
assert_eq!(ds_id.name(), "dataset");
}

#[test]
fn test_dataset_identifier_has_namespace() {
let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset");
assert!(ds_id.has_namespace());

let ds_id_no_ns = DatasetIdentifier::of(&["dataset"]);
assert!(!ds_id_no_ns.has_namespace());
}

#[test]
fn test_dataset_identifier_to_lowercase() {
let ds_id = DatasetIdentifier::parse("Namespace1.Namespace2.Dataset");
let lower_ds_id = ds_id.to_lowercase();
assert_eq!(
lower_ds_id.namespace().levels(),
&vec!["namespace1".to_string(), "namespace2".to_string()]
);
assert_eq!(lower_ds_id.name(), "dataset");
}

#[test]
fn test_dataset_identifier_equality() {
let ds_id1 = DatasetIdentifier::parse("namespace1.namespace2.dataset");
let ds_id2 = DatasetIdentifier::parse("namespace1.namespace2.dataset");
let ds_id3 = DatasetIdentifier::parse("namespace1.namespace2.other_dataset");
assert_eq!(ds_id1, ds_id2);
assert_ne!(ds_id1, ds_id3);
}

#[test]
fn test_dataset_identifier_hash() {
let ds_id1 = DatasetIdentifier::parse("namespace1.namespace2.dataset");
let ds_id2 = DatasetIdentifier::parse("namespace1.namespace2.dataset");
let mut hasher1 = DefaultHasher::new();
ds_id1.hash(&mut hasher1);
let mut hasher2 = DefaultHasher::new();
ds_id2.hash(&mut hasher2);
assert_eq!(hasher1.finish(), hasher2.finish());
}

#[test]
fn test_dataset_identifier_display() {
let ds_id = DatasetIdentifier::parse("namespace1.namespace2.dataset");
assert_eq!(format!("{}", ds_id), "namespace1.namespace2.dataset");

let ds_id_no_ns = DatasetIdentifier::of(&["dataset"]);
assert_eq!(format!("{}", ds_id_no_ns), "dataset");
}
}
141 changes: 141 additions & 0 deletions rust/lance/src/catalog/namespace.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use std::fmt;
use std::hash::{Hash, Hasher};

#[derive(Clone)]
pub struct Namespace {
levels: Vec<String>,
}

impl Namespace {
pub fn empty() -> Self {
Self { levels: Vec::new() }
}

pub fn of(levels: &[&str]) -> Self {
assert!(
levels.iter().all(|&level| level != "\0"),
"Cannot create a namespace with the null-byte character"
);
Self {
levels: levels.iter().map(|&s| s.to_string()).collect(),
}
}

pub fn levels(&self) -> &[String] {
&self.levels
}

pub fn level(&self, pos: usize) -> &str {
&self.levels[pos]
}

pub fn is_empty(&self) -> bool {
self.levels.is_empty()
}

pub fn length(&self) -> usize {
self.levels.len()
}
}

impl PartialEq for Namespace {
fn eq(&self, other: &Self) -> bool {
self.levels == other.levels
}
}

impl Eq for Namespace {}

impl Hash for Namespace {
fn hash<H: Hasher>(&self, state: &mut H) {
self.levels.hash(state);
}
}

impl fmt::Display for Namespace {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.levels.join("."))
}
}

impl fmt::Debug for Namespace {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Namespace")
.field("levels", &self.levels)
.finish()
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::hash::DefaultHasher;

#[test]
fn test_empty_namespace() {
let ns = Namespace::empty();
assert!(ns.is_empty());
assert_eq!(ns.length(), 0);
assert_eq!(ns.levels().len(), 0);
}

#[test]
fn test_namespace_of() {
let ns = Namespace::of(&["level1", "level2"]);
assert!(!ns.is_empty());
assert_eq!(ns.length(), 2);
assert_eq!(ns.level(0), "level1");
assert_eq!(ns.level(1), "level2");
}

#[test]
#[should_panic(expected = "Cannot create a namespace with the null-byte character")]
fn test_namespace_of_with_null_byte() {
Namespace::of(&["level1", "\0"]);
}

#[test]
fn test_namespace_levels() {
let ns = Namespace::of(&["level1", "level2"]);
let levels = ns.levels();
assert_eq!(levels, &vec!["level1".to_string(), "level2".to_string()]);
}

#[test]
fn test_namespace_equality() {
let ns1 = Namespace::of(&["level1", "level2"]);
let ns2 = Namespace::of(&["level1", "level2"]);
let ns3 = Namespace::of(&["level1", "level3"]);
assert_eq!(ns1, ns2);
assert_ne!(ns1, ns3);
}

#[test]
fn test_namespace_hash() {
let ns1 = Namespace::of(&["level1", "level2"]);
let ns2 = Namespace::of(&["level1", "level2"]);
let mut hasher1 = DefaultHasher::new();
ns1.hash(&mut hasher1);
let mut hasher2 = DefaultHasher::new();
ns2.hash(&mut hasher2);
assert_eq!(hasher1.finish(), hasher2.finish());
}

#[test]
fn test_namespace_display() {
let ns = Namespace::of(&["level1", "level2"]);
assert_eq!(format!("{}", ns), "level1.level2");
}

#[test]
fn test_namespace_debug() {
let ns = Namespace::of(&["level1", "level2"]);
assert_eq!(
format!("{:?}", ns),
"Namespace { levels: [\"level1\", \"level2\"] }"
);
}
}
1 change: 1 addition & 0 deletions rust/lance/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ pub use lance_core::{datatypes, error};
pub use lance_core::{Error, Result};

pub mod arrow;
pub mod catalog;
pub mod datafusion;
pub mod dataset;
pub mod index;
Expand Down

0 comments on commit 1d308e0

Please sign in to comment.