Skip to content

Commit

Permalink
feat: cache miniblock metadata (#3323)
Browse files Browse the repository at this point in the history
The miniblock chunk data was always intended to be cached. This is
essential for good random access performance. This PR adds that caching.
  • Loading branch information
westonpace authored Jan 3, 2025
1 parent aad48df commit 8fe7147
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 36 deletions.
19 changes: 18 additions & 1 deletion rust/lance-core/src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@
use std::any::{Any, TypeId};
use std::sync::Arc;

use deepsize::{Context, DeepSizeOf};
use futures::Future;
use moka::sync::Cache;
use object_store::path::Path;

use crate::utils::path::LancePathExt;
use crate::Result;

pub use deepsize::{Context, DeepSizeOf};

type ArcAny = Arc<dyn Any + Send + Sync>;

#[derive(Clone)]
Expand Down Expand Up @@ -121,6 +122,12 @@ impl FileMetadataCache {
}
}

/// Fetch an item from the cache, using a str as the key
pub fn get_by_str<T: Send + Sync + 'static>(&self, path: &str) -> Option<Arc<T>> {
self.get(&Path::parse(path).unwrap())
}

/// Fetch an item from the cache
pub fn get<T: Send + Sync + 'static>(&self, path: &Path) -> Option<Arc<T>> {
let cache = self.cache.as_ref()?;
let temp: Path;
Expand All @@ -135,6 +142,7 @@ impl FileMetadataCache {
.map(|metadata| metadata.record.clone().downcast::<T>().unwrap())
}

/// Insert an item into the cache
pub fn insert<T: DeepSizeOf + Send + Sync + 'static>(&self, path: Path, metadata: Arc<T>) {
let Some(cache) = self.cache.as_ref() else {
return;
Expand All @@ -147,6 +155,15 @@ impl FileMetadataCache {
cache.insert((path, TypeId::of::<T>()), SizedRecord::new(metadata));
}

/// Insert an item into the cache, using a str as the key
pub fn insert_by_str<T: DeepSizeOf + Send + Sync + 'static>(
&self,
key: &str,
metadata: Arc<T>,
) {
self.insert(Path::parse(key).unwrap(), metadata);
}

/// Get an item
///
/// If it exists in the cache return that
Expand Down
3 changes: 3 additions & 0 deletions rust/lance-core/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ pub const COMPRESSION_LEVEL_META_KEY: &str = "lance-encoding:compression-level";
pub const BLOB_META_KEY: &str = "lance-encoding:blob";
pub const PACKED_STRUCT_LEGACY_META_KEY: &str = "packed";
pub const PACKED_STRUCT_META_KEY: &str = "lance-encoding:packed";
pub const STRUCTURAL_ENCODING_META_KEY: &str = "lance-encoding:structural-encoding";
pub const STRUCTURAL_ENCODING_MINIBLOCK: &str = "miniblock";
pub const STRUCTURAL_ENCODING_FULLZIP: &str = "fullzip";

lazy_static::lazy_static! {
pub static ref BLOB_DESC_FIELDS: Fields =
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-core/src/utils/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ impl LancePathExt for Path {
fn child_path(&self, path: &Path) -> Path {
let mut new_path = self.clone();
for part in path.parts() {
new_path = path.child(part);
new_path = new_path.child(part);
}
new_path
}
Expand Down
39 changes: 38 additions & 1 deletion rust/lance-datagen/src/generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,38 @@ impl BatchGeneratorBuilder {
}
}

/// Factory for creating a single random array
pub struct ArrayGeneratorBuilder {
generator: Box<dyn ArrayGenerator>,
seed: Option<Seed>,
}

impl ArrayGeneratorBuilder {
fn new(generator: Box<dyn ArrayGenerator>) -> Self {
Self {
generator,
seed: None,
}
}

/// Use the given seed for the generator
pub fn with_seed(mut self, seed: Seed) -> Self {
self.seed = Some(seed);
self
}

/// Generate a single array with the given length
pub fn into_array_rows(
mut self,
length: RowCount,
) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
self.seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
);
self.generator.generate(length, &mut rng)
}
}

const MS_PER_DAY: i64 = 86400000;

pub mod array {
Expand Down Expand Up @@ -1858,11 +1890,16 @@ pub mod array {
}
}

/// Create a BatchGeneratorBuilder to start generating data
/// Create a BatchGeneratorBuilder to start generating batch data
pub fn gen() -> BatchGeneratorBuilder {
BatchGeneratorBuilder::default()
}

/// Create an ArrayGeneratorBuilder to start generating array data
pub fn gen_array(gen: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
ArrayGeneratorBuilder::new(gen)
}

/// Create a BatchGeneratorBuilder with the given schema
///
/// You can add more columns or convert this into a reader immediately
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-encoding/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -928,7 +928,7 @@ impl DataBlock {
Self::Empty() => Self::Empty(),
Self::Constant(inner) => Self::Constant(inner),
Self::AllNull(_) => panic!("Cannot remove validity on all-null data"),
Self::Nullable(inner) => *inner.data,
Self::Nullable(inner) => inner.data.remove_validity(),
Self::FixedWidth(inner) => Self::FixedWidth(inner),
Self::FixedSizeList(inner) => Self::FixedSizeList(inner.remove_validity()),
Self::VariableWidth(inner) => Self::VariableWidth(inner),
Expand Down
Loading

0 comments on commit 8fe7147

Please sign in to comment.