From 2bb6763e7709a3cecdbfe0fd18bcc2cd0cd89504 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Mon, 25 Dec 2023 15:23:27 +0000 Subject: [PATCH 01/10] improving the orientations and fixing the unwraps --- Cargo.toml | 3 +- examples/http_bench.rs | 21 +++---- examples/load_bench.rs | 16 +++-- examples/ntriples/main.rs | 20 +++--- examples/query_bench.rs | 23 ++++--- examples/rdf_xml/main.rs | 20 +++--- examples/serialize_bench.rs | 29 +++++---- examples/turtle/main.rs | 20 +++--- src/dictionary.rs | 4 ++ src/engine/array.rs | 10 +-- src/engine/chunk.rs | 16 ++--- src/engine/mod.rs | 10 +-- src/error.rs | 28 +++++++++ src/io/mod.rs | 48 ++++++++++++--- src/lib.rs | 2 +- src/main.rs | 14 ++++- src/storage/layout.rs | 96 ++++++++++++++++++++--------- src/storage/matrix.rs | 111 ++++++++++++++++++++++----------- src/storage/mod.rs | 111 +++++++++++++++++---------------- src/storage/ops.rs | 109 +++++++++++++++++++++++++++++++++ src/storage/params.rs | 118 ++++++++++++++++++++++++++++++++++++ src/storage/tabular.rs | 70 ++++++++++----------- src/utils.rs | 4 +- test.sh | 3 + tests/common/mod.rs | 26 +++++--- tests/get_object_test.rs | 82 ++++++++++++++----------- tests/get_subject_test.rs | 117 +++++++++++++++++++++++------------ tests/orientation.rs | 1 + tests/write_read_test.rs | 52 ++++++++++++---- 29 files changed, 831 insertions(+), 353 deletions(-) create mode 100644 src/storage/ops.rs create mode 100644 src/storage/params.rs create mode 100755 test.sh create mode 100644 tests/orientation.rs diff --git a/Cargo.toml b/Cargo.toml index bf0c655..6588ca9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ version = "0.0.1" edition = "2021" [dependencies] -zarrs = { version = "0.6.0", default-features = false, features = [ "http", "gzip", "sharding" ] } +zarrs = { version = "0.7.3", default-features = false, features = [ "http", "gzip", "sharding" ] } clap = { version = "4.1.8", features = ["derive"] } serde_json = "1.0.108" thiserror = "1.0.50" @@ -16,6 +16,7 @@ rio_xml = "0.8.4" rio_api = "0.8.4" safe-transmute = "0.11.2" rayon = "1.8.0" +parking_lot = "0.12" [target.'cfg(not(target_env = "msvc"))'.dependencies] jemallocator = "0.5.0" diff --git a/examples/http_bench.rs b/examples/http_bench.rs index 9c7ee89..1d5906d 100644 --- a/examples/http_bench.rs +++ b/examples/http_bench.rs @@ -1,20 +1,19 @@ -use remote_hdt::engine::EngineStrategy; +use remote_hdt::error::RemoteHDTError; use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::ops::Ops; +use remote_hdt::storage::params::Serialization; use remote_hdt::storage::HTTPStorage; use std::time::Instant; -fn main() { - let mut remote_hdt = HTTPStorage::new(MatrixLayout); +fn main() -> Result<(), RemoteHDTError> { + let mut remote_hdt = HTTPStorage::new(MatrixLayout, Serialization::Zarr); let arr = remote_hdt - .connect("https://raw.githubusercontent.com/weso/RemoteHDT/master/resources/root.zarr") - .unwrap(); - let index = remote_hdt - .get_dictionary() - .get_subject_idx_unchecked(""); + .connect("https://raw.githubusercontent.com/weso/RemoteHDT/master/resources/root.zarr")?; let before = Instant::now(); - arr.get_subject(index).unwrap(); - let after = before.elapsed(); + arr.get_subject("")?; - println!("Elapsed time: {:.2?}", after) + println!("Elapsed time: {:.2?}", before.elapsed()); + + Ok(()) } diff --git a/examples/load_bench.rs b/examples/load_bench.rs index 667c1f4..9bd391b 100644 --- a/examples/load_bench.rs +++ b/examples/load_bench.rs @@ -1,21 +1,25 @@ +use remote_hdt::error::RemoteHDTError; +use remote_hdt::storage::params::Serialization; use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::LocalStorage; use std::env; use std::time::Instant; -fn main() { +fn main() -> Result<(), RemoteHDTError> { let args: Vec = env::args().collect(); if args.len() <= 1 { panic!("Usage: cargo run --example query_bench "); } + let number_of_universities: &String = &args[1]; let zarr_path = format!("{}-lubm", number_of_universities); let before = Instant::now(); - LocalStorage::new(TabularLayout) - .load(format!("{}.zarr", zarr_path).as_str()) - .unwrap(); - let after = before.elapsed(); - println!("Elapsed time: {:.2?}", after) + LocalStorage::new(TabularLayout, Serialization::Zarr) + .load(format!("{}.zarr", zarr_path).as_str())?; + + println!("Elapsed time: {:.2?}", before.elapsed()); + + Ok(()) } diff --git a/examples/ntriples/main.rs b/examples/ntriples/main.rs index fc6af9c..1f4fbd3 100644 --- a/examples/ntriples/main.rs +++ b/examples/ntriples/main.rs @@ -1,13 +1,15 @@ +use remote_hdt::error::RemoteHDTError; +use remote_hdt::storage::params::{ChunkingStrategy, ReferenceSystem, Serialization}; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::ChunkingStrategy; use remote_hdt::storage::LocalStorage; -pub fn main() { - LocalStorage::new(TabularLayout) - .serialize( - "root.zarr", - "examples/ntriples/rdf.nt", - ChunkingStrategy::Chunk, - ) - .unwrap(); +pub fn main() -> Result<(), RemoteHDTError> { + LocalStorage::new(TabularLayout, Serialization::Zarr).serialize( + "root.zarr", + "examples/ntriples/rdf.nt", + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + )?; + + Ok(()) } diff --git a/examples/query_bench.rs b/examples/query_bench.rs index f53499f..c2ab5f8 100644 --- a/examples/query_bench.rs +++ b/examples/query_bench.rs @@ -1,30 +1,29 @@ -use remote_hdt::engine::EngineStrategy; +use remote_hdt::error::RemoteHDTError; use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::ops::Ops; +use remote_hdt::storage::params::Serialization; use remote_hdt::storage::LocalStorage; use std::env; use std::time::Instant; const SUBJECT: &str = ""; -fn main() { +fn main() -> Result<(), RemoteHDTError> { let args: Vec = env::args().collect(); if args.len() <= 1 { panic!("Usage: cargo run --example query_bench "); } + let number_of_universities: &String = &args[1]; let zarr_path = format!("{}-lubm", number_of_universities); - let mut remote_hdt = LocalStorage::new(MatrixLayout); - let arr = remote_hdt - .load(format!("{}.zarr", zarr_path).as_str()) - .unwrap(); - let index = remote_hdt - .get_dictionary() - .get_subject_idx_unchecked(SUBJECT); + let mut remote_hdt = LocalStorage::new(MatrixLayout, Serialization::Zarr); + let arr = remote_hdt.load(format!("{}.zarr", zarr_path).as_str())?; let before = Instant::now(); - arr.get_subject(index).unwrap(); - let after = before.elapsed(); + arr.get_subject(SUBJECT)?; + + println!("Elapsed time: {:.2?}", before.elapsed()); - println!("Elapsed time: {:.2?}", after) + Ok(()) } diff --git a/examples/rdf_xml/main.rs b/examples/rdf_xml/main.rs index ef19d9a..f69935a 100644 --- a/examples/rdf_xml/main.rs +++ b/examples/rdf_xml/main.rs @@ -1,13 +1,15 @@ +use remote_hdt::error::RemoteHDTError; +use remote_hdt::storage::params::{ChunkingStrategy, ReferenceSystem, Serialization}; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::ChunkingStrategy; use remote_hdt::storage::LocalStorage; -pub fn main() { - LocalStorage::new(TabularLayout) - .serialize( - "root.zarr", - "examples/rdf_xml/rdf.rdf", - ChunkingStrategy::Chunk, - ) - .unwrap(); +pub fn main() -> Result<(), RemoteHDTError> { + LocalStorage::new(TabularLayout, Serialization::Zarr).serialize( + "root.zarr", + "examples/rdf_xml/rdf.rdf", + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + )?; + + Ok(()) } diff --git a/examples/serialize_bench.rs b/examples/serialize_bench.rs index d9ee85c..1ace510 100644 --- a/examples/serialize_bench.rs +++ b/examples/serialize_bench.rs @@ -1,5 +1,6 @@ +use remote_hdt::error::RemoteHDTError; use remote_hdt::storage::matrix::MatrixLayout; -use remote_hdt::storage::ChunkingStrategy; +use remote_hdt::storage::params::{ChunkingStrategy, ReferenceSystem, Serialization}; use remote_hdt::storage::LocalStorage; use std::env; use std::time::Instant; @@ -8,24 +9,26 @@ use std::time::Instant; #[global_allocator] static ALLOCATOR: jemallocator::Jemalloc = jemallocator::Jemalloc; -fn main() { +fn main() -> Result<(), RemoteHDTError> { let args: Vec = env::args().collect(); if args.len() <= 3 { panic!("Usage: cargo run --example serialize_bench "); } - let rdf_path: &String = &args[1]; - let zarr_path: &String = &args[2]; - let shard_size: &String = &args[3]; + + let rdf_path = &args[1].as_str(); + let zarr_path = &args[2].as_str(); + let shard_size = &args[3].parse::().unwrap(); let before = Instant::now(); - LocalStorage::new(MatrixLayout) - .serialize( - &zarr_path.as_str(), - &rdf_path.as_str(), - ChunkingStrategy::Sharding(shard_size.parse::().unwrap()), - ) - .unwrap(); + LocalStorage::new(MatrixLayout, Serialization::Zarr).serialize( + zarr_path, + rdf_path, + ChunkingStrategy::Sharding(*shard_size), + ReferenceSystem::SPO, + )?; + + println!("Elapsed time: {:.2?}", before.elapsed()); - println!("Elapsed time: {:.2?}", before.elapsed()) + Ok(()) } diff --git a/examples/turtle/main.rs b/examples/turtle/main.rs index 2f689ee..a03af07 100644 --- a/examples/turtle/main.rs +++ b/examples/turtle/main.rs @@ -1,13 +1,15 @@ +use remote_hdt::error::RemoteHDTError; +use remote_hdt::storage::params::{ChunkingStrategy, ReferenceSystem, Serialization}; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::ChunkingStrategy; use remote_hdt::storage::LocalStorage; -pub fn main() { - LocalStorage::new(TabularLayout) - .serialize( - "root.zarr", - "examples/turtle/rdf.ttk", - ChunkingStrategy::Chunk, - ) - .unwrap(); +pub fn main() -> Result<(), RemoteHDTError> { + LocalStorage::new(TabularLayout, Serialization::Zarr).serialize( + "root.zarr", + "examples/turtle/rdf.ttk", + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + )?; + + Ok(()) } diff --git a/src/dictionary.rs b/src/dictionary.rs index 6a74608..9a875e0 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -50,6 +50,10 @@ impl Dictionary { self.subjects.len() } + pub fn predicates_size(&self) -> usize { + self.predicates.len() + } + pub fn objects_size(&self) -> usize { self.objects.len() } diff --git a/src/engine/array.rs b/src/engine/array.rs index 5ca2f5d..d6c4ef9 100644 --- a/src/engine/array.rs +++ b/src/engine/array.rs @@ -1,22 +1,22 @@ -use sprs::{CsMat, TriMat}; +use sprs::TriMat; use crate::storage::ZarrArray; use super::{EngineResult, EngineStrategy}; -impl EngineStrategy> for ZarrArray { - fn get_subject(&self, index: usize) -> EngineResult> { +impl EngineStrategy for ZarrArray { + fn get_first_term(&self, index: usize) -> EngineResult { let mut matrix = TriMat::new((self.rows(), self.rows())); matrix.add_triplet(index, index, 1); let matrix = matrix.to_csc(); Ok(&matrix * self) } - fn get_predicate(&self, _value: u8) -> EngineResult> { + fn get_second_term(&self, _value: usize) -> EngineResult { unimplemented!() } - fn get_object(&self, index: usize) -> EngineResult> { + fn get_third_term(&self, index: usize) -> EngineResult { let mut matrix = TriMat::new((self.cols(), self.cols())); matrix.add_triplet(index, index, 1); let matrix = matrix.to_csc(); diff --git a/src/engine/chunk.rs b/src/engine/chunk.rs index 1761804..7a17100 100644 --- a/src/engine/chunk.rs +++ b/src/engine/chunk.rs @@ -3,19 +3,19 @@ use zarrs::array_subset::ArraySubset; use zarrs::storage::ReadableStorageTraits; use crate::error::EngineError; -use crate::utils::objects_per_chunk; -use crate::utils::subjects_per_chunk; +use crate::utils::columns_per_shard; +use crate::utils::rows_per_shard; use super::EngineResult; use super::EngineStrategy; impl EngineStrategy> for Array { - fn get_subject(&self, index: usize) -> EngineResult> { - let index_to_chunk = index as u64 / subjects_per_chunk(self); - let chunk_to_index = index % subjects_per_chunk(self) as usize; + fn get_first_term(&self, index: usize) -> EngineResult> { + let index_to_chunk = index as u64 / rows_per_shard(self); + let chunk_to_index = index % rows_per_shard(self) as usize; match self .retrieve_chunk(&[index_to_chunk, 0])? - .chunks(objects_per_chunk(self) as usize) + .chunks(columns_per_shard(self) as usize) .nth(chunk_to_index) { Some(ans) => Ok(ans.to_owned()), @@ -23,11 +23,11 @@ impl EngineStrategy> for Array { } } - fn get_predicate(&self, _index: u8) -> EngineResult> { + fn get_second_term(&self, _index: usize) -> EngineResult> { unimplemented!() } - fn get_object(&self, index: usize) -> EngineResult> { + fn get_third_term(&self, index: usize) -> EngineResult> { let start = vec![0, index as u64]; let end = vec![self.shape()[0], index as u64]; let shape = &ArraySubset::new_with_start_end_inc(start, end)?; diff --git a/src/engine/mod.rs b/src/engine/mod.rs index f4b2400..a147cec 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -3,10 +3,10 @@ use crate::error::EngineError; pub mod array; pub mod chunk; -pub type EngineResult = Result; +pub(crate) type EngineResult = Result; -pub trait EngineStrategy { - fn get_subject(&self, index: usize) -> EngineResult; - fn get_predicate(&self, index: u8) -> EngineResult; - fn get_object(&self, index: usize) -> EngineResult; +pub(crate) trait EngineStrategy { + fn get_first_term(&self, index: usize) -> EngineResult; + fn get_second_term(&self, index: usize) -> EngineResult; + fn get_third_term(&self, index: usize) -> EngineResult; } diff --git a/src/error.rs b/src/error.rs index 531e8ae..3623781 100644 --- a/src/error.rs +++ b/src/error.rs @@ -33,6 +33,18 @@ pub enum RemoteHDTError { GZipCompression(#[from] GzipCompressionLevelError), #[error("The Graph you are trying to serialize is empty")] EmptyGraph, + #[error(transparent)] + Ops(#[from] OpsError), + #[error("The subjects has not been serialized properly")] + SubjectsNotInJSON, + #[error("The predicates has not been serialized properly")] + PredicatesNotInJSON, + #[error("The objects has not been serialized properly")] + ObjectsNotInJSON, + #[error("The Reference System has not been serialized properly")] + ReferenceSystemNotInJSON, + #[error("Error serializing the triples of the Graph")] + TripleSerialization, } #[derive(Error, Debug)] @@ -56,3 +68,19 @@ pub enum ParserError { #[error("No format provided")] NoFormatProvided, } + +#[derive(Error, Debug)] +pub enum OpsError { + #[error(transparent)] + Engine(#[from] EngineError), + #[error("The provided subject could not be found")] + SubjectNotFound, + #[error("The provided predicate could not be found")] + PredicateNotFound, + #[error("The provided object could not be found")] + ObjectNotFound, + #[error("The array has not been loaded correctly")] + EmptyArray, + #[error("The sparse array has not been loaded correctly")] + EmptySparseArray, +} diff --git a/src/io/mod.rs b/src/io/mod.rs index db1ad57..4c7685f 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -6,6 +6,7 @@ use std::io::BufReader; use crate::dictionary::Dictionary; use crate::error::ParserError; +use crate::storage::params::ReferenceSystem; use self::ntriples::NTriples; use self::rdf_xml::RdfXml; @@ -19,7 +20,7 @@ pub type RdfParserResult = Result<(Graph, Dictionary), ParserError>; pub type Graph = Vec>; trait Backend::Error>> { - fn parse(path: &str) -> RdfParserResult { + fn parse(path: &str, reference_system: &ReferenceSystem) -> RdfParserResult { // We create as many HashSets as fields we will be storing; that is, one // for the subjects, another for the predicates, and one for the objects. // The idea is that we will create a Dictionary matching every Term to @@ -49,10 +50,39 @@ trait Backend::Error>> { let sidx = dictionary.get_subject_idx_unchecked(&triple.subject.to_string()); let pidx = dictionary.get_predicate_idx_unchecked(&triple.predicate.to_string()); let oidx = dictionary.get_object_idx_unchecked(&triple.object.to_string()); - graph - .get_mut(sidx) - .unwrap() - .push((pidx as u32, oidx as u32)) + + match reference_system { + ReferenceSystem::SPO => { + if let Some(subject) = graph.get_mut(sidx) { + subject.push((pidx as u32, oidx as u32)) + } + } + ReferenceSystem::SOP => { + if let Some(subject) = graph.get_mut(sidx) { + subject.push((oidx as u32, pidx as u32)) + } + } + ReferenceSystem::PSO => { + if let Some(predicate) = graph.get_mut(pidx) { + predicate.push((sidx as u32, oidx as u32)) + } + } + ReferenceSystem::POS => { + if let Some(predicate) = graph.get_mut(pidx) { + predicate.push((oidx as u32, sidx as u32)) + } + } + ReferenceSystem::OPS => { + if let Some(object) = graph.get_mut(oidx) { + object.push((pidx as u32, sidx as u32)) + } + } + ReferenceSystem::OSP => { + if let Some(object) = graph.get_mut(oidx) { + object.push((sidx as u32, pidx as u32)) + } + } + } }; Ok(()) } as Result<(), E>) @@ -94,11 +124,11 @@ trait Backend::Error>> { pub struct RdfParser; impl RdfParser { - pub fn parse(path: &str) -> RdfParserResult { + pub fn parse(path: &str, reference_system: &ReferenceSystem) -> RdfParserResult { match path.split('.').last() { - Some("nt") => NTriples::parse(path), - Some("ttl") => Turtle::parse(path), - Some("rdf") => RdfXml::parse(path), + Some("nt") => NTriples::parse(path, reference_system), + Some("ttl") => Turtle::parse(path, reference_system), + Some("rdf") => RdfXml::parse(path, reference_system), Some(format) => Err(ParserError::NotSupportedFormat(format.to_string())), None => Err(ParserError::NoFormatProvided), } diff --git a/src/lib.rs b/src/lib.rs index 8ebbc75..1968cf7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ pub mod dictionary; -pub mod engine; +mod engine; pub mod error; mod io; pub mod storage; diff --git a/src/main.rs b/src/main.rs index 3873624..27ee27c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,10 @@ use clap::Parser; -use remote_hdt::storage::{tabular::TabularLayout, ChunkingStrategy, LocalStorage, StorageResult}; +use remote_hdt::storage::params::ChunkingStrategy; +use remote_hdt::storage::params::ReferenceSystem; +use remote_hdt::storage::params::Serialization; +use remote_hdt::storage::tabular::TabularLayout; +use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::StorageResult; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] @@ -15,6 +20,11 @@ struct Args { fn main() -> StorageResult<()> { let args: Args = Args::parse(); - LocalStorage::new(TabularLayout).serialize(&args.zarr, &args.rdf, ChunkingStrategy::Chunk)?; + LocalStorage::new(TabularLayout, Serialization::Sparse).serialize( + &args.zarr, + &args.rdf, + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + )?; Ok(()) } diff --git a/src/storage/layout.rs b/src/storage/layout.rs index 66a326f..a4bf232 100644 --- a/src/storage/layout.rs +++ b/src/storage/layout.rs @@ -12,75 +12,111 @@ use zarrs::array_subset::ArraySubset; use zarrs::storage::store::FilesystemStore; use crate::dictionary::Dictionary; +use crate::error::RemoteHDTError; use crate::io::Graph; -use crate::utils::objects_per_chunk; -use crate::utils::subjects_per_chunk; +use crate::utils::columns_per_shard; +use crate::utils::rows_per_shard; use crate::utils::value_to_term; use super::ChunkingStrategy; +use super::Dimensionality; +use super::ReferenceSystem; use super::StorageResult; use super::ZarrArray; type ArrayToBytesCodec = Box; pub trait LayoutOps { - fn retrieve_attributes(&mut self, arr: &Array) -> Dictionary { + fn retrieve_attributes( + &mut self, + arr: &Array, + ) -> StorageResult<(Dictionary, ReferenceSystem)> { // 4. We get the attributes so we can obtain some values that we will need let attributes = arr.attributes(); - let subjects = &value_to_term(attributes.get("subjects").unwrap()); - let predicates = &value_to_term(attributes.get("predicates").unwrap()); - let objects = &value_to_term(attributes.get("objects").unwrap()); + let subjects = &value_to_term(match attributes.get("subjects") { + Some(subjects) => subjects, + None => return Err(RemoteHDTError::SubjectsNotInJSON), + }); + let predicates = &value_to_term(match attributes.get("predicates") { + Some(predicates) => predicates, + None => return Err(RemoteHDTError::PredicatesNotInJSON), + }); + let objects = &value_to_term(match attributes.get("objects") { + Some(objects) => objects, + None => return Err(RemoteHDTError::ObjectsNotInJSON), + }); - Dictionary::from_vec_str(subjects, predicates, objects) + let reference_system: ReferenceSystem = match attributes.get("reference_system") { + Some(reference_system) => reference_system, + None => return Err(RemoteHDTError::ReferenceSystemNotInJSON), + } + .as_str() + .unwrap() + .into(); + + Ok(( + Dictionary::from_vec_str(subjects, predicates, objects), + reference_system, + )) } fn serialize(&mut self, arr: Array, graph: Graph) -> StorageResult<()> { - let objects_size = arr.shape()[1] as usize; + let columns = arr.shape()[1] as usize; let count = AtomicU64::new(0); let binding = self.graph_iter(graph); - let iter = binding.chunks_exact(subjects_per_chunk(&arr) as usize); + let iter = binding.chunks_exact(rows_per_shard(&arr) as usize); let remainder = iter.remainder(); - iter.for_each(|chunk| { - arr.store_chunk_elements( - &[count.load(Ordering::Relaxed), 0], - self.chunk_elements(chunk, objects_size), - ) - .unwrap(); + for chunk in iter { + if arr + .store_chunk_elements( + &[count.load(Ordering::Relaxed), 0], + self.chunk_elements(chunk, columns), + ) + .is_err() + { + return Err(RemoteHDTError::TripleSerialization); + } + count.fetch_add(1, Ordering::Relaxed); - }); + } if !remainder.is_empty() { arr.store_array_subset_elements( &ArraySubset::new_with_start_shape( - vec![count.load(Ordering::Relaxed) * subjects_per_chunk(&arr), 0], - vec![remainder.len() as u64, objects_per_chunk(&arr)], - ) - .unwrap(), // TODO: remove unwrap - self.chunk_elements(remainder, objects_size), - ) - .unwrap(); + vec![count.load(Ordering::Relaxed) * rows_per_shard(&arr), 0], + vec![remainder.len() as u64, columns_per_shard(&arr)], + )?, + self.chunk_elements(remainder, columns), + )?; } Ok(()) } fn graph_iter(&self, graph: Graph) -> Vec; - fn chunk_elements(&self, chunk: &[C], objects: usize) -> Vec; - fn parse(&mut self, arr: Array, dictionary: &Dictionary) -> StorageResult; - fn sharding_factor(&self, subjects: usize, objects: usize) -> usize; + fn chunk_elements(&self, chunk: &[C], columns: usize) -> Vec; + fn parse( + &mut self, + arr: &Array, + dimensionality: &Dimensionality, + ) -> StorageResult; + fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize; } pub trait Layout: LayoutOps { - fn shape(&self, dictionary: &Dictionary, graph: &Graph) -> Vec; + fn shape(&self, dimensionality: &Dimensionality) -> Vec; fn data_type(&self) -> DataType; fn chunk_shape( &self, chunking_strategy: ChunkingStrategy, - dictionary: &Dictionary, + dimensionality: &Dimensionality, ) -> ChunkGrid; fn fill_value(&self) -> FillValue; - fn dimension_names(&self) -> Option>; - fn array_to_bytes_codec(&self, dictionary: &Dictionary) -> StorageResult; + fn dimension_names(&self, reference_system: &ReferenceSystem) -> Option>; + fn array_to_bytes_codec( + &self, + dimensionality: &Dimensionality, + ) -> StorageResult; } diff --git a/src/storage/matrix.rs b/src/storage/matrix.rs index 4d4cffa..1d4679f 100644 --- a/src/storage/matrix.rs +++ b/src/storage/matrix.rs @@ -1,7 +1,7 @@ +use parking_lot::Mutex; use sprs::TriMat; use std::sync::atomic::AtomicU8; use std::sync::atomic::Ordering; -use std::sync::Mutex; use zarrs::array::codec::array_to_bytes::sharding::ShardingCodecBuilder; use zarrs::array::codec::ArrayToBytesCodecTraits; use zarrs::array::codec::GzipCodec; @@ -15,12 +15,13 @@ use zarrs::storage::ReadableStorageTraits; use super::layout::Layout; use super::layout::LayoutOps; use super::ChunkingStrategy; +use super::Dimensionality; +use super::ReferenceSystem; use super::StorageResult; use super::ZarrArray; -use crate::dictionary::Dictionary; use crate::io::Graph; -use crate::utils::subjects_per_chunk; +use crate::utils::rows_per_shard; type ZarrType = u8; type Chunk = Vec<(u32, u32)>; @@ -31,10 +32,10 @@ impl Layout for MatrixLayout where R: ReadableStorageTraits + Sized, { - fn shape(&self, dictionary: &Dictionary, _graph: &Graph) -> Vec { + fn shape(&self, dimensionality: &Dimensionality) -> Vec { vec![ - dictionary.subjects_size() as u64, - dictionary.objects_size() as u64, + dimensionality.get_first_term_size(), + dimensionality.get_third_term_size(), ] } @@ -45,28 +46,54 @@ where fn chunk_shape( &self, chunking_strategy: ChunkingStrategy, - dictionary: &Dictionary, + dimensionality: &Dimensionality, ) -> ChunkGrid { - vec![chunking_strategy.into(), dictionary.objects_size() as u64].into() + vec![ + chunking_strategy.into(), + dimensionality.get_third_term_size(), + ] + .into() } fn fill_value(&self) -> FillValue { FillValue::from(0u8) } - fn dimension_names(&self) -> Option> { - Some(vec![ - DimensionName::new("Subjects"), - DimensionName::new("Objects"), - ]) + fn dimension_names(&self, reference_system: &ReferenceSystem) -> Option> { + match reference_system { + ReferenceSystem::SPO => Some(vec![ + DimensionName::new("Subjects"), + DimensionName::new("Objects"), + ]), + ReferenceSystem::SOP => Some(vec![ + DimensionName::new("Subjects"), + DimensionName::new("Predicates"), + ]), + ReferenceSystem::PSO => Some(vec![ + DimensionName::new("Predicates"), + DimensionName::new("Objects"), + ]), + ReferenceSystem::POS => Some(vec![ + DimensionName::new("Predicates"), + DimensionName::new("Subjects"), + ]), + ReferenceSystem::OPS => Some(vec![ + DimensionName::new("Objects"), + DimensionName::new("Subjects"), + ]), + ReferenceSystem::OSP => Some(vec![ + DimensionName::new("Objects"), + DimensionName::new("Predicates"), + ]), + } } fn array_to_bytes_codec( &self, - dictionary: &Dictionary, + dimensionality: &Dimensionality, ) -> StorageResult> { let mut sharding_codec_builder = - ShardingCodecBuilder::new(vec![1, dictionary.objects_size() as u64]); + ShardingCodecBuilder::new(vec![1, dimensionality.get_third_term_size()]); sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); Ok(Box::new(sharding_codec_builder.build())) } @@ -80,16 +107,20 @@ where graph } - fn chunk_elements(&self, chunk: &[Chunk], objects: usize) -> Vec { - let slice: Vec = vec![0u8; chunk.len() * objects] + fn chunk_elements(&self, chunk: &[Chunk], columns: usize) -> Vec { + // We create a slice that has the size of the chunk filled with 0 values + // having the size of the shard; that is, number of rows, and a given + // number of columns. This value is converted into an AtomicU8 for us to + // be able to share it among threads + let slice: Vec = vec![0u8; chunk.len() * columns] .iter() .map(|&n| AtomicU8::new(n)) .collect(); - for (i, triples) in chunk.iter().enumerate() { - triples.iter().for_each(|&(predicate, object)| { - let object_idx = object as usize + i * objects; - slice[object_idx].store(predicate as ZarrType, Ordering::Relaxed); + for (first_term, triples) in chunk.iter().enumerate() { + triples.iter().for_each(|&(second_term, third_term)| { + let third_term_idx = third_term as usize + first_term * columns; + slice[third_term_idx].store(second_term as ZarrType, Ordering::Relaxed); }); } @@ -99,28 +130,36 @@ where .collect::>() } - fn parse(&mut self, arr: Array, dictionary: &Dictionary) -> StorageResult { + fn parse( + &mut self, + arr: &Array, + dimensionality: &Dimensionality, + ) -> StorageResult { let matrix = Mutex::new(TriMat::new(( - dictionary.subjects_size(), - dictionary.objects_size(), + dimensionality.first_term_size, + dimensionality.third_term_size, ))); - (0..arr.chunk_grid_shape().unwrap()[0]).for_each(|i| { + let number_of_chunks = match arr.chunk_grid_shape() { + Some(chunk_grid) => chunk_grid[0], + None => 0, + }; + (0..number_of_chunks).for_each(|i| { // Using this chunking strategy allows us to keep RAM usage low, // as we load elements by row arr.retrieve_chunk_elements::(&[i, 0]) .unwrap() - .chunks(dictionary.objects_size()) + .chunks(dimensionality.third_term_size) .enumerate() - .for_each(|(subject_idx, chunk)| { + .for_each(|(first_term_idx, chunk)| { chunk .iter() .enumerate() - .for_each(|(object_idx, &predicate_idx)| { - if predicate_idx != 0 { - matrix.lock().unwrap().add_triplet( - subject_idx + (i * subjects_per_chunk(&arr)) as usize, - object_idx, - predicate_idx, + .for_each(|(third_term_idx, &second_term_idx)| { + if second_term_idx != 0 { + matrix.lock().add_triplet( + first_term_idx + (i * rows_per_shard(arr)) as usize, + third_term_idx, + second_term_idx, ); } }) @@ -129,11 +168,11 @@ where // We use a CSC Matrix because typically, RDF knowledge graphs tend to // have more rows than columns - let x = matrix.lock().unwrap(); + let x = matrix.lock(); Ok(x.to_csc()) } - fn sharding_factor(&self, subjects: usize, _: usize) -> usize { - subjects + fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize { + dimensionality.first_term_size } } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index d8f6a57..6def7d2 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -9,16 +9,24 @@ use zarrs::array::ArrayBuilder; use zarrs::group::GroupBuilder; use zarrs::storage::store::FilesystemStore; use zarrs::storage::store::HTTPStore; +use zarrs::storage::ReadableStorageTraits; use crate::dictionary::Dictionary; use crate::error::RemoteHDTError; +use crate::io::Graph; use crate::io::RdfParser; use crate::utils::rdf_to_value; use self::layout::Layout; +use self::params::ChunkingStrategy; +use self::params::Dimensionality; +use self::params::ReferenceSystem; +use self::params::Serialization; mod layout; pub mod matrix; +pub mod ops; +pub mod params; pub mod tabular; pub type ZarrArray = CsMat; @@ -26,45 +34,57 @@ pub type StorageResult = Result; pub type LocalStorage = Storage; pub type HTTPStorage = Storage; -const ARRAY_NAME: &str = "/group/RemoteHDT"; - -pub enum ChunkingStrategy { - Chunk, - Sharding(u64), - Best, -} - -pub enum ThreadingStrategy { - Single, - Multi, -} - -impl From for u64 { - fn from(value: ChunkingStrategy) -> Self { - match value { - ChunkingStrategy::Chunk => 1, - ChunkingStrategy::Sharding(size) => size, - ChunkingStrategy::Best => 16, // TODO: set to the number of threads - } - } -} +const ARRAY_NAME: &str = "/group/RemoteHDT"; // TODO: parameterize this pub struct Storage { dictionary: Dictionary, + dimensionality: Dimensionality, layout: Box>, + serialization: Serialization, + reference_system: ReferenceSystem, + array: Option>, + sparse_array: Option, } -impl Storage { - pub fn new(layout: impl Layout + 'static) -> Self { +impl Storage { + pub fn new(layout: impl Layout + 'static, serialization: Serialization) -> Self { Storage { dictionary: Default::default(), + dimensionality: Default::default(), layout: Box::new(layout), + serialization, + reference_system: ReferenceSystem::SPO, + array: None, + sparse_array: None, } } pub fn get_dictionary(&self) -> Dictionary { self.dictionary.to_owned() } + + pub fn get_sparse_array(&self) -> Option { + self.sparse_array.to_owned() + } + + fn process_zarr(&mut self, storage: R) -> StorageResult<&Self> { + let store = Arc::new(storage); + let arr = Array::new(store, ARRAY_NAME)?; + let (dictionary, ref_system) = self.layout.retrieve_attributes(&arr)?; + self.dictionary = dictionary; + self.reference_system = ref_system; + self.dimensionality = + Dimensionality::new(&self.reference_system, &self.dictionary, &Graph::default()); + + match self.serialization { + Serialization::Zarr => self.array = Some(arr), + Serialization::Sparse => { + self.sparse_array = Some(self.layout.parse(&arr, &self.dimensionality)?) + } + } + + Ok(self) + } } impl LocalStorage { @@ -77,6 +97,7 @@ impl LocalStorage { zarr_path: &'a str, rdf_path: &'a str, chunking_strategy: ChunkingStrategy, + reference_system: ReferenceSystem, // threading_strategy: ThreadingStrategy, ) -> StorageResult<&Self> { // 1. The first thing that should be done is to check whether the path @@ -96,15 +117,17 @@ impl LocalStorage { let group = GroupBuilder::new().build(store.clone(), "/group")?; group.store_metadata()?; - // rayon::ThreadPoolBuilder::new() + // TODO: rayon::ThreadPoolBuilder::new() // .num_threads(1) // .build_global() // .unwrap(); // 3. Import the RDF dump using `rdf-rs` - let graph = match RdfParser::parse(rdf_path) { + let graph = match RdfParser::parse(rdf_path, &reference_system) { Ok((graph, dictionary)) => { self.dictionary = dictionary; + self.dimensionality = + Dimensionality::new(&reference_system, &self.dictionary, &graph); graph } Err(_) => todo!(), @@ -117,18 +140,20 @@ impl LocalStorage { let predicates = self.dictionary.predicates(); let objects = self.dictionary.objects(); let arr = ArrayBuilder::new( - self.layout.shape(&self.dictionary, &graph), + self.layout.shape(&self.dimensionality), self.layout.data_type(), - self.layout.chunk_shape(chunking_strategy, &self.dictionary), + self.layout + .chunk_shape(chunking_strategy, &self.dimensionality), self.layout.fill_value(), ) - .dimension_names(self.layout.dimension_names()) - .array_to_bytes_codec(self.layout.array_to_bytes_codec(&self.dictionary)?) + .dimension_names(self.layout.dimension_names(&reference_system)) + .array_to_bytes_codec(self.layout.array_to_bytes_codec(&self.dimensionality)?) .attributes({ let mut attributes = Map::new(); attributes.insert("subjects".into(), rdf_to_value(subjects)); attributes.insert("predicates".into(), rdf_to_value(predicates)); attributes.insert("objects".into(), rdf_to_value(objects)); + attributes.insert("reference_system".into(), reference_system.as_ref().into()); attributes }) // TODO: one attribute should be the Layout .build(store, ARRAY_NAME)?; @@ -140,31 +165,13 @@ impl LocalStorage { Ok(self) } - pub fn load(&mut self, zarr_path: &str) -> StorageResult> { - let store = Arc::new(FilesystemStore::new(zarr_path)?); - let arr = Array::new(store, ARRAY_NAME)?; - self.dictionary = self.layout.retrieve_attributes(&arr); - Ok(arr) - } - - // TODO: improve this naming convention - pub fn load_sparse(&mut self, zarr_path: &str) -> StorageResult { - let arr = self.load(zarr_path)?; - self.layout.parse(arr, &self.dictionary) + pub fn load(&mut self, zarr_path: &str) -> StorageResult<&Self> { + self.process_zarr(FilesystemStore::new(zarr_path)?) } } impl HTTPStorage { - pub fn connect(&mut self, url: &str) -> StorageResult> { - let store = Arc::new(HTTPStore::new(url)?); - let arr = Array::new(store, ARRAY_NAME)?; - self.dictionary = self.layout.retrieve_attributes(&arr); - Ok(arr) - } - - // TODO: improve this naming convention - pub fn connect_sparse(&mut self, url: &str) -> StorageResult { - let arr = self.connect(url)?; - self.layout.parse(arr, &self.dictionary) + pub fn connect(&mut self, url: &str) -> StorageResult<&Self> { + self.process_zarr(HTTPStore::new(url)?) } } diff --git a/src/storage/ops.rs b/src/storage/ops.rs new file mode 100644 index 0000000..6aa214f --- /dev/null +++ b/src/storage/ops.rs @@ -0,0 +1,109 @@ +use safe_transmute::TriviallyTransmutable; +use zarrs::storage::ReadableStorageTraits; + +use crate::engine::EngineStrategy; +use crate::error::OpsError; + +use super::params::ReferenceSystem; +use super::params::Serialization; +use super::Storage; +use super::ZarrArray; + +pub type OpsResult = Result; + +pub enum OpsFormat { + SparseArray(ZarrArray), + Zarr(Vec), +} + +pub trait Ops { + fn get_subject(&self, subject: &str) -> OpsResult; + fn get_predicate(&self, predicate: &str) -> OpsResult; + fn get_object(&self, object: &str) -> OpsResult; +} + +impl Ops for Storage { + fn get_subject(&self, subject: &str) -> OpsResult { + let index = match self.dictionary.get_subject_idx(subject) { + Some(index) => index, + None => return Err(OpsError::SubjectNotFound), + }; + + let ans = match self.serialization { + Serialization::Zarr => match &self.array { + Some(array) => OpsFormat::Zarr(match self.reference_system { + ReferenceSystem::SPO | ReferenceSystem::SOP => array.get_first_term(index)?, + ReferenceSystem::PSO | ReferenceSystem::OSP => array.get_second_term(index)?, + ReferenceSystem::POS | ReferenceSystem::OPS => array.get_third_term(index)?, + }), + None => return Err(OpsError::EmptyArray), + }, + Serialization::Sparse => match &self.sparse_array { + Some(array) => OpsFormat::SparseArray(match self.reference_system { + ReferenceSystem::SPO | ReferenceSystem::SOP => array.get_first_term(index)?, + ReferenceSystem::PSO | ReferenceSystem::OSP => array.get_second_term(index)?, + ReferenceSystem::POS | ReferenceSystem::OPS => array.get_third_term(index)?, + }), + None => return Err(OpsError::EmptySparseArray), + }, + }; + + Ok(ans) + } + + fn get_predicate(&self, predicate: &str) -> OpsResult { + let index = match self.dictionary.get_predicate_idx(predicate) { + Some(index) => index, + None => return Err(OpsError::PredicateNotFound), + }; + + let ans = match self.serialization { + Serialization::Zarr => match &self.array { + Some(array) => OpsFormat::Zarr(match self.reference_system { + ReferenceSystem::PSO | ReferenceSystem::POS => array.get_first_term(index)?, + ReferenceSystem::SPO | ReferenceSystem::OPS => array.get_second_term(index)?, + ReferenceSystem::SOP | ReferenceSystem::OSP => array.get_third_term(index)?, + }), + None => return Err(OpsError::EmptyArray), + }, + Serialization::Sparse => match &self.sparse_array { + Some(array) => OpsFormat::SparseArray(match self.reference_system { + ReferenceSystem::PSO | ReferenceSystem::POS => array.get_first_term(index)?, + ReferenceSystem::SPO | ReferenceSystem::OPS => array.get_second_term(index)?, + ReferenceSystem::SOP | ReferenceSystem::OSP => array.get_third_term(index)?, + }), + None => return Err(OpsError::EmptySparseArray), + }, + }; + + Ok(ans) + } + + fn get_object(&self, object: &str) -> OpsResult { + let index = match self.dictionary.get_object_idx(object) { + Some(index) => index, + None => return Err(OpsError::ObjectNotFound), + }; + + let ans = match self.serialization { + Serialization::Zarr => match &self.array { + Some(array) => OpsFormat::Zarr(match self.reference_system { + ReferenceSystem::OPS | ReferenceSystem::OSP => array.get_first_term(index)?, + ReferenceSystem::SOP | ReferenceSystem::POS => array.get_second_term(index)?, + ReferenceSystem::SPO | ReferenceSystem::PSO => array.get_third_term(index)?, + }), + None => return Err(OpsError::EmptyArray), + }, + Serialization::Sparse => match &self.sparse_array { + Some(array) => OpsFormat::SparseArray(match self.reference_system { + ReferenceSystem::OPS | ReferenceSystem::OSP => array.get_first_term(index)?, + ReferenceSystem::SOP | ReferenceSystem::POS => array.get_second_term(index)?, + ReferenceSystem::SPO | ReferenceSystem::PSO => array.get_third_term(index)?, + }), + None => return Err(OpsError::EmptySparseArray), + }, + }; + + Ok(ans) + } +} diff --git a/src/storage/params.rs b/src/storage/params.rs new file mode 100644 index 0000000..4fb733c --- /dev/null +++ b/src/storage/params.rs @@ -0,0 +1,118 @@ +use crate::dictionary::Dictionary; +use crate::io::Graph; + +pub enum Serialization { + Zarr, + Sparse, +} + +pub enum ChunkingStrategy { + Chunk, + Sharding(u64), + Best, +} + +pub enum ThreadingStrategy { + Single, + Multi, +} + +pub enum ReferenceSystem { + SPO, + SOP, + PSO, + POS, + OSP, + OPS, +} + +#[derive(Default)] +pub struct Dimensionality { + graph_size: Option, + pub(crate) first_term_size: usize, + second_term_size: usize, + pub(crate) third_term_size: usize, +} + +impl From for u64 { + fn from(value: ChunkingStrategy) -> Self { + match value { + ChunkingStrategy::Chunk => 1, + ChunkingStrategy::Sharding(size) => size, + ChunkingStrategy::Best => 16, // TODO: set to the number of threads + } + } +} + +impl AsRef for ReferenceSystem { + fn as_ref(&self) -> &str { + match self { + ReferenceSystem::SPO => "spo", + ReferenceSystem::SOP => "sop", + ReferenceSystem::PSO => "pso", + ReferenceSystem::POS => "pos", + ReferenceSystem::OSP => "osp", + ReferenceSystem::OPS => "ops", + } + } +} + +impl From<&str> for ReferenceSystem { + fn from(value: &str) -> Self { + match value { + "spo" => ReferenceSystem::SPO, + "sop" => ReferenceSystem::SOP, + "pso" => ReferenceSystem::PSO, + "pos" => ReferenceSystem::POS, + "osp" => ReferenceSystem::OSP, + "ops" => ReferenceSystem::OPS, + _ => ReferenceSystem::SPO, + } + } +} + +impl Dimensionality { + pub(crate) fn new( + reference_system: &ReferenceSystem, + dictionary: &Dictionary, + graph: &Graph, + ) -> Self { + Dimensionality { + graph_size: graph + .iter() + .map(|triples| triples.len()) + .reduce(|acc, a| acc + a), + first_term_size: match reference_system { + ReferenceSystem::SPO | ReferenceSystem::SOP => dictionary.subjects_size(), + ReferenceSystem::POS | ReferenceSystem::PSO => dictionary.predicates_size(), + ReferenceSystem::OPS | ReferenceSystem::OSP => dictionary.objects_size(), + }, + second_term_size: match reference_system { + ReferenceSystem::PSO | ReferenceSystem::OSP => dictionary.subjects_size(), + ReferenceSystem::SPO | ReferenceSystem::OPS => dictionary.predicates_size(), + ReferenceSystem::SOP | ReferenceSystem::POS => dictionary.objects_size(), + }, + third_term_size: match reference_system { + ReferenceSystem::POS | ReferenceSystem::OPS => dictionary.subjects_size(), + ReferenceSystem::SOP | ReferenceSystem::OSP => dictionary.predicates_size(), + ReferenceSystem::SPO | ReferenceSystem::PSO => dictionary.objects_size(), + }, + } + } + + pub(crate) fn get_graph_size(&self) -> u64 { + self.graph_size.unwrap() as u64 + } + + pub(crate) fn get_first_term_size(&self) -> u64 { + self.first_term_size as u64 + } + + pub(crate) fn get_second_term_size(&self) -> u64 { + self.second_term_size as u64 + } + + pub(crate) fn get_third_term_size(&self) -> u64 { + self.third_term_size as u64 + } +} diff --git a/src/storage/tabular.rs b/src/storage/tabular.rs index 8cca3f1..0c997e1 100644 --- a/src/storage/tabular.rs +++ b/src/storage/tabular.rs @@ -1,5 +1,5 @@ +use parking_lot::Mutex; use sprs::TriMat; -use std::sync::Mutex; use zarrs::array::codec::array_to_bytes::sharding::ShardingCodecBuilder; use zarrs::array::codec::ArrayToBytesCodecTraits; use zarrs::array::codec::GzipCodec; @@ -10,12 +10,13 @@ use zarrs::array::DimensionName; use zarrs::array::FillValue; use zarrs::storage::ReadableStorageTraits; -use crate::dictionary::Dictionary; use crate::io::Graph; use super::layout::Layout; use super::layout::LayoutOps; -use super::ChunkingStrategy; +use super::params::ChunkingStrategy; +use super::params::Dimensionality; +use super::params::ReferenceSystem; use super::StorageResult; use super::ZarrArray; @@ -28,26 +29,15 @@ impl Layout for TabularLayout where R: ReadableStorageTraits + Sized, { - fn shape(&self, _dictionary: &Dictionary, graph: &Graph) -> Vec { - vec![ - graph - .iter() - .map(|triples| triples.len() as u64) - .reduce(|acc, a| acc + a) - .unwrap(), - 3, - ] + fn shape(&self, dimensionality: &Dimensionality) -> Vec { + vec![dimensionality.get_graph_size(), 3] } fn data_type(&self) -> DataType { DataType::UInt64 } - fn chunk_shape( - &self, - chunking_strategy: ChunkingStrategy, - _dictionary: &Dictionary, - ) -> ChunkGrid { + fn chunk_shape(&self, chunking_strategy: ChunkingStrategy, _: &Dimensionality) -> ChunkGrid { vec![chunking_strategy.into(), 3].into() // TODO: make this a constant value } @@ -55,7 +45,7 @@ where FillValue::from(0u64) } - fn dimension_names(&self) -> Option> { + fn dimension_names(&self, _: &ReferenceSystem) -> Option> { Some(vec![ DimensionName::new("Triples"), DimensionName::new("Fields"), @@ -64,7 +54,7 @@ where fn array_to_bytes_codec( &self, - _dictionary: &Dictionary, + _: &Dimensionality, ) -> StorageResult> { let mut sharding_codec_builder = ShardingCodecBuilder::new(vec![1, 3]); sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); @@ -80,10 +70,10 @@ where graph .iter() .enumerate() - .flat_map(|(subject, triples)| { + .flat_map(|(first_term, triples)| { triples .iter() - .map(|&(predicate, object)| (subject as u32, predicate, object)) + .map(|&(second_term, third_term)| (first_term as u32, second_term, third_term)) .collect::>() }) .collect::>() @@ -91,40 +81,46 @@ where fn chunk_elements(&self, chunk: &[Chunk], _: usize) -> Vec { let mut ans = Vec::new(); - for &(subject, predicate, object) in chunk { - ans.push(subject as ZarrType); - ans.push(predicate as ZarrType); - ans.push(object as ZarrType); + for &(first_term, second_term, third_term) in chunk { + ans.push(first_term as ZarrType); + ans.push(second_term as ZarrType); + ans.push(third_term as ZarrType); } ans } - fn parse(&mut self, arr: Array, dictionary: &Dictionary) -> StorageResult { + fn parse( + &mut self, + arr: &Array, + dimensionality: &Dimensionality, + ) -> StorageResult { let matrix = Mutex::new(TriMat::new(( - dictionary.subjects_size(), - dictionary.objects_size(), + dimensionality.first_term_size, + dimensionality.third_term_size, ))); - (0..arr.chunk_grid_shape().unwrap()[0] as usize).for_each(|i| { + let number_of_chunks = match arr.chunk_grid_shape() { + Some(chunk_grid) => chunk_grid[0] as usize, + None => 0, + }; + (0..number_of_chunks).for_each(|i| { // Using this chunking strategy allows us to keep RAM usage low, // as we load elements by row - arr.retrieve_chunk_elements::(&[i as ZarrType, 0]) - .unwrap() - .chunks(3) - .for_each(|triple| { + if let Ok(chunk_elements) = arr.retrieve_chunk_elements::(&[i as ZarrType, 0]) { + chunk_elements.chunks(3).for_each(|triple| { matrix .lock() - .unwrap() .add_triplet(triple[0], triple[2], triple[1] as u8); }) + } }); // We use a CSC Matrix because typically, RDF knowledge graphs tend to // have more rows than columns - let x = matrix.lock().unwrap(); + let x = matrix.lock(); Ok(x.to_csc()) } - fn sharding_factor(&self, subjects: usize, objects: usize) -> usize { - subjects * objects + fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize { + dimensionality.first_term_size * dimensionality.third_term_size } } diff --git a/src/utils.rs b/src/utils.rs index 631ec71..6680a32 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -32,7 +32,7 @@ pub fn hash_to_set(terms: HashSet) -> Vec { vec } -pub fn subjects_per_chunk(arr: &Array) -> u64 { +pub fn rows_per_shard(arr: &Array) -> u64 { match arr.chunk_grid().chunk_shape(&[0, 0], arr.shape()) { Ok(shape) => match shape { Some(chunk_shape) => chunk_shape[0], @@ -42,7 +42,7 @@ pub fn subjects_per_chunk(arr: &Array) -> u64 { } } -pub fn objects_per_chunk(arr: &Array) -> u64 { +pub fn columns_per_shard(arr: &Array) -> u64 { match arr.chunk_grid().chunk_shape(&[0, 0], arr.shape()) { Ok(shape) => match shape { Some(chunk_shape) => chunk_shape[1], diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..515468a --- /dev/null +++ b/test.sh @@ -0,0 +1,3 @@ +rm -r tests/out/* +cargo test +rm -r tests/out/* \ No newline at end of file diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 3f08f76..ba42659 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,15 +1,15 @@ #![allow(dead_code)] +use remote_hdt::dictionary::Dictionary; +use remote_hdt::storage::params::ChunkingStrategy; +use remote_hdt::storage::params::ReferenceSystem; +use remote_hdt::storage::Storage; use safe_transmute::TriviallyTransmutable; -use sprs::{CsMat, TriMat}; +use sprs::CsMat; +use sprs::TriMat; use std::fs::File; use zarrs::storage::store::FilesystemStore; -use remote_hdt::{ - dictionary::Dictionary, - storage::{ChunkingStrategy, Storage}, -}; - pub const TABULAR_ZARR: &str = "tests/out/tabular.zarr"; pub const MATRIX_ZARR: &str = "tests/out/matrix.zarr"; pub const SHARDING_ZARR: &str = "tests/out/sharding.zarr"; @@ -19,10 +19,16 @@ pub fn setup( path: &str, storage: &mut Storage, chunking_strategy: ChunkingStrategy, + reference_system: ReferenceSystem, ) { if File::open(path).is_err() { storage - .serialize(path, "resources/rdf.nt", chunking_strategy) + .serialize( + path, + "resources/rdf.nt", + chunking_strategy, + reference_system, + ) .unwrap(); } else { storage.load(path).unwrap(); @@ -37,7 +43,7 @@ pub enum Subject { } impl Subject { - pub(crate) fn get_idx(self, dictionary: &Dictionary) -> usize { + fn get_idx(self, dictionary: &Dictionary) -> usize { dictionary.get_subject_idx_unchecked(self.into()) } } @@ -65,7 +71,7 @@ pub enum Predicate { } impl Predicate { - pub fn get_idx(self, dictionary: &Dictionary) -> u8 { + fn get_idx(self, dictionary: &Dictionary) -> u8 { dictionary.get_predicate_idx_unchecked(self.into()) as u8 } } @@ -98,7 +104,7 @@ pub enum Object { } impl Object { - pub fn get_idx(self, dictionary: &Dictionary) -> usize { + fn get_idx(self, dictionary: &Dictionary) -> usize { dictionary.get_object_idx_unchecked(self.into()) } } diff --git a/tests/get_object_test.rs b/tests/get_object_test.rs index 15bfd4e..ef342a3 100644 --- a/tests/get_object_test.rs +++ b/tests/get_object_test.rs @@ -1,55 +1,69 @@ -use remote_hdt::{ - engine::EngineStrategy, - storage::{matrix::MatrixLayout, tabular::TabularLayout, ChunkingStrategy, LocalStorage}, -}; +use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::ops::Ops; +use remote_hdt::storage::ops::OpsFormat; +use remote_hdt::storage::params::ChunkingStrategy; +use remote_hdt::storage::params::ReferenceSystem; +use remote_hdt::storage::params::Serialization; +use remote_hdt::storage::tabular::TabularLayout; +use remote_hdt::storage::LocalStorage; use sprs::TriMat; +use std::error::Error; + mod common; #[test] -fn get_object_matrix_chunk_test() { - let mut storage = LocalStorage::new(MatrixLayout); - common::setup(common::MATRIX_ZARR, &mut storage, ChunkingStrategy::Chunk); - - let actual = storage - .load(common::MATRIX_ZARR) - .unwrap() - .get_object(common::Object::Alan.get_idx(&storage.get_dictionary())) - .unwrap(); +fn get_object_matrix_sharding_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); - assert_eq!(actual, vec![0, 3, 0, 0, 0]) -} - -#[test] -fn get_object_matrix_sharding_test() { - let mut storage = LocalStorage::new(MatrixLayout); common::setup( common::SHARDING_ZARR, &mut storage, ChunkingStrategy::Sharding(3), + ReferenceSystem::SPO, ); - let actual = storage - .load(common::SHARDING_ZARR) - .unwrap() - .get_object(0) - .unwrap(); + let actual = match storage + .load(common::SHARDING_ZARR)? + .get_object(common::Object::Date.into())? + { + OpsFormat::Zarr(actual) => actual, + _ => unreachable!(), + }; - assert_eq!(actual, vec![2, 0, 0, 0, 0]) + if actual == vec![2, 0, 0, 0, 0] { + Ok(()) + } else { + println!("{:?}", actual); + Err(String::from("Expected and actual results are not equals").into()) + } } #[test] -fn get_object_tabular_test() { - let mut storage = LocalStorage::new(TabularLayout); - common::setup(common::TABULAR_ZARR, &mut storage, ChunkingStrategy::Chunk); +fn get_object_tabular_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(TabularLayout, Serialization::Sparse); - let actual = storage - .load_sparse(common::TABULAR_ZARR) - .unwrap() - .get_object(common::Object::Alan.get_idx(&storage.get_dictionary())) - .unwrap(); + common::setup( + common::TABULAR_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + ); + + let actual = match storage + .load(common::TABULAR_ZARR)? + .get_object(common::Object::Alan.into())? + { + OpsFormat::SparseArray(actual) => actual, + _ => unreachable!(), + }; let mut expected = TriMat::new((4, 9)); expected.add_triplet(1, 3, 3); let expected = expected.to_csc(); - assert_eq!(actual, expected) + + if actual == expected { + Ok(()) + } else { + Err(String::from("Expected and actual results are not equals").into()) + } } diff --git a/tests/get_subject_test.rs b/tests/get_subject_test.rs index 4a8d5fb..a291f95 100644 --- a/tests/get_subject_test.rs +++ b/tests/get_subject_test.rs @@ -1,59 +1,98 @@ -use remote_hdt::{ - engine::EngineStrategy, - storage::{matrix::MatrixLayout, tabular::TabularLayout, ChunkingStrategy, LocalStorage}, -}; +use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::ops::Ops; +use remote_hdt::storage::ops::OpsFormat; +use remote_hdt::storage::params::ChunkingStrategy; +use remote_hdt::storage::params::ReferenceSystem; +use remote_hdt::storage::params::Serialization; +use remote_hdt::storage::tabular::TabularLayout; +use remote_hdt::storage::LocalStorage; use sprs::TriMat; +use std::error::Error; + mod common; #[test] -fn get_subject_matrix_chunk_test() { - let mut storage = LocalStorage::new(MatrixLayout); - common::setup(common::MATRIX_ZARR, &mut storage, ChunkingStrategy::Chunk); +fn get_subject_matrix_chunk_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); - let actual = storage - .load(common::MATRIX_ZARR) - .unwrap() - .get_subject(common::Subject::Alan.get_idx(&storage.get_dictionary())) - .unwrap(); + common::setup( + common::MATRIX_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + ); - assert_eq!(actual, vec![2, 4, 5, 0, 0, 0, 0, 7, 8]) + let actual = match storage + .load(common::MATRIX_ZARR)? + .get_subject(common::Subject::Alan.into())? + { + OpsFormat::Zarr(actual) => actual, + _ => unreachable!(), + }; + + if actual == vec![2, 4, 5, 0, 0, 0, 0, 7, 8] { + Ok(()) + } else { + Err(String::from("Expected and actual results are not equals").into()) + } } #[test] -fn get_subject_matrix_sharding_test() { - let mut storage = LocalStorage::new(MatrixLayout); +fn get_subject_matrix_sharding_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); + common::setup( common::SHARDING_ZARR, &mut storage, ChunkingStrategy::Sharding(3), + ReferenceSystem::SPO, ); - let actual = storage - .load(common::SHARDING_ZARR) - .unwrap() - .get_subject(3) - .unwrap(); + let actual = match storage + .load(common::SHARDING_ZARR)? + .get_subject(common::Subject::Wilmslow.into())? + { + OpsFormat::Zarr(actual) => actual, + _ => unreachable!(), + }; - assert_eq!(actual, vec![0, 0, 0, 0, 0, 5, 1, 0, 0]) + if actual == vec![0, 0, 0, 0, 0, 5, 1, 0, 0] { + Ok(()) + } else { + Err(String::from("Expected and actual results are not equals").into()) + } } #[test] -fn get_subject_tabular_test() { - let mut storage = LocalStorage::new(TabularLayout); - common::setup(common::TABULAR_ZARR, &mut storage, ChunkingStrategy::Chunk); - - let actual = storage - .load_sparse(common::TABULAR_ZARR) - .unwrap() - .get_subject(common::Subject::Alan.get_idx(&storage.get_dictionary())) - .unwrap(); - - let mut result = TriMat::new((4, 9)); - result.add_triplet(0, 0, 2); - result.add_triplet(0, 1, 4); - result.add_triplet(0, 2, 5); - result.add_triplet(0, 7, 7); - result.add_triplet(0, 8, 8); - let result = result.to_csc(); - assert_eq!(actual, result) +fn get_subject_tabular_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(TabularLayout, Serialization::Sparse); + + common::setup( + common::TABULAR_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + ); + + let actual = match storage + .load(common::TABULAR_ZARR)? + .get_subject(common::Subject::Alan.into())? + { + OpsFormat::SparseArray(actual) => actual, + _ => unreachable!(), + }; + + let mut expected = TriMat::new((4, 9)); + expected.add_triplet(0, 0, 2); + expected.add_triplet(0, 1, 4); + expected.add_triplet(0, 2, 5); + expected.add_triplet(0, 7, 7); + expected.add_triplet(0, 8, 8); + let expected = expected.to_csc(); + + if actual == expected { + Ok(()) + } else { + Err(String::from("Expected and actual results are not equals").into()) + } } diff --git a/tests/orientation.rs b/tests/orientation.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/orientation.rs @@ -0,0 +1 @@ + diff --git a/tests/write_read_test.rs b/tests/write_read_test.rs index cebabe8..e2f2e3c 100644 --- a/tests/write_read_test.rs +++ b/tests/write_read_test.rs @@ -1,57 +1,83 @@ -use remote_hdt::storage::{ - matrix::MatrixLayout, tabular::TabularLayout, ChunkingStrategy, LocalStorage, -}; +use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::params::ChunkingStrategy; +use remote_hdt::storage::params::ReferenceSystem; +use remote_hdt::storage::params::Serialization; +use remote_hdt::storage::tabular::TabularLayout; +use remote_hdt::storage::LocalStorage; mod common; #[test] fn write_read_tabular_test() { - let mut storage = LocalStorage::new(TabularLayout); - common::setup(common::MATRIX_ZARR, &mut storage, ChunkingStrategy::Chunk); + let mut storage = LocalStorage::new(TabularLayout, Serialization::Sparse); + + common::setup( + common::MATRIX_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + ); + + storage.load(common::TABULAR_ZARR).unwrap(); + assert_eq!( - storage.load_sparse(common::TABULAR_ZARR).unwrap(), + storage.get_sparse_array().unwrap(), common::Graph::new(&storage.get_dictionary()) ) } #[test] fn write_read_matrix_test() { - let mut storage = LocalStorage::new(MatrixLayout); - common::setup(common::MATRIX_ZARR, &mut storage, ChunkingStrategy::Chunk); + let mut storage = LocalStorage::new(MatrixLayout, Serialization::Sparse); + common::setup( + common::MATRIX_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + ); + + storage.load(common::MATRIX_ZARR).unwrap(); + assert_eq!( - storage.load_sparse(common::MATRIX_ZARR).unwrap(), + storage.get_sparse_array().unwrap(), common::Graph::new(&storage.get_dictionary()) ) } #[test] fn write_read_matrix_sharding_test() { - let mut storage = LocalStorage::new(MatrixLayout); + let mut storage = LocalStorage::new(MatrixLayout, Serialization::Sparse); common::setup( common::SHARDING_ZARR, &mut storage, ChunkingStrategy::Sharding(3), + ReferenceSystem::SPO, ); + storage.load(common::SHARDING_ZARR).unwrap(); + assert_eq!( - storage.load_sparse(common::SHARDING_ZARR).unwrap(), + storage.get_sparse_array().unwrap(), common::Graph::new(&storage.get_dictionary()) ) } #[test] fn write_read_larger_than_triples_shard_test() { - let mut storage = LocalStorage::new(MatrixLayout); + let mut storage = LocalStorage::new(MatrixLayout, Serialization::Sparse); common::setup( common::LARGER_ZARR, &mut storage, ChunkingStrategy::Sharding(10000), + ReferenceSystem::SPO, ); + storage.load(common::LARGER_ZARR).unwrap(); + assert_eq!( - storage.load_sparse(common::LARGER_ZARR).unwrap(), + storage.get_sparse_array().unwrap(), common::Graph::new(&storage.get_dictionary()) ) } From 03bb447f9f42d02e216ed4e65fb0915a8897d33f Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Tue, 26 Dec 2023 13:51:31 +0000 Subject: [PATCH 02/10] improvements --- src/dictionary.rs | 33 ++++++++- src/engine/array.rs | 3 +- src/engine/chunk.rs | 29 ++++---- src/io/mod.rs | 12 +++- src/storage/layout.rs | 13 ++-- src/storage/matrix.rs | 12 ++-- src/storage/mod.rs | 17 ++--- src/storage/ops.rs | 3 +- src/storage/params.rs | 13 ++-- src/storage/tabular.rs | 4 +- tests/common/mod.rs | 11 ++- tests/orientation.rs | 149 +++++++++++++++++++++++++++++++++++++++++ 12 files changed, 245 insertions(+), 54 deletions(-) diff --git a/src/dictionary.rs b/src/dictionary.rs index 9a875e0..c358b4a 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -2,10 +2,13 @@ use std::collections::HashSet; use fcsd::Set; +use crate::storage::params::ReferenceSystem; + use super::utils::hash_to_set; #[derive(Clone)] pub struct Dictionary { + reference_system: ReferenceSystem, subjects: Set, predicates: Set, objects: Set, @@ -14,6 +17,7 @@ pub struct Dictionary { impl Default for Dictionary { fn default() -> Self { Dictionary { + reference_system: ReferenceSystem::SPO, subjects: Set::new(vec!["PlaceHolder"]).unwrap(), predicates: Set::new(vec!["PlaceHolder"]).unwrap(), objects: Set::new(vec!["PlaceHolder"]).unwrap(), @@ -23,11 +27,13 @@ impl Default for Dictionary { impl Dictionary { pub(crate) fn from_vec_str( + reference_system: ReferenceSystem, subjects: &Vec, predicates: &Vec, objects: &Vec, ) -> Self { Dictionary { + reference_system, subjects: Set::new(subjects).unwrap(), predicates: Set::new(predicates).unwrap(), objects: Set::new(objects).unwrap(), @@ -35,11 +41,13 @@ impl Dictionary { } pub(crate) fn from_set_terms( + reference_system: ReferenceSystem, subjects: HashSet, predicates: HashSet, objects: HashSet, ) -> Self { Dictionary { + reference_system, subjects: Set::new(hash_to_set(subjects)).unwrap(), predicates: Set::new(hash_to_set(predicates)).unwrap(), objects: Set::new(hash_to_set(objects)).unwrap(), @@ -70,9 +78,18 @@ impl Dictionary { self.objects.to_owned() } + pub fn get_reference_system(&self) -> ReferenceSystem { + self.reference_system.to_owned() + } + pub fn get_subject_idx(&self, subject: &str) -> Option { let mut locator = self.subjects.locator(); - locator.run(subject) + match self.reference_system { + ReferenceSystem::PSO | ReferenceSystem::OSP => { + locator.run(subject).map(|value| value + 1) + } + _ => locator.run(subject), + } } pub fn get_subject_idx_unchecked(&self, subject: &str) -> usize { @@ -81,7 +98,12 @@ impl Dictionary { pub fn get_predicate_idx(&self, predicate: &str) -> Option { let mut locator = self.predicates.locator(); - locator.run(predicate).map(|value| value + 1) + match self.reference_system { + ReferenceSystem::SPO | ReferenceSystem::OPS => { + locator.run(predicate).map(|value| value + 1) + } + _ => locator.run(predicate), + } } pub fn get_predicate_idx_unchecked(&self, predicate: &str) -> usize { @@ -90,7 +112,12 @@ impl Dictionary { pub fn get_object_idx(&self, object: &str) -> Option { let mut locator = self.objects.locator(); - locator.run(object) + match self.reference_system { + ReferenceSystem::SOP | ReferenceSystem::POS => { + locator.run(object).map(|value| value + 1) + } + _ => locator.run(object), + } } pub fn get_object_idx_unchecked(&self, object: &str) -> usize { diff --git a/src/engine/array.rs b/src/engine/array.rs index d6c4ef9..3a97292 100644 --- a/src/engine/array.rs +++ b/src/engine/array.rs @@ -2,7 +2,8 @@ use sprs::TriMat; use crate::storage::ZarrArray; -use super::{EngineResult, EngineStrategy}; +use super::EngineResult; +use super::EngineStrategy; impl EngineStrategy for ZarrArray { fn get_first_term(&self, index: usize) -> EngineResult { diff --git a/src/engine/chunk.rs b/src/engine/chunk.rs index 7a17100..cd03c92 100644 --- a/src/engine/chunk.rs +++ b/src/engine/chunk.rs @@ -2,32 +2,33 @@ use zarrs::array::Array; use zarrs::array_subset::ArraySubset; use zarrs::storage::ReadableStorageTraits; -use crate::error::EngineError; +use crate::storage::ZarrType; use crate::utils::columns_per_shard; use crate::utils::rows_per_shard; use super::EngineResult; use super::EngineStrategy; -impl EngineStrategy> for Array { - fn get_first_term(&self, index: usize) -> EngineResult> { +impl EngineStrategy> for Array { + fn get_first_term(&self, index: usize) -> EngineResult> { let index_to_chunk = index as u64 / rows_per_shard(self); - let chunk_to_index = index % rows_per_shard(self) as usize; - match self - .retrieve_chunk(&[index_to_chunk, 0])? - .chunks(columns_per_shard(self) as usize) - .nth(chunk_to_index) - { - Some(ans) => Ok(ans.to_owned()), - None => Err(EngineError::Operation), - } + let chunk_to_index = index as u64 % rows_per_shard(self); + Ok(self + .retrieve_chunk_subset_elements( + &[index_to_chunk, 0], + &ArraySubset::new_with_start_end_inc( + vec![chunk_to_index, 0], + vec![chunk_to_index, columns_per_shard(self) - 1], + )?, + )? + .to_vec()) } - fn get_second_term(&self, _index: usize) -> EngineResult> { + fn get_second_term(&self, _index: usize) -> EngineResult> { unimplemented!() } - fn get_third_term(&self, index: usize) -> EngineResult> { + fn get_third_term(&self, index: usize) -> EngineResult> { let start = vec![0, index as u64]; let end = vec![self.shape()[0], index as u64]; let shape = &ArraySubset::new_with_start_end_inc(start, end)?; diff --git a/src/io/mod.rs b/src/io/mod.rs index 4c7685f..7bf6a33 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -42,8 +42,16 @@ trait Backend::Error>> { return Err(ParserError::Dictionary(err)); } - let mut graph = vec![Vec::new(); subjects.len()]; - let dictionary = Dictionary::from_set_terms(subjects, predicates, objects); + let mut graph = vec![ + Vec::new(); + match reference_system { + ReferenceSystem::SPO | ReferenceSystem::SOP => subjects.len(), + ReferenceSystem::PSO | ReferenceSystem::POS => predicates.len(), + ReferenceSystem::OSP | ReferenceSystem::OPS => objects.len(), + } + ]; + let dictionary = + Dictionary::from_set_terms(reference_system.to_owned(), subjects, predicates, objects); if let Err(err) = Self::parser_fn(path, &mut |triple: Triple| { { diff --git a/src/storage/layout.rs b/src/storage/layout.rs index a4bf232..0cc6368 100644 --- a/src/storage/layout.rs +++ b/src/storage/layout.rs @@ -27,10 +27,7 @@ use super::ZarrArray; type ArrayToBytesCodec = Box; pub trait LayoutOps { - fn retrieve_attributes( - &mut self, - arr: &Array, - ) -> StorageResult<(Dictionary, ReferenceSystem)> { + fn retrieve_attributes(&mut self, arr: &Array) -> StorageResult { // 4. We get the attributes so we can obtain some values that we will need let attributes = arr.attributes(); @@ -55,16 +52,18 @@ pub trait LayoutOps { .unwrap() .into(); - Ok(( - Dictionary::from_vec_str(subjects, predicates, objects), + Ok(Dictionary::from_vec_str( reference_system, + subjects, + predicates, + objects, )) } fn serialize(&mut self, arr: Array, graph: Graph) -> StorageResult<()> { let columns = arr.shape()[1] as usize; let count = AtomicU64::new(0); - let binding = self.graph_iter(graph); + let binding = self.graph_iter(graph.to_owned()); let iter = binding.chunks_exact(rows_per_shard(&arr) as usize); let remainder = iter.remainder(); diff --git a/src/storage/matrix.rs b/src/storage/matrix.rs index 1d4679f..39d0749 100644 --- a/src/storage/matrix.rs +++ b/src/storage/matrix.rs @@ -1,6 +1,5 @@ use parking_lot::Mutex; use sprs::TriMat; -use std::sync::atomic::AtomicU8; use std::sync::atomic::Ordering; use zarrs::array::codec::array_to_bytes::sharding::ShardingCodecBuilder; use zarrs::array::codec::ArrayToBytesCodecTraits; @@ -14,16 +13,17 @@ use zarrs::storage::ReadableStorageTraits; use super::layout::Layout; use super::layout::LayoutOps; +use super::AtomicZarrType; use super::ChunkingStrategy; use super::Dimensionality; use super::ReferenceSystem; use super::StorageResult; use super::ZarrArray; +use super::ZarrType; use crate::io::Graph; use crate::utils::rows_per_shard; -type ZarrType = u8; type Chunk = Vec<(u32, u32)>; pub struct MatrixLayout; @@ -40,7 +40,7 @@ where } fn data_type(&self) -> DataType { - DataType::UInt8 + DataType::UInt64 } fn chunk_shape( @@ -56,7 +56,7 @@ where } fn fill_value(&self) -> FillValue { - FillValue::from(0u8) + FillValue::from(0 as ZarrType) } fn dimension_names(&self, reference_system: &ReferenceSystem) -> Option> { @@ -112,9 +112,9 @@ where // having the size of the shard; that is, number of rows, and a given // number of columns. This value is converted into an AtomicU8 for us to // be able to share it among threads - let slice: Vec = vec![0u8; chunk.len() * columns] + let slice: Vec = vec![0 as ZarrType; chunk.len() * columns] .iter() - .map(|&n| AtomicU8::new(n)) + .map(|&n| AtomicZarrType::new(n)) .collect(); for (first_term, triples) in chunk.iter().enumerate() { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 6def7d2..31357f9 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3,6 +3,7 @@ use serde_json::Map; use sprs::CsMat; use std::path::PathBuf; use std::str::FromStr; +use std::sync::atomic::AtomicU64; use std::sync::Arc; use zarrs::array::Array; use zarrs::array::ArrayBuilder; @@ -29,7 +30,9 @@ pub mod ops; pub mod params; pub mod tabular; -pub type ZarrArray = CsMat; +pub type ZarrArray = CsMat; +pub type ZarrType = u64; +type AtomicZarrType = AtomicU64; pub type StorageResult = Result; pub type LocalStorage = Storage; pub type HTTPStorage = Storage; @@ -70,11 +73,10 @@ impl Storage { fn process_zarr(&mut self, storage: R) -> StorageResult<&Self> { let store = Arc::new(storage); let arr = Array::new(store, ARRAY_NAME)?; - let (dictionary, ref_system) = self.layout.retrieve_attributes(&arr)?; + let dictionary = self.layout.retrieve_attributes(&arr)?; self.dictionary = dictionary; - self.reference_system = ref_system; - self.dimensionality = - Dimensionality::new(&self.reference_system, &self.dictionary, &Graph::default()); + self.reference_system = self.dictionary.get_reference_system(); + self.dimensionality = Dimensionality::new(&self.dictionary, &Graph::default()); match self.serialization { Serialization::Zarr => self.array = Some(arr), @@ -126,8 +128,7 @@ impl LocalStorage { let graph = match RdfParser::parse(rdf_path, &reference_system) { Ok((graph, dictionary)) => { self.dictionary = dictionary; - self.dimensionality = - Dimensionality::new(&reference_system, &self.dictionary, &graph); + self.dimensionality = Dimensionality::new(&self.dictionary, &graph); graph } Err(_) => todo!(), @@ -155,7 +156,7 @@ impl LocalStorage { attributes.insert("objects".into(), rdf_to_value(objects)); attributes.insert("reference_system".into(), reference_system.as_ref().into()); attributes - }) // TODO: one attribute should be the Layout + }) .build(store, ARRAY_NAME)?; arr.store_metadata()?; diff --git a/src/storage/ops.rs b/src/storage/ops.rs index 6aa214f..24dde0c 100644 --- a/src/storage/ops.rs +++ b/src/storage/ops.rs @@ -8,12 +8,13 @@ use super::params::ReferenceSystem; use super::params::Serialization; use super::Storage; use super::ZarrArray; +use super::ZarrType; pub type OpsResult = Result; pub enum OpsFormat { SparseArray(ZarrArray), - Zarr(Vec), + Zarr(Vec), } pub trait Ops { diff --git a/src/storage/params.rs b/src/storage/params.rs index 4fb733c..993a578 100644 --- a/src/storage/params.rs +++ b/src/storage/params.rs @@ -17,6 +17,7 @@ pub enum ThreadingStrategy { Multi, } +#[derive(Clone)] pub enum ReferenceSystem { SPO, SOP, @@ -72,27 +73,23 @@ impl From<&str> for ReferenceSystem { } impl Dimensionality { - pub(crate) fn new( - reference_system: &ReferenceSystem, - dictionary: &Dictionary, - graph: &Graph, - ) -> Self { + pub(crate) fn new(dictionary: &Dictionary, graph: &Graph) -> Self { Dimensionality { graph_size: graph .iter() .map(|triples| triples.len()) .reduce(|acc, a| acc + a), - first_term_size: match reference_system { + first_term_size: match dictionary.get_reference_system() { ReferenceSystem::SPO | ReferenceSystem::SOP => dictionary.subjects_size(), ReferenceSystem::POS | ReferenceSystem::PSO => dictionary.predicates_size(), ReferenceSystem::OPS | ReferenceSystem::OSP => dictionary.objects_size(), }, - second_term_size: match reference_system { + second_term_size: match dictionary.get_reference_system() { ReferenceSystem::PSO | ReferenceSystem::OSP => dictionary.subjects_size(), ReferenceSystem::SPO | ReferenceSystem::OPS => dictionary.predicates_size(), ReferenceSystem::SOP | ReferenceSystem::POS => dictionary.objects_size(), }, - third_term_size: match reference_system { + third_term_size: match dictionary.get_reference_system() { ReferenceSystem::POS | ReferenceSystem::OPS => dictionary.subjects_size(), ReferenceSystem::SOP | ReferenceSystem::OSP => dictionary.predicates_size(), ReferenceSystem::SPO | ReferenceSystem::PSO => dictionary.objects_size(), diff --git a/src/storage/tabular.rs b/src/storage/tabular.rs index 0c997e1..0e7e121 100644 --- a/src/storage/tabular.rs +++ b/src/storage/tabular.rs @@ -107,9 +107,10 @@ where // as we load elements by row if let Ok(chunk_elements) = arr.retrieve_chunk_elements::(&[i as ZarrType, 0]) { chunk_elements.chunks(3).for_each(|triple| { + println!("{} {} {}", triple[0], triple[2], triple[1] as ZarrType); matrix .lock() - .add_triplet(triple[0], triple[2], triple[1] as u8); + .add_triplet(triple[0], triple[2], triple[1] as ZarrType); }) } }); @@ -117,6 +118,7 @@ where // We use a CSC Matrix because typically, RDF knowledge graphs tend to // have more rows than columns let x = matrix.lock(); + Ok(x.to_csc()) } diff --git a/tests/common/mod.rs b/tests/common/mod.rs index ba42659..43b0645 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -4,6 +4,7 @@ use remote_hdt::dictionary::Dictionary; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::Storage; +use remote_hdt::storage::ZarrType; use safe_transmute::TriviallyTransmutable; use sprs::CsMat; use sprs::TriMat; @@ -14,6 +15,10 @@ pub const TABULAR_ZARR: &str = "tests/out/tabular.zarr"; pub const MATRIX_ZARR: &str = "tests/out/matrix.zarr"; pub const SHARDING_ZARR: &str = "tests/out/sharding.zarr"; pub const LARGER_ZARR: &str = "tests/out/larger.zarr"; +pub const PSO_ZARR: &str = "tests/out/pso.zarr"; +pub const OPS_ZARR: &str = "tests/out/ops.zarr"; +pub const TABULAR_PSO_ZARR: &str = "tests/out/tabular_pso.zarr"; +pub const TABULAR_OPS_ZARR: &str = "tests/out/tabular_ops.zarr"; pub fn setup( path: &str, @@ -71,8 +76,8 @@ pub enum Predicate { } impl Predicate { - fn get_idx(self, dictionary: &Dictionary) -> u8 { - dictionary.get_predicate_idx_unchecked(self.into()) as u8 + fn get_idx(self, dictionary: &Dictionary) -> ZarrType { + dictionary.get_predicate_idx_unchecked(self.into()) as ZarrType } } @@ -128,7 +133,7 @@ impl From for &str { pub struct Graph; impl Graph { - pub fn new(dictionary: &Dictionary) -> CsMat { + pub fn new(dictionary: &Dictionary) -> CsMat { let mut ans = TriMat::new((4, 9)); ans.add_triplet( diff --git a/tests/orientation.rs b/tests/orientation.rs index 8b13789..b2d13ea 100644 --- a/tests/orientation.rs +++ b/tests/orientation.rs @@ -1 +1,150 @@ +use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::ops::Ops; +use remote_hdt::storage::ops::OpsFormat; +use remote_hdt::storage::params::ChunkingStrategy; +use remote_hdt::storage::params::ReferenceSystem; +use remote_hdt::storage::params::Serialization; +use remote_hdt::storage::tabular::TabularLayout; +use remote_hdt::storage::LocalStorage; +use std::error::Error; +mod common; + +#[test] +fn orientation_pso_matrix_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); + + common::setup( + common::PSO_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::PSO, + ); + + let actual = match storage + .load(common::PSO_ZARR)? + .get_predicate(common::Predicate::InstanceOf.into())? + { + OpsFormat::Zarr(actual) => actual, + _ => unreachable!(), + }; + + if actual == vec![3, 0, 1] { + Ok(()) + } else { + Err(String::from("Expected and actual results are not equals").into()) + } +} + +#[test] +fn orientation_ops_matrix_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); + + common::setup( + common::OPS_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::OPS, + ); + + let actual = match storage + .load(common::OPS_ZARR)? + .get_object(common::Object::Alan.into())? + { + OpsFormat::Zarr(actual) => actual, + _ => unreachable!(), + }; + + if actual == vec![0, 3, 0, 0] { + Ok(()) + } else { + println!("{:?}", actual); + Err(String::from("Expected and actual results are not equals").into()) + } +} + +#[test] +fn orientation_pso_tabular_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(TabularLayout, Serialization::Sparse); + + common::setup( + common::TABULAR_PSO_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::PSO, + ); + + let actual = match storage + .load(common::TABULAR_PSO_ZARR)? + .get_predicate(common::Predicate::InstanceOf.into())? + { + OpsFormat::SparseArray(actual) => actual, + _ => unreachable!(), + }; + + println!("{}", storage.get_sparse_array().unwrap().to_dense()); + + storage + .get_dictionary() + .subjects() + .iter() + .for_each(|(i, e)| println!("{} {}", i, std::str::from_utf8(&e).unwrap().to_string())); + + println!(); + + storage + .get_dictionary() + .predicates() + .iter() + .for_each(|(i, e)| println!("{} {}", i, std::str::from_utf8(&e).unwrap().to_string())); + + println!(); + + storage + .get_dictionary() + .objects() + .iter() + .for_each(|(i, e)| println!("{} {}", i, std::str::from_utf8(&e).unwrap().to_string())); + + println!( + "{:?}", + storage + .get_dictionary() + .get_subject_idx(common::Subject::Warrington.into()) + ); + + Ok(()) + + // if actual == vec![3, 1, 1] { + // Ok(()) + // } else { + // println!("{:?}", actual); + // Err(String::from("Expected and actual results are not equals").into()) + // } +} + +#[test] +fn orientation_ops_tabular_test() -> Result<(), Box> { + let mut storage = LocalStorage::new(TabularLayout, Serialization::Zarr); + + common::setup( + common::TABULAR_OPS_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::OPS, + ); + + let actual = match storage + .load(common::TABULAR_OPS_ZARR)? + .get_subject(common::Subject::Alan.into())? + { + OpsFormat::Zarr(actual) => actual, + _ => unreachable!(), + }; + + if actual == vec![1, 3, 4, 0, 0, 0, 0, 6, 7] { + Ok(()) + } else { + Err(String::from("Expected and actual results are not equals").into()) + } +} From 337e2f9b63540d52eb5ee07ae6af7c451e96fde6 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Thu, 8 Feb 2024 12:31:50 +0000 Subject: [PATCH 03/10] WIP --- Cargo.toml | 3 +- examples/http_bench.rs | 11 +-- examples/load_bench.rs | 8 +-- examples/ntriples/main.rs | 8 +-- examples/query_bench.rs | 8 +-- examples/rdf_xml/main.rs | 8 +-- examples/serialize_bench.rs | 8 +-- examples/turtle/main.rs | 10 +-- src/dictionary.rs | 3 +- src/engine/chunk.rs | 26 ++++--- src/error.rs | 13 ++++ src/main.rs | 7 +- src/storage/layout.rs | 99 +++++++++++++++++++------- src/storage/matrix.rs | 95 +++++++++---------------- src/storage/mod.rs | 138 +++++++++++++++++++++--------------- src/storage/ops.rs | 8 +-- src/storage/params.rs | 15 ++-- src/storage/tabular.rs | 70 ++++++------------ src/utils.rs | 7 +- tests/common/mod.rs | 18 +++-- tests/get_object_test.rs | 11 +-- tests/get_subject_test.rs | 15 ++-- tests/orientation.rs | 19 ++--- tests/write_read_test.rs | 27 ++++--- 24 files changed, 342 insertions(+), 293 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6588ca9..dabe10a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ version = "0.0.1" edition = "2021" [dependencies] -zarrs = { version = "0.7.3", default-features = false, features = [ "http", "gzip", "sharding" ] } +zarrs = { version = "0.11.6", default-features = false, features = [ "http", "gzip", "sharding", "opendal", "async", "ndarray" ] } clap = { version = "4.1.8", features = ["derive"] } serde_json = "1.0.108" thiserror = "1.0.50" @@ -14,7 +14,6 @@ sprs = "0.11.1" rio_turtle = "0.8.4" rio_xml = "0.8.4" rio_api = "0.8.4" -safe-transmute = "0.11.2" rayon = "1.8.0" parking_lot = "0.12" diff --git a/examples/http_bench.rs b/examples/http_bench.rs index 1d5906d..0a04f26 100644 --- a/examples/http_bench.rs +++ b/examples/http_bench.rs @@ -1,14 +1,15 @@ use remote_hdt::error::RemoteHDTError; use remote_hdt::storage::matrix::MatrixLayout; use remote_hdt::storage::ops::Ops; -use remote_hdt::storage::params::Serialization; -use remote_hdt::storage::HTTPStorage; +use remote_hdt::storage::params::{Backend, Serialization}; +use remote_hdt::storage::Storage; use std::time::Instant; fn main() -> Result<(), RemoteHDTError> { - let mut remote_hdt = HTTPStorage::new(MatrixLayout, Serialization::Zarr); - let arr = remote_hdt - .connect("https://raw.githubusercontent.com/weso/RemoteHDT/master/resources/root.zarr")?; + let mut binding = Storage::new(MatrixLayout, Serialization::Zarr); + let arr = binding.load(Backend::HTTP( + "https://raw.githubusercontent.com/weso/RemoteHDT/master/resources/root.zarr", + ))?; let before = Instant::now(); arr.get_subject("")?; diff --git a/examples/load_bench.rs b/examples/load_bench.rs index 9bd391b..d990658 100644 --- a/examples/load_bench.rs +++ b/examples/load_bench.rs @@ -1,7 +1,7 @@ use remote_hdt::error::RemoteHDTError; -use remote_hdt::storage::params::Serialization; +use remote_hdt::storage::params::{Backend, Serialization}; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; use std::env; use std::time::Instant; @@ -16,8 +16,8 @@ fn main() -> Result<(), RemoteHDTError> { let before = Instant::now(); - LocalStorage::new(TabularLayout, Serialization::Zarr) - .load(format!("{}.zarr", zarr_path).as_str())?; + Storage::new(TabularLayout, Serialization::Zarr) + .load(Backend::FileSystem(format!("{}.zarr", zarr_path).as_str()))?; println!("Elapsed time: {:.2?}", before.elapsed()); diff --git a/examples/ntriples/main.rs b/examples/ntriples/main.rs index 1f4fbd3..04b51fc 100644 --- a/examples/ntriples/main.rs +++ b/examples/ntriples/main.rs @@ -1,11 +1,11 @@ use remote_hdt::error::RemoteHDTError; -use remote_hdt::storage::params::{ChunkingStrategy, ReferenceSystem, Serialization}; +use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization}; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; pub fn main() -> Result<(), RemoteHDTError> { - LocalStorage::new(TabularLayout, Serialization::Zarr).serialize( - "root.zarr", + Storage::new(TabularLayout, Serialization::Zarr).serialize( + Backend::FileSystem("root.zarr"), "examples/ntriples/rdf.nt", ChunkingStrategy::Chunk, ReferenceSystem::SPO, diff --git a/examples/query_bench.rs b/examples/query_bench.rs index c2ab5f8..23dc992 100644 --- a/examples/query_bench.rs +++ b/examples/query_bench.rs @@ -1,8 +1,8 @@ use remote_hdt::error::RemoteHDTError; use remote_hdt::storage::matrix::MatrixLayout; use remote_hdt::storage::ops::Ops; -use remote_hdt::storage::params::Serialization; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::params::{Backend, Serialization}; +use remote_hdt::storage::Storage; use std::env; use std::time::Instant; @@ -17,8 +17,8 @@ fn main() -> Result<(), RemoteHDTError> { let number_of_universities: &String = &args[1]; let zarr_path = format!("{}-lubm", number_of_universities); - let mut remote_hdt = LocalStorage::new(MatrixLayout, Serialization::Zarr); - let arr = remote_hdt.load(format!("{}.zarr", zarr_path).as_str())?; + let mut binding = Storage::new(MatrixLayout, Serialization::Zarr); + let arr = binding.load(Backend::FileSystem(format!("{}.zarr", zarr_path).as_str()))?; let before = Instant::now(); arr.get_subject(SUBJECT)?; diff --git a/examples/rdf_xml/main.rs b/examples/rdf_xml/main.rs index f69935a..872c428 100644 --- a/examples/rdf_xml/main.rs +++ b/examples/rdf_xml/main.rs @@ -1,11 +1,11 @@ use remote_hdt::error::RemoteHDTError; -use remote_hdt::storage::params::{ChunkingStrategy, ReferenceSystem, Serialization}; +use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization}; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; pub fn main() -> Result<(), RemoteHDTError> { - LocalStorage::new(TabularLayout, Serialization::Zarr).serialize( - "root.zarr", + Storage::new(TabularLayout, Serialization::Zarr).serialize( + Backend::FileSystem("root.zarr"), "examples/rdf_xml/rdf.rdf", ChunkingStrategy::Chunk, ReferenceSystem::SPO, diff --git a/examples/serialize_bench.rs b/examples/serialize_bench.rs index 1ace510..aba915a 100644 --- a/examples/serialize_bench.rs +++ b/examples/serialize_bench.rs @@ -1,7 +1,7 @@ use remote_hdt::error::RemoteHDTError; use remote_hdt::storage::matrix::MatrixLayout; -use remote_hdt::storage::params::{ChunkingStrategy, ReferenceSystem, Serialization}; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization}; +use remote_hdt::storage::Storage; use std::env; use std::time::Instant; @@ -21,8 +21,8 @@ fn main() -> Result<(), RemoteHDTError> { let before = Instant::now(); - LocalStorage::new(MatrixLayout, Serialization::Zarr).serialize( - zarr_path, + Storage::new(MatrixLayout, Serialization::Zarr).serialize( + Backend::FileSystem(zarr_path), rdf_path, ChunkingStrategy::Sharding(*shard_size), ReferenceSystem::SPO, diff --git a/examples/turtle/main.rs b/examples/turtle/main.rs index a03af07..89acf2e 100644 --- a/examples/turtle/main.rs +++ b/examples/turtle/main.rs @@ -1,12 +1,12 @@ use remote_hdt::error::RemoteHDTError; -use remote_hdt::storage::params::{ChunkingStrategy, ReferenceSystem, Serialization}; +use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization}; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; pub fn main() -> Result<(), RemoteHDTError> { - LocalStorage::new(TabularLayout, Serialization::Zarr).serialize( - "root.zarr", - "examples/turtle/rdf.ttk", + Storage::new(TabularLayout, Serialization::Zarr).serialize( + Backend::FileSystem("root.zarr"), + "examples/turtle/rdf.ttl", ChunkingStrategy::Chunk, ReferenceSystem::SPO, )?; diff --git a/src/dictionary.rs b/src/dictionary.rs index c358b4a..d58d06c 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -1,6 +1,5 @@ -use std::collections::HashSet; - use fcsd::Set; +use std::collections::HashSet; use crate::storage::params::ReferenceSystem; diff --git a/src/engine/chunk.rs b/src/engine/chunk.rs index cd03c92..4de33fb 100644 --- a/src/engine/chunk.rs +++ b/src/engine/chunk.rs @@ -2,33 +2,31 @@ use zarrs::array::Array; use zarrs::array_subset::ArraySubset; use zarrs::storage::ReadableStorageTraits; -use crate::storage::ZarrType; use crate::utils::columns_per_shard; use crate::utils::rows_per_shard; use super::EngineResult; use super::EngineStrategy; -impl EngineStrategy> for Array { - fn get_first_term(&self, index: usize) -> EngineResult> { +impl EngineStrategy> for Array { + fn get_first_term(&self, index: usize) -> EngineResult> { let index_to_chunk = index as u64 / rows_per_shard(self); let chunk_to_index = index as u64 % rows_per_shard(self); - Ok(self - .retrieve_chunk_subset_elements( - &[index_to_chunk, 0], - &ArraySubset::new_with_start_end_inc( - vec![chunk_to_index, 0], - vec![chunk_to_index, columns_per_shard(self) - 1], - )?, - )? - .to_vec()) + let ans = self.retrieve_chunk_subset_elements( + &[index_to_chunk, 0], + &ArraySubset::new_with_start_end_inc( + vec![chunk_to_index, 0], + vec![chunk_to_index, columns_per_shard(self) - 1], + )?, + )?; + Ok(ans.to_vec()) } - fn get_second_term(&self, _index: usize) -> EngineResult> { + fn get_second_term(&self, _index: usize) -> EngineResult> { unimplemented!() } - fn get_third_term(&self, index: usize) -> EngineResult> { + fn get_third_term(&self, index: usize) -> EngineResult> { let start = vec![0, index as u64]; let end = vec![self.shape()[0], index as u64]; let shape = &ArraySubset::new_with_start_end_inc(start, end)?; diff --git a/src/error.rs b/src/error.rs index 3623781..7b41ac1 100644 --- a/src/error.rs +++ b/src/error.rs @@ -4,6 +4,7 @@ use zarrs::array::codec::bytes_to_bytes::gzip::GzipCompressionLevelError; use zarrs::array::ArrayCreateError; use zarrs::array::ArrayError; use zarrs::array_subset::IncompatibleDimensionalityError; +use zarrs::array_subset::IncompatibleStartEndIndicesError; use zarrs::group::GroupCreateError; use zarrs::storage::store::FilesystemStoreCreateError; use zarrs::storage::store::HTTPStoreCreateError; @@ -29,6 +30,8 @@ pub enum RemoteHDTError { HTTPCreate(#[from] HTTPStoreCreateError), #[error("The Path already exists, please provide an empty path")] PathExists, + #[error("The Path does not exist, please provide another path")] + PathDoesNotExist, #[error(transparent)] GZipCompression(#[from] GzipCompressionLevelError), #[error("The Graph you are trying to serialize is empty")] @@ -45,6 +48,14 @@ pub enum RemoteHDTError { ReferenceSystemNotInJSON, #[error("Error serializing the triples of the Graph")] TripleSerialization, + #[error("The provided path is not valid")] + OsPathToString, + #[error(transparent)] + Opendal(#[from] zarrs::opendal::Error), + #[error("The provided backend is read-only")] + ReadOnlyBackend, + #[error("Error while parsing the RDF graph")] + RdfParse, } #[derive(Error, Debug)] @@ -55,6 +66,8 @@ pub enum EngineError { Array(#[from] ArrayError), #[error("Operation error")] Operation, + #[error(transparent)] + IncompatibleStartEndIndicesError(#[from] IncompatibleStartEndIndicesError), } #[derive(Error, Debug)] diff --git a/src/main.rs b/src/main.rs index 27ee27c..db495e5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,10 @@ use clap::Parser; +use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; use remote_hdt::storage::StorageResult; #[derive(Parser, Debug)] @@ -20,8 +21,8 @@ struct Args { fn main() -> StorageResult<()> { let args: Args = Args::parse(); - LocalStorage::new(TabularLayout, Serialization::Sparse).serialize( - &args.zarr, + Storage::new(TabularLayout, Serialization::Sparse).serialize( + Backend::FileSystem(&args.zarr), &args.rdf, ChunkingStrategy::Chunk, ReferenceSystem::SPO, diff --git a/src/storage/layout.rs b/src/storage/layout.rs index 0cc6368..d0385ac 100644 --- a/src/storage/layout.rs +++ b/src/storage/layout.rs @@ -1,7 +1,7 @@ +use parking_lot::Mutex; +use sprs::TriMat; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; - -use safe_transmute::TriviallyTransmutable; use zarrs::array::codec::ArrayToBytesCodecTraits; use zarrs::array::Array; use zarrs::array::ChunkGrid; @@ -9,7 +9,7 @@ use zarrs::array::DataType; use zarrs::array::DimensionName; use zarrs::array::FillValue; use zarrs::array_subset::ArraySubset; -use zarrs::storage::store::FilesystemStore; +use zarrs::storage::store::OpendalStore; use crate::dictionary::Dictionary; use crate::error::RemoteHDTError; @@ -26,8 +26,8 @@ use super::ZarrArray; type ArrayToBytesCodec = Box; -pub trait LayoutOps { - fn retrieve_attributes(&mut self, arr: &Array) -> StorageResult { +pub trait LayoutOps { + fn retrieve_attributes(&mut self, arr: &Array) -> StorageResult { // 4. We get the attributes so we can obtain some values that we will need let attributes = arr.attributes(); @@ -60,26 +60,20 @@ pub trait LayoutOps { )) } - fn serialize(&mut self, arr: Array, graph: Graph) -> StorageResult<()> { + fn serialize(&mut self, arr: Array, graph: Graph) -> StorageResult<()> { let columns = arr.shape()[1] as usize; let count = AtomicU64::new(0); let binding = self.graph_iter(graph.to_owned()); let iter = binding.chunks_exact(rows_per_shard(&arr) as usize); let remainder = iter.remainder(); - for chunk in iter { - if arr - .store_chunk_elements( - &[count.load(Ordering::Relaxed), 0], - self.chunk_elements(chunk, columns), - ) - .is_err() - { - return Err(RemoteHDTError::TripleSerialization); - } - + let _ = iter.map(|chunk| { count.fetch_add(1, Ordering::Relaxed); - } + arr.store_chunk_elements( + &[count.load(Ordering::Relaxed), 0], + self.store_chunk_elements(chunk, columns), + ) + }); if !remainder.is_empty() { arr.store_array_subset_elements( @@ -87,24 +81,81 @@ pub trait LayoutOps { vec![count.load(Ordering::Relaxed) * rows_per_shard(&arr), 0], vec![remainder.len() as u64, columns_per_shard(&arr)], )?, - self.chunk_elements(remainder, columns), + self.store_chunk_elements(remainder, columns), )?; } Ok(()) } - fn graph_iter(&self, graph: Graph) -> Vec; - fn chunk_elements(&self, chunk: &[C], columns: usize) -> Vec; fn parse( &mut self, - arr: &Array, + arr: &Array, dimensionality: &Dimensionality, - ) -> StorageResult; + ) -> StorageResult { + // First, we create the 2D matrix in such a manner that the number of + // rows is the same as the size of the first terms; i.e, in the SPO + // orientation, that will be equals to the number of subjects, while + // the number of columns is equals to the size of the third terms; i.e, + // following the same example as before, it will be equals to the number + // of objects. In our case the dimensionality abstracts the process + // of getting the size of the concrete dimension + let matrix = Mutex::new(TriMat::new(( + dimensionality.first_term_size, // we obtain the size of the first terms + dimensionality.third_term_size, // we obtain the size of the third terms + ))); + + // We compute the number of chunks; for us to achieve so, we have to obtain + // first dimension of the chunk grid + let number_of_chunks = match arr.chunk_grid_shape() { + Some(chunk_grid) => chunk_grid[0], + None => 0, + }; + + let number_of_columns = arr.shape()[1] as usize; + + // For each chunk in the Zarr array we retrieve it and parse it into a + // matrix, inserting the triplet in its corresponding position. The idea + // of parsing the array chunk-by-chunk allows us to keep the RAM usage + // low, as instead of parsing the whole array, we process smaller pieces + // of it. Once we have all the pieces processed, we will have parsed the + // whole array + for i in 0..number_of_chunks { + arr.retrieve_chunk_elements(&[i, 0])? + .chunks(number_of_columns) + .enumerate() + .for_each(|(first_term_idx, chunk)| { + self.retrieve_chunk_elements( + &matrix, + i, + number_of_columns as u64, + first_term_idx, + chunk, + ); + }) + } + + // We use a CSC Matrix because typically, RDF knowledge graphs tend to + // have more rows than columns; as such, CSC matrices are optimized + // for that precise scenario + let x = matrix.lock(); + Ok(x.to_csc()) + } + + fn graph_iter(&self, graph: Graph) -> Vec; + fn store_chunk_elements(&self, chunk: &[C], columns: usize) -> Vec; + fn retrieve_chunk_elements( + &mut self, + matrix: &Mutex>, + i: u64, + number_of_columns: u64, + first_term_idx: usize, + chunk: &[usize], + ); fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize; } -pub trait Layout: LayoutOps { +pub trait Layout: LayoutOps { fn shape(&self, dimensionality: &Dimensionality) -> Vec; fn data_type(&self) -> DataType; fn chunk_shape( diff --git a/src/storage/matrix.rs b/src/storage/matrix.rs index 39d0749..c2a5d7a 100644 --- a/src/storage/matrix.rs +++ b/src/storage/matrix.rs @@ -1,15 +1,14 @@ use parking_lot::Mutex; use sprs::TriMat; +use std::num::NonZeroU64; use std::sync::atomic::Ordering; use zarrs::array::codec::array_to_bytes::sharding::ShardingCodecBuilder; use zarrs::array::codec::ArrayToBytesCodecTraits; use zarrs::array::codec::GzipCodec; -use zarrs::array::Array; use zarrs::array::ChunkGrid; use zarrs::array::DataType; use zarrs::array::DimensionName; use zarrs::array::FillValue; -use zarrs::storage::ReadableStorageTraits; use super::layout::Layout; use super::layout::LayoutOps; @@ -18,20 +17,14 @@ use super::ChunkingStrategy; use super::Dimensionality; use super::ReferenceSystem; use super::StorageResult; -use super::ZarrArray; -use super::ZarrType; use crate::io::Graph; -use crate::utils::rows_per_shard; type Chunk = Vec<(u32, u32)>; pub struct MatrixLayout; -impl Layout for MatrixLayout -where - R: ReadableStorageTraits + Sized, -{ +impl Layout for MatrixLayout { fn shape(&self, dimensionality: &Dimensionality) -> Vec { vec![ dimensionality.get_first_term_size(), @@ -50,13 +43,13 @@ where ) -> ChunkGrid { vec![ chunking_strategy.into(), - dimensionality.get_third_term_size(), + NonZeroU64::new(dimensionality.get_third_term_size()).unwrap(), ] .into() } fn fill_value(&self) -> FillValue { - FillValue::from(0 as ZarrType) + FillValue::from(0u64) } fn dimension_names(&self, reference_system: &ReferenceSystem) -> Option> { @@ -92,27 +85,29 @@ where &self, dimensionality: &Dimensionality, ) -> StorageResult> { - let mut sharding_codec_builder = - ShardingCodecBuilder::new(vec![1, dimensionality.get_third_term_size()]); + let mut sharding_codec_builder = ShardingCodecBuilder::new( + vec![ + NonZeroU64::new(1).unwrap(), + NonZeroU64::new(dimensionality.get_third_term_size()).unwrap(), + ] + .into(), + ); sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); Ok(Box::new(sharding_codec_builder.build())) } } -impl LayoutOps for MatrixLayout -where - R: ReadableStorageTraits + Sized, -{ +impl LayoutOps for MatrixLayout { fn graph_iter(&self, graph: Graph) -> Vec { graph } - fn chunk_elements(&self, chunk: &[Chunk], columns: usize) -> Vec { + fn store_chunk_elements(&self, chunk: &[Chunk], columns: usize) -> Vec { // We create a slice that has the size of the chunk filled with 0 values // having the size of the shard; that is, number of rows, and a given // number of columns. This value is converted into an AtomicU8 for us to // be able to share it among threads - let slice: Vec = vec![0 as ZarrType; chunk.len() * columns] + let slice: Vec = vec![0u64; chunk.len() * columns] .iter() .map(|&n| AtomicZarrType::new(n)) .collect(); @@ -120,56 +115,36 @@ where for (first_term, triples) in chunk.iter().enumerate() { triples.iter().for_each(|&(second_term, third_term)| { let third_term_idx = third_term as usize + first_term * columns; - slice[third_term_idx].store(second_term as ZarrType, Ordering::Relaxed); + slice[third_term_idx].store(second_term as u64, Ordering::Relaxed); }); } slice .iter() .map(|elem| elem.load(Ordering::Relaxed)) - .collect::>() + .collect::>() } - fn parse( + fn retrieve_chunk_elements( &mut self, - arr: &Array, - dimensionality: &Dimensionality, - ) -> StorageResult { - let matrix = Mutex::new(TriMat::new(( - dimensionality.first_term_size, - dimensionality.third_term_size, - ))); - let number_of_chunks = match arr.chunk_grid_shape() { - Some(chunk_grid) => chunk_grid[0], - None => 0, - }; - (0..number_of_chunks).for_each(|i| { - // Using this chunking strategy allows us to keep RAM usage low, - // as we load elements by row - arr.retrieve_chunk_elements::(&[i, 0]) - .unwrap() - .chunks(dimensionality.third_term_size) - .enumerate() - .for_each(|(first_term_idx, chunk)| { - chunk - .iter() - .enumerate() - .for_each(|(third_term_idx, &second_term_idx)| { - if second_term_idx != 0 { - matrix.lock().add_triplet( - first_term_idx + (i * rows_per_shard(arr)) as usize, - third_term_idx, - second_term_idx, - ); - } - }) - }) - }); - - // We use a CSC Matrix because typically, RDF knowledge graphs tend to - // have more rows than columns - let x = matrix.lock(); - Ok(x.to_csc()) + matrix: &Mutex>, + i: u64, + number_of_columns: u64, + first_term_idx: usize, + chunk: &[usize], + ) { + chunk + .iter() + .enumerate() + .for_each(|(third_term_idx, &second_term_idx)| { + if second_term_idx != 0 { + matrix.lock().add_triplet( + first_term_idx + (i * number_of_columns) as usize, + third_term_idx, + second_term_idx, + ); + } + }) } fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 31357f9..e3c03aa 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1,4 +1,3 @@ -use safe_transmute::TriviallyTransmutable; use serde_json::Map; use sprs::CsMat; use std::path::PathBuf; @@ -8,9 +7,10 @@ use std::sync::Arc; use zarrs::array::Array; use zarrs::array::ArrayBuilder; use zarrs::group::GroupBuilder; -use zarrs::storage::store::FilesystemStore; -use zarrs::storage::store::HTTPStore; -use zarrs::storage::ReadableStorageTraits; +use zarrs::opendal::services::Fs; +use zarrs::opendal::services::Http; +use zarrs::opendal::Operator; +use zarrs::storage::store::OpendalStore; use crate::dictionary::Dictionary; use crate::error::RemoteHDTError; @@ -19,6 +19,7 @@ use crate::io::RdfParser; use crate::utils::rdf_to_value; use self::layout::Layout; +use self::params::Backend; use self::params::ChunkingStrategy; use self::params::Dimensionality; use self::params::ReferenceSystem; @@ -30,27 +31,24 @@ pub mod ops; pub mod params; pub mod tabular; -pub type ZarrArray = CsMat; -pub type ZarrType = u64; +pub type ZarrArray = CsMat; type AtomicZarrType = AtomicU64; pub type StorageResult = Result; -pub type LocalStorage = Storage; -pub type HTTPStorage = Storage; const ARRAY_NAME: &str = "/group/RemoteHDT"; // TODO: parameterize this -pub struct Storage { +pub struct Storage { dictionary: Dictionary, dimensionality: Dimensionality, - layout: Box>, + layout: Box>, serialization: Serialization, reference_system: ReferenceSystem, - array: Option>, + array: Option>, sparse_array: Option, } -impl Storage { - pub fn new(layout: impl Layout + 'static, serialization: Serialization) -> Self { +impl Storage { + pub fn new(layout: impl Layout + 'static, serialization: Serialization) -> Self { Storage { dictionary: Default::default(), dimensionality: Default::default(), @@ -70,54 +68,46 @@ impl Storage { self.sparse_array.to_owned() } - fn process_zarr(&mut self, storage: R) -> StorageResult<&Self> { - let store = Arc::new(storage); - let arr = Array::new(store, ARRAY_NAME)?; - let dictionary = self.layout.retrieve_attributes(&arr)?; - self.dictionary = dictionary; - self.reference_system = self.dictionary.get_reference_system(); - self.dimensionality = Dimensionality::new(&self.dictionary, &Graph::default()); - - match self.serialization { - Serialization::Zarr => self.array = Some(arr), - Serialization::Sparse => { - self.sparse_array = Some(self.layout.parse(&arr, &self.dimensionality)?) - } - } - - Ok(self) - } -} - -impl LocalStorage { /// # Errors /// Returns [`PathExistsError`] if the provided path already exists; that is, /// the user is trying to store the RDF dataset in an occupied storage. This /// is due to the fact that the user may incur in an undefined state. pub fn serialize<'a>( &mut self, - zarr_path: &'a str, + store: Backend<'a>, rdf_path: &'a str, chunking_strategy: ChunkingStrategy, reference_system: ReferenceSystem, - // threading_strategy: ThreadingStrategy, - ) -> StorageResult<&Self> { - // 1. The first thing that should be done is to check whether the path - // in which we are trying to store the dump already exists or not. If it - // does, we should stop the execution, preventing the user from losing - // data. Otherwise we can resume it and begin the actual proccess... - let path = PathBuf::from_str(zarr_path)?; - if path.exists() { - // the actual check occurs here !!! - return Err(RemoteHDTError::PathExists); - } + // threading_strategy: ThreadingStrategy, TODO: implement this + ) -> StorageResult<&mut Self> { + let operator = match store { + Backend::FileSystem(path) => { + let mut builder = Fs::default(); + let path = PathBuf::from_str(path)?; + + match path.exists() { + true => return Err(RemoteHDTError::PathExists), + false => { + let path = match path.into_os_string().into_string() { + Ok(string) => string, + Err(_) => return Err(RemoteHDTError::OsPathToString), + }; + builder.root(&path); + } + } + + Operator::new(builder)?.finish() + } + Backend::HTTP(_) => return Err(RemoteHDTError::ReadOnlyBackend), + }; // 2. We can create the FileSystemStore appropiately - let store = Arc::new(FilesystemStore::new(path)?); + let store = Arc::new(OpendalStore::new(operator.blocking())); // Create a group and write metadata to filesystem let group = GroupBuilder::new().build(store.clone(), "/group")?; - group.store_metadata()?; + + let _ = group.store_metadata()?; // TODO: rayon::ThreadPoolBuilder::new() // .num_threads(1) @@ -131,7 +121,7 @@ impl LocalStorage { self.dimensionality = Dimensionality::new(&self.dictionary, &graph); graph } - Err(_) => todo!(), + Err(_) => return Err(RemoteHDTError::RdfParse), }; // 4. Build the structure of the Array; as such, several parameters of it are @@ -160,19 +150,55 @@ impl LocalStorage { .build(store, ARRAY_NAME)?; arr.store_metadata()?; - self.layout.serialize(arr, graph)?; Ok(self) } - pub fn load(&mut self, zarr_path: &str) -> StorageResult<&Self> { - self.process_zarr(FilesystemStore::new(zarr_path)?) - } -} + pub fn load<'a>( + &mut self, + store: Backend<'a>, + // threading_strategy: ThreadingStrategy, TODO: implement this + ) -> StorageResult<&mut Self> { + let operator = match store { + Backend::FileSystem(path) => { + let mut builder = Fs::default(); + let path = PathBuf::from_str(path)?; + + match path.exists() { + false => return Err(RemoteHDTError::PathDoesNotExist), + true => { + let path = match path.into_os_string().into_string() { + Ok(string) => string, + Err(_) => return Err(RemoteHDTError::OsPathToString), + }; + builder.root(&path); + } + } + + Operator::new(builder)?.finish() + } + Backend::HTTP(path) => { + let mut builder = Http::default(); + builder.endpoint(path); + Operator::new(builder)?.finish() + } + }; -impl HTTPStorage { - pub fn connect(&mut self, url: &str) -> StorageResult<&Self> { - self.process_zarr(HTTPStore::new(url)?) + let store: Arc = Arc::new(OpendalStore::new(operator.blocking())); + let arr = Array::new(store, ARRAY_NAME)?; + let dictionary = self.layout.retrieve_attributes(&arr)?; + self.dictionary = dictionary; + self.reference_system = self.dictionary.get_reference_system(); + self.dimensionality = Dimensionality::new(&self.dictionary, &Graph::default()); + + match self.serialization { + Serialization::Zarr => self.array = Some(arr), + Serialization::Sparse => { + self.sparse_array = Some(self.layout.parse(&arr, &self.dimensionality)?) + } + } + + Ok(self) } } diff --git a/src/storage/ops.rs b/src/storage/ops.rs index 24dde0c..d74f31c 100644 --- a/src/storage/ops.rs +++ b/src/storage/ops.rs @@ -1,6 +1,3 @@ -use safe_transmute::TriviallyTransmutable; -use zarrs::storage::ReadableStorageTraits; - use crate::engine::EngineStrategy; use crate::error::OpsError; @@ -8,13 +5,12 @@ use super::params::ReferenceSystem; use super::params::Serialization; use super::Storage; use super::ZarrArray; -use super::ZarrType; pub type OpsResult = Result; pub enum OpsFormat { SparseArray(ZarrArray), - Zarr(Vec), + Zarr(Vec), } pub trait Ops { @@ -23,7 +19,7 @@ pub trait Ops { fn get_object(&self, object: &str) -> OpsResult; } -impl Ops for Storage { +impl Ops for Storage { fn get_subject(&self, subject: &str) -> OpsResult { let index = match self.dictionary.get_subject_idx(subject) { Some(index) => index, diff --git a/src/storage/params.rs b/src/storage/params.rs index 993a578..d453a66 100644 --- a/src/storage/params.rs +++ b/src/storage/params.rs @@ -1,6 +1,13 @@ +use std::num::NonZeroU64; + use crate::dictionary::Dictionary; use crate::io::Graph; +pub enum Backend<'a> { + FileSystem(&'a str), + HTTP(&'a str), +} + pub enum Serialization { Zarr, Sparse, @@ -35,12 +42,12 @@ pub struct Dimensionality { pub(crate) third_term_size: usize, } -impl From for u64 { +impl From for NonZeroU64 { fn from(value: ChunkingStrategy) -> Self { match value { - ChunkingStrategy::Chunk => 1, - ChunkingStrategy::Sharding(size) => size, - ChunkingStrategy::Best => 16, // TODO: set to the number of threads + ChunkingStrategy::Chunk => NonZeroU64::new(1).unwrap(), + ChunkingStrategy::Sharding(size) => NonZeroU64::new(size).unwrap(), + ChunkingStrategy::Best => NonZeroU64::new(16).unwrap(), // TODO: set to the number of threads } } } diff --git a/src/storage/tabular.rs b/src/storage/tabular.rs index 0e7e121..3dbfcb2 100644 --- a/src/storage/tabular.rs +++ b/src/storage/tabular.rs @@ -1,14 +1,14 @@ +use std::num::NonZeroU64; + use parking_lot::Mutex; use sprs::TriMat; use zarrs::array::codec::array_to_bytes::sharding::ShardingCodecBuilder; use zarrs::array::codec::ArrayToBytesCodecTraits; use zarrs::array::codec::GzipCodec; -use zarrs::array::Array; use zarrs::array::ChunkGrid; use zarrs::array::DataType; use zarrs::array::DimensionName; use zarrs::array::FillValue; -use zarrs::storage::ReadableStorageTraits; use crate::io::Graph; @@ -18,17 +18,12 @@ use super::params::ChunkingStrategy; use super::params::Dimensionality; use super::params::ReferenceSystem; use super::StorageResult; -use super::ZarrArray; -type ZarrType = u64; type Chunk = (u32, u32, u32); pub struct TabularLayout; -impl Layout for TabularLayout -where - R: ReadableStorageTraits + Sized, -{ +impl Layout for TabularLayout { fn shape(&self, dimensionality: &Dimensionality) -> Vec { vec![dimensionality.get_graph_size(), 3] } @@ -38,7 +33,7 @@ where } fn chunk_shape(&self, chunking_strategy: ChunkingStrategy, _: &Dimensionality) -> ChunkGrid { - vec![chunking_strategy.into(), 3].into() // TODO: make this a constant value + vec![chunking_strategy.into(), NonZeroU64::new(3).unwrap()].into() // TODO: make this a constant value } fn fill_value(&self) -> FillValue { @@ -56,16 +51,15 @@ where &self, _: &Dimensionality, ) -> StorageResult> { - let mut sharding_codec_builder = ShardingCodecBuilder::new(vec![1, 3]); + let mut sharding_codec_builder = ShardingCodecBuilder::new( + vec![NonZeroU64::new(1).unwrap(), NonZeroU64::new(3).unwrap()].into(), + ); sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); Ok(Box::new(sharding_codec_builder.build())) } } -impl LayoutOps for TabularLayout -where - R: ReadableStorageTraits + Sized, -{ +impl LayoutOps for TabularLayout { fn graph_iter(&self, graph: Graph) -> Vec { graph .iter() @@ -79,47 +73,27 @@ where .collect::>() } - fn chunk_elements(&self, chunk: &[Chunk], _: usize) -> Vec { + fn store_chunk_elements(&self, chunk: &[Chunk], _: usize) -> Vec { let mut ans = Vec::new(); for &(first_term, second_term, third_term) in chunk { - ans.push(first_term as ZarrType); - ans.push(second_term as ZarrType); - ans.push(third_term as ZarrType); + ans.push(first_term as u64); + ans.push(second_term as u64); + ans.push(third_term as u64); } ans } - fn parse( + fn retrieve_chunk_elements( &mut self, - arr: &Array, - dimensionality: &Dimensionality, - ) -> StorageResult { - let matrix = Mutex::new(TriMat::new(( - dimensionality.first_term_size, - dimensionality.third_term_size, - ))); - let number_of_chunks = match arr.chunk_grid_shape() { - Some(chunk_grid) => chunk_grid[0] as usize, - None => 0, - }; - (0..number_of_chunks).for_each(|i| { - // Using this chunking strategy allows us to keep RAM usage low, - // as we load elements by row - if let Ok(chunk_elements) = arr.retrieve_chunk_elements::(&[i as ZarrType, 0]) { - chunk_elements.chunks(3).for_each(|triple| { - println!("{} {} {}", triple[0], triple[2], triple[1] as ZarrType); - matrix - .lock() - .add_triplet(triple[0], triple[2], triple[1] as ZarrType); - }) - } - }); - - // We use a CSC Matrix because typically, RDF knowledge graphs tend to - // have more rows than columns - let x = matrix.lock(); - - Ok(x.to_csc()) + matrix: &Mutex>, + i: u64, + number_of_columns: u64, + first_term_idx: usize, + chunk: &[usize], + ) { + matrix + .lock() + .add_triplet(chunk[0], chunk[2], chunk[1] as usize); } fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize { diff --git a/src/utils.rs b/src/utils.rs index 6680a32..eaf6396 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,7 +1,6 @@ -use std::collections::HashSet; - use fcsd::Set; use serde_json::Value; +use std::collections::HashSet; use zarrs::array::Array; pub fn rdf_to_value(terms: Set) -> Value { @@ -35,7 +34,7 @@ pub fn hash_to_set(terms: HashSet) -> Vec { pub fn rows_per_shard(arr: &Array) -> u64 { match arr.chunk_grid().chunk_shape(&[0, 0], arr.shape()) { Ok(shape) => match shape { - Some(chunk_shape) => chunk_shape[0], + Some(chunk_shape) => chunk_shape[0].into(), None => todo!(), }, Err(_) => todo!(), @@ -45,7 +44,7 @@ pub fn rows_per_shard(arr: &Array) -> u64 { pub fn columns_per_shard(arr: &Array) -> u64 { match arr.chunk_grid().chunk_shape(&[0, 0], arr.shape()) { Ok(shape) => match shape { - Some(chunk_shape) => chunk_shape[1], + Some(chunk_shape) => chunk_shape[1].into(), None => todo!(), }, Err(_) => todo!(), diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 43b0645..daf67bd 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,15 +1,13 @@ #![allow(dead_code)] use remote_hdt::dictionary::Dictionary; +use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::Storage; -use remote_hdt::storage::ZarrType; -use safe_transmute::TriviallyTransmutable; use sprs::CsMat; use sprs::TriMat; use std::fs::File; -use zarrs::storage::store::FilesystemStore; pub const TABULAR_ZARR: &str = "tests/out/tabular.zarr"; pub const MATRIX_ZARR: &str = "tests/out/matrix.zarr"; @@ -20,23 +18,23 @@ pub const OPS_ZARR: &str = "tests/out/ops.zarr"; pub const TABULAR_PSO_ZARR: &str = "tests/out/tabular_pso.zarr"; pub const TABULAR_OPS_ZARR: &str = "tests/out/tabular_ops.zarr"; -pub fn setup( +pub fn setup( path: &str, - storage: &mut Storage, + storage: &mut Storage, chunking_strategy: ChunkingStrategy, reference_system: ReferenceSystem, ) { if File::open(path).is_err() { storage .serialize( - path, + Backend::FileSystem(path), "resources/rdf.nt", chunking_strategy, reference_system, ) .unwrap(); } else { - storage.load(path).unwrap(); + storage.load(Backend::FileSystem(path)).unwrap(); } } @@ -76,8 +74,8 @@ pub enum Predicate { } impl Predicate { - fn get_idx(self, dictionary: &Dictionary) -> ZarrType { - dictionary.get_predicate_idx_unchecked(self.into()) as ZarrType + fn get_idx(self, dictionary: &Dictionary) -> usize { + dictionary.get_predicate_idx_unchecked(self.into()) } } @@ -133,7 +131,7 @@ impl From for &str { pub struct Graph; impl Graph { - pub fn new(dictionary: &Dictionary) -> CsMat { + pub fn new(dictionary: &Dictionary) -> CsMat { let mut ans = TriMat::new((4, 9)); ans.add_triplet( diff --git a/tests/get_object_test.rs b/tests/get_object_test.rs index ef342a3..3348b4a 100644 --- a/tests/get_object_test.rs +++ b/tests/get_object_test.rs @@ -1,11 +1,12 @@ use remote_hdt::storage::matrix::MatrixLayout; use remote_hdt::storage::ops::Ops; use remote_hdt::storage::ops::OpsFormat; +use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; use sprs::TriMat; use std::error::Error; @@ -13,7 +14,7 @@ mod common; #[test] fn get_object_matrix_sharding_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); + let mut storage = Storage::new(MatrixLayout, Serialization::Zarr); common::setup( common::SHARDING_ZARR, @@ -23,7 +24,7 @@ fn get_object_matrix_sharding_test() -> Result<(), Box> { ); let actual = match storage - .load(common::SHARDING_ZARR)? + .load(Backend::FileSystem(common::SHARDING_ZARR))? .get_object(common::Object::Date.into())? { OpsFormat::Zarr(actual) => actual, @@ -40,7 +41,7 @@ fn get_object_matrix_sharding_test() -> Result<(), Box> { #[test] fn get_object_tabular_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(TabularLayout, Serialization::Sparse); + let mut storage = Storage::new(TabularLayout, Serialization::Sparse); common::setup( common::TABULAR_ZARR, @@ -50,7 +51,7 @@ fn get_object_tabular_test() -> Result<(), Box> { ); let actual = match storage - .load(common::TABULAR_ZARR)? + .load(Backend::FileSystem(common::TABULAR_ZARR))? .get_object(common::Object::Alan.into())? { OpsFormat::SparseArray(actual) => actual, diff --git a/tests/get_subject_test.rs b/tests/get_subject_test.rs index a291f95..011623d 100644 --- a/tests/get_subject_test.rs +++ b/tests/get_subject_test.rs @@ -1,11 +1,12 @@ use remote_hdt::storage::matrix::MatrixLayout; use remote_hdt::storage::ops::Ops; use remote_hdt::storage::ops::OpsFormat; +use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; use sprs::TriMat; use std::error::Error; @@ -13,7 +14,7 @@ mod common; #[test] fn get_subject_matrix_chunk_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); + let mut storage = Storage::new(MatrixLayout, Serialization::Zarr); common::setup( common::MATRIX_ZARR, @@ -23,7 +24,7 @@ fn get_subject_matrix_chunk_test() -> Result<(), Box> { ); let actual = match storage - .load(common::MATRIX_ZARR)? + .load(Backend::FileSystem(common::MATRIX_ZARR))? .get_subject(common::Subject::Alan.into())? { OpsFormat::Zarr(actual) => actual, @@ -39,7 +40,7 @@ fn get_subject_matrix_chunk_test() -> Result<(), Box> { #[test] fn get_subject_matrix_sharding_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); + let mut storage = Storage::new(MatrixLayout, Serialization::Zarr); common::setup( common::SHARDING_ZARR, @@ -49,7 +50,7 @@ fn get_subject_matrix_sharding_test() -> Result<(), Box> { ); let actual = match storage - .load(common::SHARDING_ZARR)? + .load(Backend::FileSystem(common::SHARDING_ZARR))? .get_subject(common::Subject::Wilmslow.into())? { OpsFormat::Zarr(actual) => actual, @@ -65,7 +66,7 @@ fn get_subject_matrix_sharding_test() -> Result<(), Box> { #[test] fn get_subject_tabular_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(TabularLayout, Serialization::Sparse); + let mut storage = Storage::new(TabularLayout, Serialization::Sparse); common::setup( common::TABULAR_ZARR, @@ -75,7 +76,7 @@ fn get_subject_tabular_test() -> Result<(), Box> { ); let actual = match storage - .load(common::TABULAR_ZARR)? + .load(Backend::FileSystem(common::TABULAR_ZARR))? .get_subject(common::Subject::Alan.into())? { OpsFormat::SparseArray(actual) => actual, diff --git a/tests/orientation.rs b/tests/orientation.rs index b2d13ea..96afc8b 100644 --- a/tests/orientation.rs +++ b/tests/orientation.rs @@ -1,18 +1,19 @@ use remote_hdt::storage::matrix::MatrixLayout; use remote_hdt::storage::ops::Ops; use remote_hdt::storage::ops::OpsFormat; +use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; use std::error::Error; mod common; #[test] fn orientation_pso_matrix_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); + let mut storage = Storage::new(MatrixLayout, Serialization::Zarr); common::setup( common::PSO_ZARR, @@ -22,7 +23,7 @@ fn orientation_pso_matrix_test() -> Result<(), Box> { ); let actual = match storage - .load(common::PSO_ZARR)? + .load(Backend::FileSystem(common::PSO_ZARR))? .get_predicate(common::Predicate::InstanceOf.into())? { OpsFormat::Zarr(actual) => actual, @@ -38,7 +39,7 @@ fn orientation_pso_matrix_test() -> Result<(), Box> { #[test] fn orientation_ops_matrix_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(MatrixLayout, Serialization::Zarr); + let mut storage = Storage::new(MatrixLayout, Serialization::Zarr); common::setup( common::OPS_ZARR, @@ -48,7 +49,7 @@ fn orientation_ops_matrix_test() -> Result<(), Box> { ); let actual = match storage - .load(common::OPS_ZARR)? + .load(Backend::FileSystem(common::OPS_ZARR))? .get_object(common::Object::Alan.into())? { OpsFormat::Zarr(actual) => actual, @@ -65,7 +66,7 @@ fn orientation_ops_matrix_test() -> Result<(), Box> { #[test] fn orientation_pso_tabular_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(TabularLayout, Serialization::Sparse); + let mut storage = Storage::new(TabularLayout, Serialization::Sparse); common::setup( common::TABULAR_PSO_ZARR, @@ -75,7 +76,7 @@ fn orientation_pso_tabular_test() -> Result<(), Box> { ); let actual = match storage - .load(common::TABULAR_PSO_ZARR)? + .load(Backend::FileSystem(common::TABULAR_PSO_ZARR))? .get_predicate(common::Predicate::InstanceOf.into())? { OpsFormat::SparseArray(actual) => actual, @@ -125,7 +126,7 @@ fn orientation_pso_tabular_test() -> Result<(), Box> { #[test] fn orientation_ops_tabular_test() -> Result<(), Box> { - let mut storage = LocalStorage::new(TabularLayout, Serialization::Zarr); + let mut storage = Storage::new(TabularLayout, Serialization::Zarr); common::setup( common::TABULAR_OPS_ZARR, @@ -135,7 +136,7 @@ fn orientation_ops_tabular_test() -> Result<(), Box> { ); let actual = match storage - .load(common::TABULAR_OPS_ZARR)? + .load(Backend::FileSystem(common::TABULAR_OPS_ZARR))? .get_subject(common::Subject::Alan.into())? { OpsFormat::Zarr(actual) => actual, diff --git a/tests/write_read_test.rs b/tests/write_read_test.rs index e2f2e3c..5ef445c 100644 --- a/tests/write_read_test.rs +++ b/tests/write_read_test.rs @@ -1,15 +1,16 @@ use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; use remote_hdt::storage::tabular::TabularLayout; -use remote_hdt::storage::LocalStorage; +use remote_hdt::storage::Storage; mod common; #[test] fn write_read_tabular_test() { - let mut storage = LocalStorage::new(TabularLayout, Serialization::Sparse); + let mut storage = Storage::new(TabularLayout, Serialization::Sparse); common::setup( common::MATRIX_ZARR, @@ -18,7 +19,9 @@ fn write_read_tabular_test() { ReferenceSystem::SPO, ); - storage.load(common::TABULAR_ZARR).unwrap(); + storage + .load(Backend::FileSystem(common::TABULAR_ZARR)) + .unwrap(); assert_eq!( storage.get_sparse_array().unwrap(), @@ -28,7 +31,7 @@ fn write_read_tabular_test() { #[test] fn write_read_matrix_test() { - let mut storage = LocalStorage::new(MatrixLayout, Serialization::Sparse); + let mut storage = Storage::new(MatrixLayout, Serialization::Sparse); common::setup( common::MATRIX_ZARR, &mut storage, @@ -36,7 +39,9 @@ fn write_read_matrix_test() { ReferenceSystem::SPO, ); - storage.load(common::MATRIX_ZARR).unwrap(); + storage + .load(Backend::FileSystem(common::MATRIX_ZARR)) + .unwrap(); assert_eq!( storage.get_sparse_array().unwrap(), @@ -46,7 +51,7 @@ fn write_read_matrix_test() { #[test] fn write_read_matrix_sharding_test() { - let mut storage = LocalStorage::new(MatrixLayout, Serialization::Sparse); + let mut storage = Storage::new(MatrixLayout, Serialization::Sparse); common::setup( common::SHARDING_ZARR, @@ -55,7 +60,9 @@ fn write_read_matrix_sharding_test() { ReferenceSystem::SPO, ); - storage.load(common::SHARDING_ZARR).unwrap(); + storage + .load(Backend::FileSystem(common::SHARDING_ZARR)) + .unwrap(); assert_eq!( storage.get_sparse_array().unwrap(), @@ -65,7 +72,7 @@ fn write_read_matrix_sharding_test() { #[test] fn write_read_larger_than_triples_shard_test() { - let mut storage = LocalStorage::new(MatrixLayout, Serialization::Sparse); + let mut storage = Storage::new(MatrixLayout, Serialization::Sparse); common::setup( common::LARGER_ZARR, @@ -74,7 +81,9 @@ fn write_read_larger_than_triples_shard_test() { ReferenceSystem::SPO, ); - storage.load(common::LARGER_ZARR).unwrap(); + storage + .load(Backend::FileSystem(common::LARGER_ZARR)) + .unwrap(); assert_eq!( storage.get_sparse_array().unwrap(), From 3517f2dffdcd1b4d22446c3094e0518d54837510 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Mon, 12 Feb 2024 10:55:54 +0000 Subject: [PATCH 04/10] improving the project structure --- examples/http_bench.rs | 2 +- examples/load_bench.rs | 2 +- examples/ntriples/main.rs | 2 +- examples/query_bench.rs | 2 +- examples/rdf_xml/main.rs | 2 +- examples/serialize_bench.rs | 2 +- examples/turtle/main.rs | 2 +- src/main.rs | 2 +- src/storage/{ => layout}/matrix.rs | 6 +++--- src/storage/{layout.rs => layout/mod.rs} | 7 +++++-- src/storage/{ => layout}/tabular.rs | 14 +++++++------- src/storage/mod.rs | 4 +--- tests/get_object_test.rs | 4 ++-- tests/get_subject_test.rs | 4 ++-- tests/orientation.rs | 4 ++-- tests/write_read_test.rs | 4 ++-- 16 files changed, 32 insertions(+), 31 deletions(-) rename src/storage/{ => layout}/matrix.rs (97%) rename src/storage/{layout.rs => layout/mod.rs} (98%) rename src/storage/{ => layout}/tabular.rs (94%) diff --git a/examples/http_bench.rs b/examples/http_bench.rs index 0a04f26..0b9f4ea 100644 --- a/examples/http_bench.rs +++ b/examples/http_bench.rs @@ -1,5 +1,5 @@ use remote_hdt::error::RemoteHDTError; -use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::layout::matrix::MatrixLayout; use remote_hdt::storage::ops::Ops; use remote_hdt::storage::params::{Backend, Serialization}; use remote_hdt::storage::Storage; diff --git a/examples/load_bench.rs b/examples/load_bench.rs index d990658..d679950 100644 --- a/examples/load_bench.rs +++ b/examples/load_bench.rs @@ -1,6 +1,6 @@ use remote_hdt::error::RemoteHDTError; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::params::{Backend, Serialization}; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; use std::env; use std::time::Instant; diff --git a/examples/ntriples/main.rs b/examples/ntriples/main.rs index 04b51fc..8ab2feb 100644 --- a/examples/ntriples/main.rs +++ b/examples/ntriples/main.rs @@ -1,6 +1,6 @@ use remote_hdt::error::RemoteHDTError; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization}; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; pub fn main() -> Result<(), RemoteHDTError> { diff --git a/examples/query_bench.rs b/examples/query_bench.rs index 23dc992..f97e448 100644 --- a/examples/query_bench.rs +++ b/examples/query_bench.rs @@ -1,5 +1,5 @@ use remote_hdt::error::RemoteHDTError; -use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::layout::matrix::MatrixLayout; use remote_hdt::storage::ops::Ops; use remote_hdt::storage::params::{Backend, Serialization}; use remote_hdt::storage::Storage; diff --git a/examples/rdf_xml/main.rs b/examples/rdf_xml/main.rs index 872c428..c415701 100644 --- a/examples/rdf_xml/main.rs +++ b/examples/rdf_xml/main.rs @@ -1,6 +1,6 @@ use remote_hdt::error::RemoteHDTError; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization}; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; pub fn main() -> Result<(), RemoteHDTError> { diff --git a/examples/serialize_bench.rs b/examples/serialize_bench.rs index aba915a..2e15eb6 100644 --- a/examples/serialize_bench.rs +++ b/examples/serialize_bench.rs @@ -1,5 +1,5 @@ use remote_hdt::error::RemoteHDTError; -use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::layout::matrix::MatrixLayout; use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization}; use remote_hdt::storage::Storage; use std::env; diff --git a/examples/turtle/main.rs b/examples/turtle/main.rs index 89acf2e..76fe24a 100644 --- a/examples/turtle/main.rs +++ b/examples/turtle/main.rs @@ -1,6 +1,6 @@ use remote_hdt::error::RemoteHDTError; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization}; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; pub fn main() -> Result<(), RemoteHDTError> { diff --git a/src/main.rs b/src/main.rs index db495e5..b476eb9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,9 @@ use clap::Parser; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; use remote_hdt::storage::StorageResult; diff --git a/src/storage/matrix.rs b/src/storage/layout/matrix.rs similarity index 97% rename from src/storage/matrix.rs rename to src/storage/layout/matrix.rs index c2a5d7a..8d19e1b 100644 --- a/src/storage/matrix.rs +++ b/src/storage/layout/matrix.rs @@ -10,15 +10,15 @@ use zarrs::array::DataType; use zarrs::array::DimensionName; use zarrs::array::FillValue; -use super::layout::Layout; -use super::layout::LayoutOps; -use super::AtomicZarrType; use super::ChunkingStrategy; use super::Dimensionality; use super::ReferenceSystem; use super::StorageResult; use crate::io::Graph; +use crate::storage::layout::LayoutOps; +use crate::storage::AtomicZarrType; +use crate::storage::Layout; type Chunk = Vec<(u32, u32)>; diff --git a/src/storage/layout.rs b/src/storage/layout/mod.rs similarity index 98% rename from src/storage/layout.rs rename to src/storage/layout/mod.rs index d0385ac..03b5859 100644 --- a/src/storage/layout.rs +++ b/src/storage/layout/mod.rs @@ -26,7 +26,10 @@ use super::ZarrArray; type ArrayToBytesCodec = Box; -pub trait LayoutOps { +pub mod matrix; +pub mod tabular; + +pub(crate) trait LayoutOps { fn retrieve_attributes(&mut self, arr: &Array) -> StorageResult { // 4. We get the attributes so we can obtain some values that we will need let attributes = arr.attributes(); @@ -155,7 +158,7 @@ pub trait LayoutOps { fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize; } -pub trait Layout: LayoutOps { +pub(crate) trait Layout: LayoutOps { fn shape(&self, dimensionality: &Dimensionality) -> Vec; fn data_type(&self) -> DataType; fn chunk_shape( diff --git a/src/storage/tabular.rs b/src/storage/layout/tabular.rs similarity index 94% rename from src/storage/tabular.rs rename to src/storage/layout/tabular.rs index 3dbfcb2..7ee9c15 100644 --- a/src/storage/tabular.rs +++ b/src/storage/layout/tabular.rs @@ -10,15 +10,15 @@ use zarrs::array::DataType; use zarrs::array::DimensionName; use zarrs::array::FillValue; -use crate::io::Graph; - -use super::layout::Layout; -use super::layout::LayoutOps; -use super::params::ChunkingStrategy; -use super::params::Dimensionality; -use super::params::ReferenceSystem; +use super::ChunkingStrategy; +use super::Dimensionality; +use super::ReferenceSystem; use super::StorageResult; +use crate::io::Graph; +use crate::storage::layout::LayoutOps; +use crate::storage::Layout; + type Chunk = (u32, u32, u32); pub struct TabularLayout; diff --git a/src/storage/mod.rs b/src/storage/mod.rs index e3c03aa..3a4bd72 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -25,11 +25,9 @@ use self::params::Dimensionality; use self::params::ReferenceSystem; use self::params::Serialization; -mod layout; -pub mod matrix; +pub mod layout; pub mod ops; pub mod params; -pub mod tabular; pub type ZarrArray = CsMat; type AtomicZarrType = AtomicU64; diff --git a/tests/get_object_test.rs b/tests/get_object_test.rs index 3348b4a..03f8e38 100644 --- a/tests/get_object_test.rs +++ b/tests/get_object_test.rs @@ -1,11 +1,11 @@ -use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::layout::matrix::MatrixLayout; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::ops::Ops; use remote_hdt::storage::ops::OpsFormat; use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; use sprs::TriMat; use std::error::Error; diff --git a/tests/get_subject_test.rs b/tests/get_subject_test.rs index 011623d..d8a44fd 100644 --- a/tests/get_subject_test.rs +++ b/tests/get_subject_test.rs @@ -1,11 +1,11 @@ -use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::layout::matrix::MatrixLayout; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::ops::Ops; use remote_hdt::storage::ops::OpsFormat; use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; use sprs::TriMat; use std::error::Error; diff --git a/tests/orientation.rs b/tests/orientation.rs index 96afc8b..71c05df 100644 --- a/tests/orientation.rs +++ b/tests/orientation.rs @@ -1,11 +1,11 @@ -use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::layout::matrix::MatrixLayout; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::ops::Ops; use remote_hdt::storage::ops::OpsFormat; use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; use std::error::Error; diff --git a/tests/write_read_test.rs b/tests/write_read_test.rs index 5ef445c..0243266 100644 --- a/tests/write_read_test.rs +++ b/tests/write_read_test.rs @@ -1,9 +1,9 @@ -use remote_hdt::storage::matrix::MatrixLayout; +use remote_hdt::storage::layout::matrix::MatrixLayout; +use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::params::Backend; use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; -use remote_hdt::storage::tabular::TabularLayout; use remote_hdt::storage::Storage; mod common; From 1851df4ed6aeb55dc1b146a747adc12ed560e0d9 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Mon, 12 Feb 2024 13:13:49 +0000 Subject: [PATCH 05/10] fixing some tests --- src/engine/chunk.rs | 14 +++++++------- src/error.rs | 3 +++ src/storage/layout/matrix.rs | 21 ++++++--------------- src/storage/layout/mod.rs | 29 ++++++++++++++--------------- src/storage/layout/tabular.rs | 8 ++------ tests/write_read_test.rs | 3 ++- 6 files changed, 34 insertions(+), 44 deletions(-) diff --git a/src/engine/chunk.rs b/src/engine/chunk.rs index 4de33fb..0e5bad5 100644 --- a/src/engine/chunk.rs +++ b/src/engine/chunk.rs @@ -14,10 +14,10 @@ impl EngineStrategy> for Array { let chunk_to_index = index as u64 % rows_per_shard(self); let ans = self.retrieve_chunk_subset_elements( &[index_to_chunk, 0], - &ArraySubset::new_with_start_end_inc( - vec![chunk_to_index, 0], - vec![chunk_to_index, columns_per_shard(self) - 1], - )?, + &ArraySubset::new_with_ranges(&[ + chunk_to_index..chunk_to_index + 1, + 0..columns_per_shard(self), + ]), )?; Ok(ans.to_vec()) } @@ -27,9 +27,9 @@ impl EngineStrategy> for Array { } fn get_third_term(&self, index: usize) -> EngineResult> { - let start = vec![0, index as u64]; - let end = vec![self.shape()[0], index as u64]; - let shape = &ArraySubset::new_with_start_end_inc(start, end)?; + let last_chunk = self.shape()[0] / rows_per_shard(self); + let col = index as u64; + let shape = &ArraySubset::new_with_ranges(&[0..last_chunk, col..col + 1]); let ans = self.retrieve_array_subset_elements(shape)?; Ok(ans.to_vec()) } diff --git a/src/error.rs b/src/error.rs index 7b41ac1..ed446fe 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,6 +3,7 @@ use thiserror::Error; use zarrs::array::codec::bytes_to_bytes::gzip::GzipCompressionLevelError; use zarrs::array::ArrayCreateError; use zarrs::array::ArrayError; +use zarrs::array::NonZeroError; use zarrs::array_subset::IncompatibleDimensionalityError; use zarrs::array_subset::IncompatibleStartEndIndicesError; use zarrs::group::GroupCreateError; @@ -56,6 +57,8 @@ pub enum RemoteHDTError { ReadOnlyBackend, #[error("Error while parsing the RDF graph")] RdfParse, + #[error(transparent)] + NonZero(#[from] NonZeroError), } #[derive(Error, Debug)] diff --git a/src/storage/layout/matrix.rs b/src/storage/layout/matrix.rs index 8d19e1b..9bd5d13 100644 --- a/src/storage/layout/matrix.rs +++ b/src/storage/layout/matrix.rs @@ -85,13 +85,8 @@ impl Layout for MatrixLayout { &self, dimensionality: &Dimensionality, ) -> StorageResult> { - let mut sharding_codec_builder = ShardingCodecBuilder::new( - vec![ - NonZeroU64::new(1).unwrap(), - NonZeroU64::new(dimensionality.get_third_term_size()).unwrap(), - ] - .into(), - ); + let mut sharding_codec_builder = + ShardingCodecBuilder::new(vec![1, dimensionality.get_third_term_size()].try_into()?); sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); Ok(Box::new(sharding_codec_builder.build())) } @@ -128,9 +123,7 @@ impl LayoutOps for MatrixLayout { fn retrieve_chunk_elements( &mut self, matrix: &Mutex>, - i: u64, - number_of_columns: u64, - first_term_idx: usize, + first_term_index: usize, chunk: &[usize], ) { chunk @@ -138,11 +131,9 @@ impl LayoutOps for MatrixLayout { .enumerate() .for_each(|(third_term_idx, &second_term_idx)| { if second_term_idx != 0 { - matrix.lock().add_triplet( - first_term_idx + (i * number_of_columns) as usize, - third_term_idx, - second_term_idx, - ); + matrix + .lock() + .add_triplet(first_term_index, third_term_idx, second_term_idx); } }) } diff --git a/src/storage/layout/mod.rs b/src/storage/layout/mod.rs index 03b5859..f21281b 100644 --- a/src/storage/layout/mod.rs +++ b/src/storage/layout/mod.rs @@ -29,7 +29,7 @@ type ArrayToBytesCodec = Box; pub mod matrix; pub mod tabular; -pub(crate) trait LayoutOps { +pub trait LayoutOps { fn retrieve_attributes(&mut self, arr: &Array) -> StorageResult { // 4. We get the attributes so we can obtain some values that we will need let attributes = arr.attributes(); @@ -70,13 +70,13 @@ pub(crate) trait LayoutOps { let iter = binding.chunks_exact(rows_per_shard(&arr) as usize); let remainder = iter.remainder(); - let _ = iter.map(|chunk| { - count.fetch_add(1, Ordering::Relaxed); + for chunk in iter { arr.store_chunk_elements( &[count.load(Ordering::Relaxed), 0], self.store_chunk_elements(chunk, columns), - ) - }); + )?; + count.fetch_add(1, Ordering::Relaxed); + } if !remainder.is_empty() { arr.store_array_subset_elements( @@ -108,10 +108,11 @@ pub(crate) trait LayoutOps { dimensionality.third_term_size, // we obtain the size of the third terms ))); - // We compute the number of chunks; for us to achieve so, we have to obtain + // We compute the number of shards; for us to achieve so, we have to obtain // first dimension of the chunk grid - let number_of_chunks = match arr.chunk_grid_shape() { + let number_of_shards = match arr.chunk_grid_shape() { Some(chunk_grid) => chunk_grid[0], + None => 0, }; @@ -123,16 +124,16 @@ pub(crate) trait LayoutOps { // low, as instead of parsing the whole array, we process smaller pieces // of it. Once we have all the pieces processed, we will have parsed the // whole array - for i in 0..number_of_chunks { - arr.retrieve_chunk_elements(&[i, 0])? + for shard in 0..number_of_shards { + arr.retrieve_chunk_elements(&[shard, 0])? + // We divide each shard by the number of columns, as a shard is + // composed of chunks having the size of [1, number of cols] .chunks(number_of_columns) .enumerate() .for_each(|(first_term_idx, chunk)| { self.retrieve_chunk_elements( &matrix, - i, - number_of_columns as u64, - first_term_idx, + first_term_idx + (shard * rows_per_shard(arr)) as usize, chunk, ); }) @@ -150,15 +151,13 @@ pub(crate) trait LayoutOps { fn retrieve_chunk_elements( &mut self, matrix: &Mutex>, - i: u64, - number_of_columns: u64, first_term_idx: usize, chunk: &[usize], ); fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize; } -pub(crate) trait Layout: LayoutOps { +pub trait Layout: LayoutOps { fn shape(&self, dimensionality: &Dimensionality) -> Vec; fn data_type(&self) -> DataType; fn chunk_shape( diff --git a/src/storage/layout/tabular.rs b/src/storage/layout/tabular.rs index 7ee9c15..b6a5455 100644 --- a/src/storage/layout/tabular.rs +++ b/src/storage/layout/tabular.rs @@ -51,9 +51,7 @@ impl Layout for TabularLayout { &self, _: &Dimensionality, ) -> StorageResult> { - let mut sharding_codec_builder = ShardingCodecBuilder::new( - vec![NonZeroU64::new(1).unwrap(), NonZeroU64::new(3).unwrap()].into(), - ); + let mut sharding_codec_builder = ShardingCodecBuilder::new(vec![1, 3].try_into()?); sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); Ok(Box::new(sharding_codec_builder.build())) } @@ -86,9 +84,7 @@ impl LayoutOps for TabularLayout { fn retrieve_chunk_elements( &mut self, matrix: &Mutex>, - i: u64, - number_of_columns: u64, - first_term_idx: usize, + first_term_index: usize, // TODO: will first_term_index instead of chunk[0] do the trick? chunk: &[usize], ) { matrix diff --git a/tests/write_read_test.rs b/tests/write_read_test.rs index 0243266..6ee9550 100644 --- a/tests/write_read_test.rs +++ b/tests/write_read_test.rs @@ -13,7 +13,7 @@ fn write_read_tabular_test() { let mut storage = Storage::new(TabularLayout, Serialization::Sparse); common::setup( - common::MATRIX_ZARR, + common::TABULAR_ZARR, &mut storage, ChunkingStrategy::Chunk, ReferenceSystem::SPO, @@ -32,6 +32,7 @@ fn write_read_tabular_test() { #[test] fn write_read_matrix_test() { let mut storage = Storage::new(MatrixLayout, Serialization::Sparse); + common::setup( common::MATRIX_ZARR, &mut storage, From 2d69b9a7783b4db344d37a20d3a5da4323b609d9 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Tue, 27 Feb 2024 09:53:17 +0000 Subject: [PATCH 06/10] improvements --- Cargo.toml | 2 +- src/engine/chunk.rs | 32 ++++++++++++++------------------ src/error.rs | 3 +++ src/storage/layout/matrix.rs | 12 ++++++------ src/storage/layout/mod.rs | 8 +++----- src/storage/layout/tabular.rs | 12 ++++++------ src/storage/mod.rs | 4 ++-- src/storage/ops.rs | 2 +- tests/get_subject_test.rs | 2 +- 9 files changed, 37 insertions(+), 40 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dabe10a..ac39d2e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ version = "0.0.1" edition = "2021" [dependencies] -zarrs = { version = "0.11.6", default-features = false, features = [ "http", "gzip", "sharding", "opendal", "async", "ndarray" ] } +zarrs = { version = "0.12.2", default-features = false, features = [ "http", "gzip", "sharding", "opendal", "async", "ndarray" ] } clap = { version = "4.1.8", features = ["derive"] } serde_json = "1.0.108" thiserror = "1.0.50" diff --git a/src/engine/chunk.rs b/src/engine/chunk.rs index 0e5bad5..d92901e 100644 --- a/src/engine/chunk.rs +++ b/src/engine/chunk.rs @@ -8,29 +8,25 @@ use crate::utils::rows_per_shard; use super::EngineResult; use super::EngineStrategy; -impl EngineStrategy> for Array { - fn get_first_term(&self, index: usize) -> EngineResult> { - let index_to_chunk = index as u64 / rows_per_shard(self); - let chunk_to_index = index as u64 % rows_per_shard(self); - let ans = self.retrieve_chunk_subset_elements( - &[index_to_chunk, 0], - &ArraySubset::new_with_ranges(&[ - chunk_to_index..chunk_to_index + 1, - 0..columns_per_shard(self), - ]), - )?; - Ok(ans.to_vec()) +impl EngineStrategy> for Array { + fn get_first_term(&self, index: usize) -> EngineResult> { + let shard_index = index as u64 / rows_per_shard(self); + let shard = self.retrieve_chunk_elements(&[shard_index, 0])?; + let chunk_index = index as u64 % rows_per_shard(self); + let start = (chunk_index * columns_per_shard(self)) as usize; + let end = start + columns_per_shard(self) as usize; + let chunk: &[u32] = &shard[start..end]; + Ok(chunk.to_vec()) } - fn get_second_term(&self, _index: usize) -> EngineResult> { + fn get_second_term(&self, _index: usize) -> EngineResult> { unimplemented!() } - fn get_third_term(&self, index: usize) -> EngineResult> { - let last_chunk = self.shape()[0] / rows_per_shard(self); + fn get_third_term(&self, index: usize) -> EngineResult> { let col = index as u64; - let shape = &ArraySubset::new_with_ranges(&[0..last_chunk, col..col + 1]); - let ans = self.retrieve_array_subset_elements(shape)?; - Ok(ans.to_vec()) + let shape = ArraySubset::new_with_start_end_inc(vec![0, col], vec![self.shape()[0], col])?; + let ans = self.retrieve_array_subset_elements(&shape)?; + Ok(ans) } } diff --git a/src/error.rs b/src/error.rs index ed446fe..7868acb 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,6 +1,7 @@ use std::convert::Infallible; use thiserror::Error; use zarrs::array::codec::bytes_to_bytes::gzip::GzipCompressionLevelError; +use zarrs::array::codec::CodecError; use zarrs::array::ArrayCreateError; use zarrs::array::ArrayError; use zarrs::array::NonZeroError; @@ -71,6 +72,8 @@ pub enum EngineError { Operation, #[error(transparent)] IncompatibleStartEndIndicesError(#[from] IncompatibleStartEndIndicesError), + #[error(transparent)] + Codec(#[from] CodecError), } #[derive(Error, Debug)] diff --git a/src/storage/layout/matrix.rs b/src/storage/layout/matrix.rs index 9bd5d13..69f7fd2 100644 --- a/src/storage/layout/matrix.rs +++ b/src/storage/layout/matrix.rs @@ -33,7 +33,7 @@ impl Layout for MatrixLayout { } fn data_type(&self) -> DataType { - DataType::UInt64 + DataType::UInt32 } fn chunk_shape( @@ -49,7 +49,7 @@ impl Layout for MatrixLayout { } fn fill_value(&self) -> FillValue { - FillValue::from(0u64) + FillValue::from(0u32) } fn dimension_names(&self, reference_system: &ReferenceSystem) -> Option> { @@ -97,12 +97,12 @@ impl LayoutOps for MatrixLayout { graph } - fn store_chunk_elements(&self, chunk: &[Chunk], columns: usize) -> Vec { + fn store_chunk_elements(&self, chunk: &[Chunk], columns: usize) -> Vec { // We create a slice that has the size of the chunk filled with 0 values // having the size of the shard; that is, number of rows, and a given // number of columns. This value is converted into an AtomicU8 for us to // be able to share it among threads - let slice: Vec = vec![0u64; chunk.len() * columns] + let slice: Vec = vec![0u32; chunk.len() * columns] .iter() .map(|&n| AtomicZarrType::new(n)) .collect(); @@ -110,14 +110,14 @@ impl LayoutOps for MatrixLayout { for (first_term, triples) in chunk.iter().enumerate() { triples.iter().for_each(|&(second_term, third_term)| { let third_term_idx = third_term as usize + first_term * columns; - slice[third_term_idx].store(second_term as u64, Ordering::Relaxed); + slice[third_term_idx].store(second_term, Ordering::Relaxed); }); } slice .iter() .map(|elem| elem.load(Ordering::Relaxed)) - .collect::>() + .collect::>() } fn retrieve_chunk_elements( diff --git a/src/storage/layout/mod.rs b/src/storage/layout/mod.rs index f21281b..a50e2a8 100644 --- a/src/storage/layout/mod.rs +++ b/src/storage/layout/mod.rs @@ -71,10 +71,8 @@ pub trait LayoutOps { let remainder = iter.remainder(); for chunk in iter { - arr.store_chunk_elements( - &[count.load(Ordering::Relaxed), 0], - self.store_chunk_elements(chunk, columns), - )?; + let slice = self.store_chunk_elements(chunk, columns); + arr.store_chunk_elements(&[count.load(Ordering::Relaxed), 0], slice)?; count.fetch_add(1, Ordering::Relaxed); } @@ -147,7 +145,7 @@ pub trait LayoutOps { } fn graph_iter(&self, graph: Graph) -> Vec; - fn store_chunk_elements(&self, chunk: &[C], columns: usize) -> Vec; + fn store_chunk_elements(&self, chunk: &[C], columns: usize) -> Vec; fn retrieve_chunk_elements( &mut self, matrix: &Mutex>, diff --git a/src/storage/layout/tabular.rs b/src/storage/layout/tabular.rs index b6a5455..d0e8115 100644 --- a/src/storage/layout/tabular.rs +++ b/src/storage/layout/tabular.rs @@ -29,7 +29,7 @@ impl Layout for TabularLayout { } fn data_type(&self) -> DataType { - DataType::UInt64 + DataType::UInt32 } fn chunk_shape(&self, chunking_strategy: ChunkingStrategy, _: &Dimensionality) -> ChunkGrid { @@ -37,7 +37,7 @@ impl Layout for TabularLayout { } fn fill_value(&self) -> FillValue { - FillValue::from(0u64) + FillValue::from(0u32) } fn dimension_names(&self, _: &ReferenceSystem) -> Option> { @@ -71,12 +71,12 @@ impl LayoutOps for TabularLayout { .collect::>() } - fn store_chunk_elements(&self, chunk: &[Chunk], _: usize) -> Vec { + fn store_chunk_elements(&self, chunk: &[Chunk], _: usize) -> Vec { let mut ans = Vec::new(); for &(first_term, second_term, third_term) in chunk { - ans.push(first_term as u64); - ans.push(second_term as u64); - ans.push(third_term as u64); + ans.push(first_term); + ans.push(second_term); + ans.push(third_term); } ans } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 3a4bd72..75acf75 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -2,7 +2,7 @@ use serde_json::Map; use sprs::CsMat; use std::path::PathBuf; use std::str::FromStr; -use std::sync::atomic::AtomicU64; +use std::sync::atomic::AtomicU32; use std::sync::Arc; use zarrs::array::Array; use zarrs::array::ArrayBuilder; @@ -30,7 +30,7 @@ pub mod ops; pub mod params; pub type ZarrArray = CsMat; -type AtomicZarrType = AtomicU64; +type AtomicZarrType = AtomicU32; pub type StorageResult = Result; const ARRAY_NAME: &str = "/group/RemoteHDT"; // TODO: parameterize this diff --git a/src/storage/ops.rs b/src/storage/ops.rs index d74f31c..b2fb565 100644 --- a/src/storage/ops.rs +++ b/src/storage/ops.rs @@ -10,7 +10,7 @@ pub type OpsResult = Result; pub enum OpsFormat { SparseArray(ZarrArray), - Zarr(Vec), + Zarr(Vec), } pub trait Ops { diff --git a/tests/get_subject_test.rs b/tests/get_subject_test.rs index d8a44fd..171e744 100644 --- a/tests/get_subject_test.rs +++ b/tests/get_subject_test.rs @@ -45,7 +45,7 @@ fn get_subject_matrix_sharding_test() -> Result<(), Box> { common::setup( common::SHARDING_ZARR, &mut storage, - ChunkingStrategy::Sharding(3), + ChunkingStrategy::Sharding(4), ReferenceSystem::SPO, ); From 36815851c3f825911cbc1d32f361bd48babfee04 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Thu, 7 Mar 2024 11:22:06 +0000 Subject: [PATCH 07/10] get_predicate implemented --- .gitignore | 3 +- Cargo.toml | 2 +- examples/{serialize_bench.rs => serialize.rs} | 2 +- src/engine/array.rs | 10 ++- src/engine/chunk.rs | 30 +++++-- src/storage/layout/matrix.rs | 17 ++-- src/storage/layout/mod.rs | 8 +- src/storage/layout/tabular.rs | 6 +- src/storage/mod.rs | 7 +- src/storage/params.rs | 10 +-- tests/common/mod.rs | 6 +- tests/get_predicate_test.rs | 88 +++++++++++++++++++ tests/orientation.rs | 47 ++-------- 13 files changed, 161 insertions(+), 75 deletions(-) rename examples/{serialize_bench.rs => serialize.rs} (90%) create mode 100644 tests/get_predicate_test.rs diff --git a/.gitignore b/.gitignore index 562046a..7f51346 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ benches/*/*.nt !resources/root.zarr .vscode heaptrack.* -tests/out \ No newline at end of file +tests/out +uniprotkb_* \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index ac39d2e..95475c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ version = "0.0.1" edition = "2021" [dependencies] -zarrs = { version = "0.12.2", default-features = false, features = [ "http", "gzip", "sharding", "opendal", "async", "ndarray" ] } +zarrs = { version = "0.12.3", default-features = false, features = [ "http", "gzip", "sharding", "opendal", "async", "ndarray" ] } clap = { version = "4.1.8", features = ["derive"] } serde_json = "1.0.108" thiserror = "1.0.50" diff --git a/examples/serialize_bench.rs b/examples/serialize.rs similarity index 90% rename from examples/serialize_bench.rs rename to examples/serialize.rs index 2e15eb6..421c892 100644 --- a/examples/serialize_bench.rs +++ b/examples/serialize.rs @@ -12,7 +12,7 @@ static ALLOCATOR: jemallocator::Jemalloc = jemallocator::Jemalloc; fn main() -> Result<(), RemoteHDTError> { let args: Vec = env::args().collect(); if args.len() <= 3 { - panic!("Usage: cargo run --example serialize_bench "); + panic!("Usage: cargo run --example serialize "); } let rdf_path = &args[1].as_str(); diff --git a/src/engine/array.rs b/src/engine/array.rs index 3a97292..0470603 100644 --- a/src/engine/array.rs +++ b/src/engine/array.rs @@ -13,8 +13,14 @@ impl EngineStrategy for ZarrArray { Ok(&matrix * self) } - fn get_second_term(&self, _value: usize) -> EngineResult { - unimplemented!() + fn get_second_term(&self, value: usize) -> EngineResult { + let mut matrix = TriMat::new((self.rows(), self.cols())); + self.iter().for_each(|(&e, (row, col))| { + if e == value { + matrix.add_triplet(row, col, value); + } + }); + Ok(matrix.to_csc()) } fn get_third_term(&self, index: usize) -> EngineResult { diff --git a/src/engine/chunk.rs b/src/engine/chunk.rs index d92901e..0f37f6c 100644 --- a/src/engine/chunk.rs +++ b/src/engine/chunk.rs @@ -2,6 +2,7 @@ use zarrs::array::Array; use zarrs::array_subset::ArraySubset; use zarrs::storage::ReadableStorageTraits; +use crate::error::EngineError; use crate::utils::columns_per_shard; use crate::utils::rows_per_shard; @@ -19,14 +20,33 @@ impl EngineStrategy> for Array { Ok(chunk.to_vec()) } - fn get_second_term(&self, _index: usize) -> EngineResult> { - unimplemented!() + fn get_second_term(&self, index: usize) -> EngineResult> { + let mut ans = Vec::new(); + let number_of_shards = match self.chunk_grid_shape() { + Some(chunk_grid) => chunk_grid[0], + None => return Err(EngineError::Operation), + }; + for i in 0..number_of_shards { + let mut shard = self.retrieve_chunk_elements::(&[i, 0])?; + shard.iter_mut().for_each(|e| { + if *e != index as u32 { + *e = 0 + } + }); + ans.append(&mut shard); + } + Ok(ans) } fn get_third_term(&self, index: usize) -> EngineResult> { + let objects = self.shape()[0]; let col = index as u64; - let shape = ArraySubset::new_with_start_end_inc(vec![0, col], vec![self.shape()[0], col])?; - let ans = self.retrieve_array_subset_elements(&shape)?; - Ok(ans) + let shape = ArraySubset::new_with_ranges(&[0..objects, col..col + 1]); + let array_subset = self.retrieve_array_subset(&shape).unwrap(); + let third_term_subset = array_subset + .windows(4) + .map(|w| u32::from_ne_bytes(w.try_into().unwrap())) + .collect::>(); + Ok(third_term_subset) } } diff --git a/src/storage/layout/matrix.rs b/src/storage/layout/matrix.rs index 69f7fd2..f2bc735 100644 --- a/src/storage/layout/matrix.rs +++ b/src/storage/layout/matrix.rs @@ -85,8 +85,11 @@ impl Layout for MatrixLayout { &self, dimensionality: &Dimensionality, ) -> StorageResult> { - let mut sharding_codec_builder = - ShardingCodecBuilder::new(vec![1, dimensionality.get_third_term_size()].try_into()?); + let mut sharding_codec_builder = ShardingCodecBuilder::new( + vec![1, dimensionality.get_third_term_size()] + .as_slice() + .try_into()?, + ); sharding_codec_builder.bytes_to_bytes_codecs(vec![Box::new(GzipCodec::new(5)?)]); Ok(Box::new(sharding_codec_builder.build())) } @@ -124,16 +127,18 @@ impl LayoutOps for MatrixLayout { &mut self, matrix: &Mutex>, first_term_index: usize, - chunk: &[usize], + chunk: &[u32], ) { chunk .iter() .enumerate() .for_each(|(third_term_idx, &second_term_idx)| { if second_term_idx != 0 { - matrix - .lock() - .add_triplet(first_term_index, third_term_idx, second_term_idx); + matrix.lock().add_triplet( + first_term_index, + third_term_idx, + second_term_idx as usize, + ); } }) } diff --git a/src/storage/layout/mod.rs b/src/storage/layout/mod.rs index a50e2a8..11657ad 100644 --- a/src/storage/layout/mod.rs +++ b/src/storage/layout/mod.rs @@ -72,12 +72,12 @@ pub trait LayoutOps { for chunk in iter { let slice = self.store_chunk_elements(chunk, columns); - arr.store_chunk_elements(&[count.load(Ordering::Relaxed), 0], slice)?; + arr.store_chunk_elements::(&[count.load(Ordering::Relaxed), 0], slice)?; count.fetch_add(1, Ordering::Relaxed); } if !remainder.is_empty() { - arr.store_array_subset_elements( + arr.store_array_subset_elements::( &ArraySubset::new_with_start_shape( vec![count.load(Ordering::Relaxed) * rows_per_shard(&arr), 0], vec![remainder.len() as u64, columns_per_shard(&arr)], @@ -123,7 +123,7 @@ pub trait LayoutOps { // of it. Once we have all the pieces processed, we will have parsed the // whole array for shard in 0..number_of_shards { - arr.retrieve_chunk_elements(&[shard, 0])? + arr.retrieve_chunk_elements::(&[shard, 0])? // We divide each shard by the number of columns, as a shard is // composed of chunks having the size of [1, number of cols] .chunks(number_of_columns) @@ -150,7 +150,7 @@ pub trait LayoutOps { &mut self, matrix: &Mutex>, first_term_idx: usize, - chunk: &[usize], + chunk: &[u32], ); fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize; } diff --git a/src/storage/layout/tabular.rs b/src/storage/layout/tabular.rs index d0e8115..be6c520 100644 --- a/src/storage/layout/tabular.rs +++ b/src/storage/layout/tabular.rs @@ -84,12 +84,12 @@ impl LayoutOps for TabularLayout { fn retrieve_chunk_elements( &mut self, matrix: &Mutex>, - first_term_index: usize, // TODO: will first_term_index instead of chunk[0] do the trick? - chunk: &[usize], + _first_term_index: usize, // TODO: will first_term_index instead of chunk[0] do the trick? + chunk: &[u32], ) { matrix .lock() - .add_triplet(chunk[0], chunk[2], chunk[1] as usize); + .add_triplet(chunk[0] as usize, chunk[2] as usize, chunk[1] as usize); } fn sharding_factor(&self, dimensionality: &Dimensionality) -> usize { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 75acf75..8035e77 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -104,8 +104,7 @@ impl Storage { // Create a group and write metadata to filesystem let group = GroupBuilder::new().build(store.clone(), "/group")?; - - let _ = group.store_metadata()?; + group.store_metadata()?; // TODO: rayon::ThreadPoolBuilder::new() // .num_threads(1) @@ -153,9 +152,9 @@ impl Storage { Ok(self) } - pub fn load<'a>( + pub fn load( &mut self, - store: Backend<'a>, + store: Backend<'_>, // threading_strategy: ThreadingStrategy, TODO: implement this ) -> StorageResult<&mut Self> { let operator = match store { diff --git a/src/storage/params.rs b/src/storage/params.rs index d453a66..a21559b 100644 --- a/src/storage/params.rs +++ b/src/storage/params.rs @@ -38,7 +38,7 @@ pub enum ReferenceSystem { pub struct Dimensionality { graph_size: Option, pub(crate) first_term_size: usize, - second_term_size: usize, + _second_term_size: usize, pub(crate) third_term_size: usize, } @@ -91,7 +91,7 @@ impl Dimensionality { ReferenceSystem::POS | ReferenceSystem::PSO => dictionary.predicates_size(), ReferenceSystem::OPS | ReferenceSystem::OSP => dictionary.objects_size(), }, - second_term_size: match dictionary.get_reference_system() { + _second_term_size: match dictionary.get_reference_system() { ReferenceSystem::PSO | ReferenceSystem::OSP => dictionary.subjects_size(), ReferenceSystem::SPO | ReferenceSystem::OPS => dictionary.predicates_size(), ReferenceSystem::SOP | ReferenceSystem::POS => dictionary.objects_size(), @@ -112,9 +112,9 @@ impl Dimensionality { self.first_term_size as u64 } - pub(crate) fn get_second_term_size(&self) -> u64 { - self.second_term_size as u64 - } + // pub(crate) fn get_second_term_size(&self) -> u64 { + // self._second_term_size as u64 + // } pub(crate) fn get_third_term_size(&self) -> u64 { self.third_term_size as u64 diff --git a/tests/common/mod.rs b/tests/common/mod.rs index daf67bd..c0d5b80 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -46,7 +46,7 @@ pub enum Subject { } impl Subject { - fn get_idx(self, dictionary: &Dictionary) -> usize { + pub(crate) fn get_idx(self, dictionary: &Dictionary) -> usize { dictionary.get_subject_idx_unchecked(self.into()) } } @@ -74,7 +74,7 @@ pub enum Predicate { } impl Predicate { - fn get_idx(self, dictionary: &Dictionary) -> usize { + pub(crate) fn get_idx(self, dictionary: &Dictionary) -> usize { dictionary.get_predicate_idx_unchecked(self.into()) } } @@ -107,7 +107,7 @@ pub enum Object { } impl Object { - fn get_idx(self, dictionary: &Dictionary) -> usize { + pub(crate) fn get_idx(self, dictionary: &Dictionary) -> usize { dictionary.get_object_idx_unchecked(self.into()) } } diff --git a/tests/get_predicate_test.rs b/tests/get_predicate_test.rs new file mode 100644 index 0000000..6352252 --- /dev/null +++ b/tests/get_predicate_test.rs @@ -0,0 +1,88 @@ +use remote_hdt::storage::layout::matrix::MatrixLayout; +use remote_hdt::storage::layout::tabular::TabularLayout; +use remote_hdt::storage::ops::Ops; +use remote_hdt::storage::ops::OpsFormat; +use remote_hdt::storage::params::Backend; +use remote_hdt::storage::params::ChunkingStrategy; +use remote_hdt::storage::params::ReferenceSystem; +use remote_hdt::storage::params::Serialization; +use remote_hdt::storage::Storage; +use sprs::TriMat; +use std::error::Error; + +mod common; + +#[test] +fn get_predicate_matrix_chunk_test() -> Result<(), Box> { + let mut storage = Storage::new(MatrixLayout, Serialization::Zarr); + + common::setup( + common::MATRIX_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + ); + + let actual = match storage + .load(Backend::FileSystem(common::MATRIX_ZARR))? + .get_predicate(common::Predicate::InstanceOf.into())? + { + OpsFormat::Zarr(actual) => actual, + _ => unreachable!(), + }; + + if actual + == vec![ + 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 0, 0, 0, + ] + { + Ok(()) + } else { + Err(String::from("Expected and actual results are not equals").into()) + } +} + +#[test] +fn get_predicate_tabular_test() -> Result<(), Box> { + let mut storage = Storage::new(TabularLayout, Serialization::Sparse); + + common::setup( + common::TABULAR_ZARR, + &mut storage, + ChunkingStrategy::Chunk, + ReferenceSystem::SPO, + ); + + let actual = match storage + .load(Backend::FileSystem(common::TABULAR_ZARR))? + .get_predicate(common::Predicate::InstanceOf.into())? + { + OpsFormat::SparseArray(actual) => actual, + _ => unreachable!(), + }; + + let mut expected = TriMat::new((4, 9)); + expected.add_triplet( + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Object::Human.get_idx(&storage.get_dictionary()), + common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Subject::Wilmslow.get_idx(&storage.get_dictionary()), + common::Object::Town.get_idx(&storage.get_dictionary()), + common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Subject::Bombe.get_idx(&storage.get_dictionary()), + common::Object::Computer.get_idx(&storage.get_dictionary()), + common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()), + ); + let expected = expected.to_csc(); + + if actual == expected { + Ok(()) + } else { + Err(String::from("Expected and actual results are not equals").into()) + } +} diff --git a/tests/orientation.rs b/tests/orientation.rs index 71c05df..cf6a737 100644 --- a/tests/orientation.rs +++ b/tests/orientation.rs @@ -79,49 +79,16 @@ fn orientation_pso_tabular_test() -> Result<(), Box> { .load(Backend::FileSystem(common::TABULAR_PSO_ZARR))? .get_predicate(common::Predicate::InstanceOf.into())? { - OpsFormat::SparseArray(actual) => actual, + OpsFormat::Zarr(actual) => actual, _ => unreachable!(), }; - println!("{}", storage.get_sparse_array().unwrap().to_dense()); - - storage - .get_dictionary() - .subjects() - .iter() - .for_each(|(i, e)| println!("{} {}", i, std::str::from_utf8(&e).unwrap().to_string())); - - println!(); - - storage - .get_dictionary() - .predicates() - .iter() - .for_each(|(i, e)| println!("{} {}", i, std::str::from_utf8(&e).unwrap().to_string())); - - println!(); - - storage - .get_dictionary() - .objects() - .iter() - .for_each(|(i, e)| println!("{} {}", i, std::str::from_utf8(&e).unwrap().to_string())); - - println!( - "{:?}", - storage - .get_dictionary() - .get_subject_idx(common::Subject::Warrington.into()) - ); - - Ok(()) - - // if actual == vec![3, 1, 1] { - // Ok(()) - // } else { - // println!("{:?}", actual); - // Err(String::from("Expected and actual results are not equals").into()) - // } + if actual == vec![3, 1, 1] { + Ok(()) + } else { + println!("{:?}", actual); + Err(String::from("Expected and actual results are not equals").into()) + } } #[test] From a5ab4b1d51fc60b3245216a4aaf2030a24eb569f Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Mon, 11 Mar 2024 12:28:24 +0000 Subject: [PATCH 08/10] improving the tests --- Cargo.toml | 2 +- src/engine/chunk.rs | 8 +--- src/storage/layout/mod.rs | 12 ++++- tests/common/mod.rs | 78 ++++++++++++++++++++++++++++++++ tests/get_object_test.rs | 30 ++++++++++--- tests/get_predicate_test.rs | 43 +++++++++++++++--- tests/get_subject_test.rs | 89 ++++++++++++++++++++++++++++++++----- tests/orientation.rs | 41 ++++++++++++++++- 8 files changed, 268 insertions(+), 35 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 95475c4..b2f7a51 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ version = "0.0.1" edition = "2021" [dependencies] -zarrs = { version = "0.12.3", default-features = false, features = [ "http", "gzip", "sharding", "opendal", "async", "ndarray" ] } +zarrs = { version = "0.12.4", default-features = false, features = [ "http", "gzip", "sharding", "opendal", "async", "ndarray", "crc32c" ] } clap = { version = "4.1.8", features = ["derive"] } serde_json = "1.0.108" thiserror = "1.0.50" diff --git a/src/engine/chunk.rs b/src/engine/chunk.rs index 0f37f6c..9870474 100644 --- a/src/engine/chunk.rs +++ b/src/engine/chunk.rs @@ -42,11 +42,7 @@ impl EngineStrategy> for Array { let objects = self.shape()[0]; let col = index as u64; let shape = ArraySubset::new_with_ranges(&[0..objects, col..col + 1]); - let array_subset = self.retrieve_array_subset(&shape).unwrap(); - let third_term_subset = array_subset - .windows(4) - .map(|w| u32::from_ne_bytes(w.try_into().unwrap())) - .collect::>(); - Ok(third_term_subset) + let array_subset = self.retrieve_array_subset_elements::(&shape)?; + Ok(array_subset) } } diff --git a/src/storage/layout/mod.rs b/src/storage/layout/mod.rs index 11657ad..6bb9338 100644 --- a/src/storage/layout/mod.rs +++ b/src/storage/layout/mod.rs @@ -77,10 +77,18 @@ pub trait LayoutOps { } if !remainder.is_empty() { + // first we count the number of shards that have been processed, and + // multiply it by the number of chunks in every shard. Hence, we will + // obtain the number of rows that have been processed + let rows_processed = count.load(Ordering::Relaxed) * rows_per_shard(&arr); + // then we obtain the size of the last shard that is going to be + // processed; it is equals to the size of the remainder + let last_shard_size = remainder.len() as u64; + // lastly, we store the elements in the provided subset arr.store_array_subset_elements::( &ArraySubset::new_with_start_shape( - vec![count.load(Ordering::Relaxed) * rows_per_shard(&arr), 0], - vec![remainder.len() as u64, columns_per_shard(&arr)], + vec![rows_processed, 0], + vec![last_shard_size, columns_per_shard(&arr)], )?, self.store_chunk_elements(remainder, columns), )?; diff --git a/tests/common/mod.rs b/tests/common/mod.rs index c0d5b80..e97dae3 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -192,3 +192,81 @@ impl Graph { ans.to_csc() } } + +pub fn set_expected_first_term_matrix( + expected: &mut Vec, + subject: Subject, + predicate: Predicate, + object: Object, + dictionary: &Dictionary, + reference_system: ReferenceSystem, +) { + let subject_idx = subject.get_idx(dictionary); + let predicate_idx = predicate.get_idx(dictionary); + let object_idx = object.get_idx(dictionary); + + match reference_system { + ReferenceSystem::SPO => expected[object_idx] = predicate_idx as u32, + ReferenceSystem::SOP => expected[predicate_idx] = object_idx as u32, + ReferenceSystem::PSO => expected[object_idx] = subject_idx as u32, + ReferenceSystem::POS => expected[subject_idx] = object_idx as u32, + ReferenceSystem::OSP => expected[predicate_idx] = subject_idx as u32, + ReferenceSystem::OPS => expected[subject_idx] = predicate_idx as u32, + } +} + +pub fn set_expected_second_term_matrix( + expected: &mut Vec, + subject: Subject, + predicate: Predicate, + object: Object, + dictionary: &Dictionary, + reference_system: ReferenceSystem, +) { + let subject_idx = subject.get_idx(dictionary); + let predicate_idx = predicate.get_idx(dictionary); + let object_idx = object.get_idx(dictionary); + + match reference_system { + ReferenceSystem::SPO => { + expected[subject_idx * dictionary.objects_size() + object_idx] = predicate_idx as u32 + } + ReferenceSystem::SOP => { + expected[subject_idx * dictionary.predicates_size() + predicate_idx] = object_idx as u32 + } + ReferenceSystem::PSO => { + expected[predicate_idx * dictionary.objects_size() + object_idx] = subject_idx as u32 + } + ReferenceSystem::POS => { + expected[predicate_idx * dictionary.subjects_size() + subject_idx] = object_idx as u32 + } + ReferenceSystem::OSP => { + expected[object_idx * dictionary.predicates_size() + predicate_idx] = subject_idx as u32 + } + ReferenceSystem::OPS => { + expected[object_idx * dictionary.subjects_size() + subject_idx] = predicate_idx as u32 + } + } +} + +pub fn set_expected_third_term_matrix( + expected: &mut Vec, + subject: Subject, + predicate: Predicate, + object: Object, + dictionary: &Dictionary, + reference_system: ReferenceSystem, +) { + let subject_idx = subject.get_idx(dictionary); + let predicate_idx = predicate.get_idx(dictionary); + let object_idx = object.get_idx(dictionary); + + match reference_system { + ReferenceSystem::SPO => expected[subject_idx] = predicate_idx as u32, + ReferenceSystem::SOP => expected[subject_idx] = object_idx as u32, + ReferenceSystem::PSO => expected[predicate_idx] = subject_idx as u32, + ReferenceSystem::POS => expected[predicate_idx] = object_idx as u32, + ReferenceSystem::OSP => expected[object_idx] = subject_idx as u32, + ReferenceSystem::OPS => expected[object_idx] = predicate_idx as u32, + } +} diff --git a/tests/get_object_test.rs b/tests/get_object_test.rs index 03f8e38..a6d0882 100644 --- a/tests/get_object_test.rs +++ b/tests/get_object_test.rs @@ -1,3 +1,4 @@ +use common::set_expected_third_term_matrix; use remote_hdt::storage::layout::matrix::MatrixLayout; use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::ops::Ops; @@ -19,7 +20,7 @@ fn get_object_matrix_sharding_test() -> Result<(), Box> { common::setup( common::SHARDING_ZARR, &mut storage, - ChunkingStrategy::Sharding(3), + ChunkingStrategy::Sharding(4), ReferenceSystem::SPO, ); @@ -31,10 +32,19 @@ fn get_object_matrix_sharding_test() -> Result<(), Box> { _ => unreachable!(), }; - if actual == vec![2, 0, 0, 0, 0] { + let mut expected = vec![0u32; storage.get_dictionary().subjects_size()]; + set_expected_third_term_matrix( + &mut expected, + common::Subject::Alan, + common::Predicate::DateOfBirth, + common::Object::Date, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + + if actual == expected { Ok(()) } else { - println!("{:?}", actual); Err(String::from("Expected and actual results are not equals").into()) } } @@ -58,11 +68,17 @@ fn get_object_tabular_test() -> Result<(), Box> { _ => unreachable!(), }; - let mut expected = TriMat::new((4, 9)); - expected.add_triplet(1, 3, 3); - let expected = expected.to_csc(); + let mut expected = TriMat::new(( + storage.get_dictionary().subjects_size(), + storage.get_dictionary().objects_size(), + )); + expected.add_triplet( + common::Subject::Bombe.get_idx(&storage.get_dictionary()), + common::Object::Alan.get_idx(&storage.get_dictionary()), + common::Predicate::Discoverer.get_idx(&storage.get_dictionary()), + ); - if actual == expected { + if actual == expected.to_csc() { Ok(()) } else { Err(String::from("Expected and actual results are not equals").into()) diff --git a/tests/get_predicate_test.rs b/tests/get_predicate_test.rs index 6352252..be46da1 100644 --- a/tests/get_predicate_test.rs +++ b/tests/get_predicate_test.rs @@ -1,3 +1,4 @@ +use common::set_expected_second_term_matrix; use remote_hdt::storage::layout::matrix::MatrixLayout; use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::ops::Ops; @@ -31,12 +32,37 @@ fn get_predicate_matrix_chunk_test() -> Result<(), Box> { _ => unreachable!(), }; - if actual - == vec![ - 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 5, 0, 0, 0, - ] - { + let mut expected = vec![ + 0u32; + storage.get_dictionary().subjects_size() + * storage.get_dictionary().objects_size() + ]; + set_expected_second_term_matrix( + &mut expected, + common::Subject::Alan, + common::Predicate::InstanceOf, + common::Object::Human, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + set_expected_second_term_matrix( + &mut expected, + common::Subject::Wilmslow, + common::Predicate::InstanceOf, + common::Object::Town, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + set_expected_second_term_matrix( + &mut expected, + common::Subject::Bombe, + common::Predicate::InstanceOf, + common::Object::Computer, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + + if actual == expected { Ok(()) } else { Err(String::from("Expected and actual results are not equals").into()) @@ -62,7 +88,10 @@ fn get_predicate_tabular_test() -> Result<(), Box> { _ => unreachable!(), }; - let mut expected = TriMat::new((4, 9)); + let mut expected = TriMat::new(( + storage.get_dictionary().subjects_size(), + storage.get_dictionary().objects_size(), + )); expected.add_triplet( common::Subject::Alan.get_idx(&storage.get_dictionary()), common::Object::Human.get_idx(&storage.get_dictionary()), diff --git a/tests/get_subject_test.rs b/tests/get_subject_test.rs index 171e744..fdc8adc 100644 --- a/tests/get_subject_test.rs +++ b/tests/get_subject_test.rs @@ -1,3 +1,4 @@ +use common::set_expected_first_term_matrix; use remote_hdt::storage::layout::matrix::MatrixLayout; use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::ops::Ops; @@ -31,7 +32,49 @@ fn get_subject_matrix_chunk_test() -> Result<(), Box> { _ => unreachable!(), }; - if actual == vec![2, 4, 5, 0, 0, 0, 0, 7, 8] { + let mut expected = vec![0u32; storage.get_dictionary().objects_size()]; + set_expected_first_term_matrix( + &mut expected, + common::Subject::Alan, + common::Predicate::InstanceOf, + common::Object::Human, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + set_expected_first_term_matrix( + &mut expected, + common::Subject::Alan, + common::Predicate::PlaceOfBirth, + common::Object::Warrington, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + set_expected_first_term_matrix( + &mut expected, + common::Subject::Alan, + common::Predicate::PlaceOfDeath, + common::Object::Wilmslow, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + set_expected_first_term_matrix( + &mut expected, + common::Subject::Alan, + common::Predicate::DateOfBirth, + common::Object::Date, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + set_expected_first_term_matrix( + &mut expected, + common::Subject::Alan, + common::Predicate::Employer, + common::Object::GCHQ, + &storage.get_dictionary(), + ReferenceSystem::SPO, + ); + + if actual == expected { Ok(()) } else { Err(String::from("Expected and actual results are not equals").into()) @@ -57,7 +100,11 @@ fn get_subject_matrix_sharding_test() -> Result<(), Box> { _ => unreachable!(), }; - if actual == vec![0, 0, 0, 0, 0, 5, 1, 0, 0] { + let mut expected = vec![0u32; storage.get_dictionary().objects_size()]; + expected[5] = common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()) as u32; + expected[6] = common::Predicate::Country.get_idx(&storage.get_dictionary()) as u32; + + if actual == expected { Ok(()) } else { Err(String::from("Expected and actual results are not equals").into()) @@ -83,15 +130,37 @@ fn get_subject_tabular_test() -> Result<(), Box> { _ => unreachable!(), }; - let mut expected = TriMat::new((4, 9)); - expected.add_triplet(0, 0, 2); - expected.add_triplet(0, 1, 4); - expected.add_triplet(0, 2, 5); - expected.add_triplet(0, 7, 7); - expected.add_triplet(0, 8, 8); - let expected = expected.to_csc(); + let mut expected = TriMat::new(( + storage.get_dictionary().subjects_size(), + storage.get_dictionary().objects_size(), + )); + expected.add_triplet( + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Object::Human.get_idx(&storage.get_dictionary()), + common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Object::Warrington.get_idx(&storage.get_dictionary()), + common::Predicate::PlaceOfBirth.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Object::Wilmslow.get_idx(&storage.get_dictionary()), + common::Predicate::PlaceOfDeath.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Object::Date.get_idx(&storage.get_dictionary()), + common::Predicate::DateOfBirth.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Object::GCHQ.get_idx(&storage.get_dictionary()), + common::Predicate::Employer.get_idx(&storage.get_dictionary()), + ); - if actual == expected { + if actual == expected.to_csc() { Ok(()) } else { Err(String::from("Expected and actual results are not equals").into()) diff --git a/tests/orientation.rs b/tests/orientation.rs index cf6a737..5228fb4 100644 --- a/tests/orientation.rs +++ b/tests/orientation.rs @@ -1,3 +1,4 @@ +use common::set_expected_first_term_matrix; use remote_hdt::storage::layout::matrix::MatrixLayout; use remote_hdt::storage::layout::tabular::TabularLayout; use remote_hdt::storage::ops::Ops; @@ -30,7 +31,33 @@ fn orientation_pso_matrix_test() -> Result<(), Box> { _ => unreachable!(), }; - if actual == vec![3, 0, 1] { + let mut expected = vec![0u32; storage.get_dictionary().objects_size()]; + set_expected_first_term_matrix( + &mut expected, + common::Subject::Alan, + common::Predicate::InstanceOf, + common::Object::Human, + &storage.get_dictionary(), + ReferenceSystem::PSO, + ); + set_expected_first_term_matrix( + &mut expected, + common::Subject::Wilmslow, + common::Predicate::InstanceOf, + common::Object::Town, + &storage.get_dictionary(), + ReferenceSystem::PSO, + ); + set_expected_first_term_matrix( + &mut expected, + common::Subject::Bombe, + common::Predicate::InstanceOf, + common::Object::Computer, + &storage.get_dictionary(), + ReferenceSystem::PSO, + ); + + if actual == expected { Ok(()) } else { Err(String::from("Expected and actual results are not equals").into()) @@ -56,7 +83,17 @@ fn orientation_ops_matrix_test() -> Result<(), Box> { _ => unreachable!(), }; - if actual == vec![0, 3, 0, 0] { + let mut expected = vec![0u32; storage.get_dictionary().subjects_size()]; + set_expected_first_term_matrix( + &mut expected, + common::Subject::Bombe, + common::Predicate::Discoverer, + common::Object::Alan, + &storage.get_dictionary(), + ReferenceSystem::OPS, + ); + + if actual == expected { Ok(()) } else { println!("{:?}", actual); From 3fd018d8112583b50ab53fbd1bb08945a4ab1074 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Mon, 11 Mar 2024 15:04:01 +0000 Subject: [PATCH 09/10] errors finally fixed --- Cargo.toml | 5 +--- examples/query_bench.rs | 4 +-- examples/serialize.rs | 4 --- src/engine/chunk.rs | 2 +- src/error.rs | 2 -- src/storage/layout/mod.rs | 12 +++++--- src/storage/mod.rs | 52 ++++++++++----------------------- src/storage/ops.rs | 4 ++- src/utils.rs | 4 +-- tests/get_object_test.rs | 2 +- tests/orientation.rs | 61 +++++++++++++++++++++++++++++++++++---- 11 files changed, 90 insertions(+), 62 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b2f7a51..ef7b0c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ version = "0.0.1" edition = "2021" [dependencies] -zarrs = { version = "0.12.4", default-features = false, features = [ "http", "gzip", "sharding", "opendal", "async", "ndarray", "crc32c" ] } +zarrs = { version = "0.12.4", default-features = false, features = [ "http", "gzip", "sharding", "async", "ndarray", "crc32c" ] } clap = { version = "4.1.8", features = ["derive"] } serde_json = "1.0.108" thiserror = "1.0.50" @@ -17,9 +17,6 @@ rio_api = "0.8.4" rayon = "1.8.0" parking_lot = "0.12" -[target.'cfg(not(target_env = "msvc"))'.dependencies] -jemallocator = "0.5.0" - [profile.release] codegen-units = 1 opt-level = 3 diff --git a/examples/query_bench.rs b/examples/query_bench.rs index f97e448..711effc 100644 --- a/examples/query_bench.rs +++ b/examples/query_bench.rs @@ -6,7 +6,7 @@ use remote_hdt::storage::Storage; use std::env; use std::time::Instant; -const SUBJECT: &str = ""; +const SUBJECT: &str = ""; fn main() -> Result<(), RemoteHDTError> { let args: Vec = env::args().collect(); @@ -21,7 +21,7 @@ fn main() -> Result<(), RemoteHDTError> { let arr = binding.load(Backend::FileSystem(format!("{}.zarr", zarr_path).as_str()))?; let before = Instant::now(); - arr.get_subject(SUBJECT)?; + arr.get_object(SUBJECT)?; println!("Elapsed time: {:.2?}", before.elapsed()); diff --git a/examples/serialize.rs b/examples/serialize.rs index 421c892..9ca4192 100644 --- a/examples/serialize.rs +++ b/examples/serialize.rs @@ -5,10 +5,6 @@ use remote_hdt::storage::Storage; use std::env; use std::time::Instant; -#[cfg(not(target_env = "msvc"))] -#[global_allocator] -static ALLOCATOR: jemallocator::Jemalloc = jemallocator::Jemalloc; - fn main() -> Result<(), RemoteHDTError> { let args: Vec = env::args().collect(); if args.len() <= 3 { diff --git a/src/engine/chunk.rs b/src/engine/chunk.rs index 9870474..edf515c 100644 --- a/src/engine/chunk.rs +++ b/src/engine/chunk.rs @@ -9,7 +9,7 @@ use crate::utils::rows_per_shard; use super::EngineResult; use super::EngineStrategy; -impl EngineStrategy> for Array { +impl EngineStrategy> for Array { fn get_first_term(&self, index: usize) -> EngineResult> { let shard_index = index as u64 / rows_per_shard(self); let shard = self.retrieve_chunk_elements(&[shard_index, 0])?; diff --git a/src/error.rs b/src/error.rs index 7868acb..a810e88 100644 --- a/src/error.rs +++ b/src/error.rs @@ -52,8 +52,6 @@ pub enum RemoteHDTError { TripleSerialization, #[error("The provided path is not valid")] OsPathToString, - #[error(transparent)] - Opendal(#[from] zarrs::opendal::Error), #[error("The provided backend is read-only")] ReadOnlyBackend, #[error("Error while parsing the RDF graph")] diff --git a/src/storage/layout/mod.rs b/src/storage/layout/mod.rs index 6bb9338..220d673 100644 --- a/src/storage/layout/mod.rs +++ b/src/storage/layout/mod.rs @@ -9,7 +9,8 @@ use zarrs::array::DataType; use zarrs::array::DimensionName; use zarrs::array::FillValue; use zarrs::array_subset::ArraySubset; -use zarrs::storage::store::OpendalStore; +use zarrs::storage::store::FilesystemStore; +use zarrs::storage::ReadableStorageTraits; use crate::dictionary::Dictionary; use crate::error::RemoteHDTError; @@ -30,7 +31,10 @@ pub mod matrix; pub mod tabular; pub trait LayoutOps { - fn retrieve_attributes(&mut self, arr: &Array) -> StorageResult { + fn retrieve_attributes( + &mut self, + arr: &Array, + ) -> StorageResult { // 4. We get the attributes so we can obtain some values that we will need let attributes = arr.attributes(); @@ -63,7 +67,7 @@ pub trait LayoutOps { )) } - fn serialize(&mut self, arr: Array, graph: Graph) -> StorageResult<()> { + fn serialize(&mut self, arr: &Array, graph: Graph) -> StorageResult<()> { let columns = arr.shape()[1] as usize; let count = AtomicU64::new(0); let binding = self.graph_iter(graph.to_owned()); @@ -99,7 +103,7 @@ pub trait LayoutOps { fn parse( &mut self, - arr: &Array, + arr: &Array, dimensionality: &Dimensionality, ) -> StorageResult { // First, we create the 2D matrix in such a manner that the number of diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 8035e77..2047967 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -6,11 +6,11 @@ use std::sync::atomic::AtomicU32; use std::sync::Arc; use zarrs::array::Array; use zarrs::array::ArrayBuilder; +use zarrs::array_subset::ArraySubset; use zarrs::group::GroupBuilder; -use zarrs::opendal::services::Fs; -use zarrs::opendal::services::Http; -use zarrs::opendal::Operator; -use zarrs::storage::store::OpendalStore; +use zarrs::storage::store::FilesystemStore; +use zarrs::storage::store::HTTPStore; +use zarrs::storage::ReadableStorageTraits; use crate::dictionary::Dictionary; use crate::error::RemoteHDTError; @@ -41,7 +41,7 @@ pub struct Storage { layout: Box>, serialization: Serialization, reference_system: ReferenceSystem, - array: Option>, + array: Option>, sparse_array: Option, } @@ -78,29 +78,20 @@ impl Storage { reference_system: ReferenceSystem, // threading_strategy: ThreadingStrategy, TODO: implement this ) -> StorageResult<&mut Self> { - let operator = match store { + let path = match store { Backend::FileSystem(path) => { - let mut builder = Fs::default(); let path = PathBuf::from_str(path)?; match path.exists() { true => return Err(RemoteHDTError::PathExists), - false => { - let path = match path.into_os_string().into_string() { - Ok(string) => string, - Err(_) => return Err(RemoteHDTError::OsPathToString), - }; - builder.root(&path); - } + false => path, } - - Operator::new(builder)?.finish() } Backend::HTTP(_) => return Err(RemoteHDTError::ReadOnlyBackend), }; // 2. We can create the FileSystemStore appropiately - let store = Arc::new(OpendalStore::new(operator.blocking())); + let store = Arc::new(FilesystemStore::new(path)?); // Create a group and write metadata to filesystem let group = GroupBuilder::new().build(store.clone(), "/group")?; @@ -144,10 +135,13 @@ impl Storage { attributes.insert("reference_system".into(), reference_system.as_ref().into()); attributes }) - .build(store, ARRAY_NAME)?; + .build(store.clone(), ARRAY_NAME)?; arr.store_metadata()?; - self.layout.serialize(arr, graph)?; + self.layout.serialize(&arr, graph)?; + + let shape = ArraySubset::new_with_ranges(&[0..10, 1..2]); + arr.retrieve_array_subset_elements::(&shape).unwrap(); Ok(self) } @@ -157,32 +151,18 @@ impl Storage { store: Backend<'_>, // threading_strategy: ThreadingStrategy, TODO: implement this ) -> StorageResult<&mut Self> { - let operator = match store { + let store: Arc = match store { Backend::FileSystem(path) => { - let mut builder = Fs::default(); let path = PathBuf::from_str(path)?; match path.exists() { false => return Err(RemoteHDTError::PathDoesNotExist), - true => { - let path = match path.into_os_string().into_string() { - Ok(string) => string, - Err(_) => return Err(RemoteHDTError::OsPathToString), - }; - builder.root(&path); - } + true => Arc::new(FilesystemStore::new(path)?), } - - Operator::new(builder)?.finish() - } - Backend::HTTP(path) => { - let mut builder = Http::default(); - builder.endpoint(path); - Operator::new(builder)?.finish() } + Backend::HTTP(url) => Arc::new(HTTPStore::new(url)?), }; - let store: Arc = Arc::new(OpendalStore::new(operator.blocking())); let arr = Array::new(store, ARRAY_NAME)?; let dictionary = self.layout.retrieve_attributes(&arr)?; self.dictionary = dictionary; diff --git a/src/storage/ops.rs b/src/storage/ops.rs index b2fb565..afb7de2 100644 --- a/src/storage/ops.rs +++ b/src/storage/ops.rs @@ -39,7 +39,9 @@ impl Ops for Storage { Some(array) => OpsFormat::SparseArray(match self.reference_system { ReferenceSystem::SPO | ReferenceSystem::SOP => array.get_first_term(index)?, ReferenceSystem::PSO | ReferenceSystem::OSP => array.get_second_term(index)?, - ReferenceSystem::POS | ReferenceSystem::OPS => array.get_third_term(index)?, + ReferenceSystem::POS | ReferenceSystem::OPS => { + array.get_third_term(index).unwrap() + } }), None => return Err(OpsError::EmptySparseArray), }, diff --git a/src/utils.rs b/src/utils.rs index eaf6396..1c498a1 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -31,7 +31,7 @@ pub fn hash_to_set(terms: HashSet) -> Vec { vec } -pub fn rows_per_shard(arr: &Array) -> u64 { +pub fn rows_per_shard(arr: &Array) -> u64 { match arr.chunk_grid().chunk_shape(&[0, 0], arr.shape()) { Ok(shape) => match shape { Some(chunk_shape) => chunk_shape[0].into(), @@ -41,7 +41,7 @@ pub fn rows_per_shard(arr: &Array) -> u64 { } } -pub fn columns_per_shard(arr: &Array) -> u64 { +pub fn columns_per_shard(arr: &Array) -> u64 { match arr.chunk_grid().chunk_shape(&[0, 0], arr.shape()) { Ok(shape) => match shape { Some(chunk_shape) => chunk_shape[1].into(), diff --git a/tests/get_object_test.rs b/tests/get_object_test.rs index a6d0882..984ab83 100644 --- a/tests/get_object_test.rs +++ b/tests/get_object_test.rs @@ -20,7 +20,7 @@ fn get_object_matrix_sharding_test() -> Result<(), Box> { common::setup( common::SHARDING_ZARR, &mut storage, - ChunkingStrategy::Sharding(4), + ChunkingStrategy::Sharding(3), ReferenceSystem::SPO, ); diff --git a/tests/orientation.rs b/tests/orientation.rs index 5228fb4..7b56d25 100644 --- a/tests/orientation.rs +++ b/tests/orientation.rs @@ -8,6 +8,7 @@ use remote_hdt::storage::params::ChunkingStrategy; use remote_hdt::storage::params::ReferenceSystem; use remote_hdt::storage::params::Serialization; use remote_hdt::storage::Storage; +use sprs::TriMat; use std::error::Error; mod common; @@ -116,11 +117,31 @@ fn orientation_pso_tabular_test() -> Result<(), Box> { .load(Backend::FileSystem(common::TABULAR_PSO_ZARR))? .get_predicate(common::Predicate::InstanceOf.into())? { - OpsFormat::Zarr(actual) => actual, + OpsFormat::SparseArray(actual) => actual, _ => unreachable!(), }; - if actual == vec![3, 1, 1] { + let mut expected = TriMat::new(( + storage.get_dictionary().predicates_size(), + storage.get_dictionary().objects_size(), + )); + expected.add_triplet( + common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()), + common::Object::Human.get_idx(&storage.get_dictionary()), + common::Subject::Alan.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()), + common::Object::Town.get_idx(&storage.get_dictionary()), + common::Subject::Wilmslow.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()), + common::Object::Computer.get_idx(&storage.get_dictionary()), + common::Subject::Bombe.get_idx(&storage.get_dictionary()), + ); + + if actual == expected.to_csc() { Ok(()) } else { println!("{:?}", actual); @@ -130,7 +151,7 @@ fn orientation_pso_tabular_test() -> Result<(), Box> { #[test] fn orientation_ops_tabular_test() -> Result<(), Box> { - let mut storage = Storage::new(TabularLayout, Serialization::Zarr); + let mut storage = Storage::new(TabularLayout, Serialization::Sparse); common::setup( common::TABULAR_OPS_ZARR, @@ -143,11 +164,41 @@ fn orientation_ops_tabular_test() -> Result<(), Box> { .load(Backend::FileSystem(common::TABULAR_OPS_ZARR))? .get_subject(common::Subject::Alan.into())? { - OpsFormat::Zarr(actual) => actual, + OpsFormat::SparseArray(actual) => actual, _ => unreachable!(), }; - if actual == vec![1, 3, 4, 0, 0, 0, 0, 6, 7] { + let mut expected = TriMat::new(( + storage.get_dictionary().objects_size(), + storage.get_dictionary().subjects_size(), + )); + expected.add_triplet( + common::Object::Human.get_idx(&storage.get_dictionary()), + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Predicate::InstanceOf.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Object::Warrington.get_idx(&storage.get_dictionary()), + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Predicate::PlaceOfBirth.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Object::Wilmslow.get_idx(&storage.get_dictionary()), + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Predicate::PlaceOfDeath.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Object::Date.get_idx(&storage.get_dictionary()), + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Predicate::DateOfBirth.get_idx(&storage.get_dictionary()), + ); + expected.add_triplet( + common::Object::GCHQ.get_idx(&storage.get_dictionary()), + common::Subject::Alan.get_idx(&storage.get_dictionary()), + common::Predicate::Employer.get_idx(&storage.get_dictionary()), + ); + + if actual == expected.to_csc() { Ok(()) } else { Err(String::from("Expected and actual results are not equals").into()) From 598c20455280a684de13dccdf52ae45e1f983083 Mon Sep 17 00:00:00 2001 From: angelip2303 Date: Mon, 11 Mar 2024 15:10:49 +0000 Subject: [PATCH 10/10] clippy now fixed --- src/storage/layout/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/storage/layout/mod.rs b/src/storage/layout/mod.rs index 220d673..bca148f 100644 --- a/src/storage/layout/mod.rs +++ b/src/storage/layout/mod.rs @@ -71,7 +71,7 @@ pub trait LayoutOps { let columns = arr.shape()[1] as usize; let count = AtomicU64::new(0); let binding = self.graph_iter(graph.to_owned()); - let iter = binding.chunks_exact(rows_per_shard(&arr) as usize); + let iter = binding.chunks_exact(rows_per_shard(arr) as usize); let remainder = iter.remainder(); for chunk in iter { @@ -84,7 +84,7 @@ pub trait LayoutOps { // first we count the number of shards that have been processed, and // multiply it by the number of chunks in every shard. Hence, we will // obtain the number of rows that have been processed - let rows_processed = count.load(Ordering::Relaxed) * rows_per_shard(&arr); + let rows_processed = count.load(Ordering::Relaxed) * rows_per_shard(arr); // then we obtain the size of the last shard that is going to be // processed; it is equals to the size of the remainder let last_shard_size = remainder.len() as u64; @@ -92,7 +92,7 @@ pub trait LayoutOps { arr.store_array_subset_elements::( &ArraySubset::new_with_start_shape( vec![rows_processed, 0], - vec![last_shard_size, columns_per_shard(&arr)], + vec![last_shard_size, columns_per_shard(arr)], )?, self.store_chunk_elements(remainder, columns), )?;