Skip to content

Commit

Permalink
Merge pull request #14 from weso/several-stores-get-predicate-and-opt…
Browse files Browse the repository at this point in the history
…imizations

All test work and I have checked the new project structure
  • Loading branch information
DiegoMfer authored Mar 11, 2024
2 parents 47dba01 + 598c204 commit f30f4f0
Show file tree
Hide file tree
Showing 35 changed files with 1,763 additions and 710 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ benches/*/*.nt
!resources/root.zarr
.vscode
heaptrack.*
tests/out
tests/out
uniprotkb_*
7 changes: 2 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ version = "0.0.1"
edition = "2021"

[dependencies]
zarrs = { version = "0.6.0", default-features = false, features = [ "http", "gzip", "sharding" ] }
zarrs = { version = "0.12.4", default-features = false, features = [ "http", "gzip", "sharding", "async", "ndarray", "crc32c" ] }
clap = { version = "4.1.8", features = ["derive"] }
serde_json = "1.0.108"
thiserror = "1.0.50"
Expand All @@ -14,11 +14,8 @@ sprs = "0.11.1"
rio_turtle = "0.8.4"
rio_xml = "0.8.4"
rio_api = "0.8.4"
safe-transmute = "0.11.2"
rayon = "1.8.0"

[target.'cfg(not(target_env = "msvc"))'.dependencies]
jemallocator = "0.5.0"
parking_lot = "0.12"

[profile.release]
codegen-units = 1
Expand Down
28 changes: 14 additions & 14 deletions examples/http_bench.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
use remote_hdt::engine::EngineStrategy;
use remote_hdt::storage::matrix::MatrixLayout;
use remote_hdt::storage::HTTPStorage;
use remote_hdt::error::RemoteHDTError;
use remote_hdt::storage::layout::matrix::MatrixLayout;
use remote_hdt::storage::ops::Ops;
use remote_hdt::storage::params::{Backend, Serialization};
use remote_hdt::storage::Storage;
use std::time::Instant;

fn main() {
let mut remote_hdt = HTTPStorage::new(MatrixLayout);
let arr = remote_hdt
.connect("https://raw.githubusercontent.com/weso/RemoteHDT/master/resources/root.zarr")
.unwrap();
let index = remote_hdt
.get_dictionary()
.get_subject_idx_unchecked("<http://example.org/alan>");
fn main() -> Result<(), RemoteHDTError> {
let mut binding = Storage::new(MatrixLayout, Serialization::Zarr);
let arr = binding.load(Backend::HTTP(
"https://raw.githubusercontent.com/weso/RemoteHDT/master/resources/root.zarr",
))?;

let before = Instant::now();
arr.get_subject(index).unwrap();
let after = before.elapsed();
arr.get_subject("<http://example.org/alan>")?;

println!("Elapsed time: {:.2?}", after)
println!("Elapsed time: {:.2?}", before.elapsed());

Ok(())
}
20 changes: 12 additions & 8 deletions examples/load_bench.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
use remote_hdt::storage::tabular::TabularLayout;
use remote_hdt::storage::LocalStorage;
use remote_hdt::error::RemoteHDTError;
use remote_hdt::storage::layout::tabular::TabularLayout;
use remote_hdt::storage::params::{Backend, Serialization};
use remote_hdt::storage::Storage;
use std::env;
use std::time::Instant;

fn main() {
fn main() -> Result<(), RemoteHDTError> {
let args: Vec<String> = env::args().collect();
if args.len() <= 1 {
panic!("Usage: cargo run --example query_bench <number_of_universities>");
}

let number_of_universities: &String = &args[1];
let zarr_path = format!("{}-lubm", number_of_universities);

let before = Instant::now();
LocalStorage::new(TabularLayout)
.load(format!("{}.zarr", zarr_path).as_str())
.unwrap();
let after = before.elapsed();

println!("Elapsed time: {:.2?}", after)
Storage::new(TabularLayout, Serialization::Zarr)
.load(Backend::FileSystem(format!("{}.zarr", zarr_path).as_str()))?;

println!("Elapsed time: {:.2?}", before.elapsed());

Ok(())
}
24 changes: 13 additions & 11 deletions examples/ntriples/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
use remote_hdt::storage::tabular::TabularLayout;
use remote_hdt::storage::ChunkingStrategy;
use remote_hdt::storage::LocalStorage;
use remote_hdt::error::RemoteHDTError;
use remote_hdt::storage::layout::tabular::TabularLayout;
use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization};
use remote_hdt::storage::Storage;

pub fn main() {
LocalStorage::new(TabularLayout)
.serialize(
"root.zarr",
"examples/ntriples/rdf.nt",
ChunkingStrategy::Chunk,
)
.unwrap();
pub fn main() -> Result<(), RemoteHDTError> {
Storage::new(TabularLayout, Serialization::Zarr).serialize(
Backend::FileSystem("root.zarr"),
"examples/ntriples/rdf.nt",
ChunkingStrategy::Chunk,
ReferenceSystem::SPO,
)?;

Ok(())
}
29 changes: 14 additions & 15 deletions examples/query_bench.rs
Original file line number Diff line number Diff line change
@@ -1,30 +1,29 @@
use remote_hdt::engine::EngineStrategy;
use remote_hdt::storage::matrix::MatrixLayout;
use remote_hdt::storage::LocalStorage;
use remote_hdt::error::RemoteHDTError;
use remote_hdt::storage::layout::matrix::MatrixLayout;
use remote_hdt::storage::ops::Ops;
use remote_hdt::storage::params::{Backend, Serialization};
use remote_hdt::storage::Storage;
use std::env;
use std::time::Instant;

const SUBJECT: &str = "<http://www.Department0.University0.edu/AssistantProfessor0/Publication0>";
const SUBJECT: &str = "<http://www.Department1.University0.edu/Course8>";

fn main() {
fn main() -> Result<(), RemoteHDTError> {
let args: Vec<String> = env::args().collect();
if args.len() <= 1 {
panic!("Usage: cargo run --example query_bench <number_of_universities>");
}

let number_of_universities: &String = &args[1];
let zarr_path = format!("{}-lubm", number_of_universities);

let mut remote_hdt = LocalStorage::new(MatrixLayout);
let arr = remote_hdt
.load(format!("{}.zarr", zarr_path).as_str())
.unwrap();
let index = remote_hdt
.get_dictionary()
.get_subject_idx_unchecked(SUBJECT);
let mut binding = Storage::new(MatrixLayout, Serialization::Zarr);
let arr = binding.load(Backend::FileSystem(format!("{}.zarr", zarr_path).as_str()))?;

let before = Instant::now();
arr.get_subject(index).unwrap();
let after = before.elapsed();
arr.get_object(SUBJECT)?;

println!("Elapsed time: {:.2?}", before.elapsed());

println!("Elapsed time: {:.2?}", after)
Ok(())
}
24 changes: 13 additions & 11 deletions examples/rdf_xml/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
use remote_hdt::storage::tabular::TabularLayout;
use remote_hdt::storage::ChunkingStrategy;
use remote_hdt::storage::LocalStorage;
use remote_hdt::error::RemoteHDTError;
use remote_hdt::storage::layout::tabular::TabularLayout;
use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization};
use remote_hdt::storage::Storage;

pub fn main() {
LocalStorage::new(TabularLayout)
.serialize(
"root.zarr",
"examples/rdf_xml/rdf.rdf",
ChunkingStrategy::Chunk,
)
.unwrap();
pub fn main() -> Result<(), RemoteHDTError> {
Storage::new(TabularLayout, Serialization::Zarr).serialize(
Backend::FileSystem("root.zarr"),
"examples/rdf_xml/rdf.rdf",
ChunkingStrategy::Chunk,
ReferenceSystem::SPO,
)?;

Ok(())
}
30 changes: 30 additions & 0 deletions examples/serialize.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
use remote_hdt::error::RemoteHDTError;
use remote_hdt::storage::layout::matrix::MatrixLayout;
use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization};
use remote_hdt::storage::Storage;
use std::env;
use std::time::Instant;

fn main() -> Result<(), RemoteHDTError> {
let args: Vec<String> = env::args().collect();
if args.len() <= 3 {
panic!("Usage: cargo run --example serialize <rdf_path> <zarr_path> <shard_size>");
}

let rdf_path = &args[1].as_str();
let zarr_path = &args[2].as_str();
let shard_size = &args[3].parse::<u64>().unwrap();

let before = Instant::now();

Storage::new(MatrixLayout, Serialization::Zarr).serialize(
Backend::FileSystem(zarr_path),
rdf_path,
ChunkingStrategy::Sharding(*shard_size),
ReferenceSystem::SPO,
)?;

println!("Elapsed time: {:.2?}", before.elapsed());

Ok(())
}
31 changes: 0 additions & 31 deletions examples/serialize_bench.rs

This file was deleted.

24 changes: 13 additions & 11 deletions examples/turtle/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
use remote_hdt::storage::tabular::TabularLayout;
use remote_hdt::storage::ChunkingStrategy;
use remote_hdt::storage::LocalStorage;
use remote_hdt::error::RemoteHDTError;
use remote_hdt::storage::layout::tabular::TabularLayout;
use remote_hdt::storage::params::{Backend, ChunkingStrategy, ReferenceSystem, Serialization};
use remote_hdt::storage::Storage;

pub fn main() {
LocalStorage::new(TabularLayout)
.serialize(
"root.zarr",
"examples/turtle/rdf.ttk",
ChunkingStrategy::Chunk,
)
.unwrap();
pub fn main() -> Result<(), RemoteHDTError> {
Storage::new(TabularLayout, Serialization::Zarr).serialize(
Backend::FileSystem("root.zarr"),
"examples/turtle/rdf.ttl",
ChunkingStrategy::Chunk,
ReferenceSystem::SPO,
)?;

Ok(())
}
38 changes: 34 additions & 4 deletions src/dictionary.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
use fcsd::Set;
use std::collections::HashSet;

use fcsd::Set;
use crate::storage::params::ReferenceSystem;

use super::utils::hash_to_set;

#[derive(Clone)]
pub struct Dictionary {
reference_system: ReferenceSystem,
subjects: Set,
predicates: Set,
objects: Set,
Expand All @@ -14,6 +16,7 @@ pub struct Dictionary {
impl Default for Dictionary {
fn default() -> Self {
Dictionary {
reference_system: ReferenceSystem::SPO,
subjects: Set::new(vec!["PlaceHolder"]).unwrap(),
predicates: Set::new(vec!["PlaceHolder"]).unwrap(),
objects: Set::new(vec!["PlaceHolder"]).unwrap(),
Expand All @@ -23,23 +26,27 @@ impl Default for Dictionary {

impl Dictionary {
pub(crate) fn from_vec_str(
reference_system: ReferenceSystem,
subjects: &Vec<String>,
predicates: &Vec<String>,
objects: &Vec<String>,
) -> Self {
Dictionary {
reference_system,
subjects: Set::new(subjects).unwrap(),
predicates: Set::new(predicates).unwrap(),
objects: Set::new(objects).unwrap(),
}
}

pub(crate) fn from_set_terms(
reference_system: ReferenceSystem,
subjects: HashSet<String>,
predicates: HashSet<String>,
objects: HashSet<String>,
) -> Self {
Dictionary {
reference_system,
subjects: Set::new(hash_to_set(subjects)).unwrap(),
predicates: Set::new(hash_to_set(predicates)).unwrap(),
objects: Set::new(hash_to_set(objects)).unwrap(),
Expand All @@ -50,6 +57,10 @@ impl Dictionary {
self.subjects.len()
}

pub fn predicates_size(&self) -> usize {
self.predicates.len()
}

pub fn objects_size(&self) -> usize {
self.objects.len()
}
Expand All @@ -66,9 +77,18 @@ impl Dictionary {
self.objects.to_owned()
}

pub fn get_reference_system(&self) -> ReferenceSystem {
self.reference_system.to_owned()
}

pub fn get_subject_idx(&self, subject: &str) -> Option<usize> {
let mut locator = self.subjects.locator();
locator.run(subject)
match self.reference_system {
ReferenceSystem::PSO | ReferenceSystem::OSP => {
locator.run(subject).map(|value| value + 1)
}
_ => locator.run(subject),
}
}

pub fn get_subject_idx_unchecked(&self, subject: &str) -> usize {
Expand All @@ -77,7 +97,12 @@ impl Dictionary {

pub fn get_predicate_idx(&self, predicate: &str) -> Option<usize> {
let mut locator = self.predicates.locator();
locator.run(predicate).map(|value| value + 1)
match self.reference_system {
ReferenceSystem::SPO | ReferenceSystem::OPS => {
locator.run(predicate).map(|value| value + 1)
}
_ => locator.run(predicate),
}
}

pub fn get_predicate_idx_unchecked(&self, predicate: &str) -> usize {
Expand All @@ -86,7 +111,12 @@ impl Dictionary {

pub fn get_object_idx(&self, object: &str) -> Option<usize> {
let mut locator = self.objects.locator();
locator.run(object)
match self.reference_system {
ReferenceSystem::SOP | ReferenceSystem::POS => {
locator.run(object).map(|value| value + 1)
}
_ => locator.run(object),
}
}

pub fn get_object_idx_unchecked(&self, object: &str) -> usize {
Expand Down
Loading

0 comments on commit f30f4f0

Please sign in to comment.