diff --git a/Cargo.toml b/Cargo.toml index e6670c6..f0b95be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ csv = "1" cmov = "0.3.1" rstest = "0.23.0" array-init = "2.1.0" +aligned-vec = "0.6.1" [dev-dependencies] bincode = "1.3" diff --git a/examples/pointcloud-csv.rs b/examples/pointcloud-csv.rs index ef38345..182a531 100644 --- a/examples/pointcloud-csv.rs +++ b/examples/pointcloud-csv.rs @@ -10,7 +10,7 @@ use std::io::Write; use std::time::Instant; use csv::Reader; -use kiddo::ImmutableKdTree; +use kiddo::ImmutableKdDynamicTree; use kiddo::SquaredEuclidean; use rkyv::ser::serializers::{AlignedSerializer, BufferScratch, CompositeSerializer}; use rkyv::ser::Serializer; @@ -30,7 +30,7 @@ struct Point { z: f64, } -type Tree = ImmutableKdTree; +type Tree = ImmutableKdDynamicTree; fn main() -> Result<(), Box> { #[cfg(feature = "tracing")] @@ -61,8 +61,6 @@ fn main() -> Result<(), Box> { ElapsedDuration::new(start.elapsed()) ); - println!("Tree Stats: {:?}", kdtree.generate_stats()); - // Test query on the newly created tree let query = [0.123f64, 0.456f64, 0.789f64]; let nearest_neighbour = kdtree.nearest_one::(&query); diff --git a/examples/pointcloud-las.rs b/examples/pointcloud-las.rs index 647a897..afd7d86 100644 --- a/examples/pointcloud-las.rs +++ b/examples/pointcloud-las.rs @@ -9,7 +9,7 @@ use std::fs::File; use std::io::Write; use std::time::Instant; -use kiddo::ImmutableKdTree; +use kiddo::ImmutableKdDynamicTree; use kiddo::SquaredEuclidean; use las::Reader; @@ -22,7 +22,7 @@ use tracing_subscriber::fmt; const BUFFER_LEN: usize = 10_000_000_000; const SCRATCH_LEN: usize = 1_000_000_000; -type Tree = ImmutableKdTree; +type Tree = ImmutableKdDynamicTree; fn main() -> Result<(), Box> { #[cfg(feature = "tracing")] @@ -53,8 +53,6 @@ fn main() -> Result<(), Box> { ElapsedDuration::new(start.elapsed()) ); - println!("Tree Stats: {:?}", kdtree.generate_stats()); - // Test query on the newly created tree let query = [0.123f32, 0.456f32, 0.789f32]; let nearest_neighbour = kdtree.nearest_one::(&query); diff --git a/src/float_leaf_slice/fallback.rs b/src/float_leaf_slice/fallback.rs index 0e2150c..ec8d750 100644 --- a/src/float_leaf_slice/fallback.rs +++ b/src/float_leaf_slice/fallback.rs @@ -2,6 +2,7 @@ use az::Cast; use crate::{float::kdtree::Axis, types::Content}; +#[inline] pub(crate) fn get_best_from_dists_autovec( acc: &[A], items: &[T], diff --git a/src/float_leaf_slice/leaf_slice.rs b/src/float_leaf_slice/leaf_slice.rs index 4b4f604..24eeb50 100644 --- a/src/float_leaf_slice/leaf_slice.rs +++ b/src/float_leaf_slice/leaf_slice.rs @@ -1,6 +1,8 @@ use az::Cast; use std::slice::ChunksExact; +const CHUNK_SIZE: usize = 32; + #[cfg(all( feature = "simd", target_feature = "avx2", @@ -34,6 +36,7 @@ where T: Content, usize: Cast, { + #[inline] pub fn nearest_one(&self, query: &[A; K], best_dist: &mut A, best_item: &mut T) where D: DistanceMetric, @@ -77,6 +80,7 @@ impl<'a, A: Axis, T: Content, const K: usize, const C: usize> Iterator { type Item = ([&'a [A; C]; K], &'a [T; C]); + #[inline] fn next(&mut self) -> Option { if self.items_iterator.is_empty() { None @@ -93,6 +97,7 @@ impl<'a, A: Axis, T: Content, const K: usize, const C: usize> Iterator impl<'a, A: Axis, T: Content, const K: usize, const C: usize> LeafFixedSliceIterator<'a, A, T, K, C> { + #[inline] fn remainder(&self) -> ([&'a [A]; K], &'a [T]) { ( array_init::array_init(|i| self.points_iterators[i].remainder()), @@ -122,6 +127,7 @@ where T: Content, usize: Cast, { + #[inline] pub fn new<'a>(content_points: [&'a [A]; K], content_items: &'a [T]) -> LeafSlice<'a, A, T, K> { let size = content_items.len(); for arr in content_points { @@ -134,6 +140,7 @@ where } } + #[inline] fn as_full_chunks(&self) -> LeafFixedSliceIterator { let points_iterators = self.content_points.map(|i| i.chunks_exact(C)); let items_iterator = self.content_items.chunks_exact(C); @@ -144,14 +151,15 @@ where } } + #[inline] pub fn nearest_one(&self, query: &[A; K], best_dist: &mut A, best_item: &mut T) where D: DistanceMetric, { - let chunk_iter = self.as_full_chunks::<64>(); + let chunk_iter = self.as_full_chunks::(); let (remainder_points, remainder_items) = chunk_iter.remainder(); for chunk in chunk_iter { - let dists = A::dists_for_chunk::(chunk.0, query); + let dists = A::dists_for_chunk::(chunk.0, query); A::update_best_dist(dists, chunk.1, best_dist, best_item); } @@ -180,6 +188,7 @@ where T: Content, usize: Cast, { + #[inline] fn update_best_dist( acc: [f64; C], items: &[T; C], @@ -213,6 +222,7 @@ where } } + #[inline] fn dists_for_chunk(chunk: [&[Self; C]; K], query: &[Self; K]) -> [Self; C] where D: DistanceMetric, @@ -237,6 +247,7 @@ where T: Content, usize: Cast, { + #[inline] fn update_best_dist( acc: [f32; C], items: &[T; C], @@ -268,6 +279,7 @@ where } } + #[inline] fn dists_for_chunk(chunk: [&[Self; C]; K], query: &[Self; K]) -> [Self; C] where D: DistanceMetric, diff --git a/src/immutable_dynamic/common/generate_immutable_dynamic_approx_nearest_one.rs b/src/immutable_dynamic/common/generate_immutable_dynamic_approx_nearest_one.rs index d5f292c..84def8b 100644 --- a/src/immutable_dynamic/common/generate_immutable_dynamic_approx_nearest_one.rs +++ b/src/immutable_dynamic/common/generate_immutable_dynamic_approx_nearest_one.rs @@ -20,7 +20,7 @@ macro_rules! generate_immutable_dynamic_approx_nearest_one { let mut level: usize = 0; let mut leaf_idx: usize = 0; - while level <= self.max_stem_level { + while level <= self.max_stem_level as usize { let val = *unsafe { self.stems.get_unchecked(curr_idx) }; let is_right_child = *unsafe { query.get_unchecked(dim) } >= val; @@ -32,13 +32,13 @@ macro_rules! generate_immutable_dynamic_approx_nearest_one { dim = (dim + 1) % K; } - let leaf_extent: core::range::Range = unsafe { *self.leaf_extents.get_unchecked(leaf_idx) }; + let (start, end) = unsafe { *self.leaf_extents.get_unchecked(leaf_idx) }; let leaf_slice = $crate::float_leaf_slice::leaf_slice::LeafSlice::new( array_init::array_init(|i| - &self.leaf_points[i][leaf_extent] + &self.leaf_points[i][start as usize..end as usize] ), - &self.leaf_items[leaf_extent], + &self.leaf_items[start as usize..end as usize], ); leaf_slice.nearest_one::( diff --git a/src/immutable_dynamic/common/generate_immutable_dynamic_nearest_one.rs b/src/immutable_dynamic/common/generate_immutable_dynamic_nearest_one.rs index c6250b8..393ab6a 100644 --- a/src/immutable_dynamic/common/generate_immutable_dynamic_nearest_one.rs +++ b/src/immutable_dynamic/common/generate_immutable_dynamic_nearest_one.rs @@ -10,19 +10,24 @@ macro_rules! generate_immutable_dynamic_nearest_one { D: DistanceMetric, { let mut off = [A::zero(); K]; + let mut result = NearestNeighbour { + distance: A::max_value(), + item: T::zero(), + }; + self.nearest_one_recurse::( query, 0, 0, - NearestNeighbour { - distance: A::max_value(), - item: T::zero(), - }, + &mut result, &mut off, A::zero(), 0, + // 0, 0, - ) + ); + + result } #[allow(clippy::too_many_arguments)] @@ -31,21 +36,22 @@ macro_rules! generate_immutable_dynamic_nearest_one { query: &[A; K], stem_idx: usize, split_dim: usize, - mut nearest: NearestNeighbour, + nearest: &mut NearestNeighbour, off: &mut [A; K], rd: A, mut level: usize, + // mut minor_level: u64, mut leaf_idx: usize, - ) -> NearestNeighbour + ) where D: DistanceMetric, { // use cmov::Cmov; use $crate::modified_van_emde_boas::modified_van_emde_boas_get_child_idx_v2_branchless; - if level > self.max_stem_level { - self.search_leaf_for_nearest::(query, &mut nearest, leaf_idx as usize); - return nearest; + if level > self.max_stem_level as usize { + self.search_leaf_for_nearest::(query, nearest, leaf_idx as usize); + return; } let val = *unsafe { self.stems.get_unchecked(stem_idx as usize) }; @@ -55,8 +61,8 @@ macro_rules! generate_immutable_dynamic_nearest_one { let closer_leaf_idx = leaf_idx + is_right_child; let farther_leaf_idx = leaf_idx + (1 - is_right_child); - let closer_node_idx = modified_van_emde_boas_get_child_idx_v2_branchless(stem_idx, is_right_child == 1, level); - let further_node_idx = modified_van_emde_boas_get_child_idx_v2_branchless(stem_idx, is_right_child == 0, level); + let closer_node_idx = modified_van_emde_boas_get_child_idx_v2_branchless(stem_idx, is_right_child == 1, /*minor_*/level); + let further_node_idx = modified_van_emde_boas_get_child_idx_v2_branchless(stem_idx, is_right_child == 0, /*minor_*/level); let mut rd = rd; let old_off = off[split_dim]; @@ -64,8 +70,10 @@ macro_rules! generate_immutable_dynamic_nearest_one { level += 1; let next_split_dim = (split_dim + 1).rem(K); + // minor_level += 1; + // minor_level.cmovnz(&0, u8::from(minor_level == 3)); - let nearest_neighbour = self.nearest_one_recurse::( + self.nearest_one_recurse::( query, closer_node_idx, next_split_dim, @@ -73,19 +81,15 @@ macro_rules! generate_immutable_dynamic_nearest_one { off, rd, level, + // minor_level, closer_leaf_idx, ); - if nearest_neighbour < nearest { - nearest = nearest_neighbour; - } - rd = Axis::rd_update(rd, D::dist1(new_off, old_off)); if rd <= nearest.distance { off[split_dim] = new_off; - - let result = self.nearest_one_recurse::( + self.nearest_one_recurse::( query, further_node_idx, next_split_dim, @@ -93,20 +97,15 @@ macro_rules! generate_immutable_dynamic_nearest_one { off, rd, level, + // minor_level, farther_leaf_idx, ); off[split_dim] = old_off; - - if result < nearest { - nearest = result; - } } - - nearest } #[inline] - fn search_leaf_for_nearest( + fn search_leaf_for_nearest( &self, query: &[A; K], nearest: &mut NearestNeighbour, @@ -114,26 +113,24 @@ macro_rules! generate_immutable_dynamic_nearest_one { ) where D: DistanceMetric, { - let leaf_extent = unsafe { self.leaf_extents.get_unchecked(leaf_idx) }; - // let leaf_extent = self.leaf_extents[leaf_idx]; + let (start, end) = unsafe { *self.leaf_extents.get_unchecked(leaf_idx) }; + + // Artificially extend size to be at least chunk length for faster processing + // TODO: why does this slow things down? + // let end = end.max(start + 32).min(self.leaf_items.len() as u32); + let leaf_slice = $crate::float_leaf_slice::leaf_slice::LeafSlice::new( array_init::array_init(|i| - &self.leaf_points[i][leaf_extent.start as usize..leaf_extent.end as usize] + &self.leaf_points[i][start as usize..end as usize] ), - &self.leaf_items[leaf_extent.start as usize..leaf_extent.end as usize], + &self.leaf_items[start as usize..end as usize], ); - let mut best_item = nearest.item; - let mut best_dist = nearest.distance; - leaf_slice.nearest_one::( query, - &mut best_dist, - &mut best_item + &mut nearest.distance, + &mut nearest.item ); - - nearest.distance = best_dist; - nearest.item = best_item; } } }; diff --git a/src/immutable_dynamic/float/kdtree.rs b/src/immutable_dynamic/float/kdtree.rs index 70fe613..4f91073 100644 --- a/src/immutable_dynamic/float/kdtree.rs +++ b/src/immutable_dynamic/float/kdtree.rs @@ -5,9 +5,9 @@ //! As with the vanilla tree, [`f64`] or [`f32`] are supported currently for co-ordinate //! values, or [`f16`](https://docs.rs/half/latest/half/struct.f16.html) if the `f16` feature is enabled +use aligned_vec::{avec,AVec}; use array_init::array_init; use az::{Az, Cast}; -use core::range::Range; use ordered_float::OrderedFloat; use std::cmp::PartialEq; use std::fmt::Debug; @@ -51,10 +51,10 @@ pub struct ImmutableDynamicKdTree< const K: usize, const B: usize, > { - pub(crate) stems: Vec, + pub(crate) stems: AVec, pub(crate) leaf_points: [Vec; K], pub(crate) leaf_items: Vec, - pub(crate) leaf_extents: Vec>, + pub(crate) leaf_extents: Vec<(u32,u32)>, pub(crate) max_stem_level: usize, } @@ -125,11 +125,13 @@ where // TODO: this is wrong for most situations needing > 7 nodes // let stem_node_count = stem_node_count + stem_node_count.div_floor(7); let stem_node_count = stem_node_count * 5; + // TODO: just trim the stems afterwards by traversing right-child non-inf nodes + // till we hit max level to get the max used stem - let mut stems = vec![A::infinity(); stem_node_count]; + let mut stems = avec![A::infinity(); stem_node_count]; let mut leaf_points: [Vec; K] = array_init(|_| Vec::with_capacity(item_count)); let mut leaf_items: Vec = Vec::with_capacity(item_count); - let mut leaf_extents: Vec> = Vec::with_capacity(item_count.div_ceil(B)); + let mut leaf_extents: Vec<(u32,u32)> = Vec::with_capacity(item_count.div_ceil(B)); let mut sort_index = Vec::from_iter(0..item_count); @@ -158,7 +160,7 @@ where #[allow(clippy::too_many_arguments)] fn populate_recursive( - stems: &mut Vec, + stems: &mut AVec, dim: usize, source: &[[A; K]], sort_index: &mut [usize], @@ -168,7 +170,7 @@ where capacity: usize, leaf_points: &mut [Vec; K], leaf_items: &mut Vec, - leaf_extents: &mut Vec>, + leaf_extents: &mut Vec<(u32,u32)>, ) { #[cfg(feature = "tracing")] let span = span!(Level::TRACE, "opt", idx = stem_index); @@ -180,8 +182,7 @@ where // Write leaf and terminate recursion // println!("Writing leaf #{:?}", leaf_extents.len()); - let range = leaf_items.len()..(leaf_items.len() + chunk_length); - leaf_extents.push(range.into()); + leaf_extents.push((leaf_items.len() as u32, (leaf_items.len() + chunk_length) as u32)); (0..chunk_length).for_each(|i| { (0..K).for_each(|dim| leaf_points[dim].push(source[sort_index[i]][dim])); @@ -192,7 +193,6 @@ where } // println!("Handling stem #{:?}", stem_index); - let levels_below = max_stem_level - level; let left_capacity = (2usize.pow(levels_below as u32) * B).min(capacity); let right_capacity = capacity.saturating_sub(left_capacity); @@ -215,36 +215,6 @@ where stems[stem_index] = source[sort_index[pivot]][dim]; } - // let minor_triangle_idx = stem_index & (ITEMS_PER_CACHE_LINE - 1); - // - // minor_level = minor_level + 1; - // let incrementing_major_level = minor_level == LOG2_ITEMS_PER_CACHE_LINE; - // - // // Switching to next van Emde Boas triangle if incrementing major level - // if incrementing_major_level { - // minor_level = 0; - // major_level_base_idx = major_level_base_idx + major_level_base_delta; - // major_level_base_delta = major_level_base_delta << LOG2_ITEMS_PER_CACHE_LINE; - // } - // - // let next_minor_triangle_root_idx = if incrementing_major_level { - // (minor_triangle_idx - LOG2_ITEMS_PER_CACHE_LINE) << LOG2_MINOR_TRIANGLE_FACTOR - // } else { - // 0 - // }; - - // let left_child_idx = if incrementing_major_level { - // major_level_base_idx + next_minor_triangle_root_idx - // } else { - // stem_index + minor_triangle_idx + 1 - // }; - // - // let right_child_idx = if incrementing_major_level { - // major_level_base_idx + next_minor_triangle_root_idx + (1 << LOG2_ITEMS_PER_CACHE_LINE) - // } else { - // stem_index + minor_triangle_idx + 2 - // }; - let left_child_idx = modified_van_emde_boas_get_child_idx_v2(stem_index, false, level); let right_child_idx = modified_van_emde_boas_get_child_idx_v2(stem_index, true, level); @@ -290,8 +260,6 @@ where dim: usize, mut pivot: usize, ) -> usize { - // Using this version of update_pivot makes construction significantly faster (~13%) - // TODO: this block might be faster by using a quickselect with a fat partition? // we could then run that quickselect and subtract (fat partition length - 1) // from the pivot, avoiding the need for the while loop. @@ -406,10 +374,10 @@ mod tests { let tree: ImmutableDynamicKdTree = ImmutableDynamicKdTree::new_from_slice(&content_to_add); - assert_eq!(tree.leaf_extents[0].iter().count(), 3); - assert_eq!(tree.leaf_extents[1].iter().count(), 5); - assert_eq!(tree.leaf_extents[2].iter().count(), 4); - assert_eq!(tree.leaf_extents[3].iter().count(), 4); + // assert_eq!(tree.leaf_extents[0].iter().count(), 3); + // assert_eq!(tree.leaf_extents[1].iter().count(), 5); + // assert_eq!(tree.leaf_extents[2].iter().count(), 4); + // assert_eq!(tree.leaf_extents[3].iter().count(), 4); } #[test] diff --git a/src/modified_van_emde_boas.rs b/src/modified_van_emde_boas.rs index 39e4a78..8b5e273 100644 --- a/src/modified_van_emde_boas.rs +++ b/src/modified_van_emde_boas.rs @@ -7,6 +7,7 @@ const ITEMS_PER_CACHE_LINE_MASK: usize = ITEMS_PER_CACHE_LINE - 1; const LOG2_ITEMS_PER_CACHE_LINE: usize = ITEMS_PER_CACHE_LINE.ilog2() as usize; // f64 = 3 levels; f32 = 4 levels #[allow(dead_code)] +#[inline] pub(crate) fn modified_van_emde_boas_get_child_idx_v2( curr_idx: usize, is_right_child: bool, @@ -30,10 +31,12 @@ pub(crate) fn modified_van_emde_boas_get_child_idx_v2( } #[allow(dead_code)] +#[inline] pub(crate) fn modified_van_emde_boas_get_child_idx_v2_branchless( curr_idx: usize, is_right_child: bool, level: usize, + // minor_level: u64, ) -> usize { let minor_level = (level % LOG2_ITEMS_PER_CACHE_LINE) as u64; let maj_idx = (curr_idx >> LOG2_ITEMS_PER_CACHE_LINE) as u64;