From dc2d9b31dec5842c1845975511fcdb70ce1df7e6 Mon Sep 17 00:00:00 2001 From: Noa Date: Tue, 6 Feb 2024 23:48:57 -0600 Subject: [PATCH 1/5] Add ParallelIterator::collect_vec_list --- src/iter/collect/mod.rs | 20 ++++++++++++++++++++ src/iter/extend.rs | 13 +++++++++---- src/iter/mod.rs | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) diff --git a/src/iter/collect/mod.rs b/src/iter/collect/mod.rs index 4044a685b..807ce6639 100644 --- a/src/iter/collect/mod.rs +++ b/src/iter/collect/mod.rs @@ -1,3 +1,5 @@ +use std::collections::LinkedList; + use super::{IndexedParallelIterator, ParallelIterator}; mod consumer; @@ -64,6 +66,24 @@ where }); } +/// Collects the iterator into a linked list of vectors. +/// +/// This is called by `ParallelIterator::collect_vec_list`. +pub(super) fn collect_vec_list(pi: I) -> LinkedList> +where + I: ParallelIterator, +{ + match pi.opt_len() { + Some(len) => { + // Pseudo-specialization. See impl of ParallelExtend for Vec for more details. + let mut v = Vec::new(); + super::collect::special_extend(pi, len, &mut v); + LinkedList::from([v]) + } + None => super::extend::drive_list_vec(pi), + } +} + /// Create a consumer on the slice of memory we are collecting into. /// /// The consumer needs to be used inside the scope function, and the diff --git a/src/iter/extend.rs b/src/iter/extend.rs index a2645280f..3e5a51efa 100644 --- a/src/iter/extend.rs +++ b/src/iter/extend.rs @@ -12,10 +12,7 @@ use std::hash::{BuildHasher, Hash}; /// parallel, then extending the collection sequentially. macro_rules! extend { ($self:ident, $par_iter:ident, $extend:ident) => { - $extend( - $self, - $par_iter.into_par_iter().drive_unindexed(ListVecConsumer), - ); + $extend($self, drive_list_vec($par_iter)); }; } @@ -24,6 +21,14 @@ fn len(list: &LinkedList>) -> usize { list.iter().map(Vec::len).sum() } +pub(super) fn drive_list_vec(pi: I) -> LinkedList> +where + I: IntoParallelIterator, + T: Send, +{ + pi.into_par_iter().drive_unindexed(ListVecConsumer) +} + struct ListVecConsumer; struct ListVecFolder { diff --git a/src/iter/mod.rs b/src/iter/mod.rs index 7b5a29aeb..9f0cd373e 100644 --- a/src/iter/mod.rs +++ b/src/iter/mod.rs @@ -83,6 +83,7 @@ use self::plumbing::*; use self::private::Try; pub use either::Either; use std::cmp::{self, Ordering}; +use std::collections::LinkedList; use std::iter::{Product, Sum}; use std::ops::{Fn, RangeBounds}; @@ -2339,6 +2340,41 @@ pub trait ParallelIterator: Sized + Send { SkipAnyWhile::new(self, predicate) } + /// Collects this iterator into a linked list of vectors. + /// + /// This is useful when you need to condense a parallel iterator into a collection, + /// but have no specific requirements for what that collection should be. If you + /// plan to store the collection longer-term, `Vec` is, as always, likely the + /// best default choice, despite the overhead that comes from concatenating each + /// vector. Or, if this is an `IndexedParallelIterator`, you should also prefer to + /// just collect to a `Vec`. + /// + /// Internally, most [`FromParallelIterator`]/[`ParallelExtend`] implementations + /// use this strategy; each job collecting their chunk of the iterator to a `Vec` + /// and those chunks getting merged into a `LinkedList`, before then extending the + /// collection with each vector. This is the most efficient way to collect an + /// unindexed parallel iterator (again, indexed parallel iterators can be + /// efficiently collected simply into a vector). + /// + /// # Examples + /// + /// ``` + /// # use std::collections::LinkedList; + /// use rayon::prelude::*; + /// + /// let result: LinkedList> = (0..=100) + /// .into_par_iter() + /// .filter(|x| x % 2 == 0) + /// .flat_map(|x| 0..x) + /// .collect_vec_list(); + /// + /// let total_len = result.iter().flatten().count(); + /// assert_eq!(total_len, 2550); + /// ``` + fn collect_vec_list(self) -> LinkedList> { + collect::collect_vec_list(self) + } + /// Internal method used to define the behavior of this parallel /// iterator. You should not need to call this directly. /// From 5ed170d419b7b7470a03ffd24d248f81f49a2bf4 Mon Sep 17 00:00:00 2001 From: Noa Date: Wed, 7 Feb 2024 13:45:27 -0600 Subject: [PATCH 2/5] Update docs Co-authored-by: Josh Stone --- src/iter/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/iter/mod.rs b/src/iter/mod.rs index 9f0cd373e..005cb6002 100644 --- a/src/iter/mod.rs +++ b/src/iter/mod.rs @@ -2352,9 +2352,8 @@ pub trait ParallelIterator: Sized + Send { /// Internally, most [`FromParallelIterator`]/[`ParallelExtend`] implementations /// use this strategy; each job collecting their chunk of the iterator to a `Vec` /// and those chunks getting merged into a `LinkedList`, before then extending the - /// collection with each vector. This is the most efficient way to collect an - /// unindexed parallel iterator (again, indexed parallel iterators can be - /// efficiently collected simply into a vector). + /// collection with each vector. This is a very efficient way to collect an + /// unindexed parallel iterator, without much intermediate data movement. /// /// # Examples /// From c8ab4210428c531f59ddb07898002f1500067106 Mon Sep 17 00:00:00 2001 From: Noa Date: Wed, 7 Feb 2024 13:48:03 -0600 Subject: [PATCH 3/5] Inline the collect_vec_list implementation --- src/iter/collect/mod.rs | 18 ------------------ src/iter/mod.rs | 10 +++++++++- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/src/iter/collect/mod.rs b/src/iter/collect/mod.rs index 807ce6639..5f8303d12 100644 --- a/src/iter/collect/mod.rs +++ b/src/iter/collect/mod.rs @@ -66,24 +66,6 @@ where }); } -/// Collects the iterator into a linked list of vectors. -/// -/// This is called by `ParallelIterator::collect_vec_list`. -pub(super) fn collect_vec_list(pi: I) -> LinkedList> -where - I: ParallelIterator, -{ - match pi.opt_len() { - Some(len) => { - // Pseudo-specialization. See impl of ParallelExtend for Vec for more details. - let mut v = Vec::new(); - super::collect::special_extend(pi, len, &mut v); - LinkedList::from([v]) - } - None => super::extend::drive_list_vec(pi), - } -} - /// Create a consumer on the slice of memory we are collecting into. /// /// The consumer needs to be used inside the scope function, and the diff --git a/src/iter/mod.rs b/src/iter/mod.rs index 005cb6002..437526ac3 100644 --- a/src/iter/mod.rs +++ b/src/iter/mod.rs @@ -2371,7 +2371,15 @@ pub trait ParallelIterator: Sized + Send { /// assert_eq!(total_len, 2550); /// ``` fn collect_vec_list(self) -> LinkedList> { - collect::collect_vec_list(self) + match self.opt_len() { + Some(len) => { + // Pseudo-specialization. See impl of ParallelExtend for Vec for more details. + let mut v = Vec::new(); + collect::special_extend(self, len, &mut v); + LinkedList::from([v]) + } + None => extend::drive_list_vec(self), + } } /// Internal method used to define the behavior of this parallel From b5a9a0e2dfe43e5ea9fa50249446603e507c19ca Mon Sep 17 00:00:00 2001 From: Noa Date: Wed, 7 Feb 2024 13:55:35 -0600 Subject: [PATCH 4/5] Add check for len of 0 --- src/iter/collect/mod.rs | 2 -- src/iter/mod.rs | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/iter/collect/mod.rs b/src/iter/collect/mod.rs index 5f8303d12..4044a685b 100644 --- a/src/iter/collect/mod.rs +++ b/src/iter/collect/mod.rs @@ -1,5 +1,3 @@ -use std::collections::LinkedList; - use super::{IndexedParallelIterator, ParallelIterator}; mod consumer; diff --git a/src/iter/mod.rs b/src/iter/mod.rs index 437526ac3..e98bc7a7a 100644 --- a/src/iter/mod.rs +++ b/src/iter/mod.rs @@ -2372,6 +2372,7 @@ pub trait ParallelIterator: Sized + Send { /// ``` fn collect_vec_list(self) -> LinkedList> { match self.opt_len() { + Some(0) => LinkedList::new(), Some(len) => { // Pseudo-specialization. See impl of ParallelExtend for Vec for more details. let mut v = Vec::new(); From 38057628db45a48182d566162e76ce77debec202 Mon Sep 17 00:00:00 2001 From: Noa Date: Wed, 7 Feb 2024 14:02:37 -0600 Subject: [PATCH 5/5] Update docs --- src/iter/mod.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/iter/mod.rs b/src/iter/mod.rs index e98bc7a7a..4b7289190 100644 --- a/src/iter/mod.rs +++ b/src/iter/mod.rs @@ -1962,6 +1962,9 @@ pub trait ParallelIterator: Sized + Send { /// of how many elements the iterator contains, and even allows you to reuse /// an existing vector's backing store rather than allocating a fresh vector. /// + /// See also [`collect_vec_list()`][Self::collect_vec_list] for collecting + /// into a `LinkedList>`. + /// /// [`IndexedParallelIterator`]: trait.IndexedParallelIterator.html /// [`collect_into_vec()`]: /// trait.IndexedParallelIterator.html#method.collect_into_vec @@ -2367,7 +2370,9 @@ pub trait ParallelIterator: Sized + Send { /// .flat_map(|x| 0..x) /// .collect_vec_list(); /// - /// let total_len = result.iter().flatten().count(); + /// // `par_iter.collect_vec_list().into_iter().flatten()` turns + /// // a parallel iterator into a serial one + /// let total_len = result.into_iter().flatten().count(); /// assert_eq!(total_len, 2550); /// ``` fn collect_vec_list(self) -> LinkedList> { @@ -3232,14 +3237,15 @@ where /// /// If your collection is not naturally parallel, the easiest (and /// fastest) way to do this is often to collect `par_iter` into a - /// [`LinkedList`] or other intermediate data structure and then - /// sequentially extend your collection. However, a more 'native' - /// technique is to use the [`par_iter.fold`] or + /// [`LinkedList`] (via [`collect_vec_list`]) or another intermediate + /// data structure and then sequentially extend your collection. However, + /// a more 'native' technique is to use the [`par_iter.fold`] or /// [`par_iter.fold_with`] methods to create the collection. /// Alternatively, if your collection is 'natively' parallel, you /// can use `par_iter.for_each` to process each element in turn. /// /// [`LinkedList`]: https://doc.rust-lang.org/std/collections/struct.LinkedList.html + /// [`collect_vec_list`]: ParallelIterator::collect_vec_list /// [`par_iter.fold`]: trait.ParallelIterator.html#method.fold /// [`par_iter.fold_with`]: trait.ParallelIterator.html#method.fold_with /// [`par_iter.for_each`]: trait.ParallelIterator.html#method.for_each