From d0a1cabe12c76b58e6c79bbb2fcf16123c06ebcc Mon Sep 17 00:00:00 2001
From: Moritz Hoffmann <antiguru@gmail.com>
Date: Thu, 13 Jun 2024 16:06:22 -0400
Subject: [PATCH] Merge batcher input generic over containers (#494)

Enable flatcontainers in merge batchers.

---------

Signed-off-by: Moritz Hoffmann <antiguru@gmail.com>
---
 examples/spines.rs                            |   4 +-
 src/consolidation.rs                          | 194 +++++++++-
 src/trace/implementations/chunker.rs          | 364 ++++++++++++++++++
 src/trace/implementations/merge_batcher.rs    | 127 ++----
 .../implementations/merge_batcher_col.rs      |  87 +----
 .../implementations/merge_batcher_flat.rs     | 332 ++++++++++++++++
 src/trace/implementations/mod.rs              | 161 ++++----
 src/trace/implementations/ord_neu.rs          |  41 +-
 src/trace/implementations/rhh.rs              |   7 +-
 tests/bfs.rs                                  | 154 ++++++--
 10 files changed, 1157 insertions(+), 314 deletions(-)
 create mode 100644 src/trace/implementations/chunker.rs
 create mode 100644 src/trace/implementations/merge_batcher_flat.rs

diff --git a/examples/spines.rs b/examples/spines.rs
index 9fa407977..6720575fe 100644
--- a/examples/spines.rs
+++ b/examples/spines.rs
@@ -66,8 +66,8 @@ fn main() {
                 },
                 "flat" => {
                     use differential_dataflow::trace::implementations::ord_neu::FlatKeySpine;
-                    let data = data.arrange::<FlatKeySpine<_,_,_>>();
-                    let keys = keys.arrange::<FlatKeySpine<_,_,_>>();
+                    let data = data.arrange::<FlatKeySpine<String,_,isize,_>>();
+                    let keys = keys.arrange::<FlatKeySpine<String,_,isize,_>>();
                     keys.join_core(&data, |_k, (), ()| Option::<()>::None)
                         .probe_with(&mut probe);
                 }
diff --git a/src/consolidation.rs b/src/consolidation.rs
index 47573eed8..b9495d104 100644
--- a/src/consolidation.rs
+++ b/src/consolidation.rs
@@ -10,10 +10,15 @@
 //! you need specific behavior, it may be best to defensively copy, paste, and maintain the
 //! specific behavior you require.
 
+use std::cmp::Ordering;
 use std::collections::VecDeque;
+use timely::Container;
 use timely::container::{ContainerBuilder, PushInto, SizableContainer};
+use timely::container::flatcontainer::{FlatStack, Push, Region};
+use timely::container::flatcontainer::impls::tuple::{TupleABCRegion, TupleABRegion};
 use crate::Data;
-use crate::difference::Semigroup;
+use crate::difference::{IsZero, Semigroup};
+use crate::trace::cursor::IntoOwned;
 
 /// Sorts and consolidates `vec`.
 ///
@@ -218,6 +223,136 @@ where
     }
 }
 
+/// Layout of containers and their read items to be consolidated.
+///
+/// This trait specifies behavior to extract keys and diffs from container's read
+/// items. Consolidation accumulates the diffs per key.
+///
+/// The trait requires `Container` to have access to its `Item` GAT.
+pub trait ConsolidateLayout: Container {
+    /// Key portion of data, essentially everything minus the diff
+    type Key<'a>: Eq where Self: 'a;
+
+    /// GAT diff type.
+    type Diff<'a>: IntoOwned<'a, Owned = Self::DiffOwned> where Self: 'a;
+
+    /// Owned diff type.
+    type DiffOwned: for<'a> Semigroup<Self::Diff<'a>>;
+
+    /// Deconstruct an item into key and diff. Must be cheap.
+    fn into_parts(item: Self::Item<'_>) -> (Self::Key<'_>, Self::Diff<'_>);
+
+    /// Push an element to a compatible container.
+    ///
+    /// This function is odd to have, so let's explain why it exists. Ideally, the container
+    /// would accept a `(key, diff)` pair and we wouldn't need this function. However, we
+    /// might never be in a position where this is true: Vectors can push any `T`, which would
+    /// collide with a specific implementation for pushing tuples of mixes GATs and owned types.
+    ///
+    /// For this reason, we expose a function here that takes a GAT key and an owned diff, and
+    /// leave it to the implementation to "patch" a suitable item that can be pushed into `self`.
+    fn push_with_diff(&mut self, key: Self::Key<'_>, diff: Self::DiffOwned);
+
+    /// Compare two items by key to sort containers.
+    fn cmp(item1: &Self::Item<'_>, item2: &Self::Item<'_>) -> Ordering;
+}
+
+impl<D, T, R> ConsolidateLayout for Vec<(D, T, R)>
+where
+    D: Ord + Clone + 'static,
+    T: Ord + Clone + 'static,
+    for<'a> R: Semigroup + IntoOwned<'a, Owned = R> + Clone + 'static,
+{
+    type Key<'a> = (D, T) where Self: 'a;
+    type Diff<'a> = R where Self: 'a;
+    type DiffOwned = R;
+
+    fn into_parts((data, time, diff): Self::Item<'_>) -> (Self::Key<'_>, Self::Diff<'_>) {
+        ((data, time), diff)
+    }
+
+    fn cmp<'a>(item1: &Self::Item<'_>, item2: &Self::Item<'_>) -> Ordering {
+        (&item1.0, &item1.1).cmp(&(&item2.0, &item2.1))
+    }
+
+    fn push_with_diff(&mut self, (data, time): Self::Key<'_>, diff: Self::DiffOwned) {
+        self.push((data, time, diff));
+    }
+}
+
+impl<K, V, T, R> ConsolidateLayout for FlatStack<TupleABCRegion<TupleABRegion<K, V>, T, R>>
+where
+    for<'a> K: Region + Push<<K as Region>::ReadItem<'a>> + Clone + 'static,
+    for<'a> K::ReadItem<'a>: Ord + Copy,
+    for<'a> V: Region + Push<<V as Region>::ReadItem<'a>> + Clone + 'static,
+    for<'a> V::ReadItem<'a>: Ord + Copy,
+    for<'a> T: Region + Push<<T as Region>::ReadItem<'a>> + Clone + 'static,
+    for<'a> T::ReadItem<'a>: Ord + Copy,
+    R: Region + Push<<R as Region>::Owned> + Clone + 'static,
+    for<'a> R::Owned: Semigroup<R::ReadItem<'a>>,
+{
+    type Key<'a> = (K::ReadItem<'a>, V::ReadItem<'a>, T::ReadItem<'a>) where Self: 'a;
+    type Diff<'a> = R::ReadItem<'a> where Self: 'a;
+    type DiffOwned = R::Owned;
+
+    fn into_parts(((key, val), time, diff): Self::Item<'_>) -> (Self::Key<'_>, Self::Diff<'_>) {
+        ((key, val, time), diff)
+    }
+
+    fn cmp<'a>(((key1, val1), time1, _diff1): &Self::Item<'_>, ((key2, val2), time2, _diff2): &Self::Item<'_>) -> Ordering {
+        (K::reborrow(*key1), V::reborrow(*val1), T::reborrow(*time1)).cmp(&(K::reborrow(*key2), V::reborrow(*val2), T::reborrow(*time2)))
+    }
+
+    fn push_with_diff(&mut self, (key, value, time): Self::Key<'_>, diff: Self::DiffOwned) {
+        self.copy(((key, value), time, diff));
+    }
+}
+
+/// Consolidate the supplied container.
+pub fn consolidate_container<C: ConsolidateLayout>(container: &mut C, target: &mut C) {
+    // Sort input data
+    let mut permutation = Vec::new();
+    permutation.extend(container.drain());
+    permutation.sort_by(|a, b| C::cmp(a, b));
+
+    // Consolidate sorted data.
+    let mut previous: Option<(C::Key<'_>, C::DiffOwned)> = None;
+    // TODO: We should ensure that `target` has sufficient capacity, but `Container` doesn't
+    // offer a suitable API.
+    for item in permutation.drain(..) {
+        let (key, diff) = C::into_parts(item);
+        match &mut previous {
+            // Initial iteration, remeber key and diff.
+            // TODO: Opportunity for GatCow for diff.
+            None => previous = Some((key, diff.into_owned())),
+            Some((prevkey, d)) => {
+                // Second and following iteration, compare and accumulate or emit.
+                if key == *prevkey {
+                    // Keys match, keep accumulating.
+                    d.plus_equals(&diff);
+                } else {
+                    // Keys don't match, write down result if non-zero.
+                    if !d.is_zero() {
+                        // Unwrap because we checked for `Some` above.
+                        let (prevkey, diff) = previous.take().unwrap();
+                        target.push_with_diff(prevkey, diff);
+                    }
+                    // Remember current key and diff as `previous`
+                    previous = Some((key, diff.into_owned()));
+                }
+            }
+        }
+    }
+    // Write any residual data, if non-zero.
+    if let Some((previtem, d)) = previous {
+        if !d.is_zero() {
+            target.push_with_diff(previtem, d);
+        }
+    }
+}
+
+
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -308,4 +443,61 @@ mod tests {
             assert_eq!((i, 0, 1), collected[i]);
         }
     }
+
+    #[test]
+    fn test_consolidate_container() {
+        let mut data = vec![(1, 1, 1), (2, 1, 1), (1, 1, -1)];
+        let mut target = Vec::default();
+        data.sort();
+        consolidate_container(&mut data, &mut target);
+        assert_eq!(target, [(2, 1, 1)]);
+    }
+
+    #[cfg(not(debug_assertions))]
+    const LEN: usize = 256 << 10;
+    #[cfg(not(debug_assertions))]
+    const REPS: usize = 10 << 10;
+
+    #[cfg(debug_assertions)]
+    const LEN: usize = 256 << 1;
+    #[cfg(debug_assertions)]
+    const REPS: usize = 10 << 1;
+
+    #[test]
+    fn test_consolidator_duration() {
+        let mut data = Vec::with_capacity(LEN);
+        let mut data2 = Vec::with_capacity(LEN);
+        let mut target = Vec::new();
+        let mut duration = std::time::Duration::default();
+        for _ in 0..REPS {
+            data.clear();
+            data2.clear();
+            target.clear();
+            data.extend((0..LEN).map(|i| (i/4, 1, -2isize + ((i % 4) as isize))));
+            data2.extend((0..LEN).map(|i| (i/4, 1, -2isize + ((i % 4) as isize))));
+            data.sort_by(|x,y| x.0.cmp(&y.0));
+            let start = std::time::Instant::now();
+            consolidate_container(&mut data, &mut target);
+            duration += start.elapsed();
+
+            consolidate_updates(&mut data2);
+            assert_eq!(target, data2);
+        }
+        println!("elapsed consolidator {duration:?}");
+    }
+
+    #[test]
+    fn test_consolidator_duration_vec() {
+        let mut data = Vec::with_capacity(LEN);
+        let mut duration = std::time::Duration::default();
+        for _ in 0..REPS {
+            data.clear();
+            data.extend((0..LEN).map(|i| (i/4, 1, -2isize + ((i % 4) as isize))));
+            data.sort_by(|x,y| x.0.cmp(&y.0));
+            let start = std::time::Instant::now();
+            consolidate_updates(&mut data);
+            duration += start.elapsed();
+        }
+        println!("elapsed vec {duration:?}");
+    }
 }
diff --git a/src/trace/implementations/chunker.rs b/src/trace/implementations/chunker.rs
new file mode 100644
index 000000000..527a614d0
--- /dev/null
+++ b/src/trace/implementations/chunker.rs
@@ -0,0 +1,364 @@
+//! Organize streams of data into sorted chunks.
+
+use std::collections::VecDeque;
+use timely::communication::message::RefOrMut;
+use timely::Container;
+use timely::container::columnation::{Columnation, TimelyStack};
+use timely::container::{ContainerBuilder, PushInto, SizableContainer};
+use crate::consolidation::{consolidate_updates, consolidate_container, ConsolidateLayout};
+use crate::difference::Semigroup;
+
+/// Chunk a stream of vectors into chains of vectors.
+pub struct VecChunker<T> {
+    pending: Vec<T>,
+    ready: VecDeque<Vec<T>>,
+    empty: Option<Vec<T>>,
+}
+
+impl<T> Default for VecChunker<T> {
+    fn default() -> Self {
+        Self {
+            pending: Vec::default(),
+            ready: VecDeque::default(),
+            empty: None,
+        }
+    }
+}
+
+impl<K, V, T, R> VecChunker<((K, V), T, R)>
+where
+    K: Ord,
+    V: Ord,
+    T: Ord,
+    R: Semigroup,
+{
+    const BUFFER_SIZE_BYTES: usize = 8 << 10;
+    fn chunk_capacity() -> usize {
+        let size = ::std::mem::size_of::<((K, V), T, R)>();
+        if size == 0 {
+            Self::BUFFER_SIZE_BYTES
+        } else if size <= Self::BUFFER_SIZE_BYTES {
+            Self::BUFFER_SIZE_BYTES / size
+        } else {
+            1
+        }
+    }
+
+    /// Form chunks out of pending data, if needed. This function is meant to be applied to
+    /// potentially full buffers, and ensures that if the buffer was full when called it is at most
+    /// half full when the function returns.
+    ///
+    /// `form_chunk` does the following:
+    /// * If pending is full, consolidate.
+    /// * If after consolidation it's more than half full, peel off chunks,
+    ///   leaving behind any partial chunk in pending.
+    fn form_chunk(&mut self) {
+        consolidate_updates(&mut self.pending);
+        if self.pending.len() >= Self::chunk_capacity() {
+            while self.pending.len() > Self::chunk_capacity() {
+                let mut chunk = Vec::with_capacity(Self::chunk_capacity());
+                chunk.extend(self.pending.drain(..chunk.capacity()));
+                self.ready.push_back(chunk);
+            }
+        }
+    }
+}
+
+impl<'a, K, V, T, R> PushInto<RefOrMut<'a, Vec<((K, V), T, R)>>> for VecChunker<((K, V), T, R)>
+where
+    K: Ord + Clone,
+    V: Ord + Clone,
+    T: Ord + Clone,
+    R: Semigroup + Clone,
+{
+    fn push_into(&mut self, container: RefOrMut<'a, Vec<((K, V), T, R)>>) {
+        // Ensure `self.pending` has the desired capacity. We should never have a larger capacity
+        // because we don't write more than capacity elements into the buffer.
+        // Important: Consolidation requires `pending` to have twice the chunk capacity to
+        // amortize its cost. Otherwise, it risks to do quadratic work.
+        if self.pending.capacity() < Self::chunk_capacity() * 2 {
+            self.pending.reserve(Self::chunk_capacity() * 2 - self.pending.len());
+        }
+
+        // `container` is either a shared reference or an owned allocations.
+        match container {
+            RefOrMut::Ref(vec) => {
+                let mut slice = &vec[..];
+                while !slice.is_empty() {
+                    let (head, tail) = slice.split_at(std::cmp::min(self.pending.capacity() - self.pending.len(), slice.len()));
+                    slice = tail;
+                    self.pending.extend_from_slice(head);
+                    if self.pending.len() == self.pending.capacity() {
+                        self.form_chunk();
+                    }
+                }
+            }
+            RefOrMut::Mut(vec) => {
+                let mut drain = vec.drain(..).peekable();
+                while drain.peek().is_some() {
+                    self.pending.extend((&mut drain).take(self.pending.capacity() - self.pending.len()));
+                    if self.pending.len() == self.pending.capacity() {
+                        self.form_chunk();
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<K, V, T, R> ContainerBuilder for VecChunker<((K, V), T, R)>
+where
+    K: Ord + Clone + 'static,
+    V: Ord + Clone + 'static,
+    T: Ord + Clone + 'static,
+    R: Semigroup + Clone + 'static,
+{
+    type Container = Vec<((K, V), T, R)>;
+
+    fn extract(&mut self) -> Option<&mut Self::Container> {
+        if let Some(ready) = self.ready.pop_front() {
+            self.empty = Some(ready);
+            self.empty.as_mut()
+        } else {
+            None
+        }
+    }
+
+    fn finish(&mut self) -> Option<&mut Self::Container> {
+        if !self.pending.is_empty() {
+            consolidate_updates(&mut self.pending);
+            while !self.pending.is_empty() {
+                let mut chunk = Vec::with_capacity(Self::chunk_capacity());
+                chunk.extend(self.pending.drain(..std::cmp::min(self.pending.len(), chunk.capacity())));
+                self.ready.push_back(chunk);
+            }
+        }
+        self.empty = self.ready.pop_front();
+        self.empty.as_mut()
+    }
+}
+
+/// Chunk a stream of vectors into chains of vectors.
+pub struct ColumnationChunker<T: Columnation> {
+    pending: Vec<T>,
+    ready: VecDeque<TimelyStack<T>>,
+    empty: Option<TimelyStack<T>>,
+}
+
+impl<T: Columnation> Default for ColumnationChunker<T> {
+    fn default() -> Self {
+        Self {
+            pending: Vec::default(),
+            ready: VecDeque::default(),
+            empty: None,
+        }
+    }
+}
+
+impl<K,V,T,R> ColumnationChunker<((K, V), T, R)>
+where
+    K: Columnation + Ord,
+    V: Columnation + Ord,
+    T: Columnation + Ord,
+    R: Columnation + Semigroup,
+{
+    const BUFFER_SIZE_BYTES: usize = 64 << 10;
+    fn chunk_capacity() -> usize {
+        let size = ::std::mem::size_of::<((K, V), T, R)>();
+        if size == 0 {
+            Self::BUFFER_SIZE_BYTES
+        } else if size <= Self::BUFFER_SIZE_BYTES {
+            Self::BUFFER_SIZE_BYTES / size
+        } else {
+            1
+        }
+    }
+
+    /// Form chunks out of pending data, if needed. This function is meant to be applied to
+    /// potentially full buffers, and ensures that if the buffer was full when called it is at most
+    /// half full when the function returns.
+    ///
+    /// `form_chunk` does the following:
+    /// * If pending is full, consolidate.
+    /// * If after consolidation it's more than half full, peel off chunks,
+    ///   leaving behind any partial chunk in pending.
+    fn form_chunk(&mut self) {
+        consolidate_updates(&mut self.pending);
+        if self.pending.len() >= Self::chunk_capacity() {
+            while self.pending.len() > Self::chunk_capacity() {
+                let mut chunk = TimelyStack::with_capacity(Self::chunk_capacity());
+                for item in self.pending.drain(..chunk.capacity()) {
+                    chunk.copy(&item);
+                }
+                self.ready.push_back(chunk);
+            }
+        }
+    }
+}
+
+impl<'a, K, V, T, R> PushInto<RefOrMut<'a, Vec<((K, V), T, R)>>> for ColumnationChunker<((K, V), T, R)>
+where
+    K: Columnation + Ord + Clone,
+    V: Columnation + Ord + Clone,
+    T: Columnation + Ord + Clone,
+    R: Columnation + Semigroup + Clone,
+{
+    fn push_into(&mut self, container: RefOrMut<'a, Vec<((K, V), T, R)>>) {
+        // Ensure `self.pending` has the desired capacity. We should never have a larger capacity
+        // because we don't write more than capacity elements into the buffer.
+        if self.pending.capacity() < Self::chunk_capacity() * 2 {
+            self.pending.reserve(Self::chunk_capacity() * 2 - self.pending.len());
+        }
+
+        // `container` is either a shared reference or an owned allocations.
+        match container {
+            RefOrMut::Ref(vec) => {
+                let mut slice = &vec[..];
+                while !slice.is_empty() {
+                    let (head, tail) = slice.split_at(std::cmp::min(self.pending.capacity() - self.pending.len(), slice.len()));
+                    slice = tail;
+                    self.pending.extend_from_slice(head);
+                    if self.pending.len() == self.pending.capacity() {
+                        self.form_chunk();
+                    }
+                }
+            }
+            RefOrMut::Mut(vec) => {
+                let mut drain = vec.drain(..).peekable();
+                while drain.peek().is_some() {
+                    self.pending.extend((&mut drain).take(self.pending.capacity() - self.pending.len()));
+                    if self.pending.len() == self.pending.capacity() {
+                        self.form_chunk();
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<K, V, T, R> ContainerBuilder for ColumnationChunker<((K, V), T, R)>
+where
+    K: Columnation + Ord + Clone + 'static,
+    V: Columnation + Ord + Clone + 'static,
+    T: Columnation + Ord + Clone + 'static,
+    R: Columnation + Semigroup + Clone + 'static,
+{
+    type Container = TimelyStack<((K,V),T,R)>;
+
+    fn extract(&mut self) -> Option<&mut Self::Container> {
+        if let Some(ready) = self.ready.pop_front() {
+            self.empty = Some(ready);
+            self.empty.as_mut()
+        } else {
+            None
+        }
+    }
+
+    fn finish(&mut self) -> Option<&mut Self::Container> {
+        consolidate_updates(&mut self.pending);
+        while !self.pending.is_empty() {
+            let mut chunk = TimelyStack::with_capacity(Self::chunk_capacity());
+            for item in self.pending.drain(..std::cmp::min(self.pending.len(), chunk.capacity())) {
+                chunk.copy(&item);
+            }
+            self.ready.push_back(chunk);
+        }
+        self.empty = self.ready.pop_front();
+        self.empty.as_mut()
+    }
+}
+
+/// Chunk a stream of containers into chains of vectors.
+pub struct ContainerChunker<Output> {
+    pending: Output,
+    ready: VecDeque<Output>,
+    empty: Output,
+}
+
+impl<Output> Default for ContainerChunker<Output>
+where
+    Output: Default,
+{
+    fn default() -> Self {
+        Self {
+            pending: Output::default(),
+            ready: VecDeque::default(),
+            empty: Output::default(),
+        }
+    }
+}
+
+impl<'a, Input, Output> PushInto<RefOrMut<'a, Input>> for ContainerChunker<Output>
+where
+    Input: Container,
+    Output: SizableContainer
+        + ConsolidateLayout
+        + PushInto<Input::Item<'a>>
+        + PushInto<Input::ItemRef<'a>>,
+{
+    fn push_into(&mut self, container: RefOrMut<'a, Input>) {
+        if self.pending.capacity() < Output::preferred_capacity() {
+            self.pending.reserve(Output::preferred_capacity() - self.pending.len());
+        }
+        let form_batch = |this: &mut Self| {
+            if this.pending.len() == this.pending.capacity() {
+                consolidate_container(&mut this.pending, &mut this.empty);
+                std::mem::swap(&mut this.pending, &mut this.empty);
+                this.empty.clear();
+                if this.pending.len() > this.pending.capacity() / 2 {
+                    // Note that we're pushing non-full containers, which is a deviation from
+                    // other implementation. The reason for this is that we cannot extract
+                    // partial data from `this.pending`. We should revisit this in the future.
+                    this.ready.push_back(std::mem::take(&mut this.pending));
+                }
+            }
+        };
+        match container {
+            RefOrMut::Ref(container) => {
+                for item in container.iter() {
+                    self.pending.push(item);
+                    form_batch(self);
+                }
+            }
+            RefOrMut::Mut(container) => {
+                for item in container.drain() {
+                    self.pending.push(item);
+                    form_batch(self);
+                }
+            }
+        }
+    }
+}
+
+impl<Output> ContainerBuilder for ContainerChunker<Output>
+where
+    Output: SizableContainer + ConsolidateLayout,
+{
+    type Container = Output;
+
+    fn extract(&mut self) -> Option<&mut Self::Container> {
+        if let Some(ready) = self.ready.pop_front() {
+            self.empty = ready;
+            Some(&mut self.empty)
+        } else {
+            None
+        }
+    }
+
+    fn finish(&mut self) -> Option<&mut Self::Container> {
+        if !self.pending.is_empty() {
+            consolidate_container(&mut self.pending, &mut self.empty);
+            std::mem::swap(&mut self.pending, &mut self.empty);
+            self.empty.clear();
+            if !self.pending.is_empty() {
+                self.ready.push_back(std::mem::take(&mut self.pending));
+            }
+        }
+        if let Some(ready) = self.ready.pop_front() {
+            self.empty = ready;
+            Some(&mut self.empty)
+        } else {
+            None
+        }
+    }
+}
diff --git a/src/trace/implementations/merge_batcher.rs b/src/trace/implementations/merge_batcher.rs
index bb13cf650..cd4e7e72a 100644
--- a/src/trace/implementations/merge_batcher.rs
+++ b/src/trace/implementations/merge_batcher.rs
@@ -1,6 +1,7 @@
 //! A general purpose `Batcher` implementation based on radix sort.
 
 use std::collections::VecDeque;
+use std::marker::PhantomData;
 
 use timely::communication::message::RefOrMut;
 use timely::logging::WorkerIdentifier;
@@ -8,17 +9,18 @@ use timely::logging_core::Logger;
 use timely::progress::frontier::AntichainRef;
 use timely::progress::{frontier::Antichain, Timestamp};
 use timely::{Container, PartialOrder};
+use timely::container::{ContainerBuilder, PushInto};
 
-use crate::consolidation::consolidate_updates;
 use crate::difference::Semigroup;
 use crate::logging::{BatcherEvent, DifferentialEvent};
 use crate::trace::{Batcher, Builder};
 use crate::Data;
 
 /// Creates batches from unordered tuples.
-pub struct MergeBatcher<M, T>
+pub struct MergeBatcher<Input, C, M, T>
 where
-    M: Merger,
+    C: ContainerBuilder<Container=M::Chunk> + Default,
+    M: Merger<Time = T>,
 {
     /// each power-of-two length list of allocations.
     /// Do not push/pop directly but use the corresponding functions
@@ -26,6 +28,8 @@ where
     chains: Vec<Vec<M::Chunk>>,
     /// Stash of empty chunks
     stash: Vec<M::Chunk>,
+    /// Chunker to transform input streams to chunks of data.
+    chunker: C,
     /// Thing to accept data, merge chains, and talk to the builder.
     merger: M,
     /// Logger for size accounting.
@@ -36,14 +40,16 @@ where
     lower: Antichain<T>,
     /// The lower-bound frontier of the data, after the last call to seal.
     frontier: Antichain<T>,
+    _marker: PhantomData<Input>,
 }
 
-impl<M, T> Batcher for MergeBatcher<M, T>
+impl<Input, C, M, T> Batcher for MergeBatcher<Input, C, M, T>
 where
+    C: ContainerBuilder<Container=M::Chunk> + Default + for<'a> PushInto<RefOrMut<'a, Input>>,
     M: Merger<Time = T>,
     T: Timestamp,
 {
-    type Input = M::Input;
+    type Input = Input;
     type Output = M::Output;
     type Time = T;
 
@@ -51,19 +57,24 @@ where
         Self {
             logger,
             operator_id,
+            chunker: C::default(),
             merger: M::default(),
             chains: Vec::new(),
             stash: Vec::new(),
             frontier: Antichain::new(),
             lower: Antichain::from_elem(T::minimum()),
+            _marker: PhantomData,
         }
     }
 
     /// Push a container of data into this merge batcher. Updates the internal chain structure if
     /// needed.
-    fn push_container(&mut self, container: RefOrMut<M::Input>) {
-        let chain = self.merger.accept(container, &mut self.stash);
-        self.insert_chain(chain);
+    fn push_container(&mut self, container: RefOrMut<Input>) {
+        self.chunker.push_into(container);
+        while let Some(chunk) = self.chunker.extract() {
+            let chunk = std::mem::take(chunk);
+            self.insert_chain(vec![chunk]);
+        }
     }
 
     // Sealing a batch means finding those updates with times not greater or equal to any time
@@ -72,9 +83,9 @@ where
     // updates with times not greater or equal to `upper`.
     fn seal<B: Builder<Input = Self::Output, Time = Self::Time>>(&mut self, upper: Antichain<T>) -> B::Output {
         // Finish
-        let chain = self.merger.finish(&mut self.stash);
-        if !chain.is_empty() {
-            self.chain_push(chain);
+        while let Some(chunk) = self.chunker.finish() {
+            let chunk = std::mem::take(chunk);
+            self.insert_chain(vec![chunk]);
         }
 
         // Merge all remaining chains into a single chain.
@@ -111,9 +122,10 @@ where
     }
 }
 
-impl<M, T> MergeBatcher<M, T>
+impl<Input, C, M, T> MergeBatcher<Input, C, M, T>
 where
-    M: Merger,
+    C: ContainerBuilder<Container=M::Chunk> + Default,
+    M: Merger<Time = T>,
 {
     /// Insert a chain and maintain chain properties: Chains are geometrically sized and ordered
     /// by decreasing length.
@@ -178,7 +190,11 @@ where
     }
 }
 
-impl<M: Merger, T> Drop for MergeBatcher<M, T> {
+impl<Input, C, M, T> Drop for MergeBatcher<Input, C, M, T>
+where
+    C: ContainerBuilder<Container=M::Chunk> + Default,
+    M: Merger<Time = T>,
+{
     fn drop(&mut self) {
         // Cleanup chain to retract accounting information.
         while self.chain_pop().is_some() {}
@@ -187,8 +203,6 @@ impl<M: Merger, T> Drop for MergeBatcher<M, T> {
 
 /// A trait to describe interesting moments in a merge batcher.
 pub trait Merger: Default {
-    /// The type of update containers received from inputs.
-    type Input;
     /// The internal representation of chunks of data.
     type Chunk: Container;
     /// The output type
@@ -197,10 +211,6 @@ pub trait Merger: Default {
     type Output;
     /// The type of time in frontiers to extract updates.
     type Time;
-    /// Accept a fresh container of input data.
-    fn accept(&mut self, container: RefOrMut<Self::Input>, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk>;
-    /// Finish processing any stashed data.
-    fn finish(&mut self, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk>;
     /// Merge chains into an output chain.
     fn merge(&mut self, list1: Vec<Self::Chunk>, list2: Vec<Self::Chunk>, output: &mut Vec<Self::Chunk>, stash: &mut Vec<Self::Chunk>);
     /// Extract ready updates based on the `upper` frontier.
@@ -229,12 +239,12 @@ pub trait Merger: Default {
 
 /// A merger that knows how to accept and maintain chains of vectors.
 pub struct VecMerger<T> {
-    pending: Vec<T>,
+    _marker: PhantomData<T>,
 }
 
 impl<T> Default for VecMerger<T> {
     fn default() -> Self {
-        Self { pending: Vec::default() }
+        Self { _marker: PhantomData }
     }
 }
 
@@ -251,10 +261,6 @@ impl<T> VecMerger<T> {
         }
     }
 
-    fn pending_capacity(&self) -> usize {
-        self.chunk_capacity() * 2
-    }
-
     /// Helper to get pre-sized vector from the stash.
     #[inline]
     fn empty(&self, stash: &mut Vec<Vec<T>>) -> Vec<T> {
@@ -280,78 +286,9 @@ where
     R: Semigroup + 'static,
 {
     type Time = T;
-    type Input = Vec<((K, V), T, R)>;
     type Chunk = Vec<((K, V), T, R)>;
     type Output = Vec<((K, V), T, R)>;
 
-    fn accept(&mut self, container: RefOrMut<Self::Input>, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk> {
-        // Ensure `self.pending` has the desired capacity. We should never have a larger capacity
-        // because we don't write more than capacity elements into the buffer.
-        if self.pending.capacity() < self.pending_capacity() {
-            self.pending.reserve(self.pending_capacity() - self.pending.len());
-        }
-
-        // Form a chain from what's in pending.
-        // This closure does the following:
-        // * If pending is full, consolidate.
-        // * If after consolidation it's more than half full, peel off a chain of full blocks,
-        //   leaving behind any partial block in pending.
-        // * Merge the new chain with `final_chain` and return it in-place.
-        let form_chain = |this: &mut Self, final_chain: &mut Vec<Self::Chunk>, stash: &mut _| {
-            if this.pending.len() == this.pending.capacity() {
-                consolidate_updates(&mut this.pending);
-                if this.pending.len() >= this.chunk_capacity() {
-                    let mut chain = Vec::default();
-                    while this.pending.len() > this.chunk_capacity() {
-                        let mut chunk = this.empty(stash);
-                        chunk.extend(this.pending.drain(..chunk.capacity()));
-                        chain.push(chunk);
-                    }
-                    if final_chain.is_empty() {
-                        *final_chain = chain;
-                    } else if !chain.is_empty() {
-                        let mut output = Vec::default();
-                        this.merge(std::mem::take(final_chain), chain, &mut output, stash);
-                        *final_chain = output;
-                    }
-                }
-            }
-        };
-
-        let mut final_chain = Vec::default();
-        // `container` is either a shared reference or an owned allocations.
-        match container {
-            RefOrMut::Ref(vec) => {
-                let mut slice = &vec[..];
-                while !slice.is_empty() {
-                    let (head, tail) = slice.split_at(std::cmp::min(self.pending.capacity() - self.pending.len(), slice.len()));
-                    slice = tail;
-                    self.pending.extend_from_slice(head);
-                    form_chain(self, &mut final_chain, stash);
-                }
-            }
-            RefOrMut::Mut(vec) => {
-                while !vec.is_empty() {
-                    self.pending.extend(vec.drain(..std::cmp::min(self.pending.capacity() - self.pending.len(), vec.len())));
-                    form_chain(self, &mut final_chain, stash);
-                }
-            }
-        }
-        final_chain
-    }
-
-    fn finish(&mut self, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk> {
-        // Extract all data from `pending`.
-        consolidate_updates(&mut self.pending);
-        let mut chain = Vec::default();
-        while !self.pending.is_empty() {
-            let mut chunk = self.empty(stash);
-            chunk.extend(self.pending.drain(..std::cmp::min(chunk.capacity(), self.pending.len())));
-            chain.push(chunk);
-        }
-        chain
-    }
-
     fn merge(&mut self, list1: Vec<Self::Chunk>, list2: Vec<Self::Chunk>, output: &mut Vec<Self::Chunk>, stash: &mut Vec<Self::Chunk>) {
         let mut list1 = list1.into_iter();
         let mut list2 = list2.into_iter();
diff --git a/src/trace/implementations/merge_batcher_col.rs b/src/trace/implementations/merge_batcher_col.rs
index 265f2e649..677347015 100644
--- a/src/trace/implementations/merge_batcher_col.rs
+++ b/src/trace/implementations/merge_batcher_col.rs
@@ -1,8 +1,7 @@
 //! A general purpose `Batcher` implementation based on radix sort for TimelyStack.
 
-use crate::consolidation::consolidate_updates;
 use std::cmp::Ordering;
-use timely::communication::message::RefOrMut;
+use std::marker::PhantomData;
 use timely::container::columnation::{Columnation, TimelyStack};
 use timely::progress::frontier::{Antichain, AntichainRef};
 use timely::{Container, Data, PartialOrder};
@@ -13,12 +12,14 @@ use crate::trace::Builder;
 
 /// A merger for timely stacks
 pub struct ColumnationMerger<T> {
-    pending: Vec<T>,
+    _marker: PhantomData<T>,
 }
 
 impl<T> Default for ColumnationMerger<T> {
     fn default() -> Self {
-        Self { pending: Vec::default() }
+        Self {
+            _marker: PhantomData,
+        }
     }
 }
 
@@ -35,11 +36,6 @@ impl<T: Columnation> ColumnationMerger<T> {
         }
     }
 
-    /// Buffer size for pending updates, currently 2 * [`Self::chunk_capacity`].
-    fn pending_capacity(&self) -> usize {
-        self.chunk_capacity() * 2
-    }
-
     /// Helper to get pre-sized vector from the stash.
     #[inline]
     fn empty(&self, stash: &mut Vec<TimelyStack<T>>) -> TimelyStack<T> {
@@ -65,82 +61,9 @@ where
     R: Columnation + Semigroup + 'static,
 {
     type Time = T;
-    type Input = Vec<((K, V), T, R)>;
     type Chunk = TimelyStack<((K, V), T, R)>;
     type Output = TimelyStack<((K, V), T, R)>;
 
-    fn accept(&mut self, container: RefOrMut<Self::Input>, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk> {
-        // Ensure `self.pending` has the desired capacity. We should never have a larger capacity
-        // because we don't write more than capacity elements into the buffer.
-        if self.pending.capacity() < self.pending_capacity() {
-            self.pending.reserve(self.pending_capacity() - self.pending.len());
-        }
-
-        // Form a chain from what's in pending.
-        // This closure does the following:
-        // * If pending is full, consolidate.
-        // * If after consolidation it's more than half full, peel off a chain of full blocks,
-        //   leaving behind any partial block in pending.
-        // * Merge the new chain with `final_chain` and return it in-place.
-        let form_chain = |this: &mut Self, final_chain: &mut Vec<Self::Chunk>, stash: &mut _| {
-            if this.pending.len() == this.pending.capacity() {
-                consolidate_updates(&mut this.pending);
-                if this.pending.len() >= this.chunk_capacity() {
-                    let mut chain = Vec::default();
-                    while this.pending.len() > this.chunk_capacity() {
-                        let mut chunk = this.empty(stash);
-                        for datum in this.pending.drain(..chunk.capacity()) {
-                            chunk.copy(&datum);
-                        }
-                        chain.push(chunk);
-                    }
-                    if final_chain.is_empty() {
-                        *final_chain = chain;
-                    } else if !chain.is_empty() {
-                        let mut output = Vec::default();
-                        this.merge(std::mem::take(final_chain), chain, &mut output, stash);
-                        *final_chain = output;
-                    }
-                }
-            }
-        };
-
-        let mut final_chain = Vec::default();
-        // `container` is either a shared reference or an owned allocations.
-        match container {
-            RefOrMut::Ref(vec) => {
-                let mut slice = &vec[..];
-                while !slice.is_empty() {
-                    let (head, tail) = slice.split_at(std::cmp::min(self.pending.capacity() - self.pending.len(), slice.len()));
-                    slice = tail;
-                    self.pending.extend_from_slice(head);
-                    form_chain(self, &mut final_chain, stash);
-                }
-            }
-            RefOrMut::Mut(vec) => {
-                while !vec.is_empty() {
-                    self.pending.extend(vec.drain(..std::cmp::min(self.pending.capacity() - self.pending.len(), vec.len())));
-                    form_chain(self, &mut final_chain, stash);
-                }
-            }
-        }
-        final_chain
-    }
-
-    fn finish(&mut self, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk> {
-        // Extract all data from `pending`.
-        consolidate_updates(&mut self.pending);
-        let mut chain = Vec::default();
-        while !self.pending.is_empty() {
-            let mut chunk = self.empty(stash);
-            for datum in self.pending.drain(..std::cmp::min(chunk.capacity(), self.pending.len())) {
-                chunk.copy(&datum);
-            }
-            chain.push(chunk);
-        }
-        chain
-    }
-
     fn merge(&mut self, list1: Vec<Self::Chunk>, list2: Vec<Self::Chunk>, output: &mut Vec<Self::Chunk>, stash: &mut Vec<Self::Chunk>) {
         let mut list1 = list1.into_iter();
         let mut list2 = list2.into_iter();
diff --git a/src/trace/implementations/merge_batcher_flat.rs b/src/trace/implementations/merge_batcher_flat.rs
new file mode 100644
index 000000000..d6296a4d8
--- /dev/null
+++ b/src/trace/implementations/merge_batcher_flat.rs
@@ -0,0 +1,332 @@
+//! A general purpose `Batcher` implementation for FlatStack.
+
+use std::cmp::Ordering;
+use std::marker::PhantomData;
+use timely::progress::frontier::{Antichain, AntichainRef};
+use timely::{Container, Data, PartialOrder};
+use timely::container::flatcontainer::{Push, FlatStack, Region, ReserveItems};
+use timely::container::flatcontainer::impls::tuple::{TupleABCRegion, TupleABRegion};
+
+use crate::difference::Semigroup;
+use crate::trace::implementations::merge_batcher::Merger;
+use crate::trace::Builder;
+use crate::trace::cursor::IntoOwned;
+
+/// A merger for flat stacks. `T` describes the
+pub struct FlatcontainerMerger<T, R, MC> {
+    _marker: PhantomData<(T, R, MC)>,
+}
+
+impl<T, R, MC> Default for FlatcontainerMerger<T, R, MC> {
+    fn default() -> Self {
+        Self { _marker: PhantomData, }
+    }
+}
+
+impl<T, R, MC: Region> FlatcontainerMerger<T, R, MC> {
+    const BUFFER_SIZE_BYTES: usize = 8 << 10;
+    fn chunk_capacity(&self) -> usize {
+        let size = ::std::mem::size_of::<MC::Index>();
+        if size == 0 {
+            Self::BUFFER_SIZE_BYTES
+        } else if size <= Self::BUFFER_SIZE_BYTES {
+            Self::BUFFER_SIZE_BYTES / size
+        } else {
+            1
+        }
+    }
+
+    /// Helper to get pre-sized vector from the stash.
+    #[inline]
+    fn empty(&self, stash: &mut Vec<FlatStack<MC>>) -> FlatStack<MC> {
+        stash.pop().unwrap_or_else(|| FlatStack::with_capacity(self.chunk_capacity()))
+    }
+
+    /// Helper to return a chunk to the stash.
+    #[inline]
+    fn recycle(&self, mut chunk: FlatStack<MC>, stash: &mut Vec<FlatStack<MC>>) {
+        // TODO: Should we limit the size of `stash`?
+        if chunk.capacity() == self.chunk_capacity() {
+            chunk.clear();
+            stash.push(chunk);
+        }
+    }
+}
+
+/// Behavior to dissect items of chunks in the merge batcher
+pub trait MergerChunk: Region {
+    /// The key of the update
+    type Key<'a>: Ord where Self: 'a;
+    /// The value of the update
+    type Val<'a>: Ord where Self: 'a;
+    /// The time of the update
+    type Time<'a>: Ord where Self: 'a;
+    /// The diff of the update
+    type Diff<'a> where Self: 'a;
+
+    /// Split a read item into its constituents. Must be cheap.
+    fn into_parts<'a>(item: Self::ReadItem<'a>) -> (Self::Key<'a>, Self::Val<'a>, Self::Time<'a>, Self::Diff<'a>);
+}
+
+impl<K,V,T,R> MergerChunk for TupleABCRegion<TupleABRegion<K, V>, T, R>
+where
+    K: Region,
+    for<'a> K::ReadItem<'a>: Ord,
+    V: Region,
+    for<'a> V::ReadItem<'a>: Ord,
+    T: Region,
+    for<'a> T::ReadItem<'a>: Ord,
+    R: Region,
+{
+    type Key<'a> = K::ReadItem<'a> where Self: 'a;
+    type Val<'a> = V::ReadItem<'a> where Self: 'a;
+    type Time<'a> = T::ReadItem<'a> where Self: 'a;
+    type Diff<'a> = R::ReadItem<'a> where Self: 'a;
+
+    fn into_parts<'a>(((key, val), time, diff): Self::ReadItem<'a>) -> (Self::Key<'a>, Self::Val<'a>, Self::Time<'a>, Self::Diff<'a>) {
+        (key, val, time, diff)
+    }
+}
+
+impl<T, R, FR> Merger for FlatcontainerMerger<T, R, FR>
+where
+    for<'a> T: Ord + PartialOrder + PartialOrder<FR::Time<'a>> + Data,
+    for<'a> R: Default + Semigroup + Semigroup<FR::Diff<'a>> + Data,
+    for<'a> FR: MergerChunk + Clone + 'static
+        + ReserveItems<<FR as Region>::ReadItem<'a>>
+        + Push<<FR as Region>::ReadItem<'a>>
+        + Push<((FR::Key<'a>, FR::Val<'a>), FR::Time<'a>, &'a R)>
+        + Push<((FR::Key<'a>, FR::Val<'a>), FR::Time<'a>, FR::Diff<'a>)>,
+    for<'a> FR::Time<'a>: PartialOrder<T> + Copy + IntoOwned<'a, Owned=T>,
+    for<'a> FR::Diff<'a>: IntoOwned<'a, Owned=R>,
+    for<'a> FR::ReadItem<'a>: std::fmt::Debug,
+{
+    type Time = T;
+    type Chunk = FlatStack<FR>;
+    type Output = FlatStack<FR>;
+
+    fn merge(&mut self, list1: Vec<Self::Chunk>, list2: Vec<Self::Chunk>, output: &mut Vec<Self::Chunk>, stash: &mut Vec<Self::Chunk>) {
+        let mut list1 = list1.into_iter();
+        let mut list2 = list2.into_iter();
+
+        let mut head1 = <FlatStackQueue<FR>>::from(list1.next().unwrap_or_default());
+        let mut head2 = <FlatStackQueue<FR>>::from(list2.next().unwrap_or_default());
+
+        let mut result = self.empty(stash);
+
+        let mut diff = R::default();
+
+        // while we have valid data in each input, merge.
+        while !head1.is_empty() && !head2.is_empty() {
+            while (result.capacity() - result.len()) > 0 && !head1.is_empty() && !head2.is_empty() {
+                let cmp = {
+                    let (key1, val1, time1, _diff) = FR::into_parts(head1.peek());
+                    let (key2, val2, time2, _diff) = FR::into_parts(head2.peek());
+                    ((key1, val1), time1).cmp(&((key2, val2), time2))
+                };
+                // TODO: The following less/greater branches could plausibly be a good moment for
+                // `copy_range`, on account of runs of records that might benefit more from a
+                // `memcpy`.
+                match cmp {
+                    Ordering::Less => {
+                        result.copy(head1.pop());
+                    }
+                    Ordering::Greater => {
+                        result.copy(head2.pop());
+                    }
+                    Ordering::Equal => {
+                        let (key, val, time1, diff1) = FR::into_parts(head1.pop());
+                        let (_key, _val, _time2, diff2) = FR::into_parts(head2.pop());
+                        diff1.clone_onto(&mut diff);
+                        diff.plus_equals(&diff2);
+                        if !diff.is_zero() {
+                            result.copy(((key, val), time1, &diff));
+                        }
+                    }
+                }
+            }
+
+            if result.capacity() == result.len() {
+                output.push(result);
+                result = self.empty(stash);
+            }
+
+            if head1.is_empty() {
+                self.recycle(head1.done(), stash);
+                head1 = FlatStackQueue::from(list1.next().unwrap_or_default());
+            }
+            if head2.is_empty() {
+                self.recycle(head2.done(), stash);
+                head2 = FlatStackQueue::from(list2.next().unwrap_or_default());
+            }
+        }
+
+        while !head1.is_empty() {
+            let advance = result.capacity() - result.len();
+            let iter = head1.iter().take(advance);
+            result.reserve_items(iter.clone());
+            for item in iter {
+                result.copy(item);
+            }
+            output.push(result);
+            head1.advance(advance);
+            result = self.empty(stash);
+        }
+        if !result.is_empty() {
+            output.push(result);
+            result = self.empty(stash);
+        }
+        output.extend(list1);
+        self.recycle(head1.done(), stash);
+
+        while !head2.is_empty() {
+            let advance = result.capacity() - result.len();
+            let iter = head2.iter().take(advance);
+            result.reserve_items(iter.clone());
+            for item in iter {
+                result.copy(item);
+            }
+            output.push(result);
+            head2.advance(advance);
+            result = self.empty(stash);
+        }
+        output.extend(list2);
+        self.recycle(head2.done(), stash);
+    }
+
+    fn extract(
+        &mut self,
+        merged: Vec<Self::Chunk>,
+        upper: AntichainRef<Self::Time>,
+        frontier: &mut Antichain<Self::Time>,
+        readied: &mut Vec<Self::Chunk>,
+        kept: &mut Vec<Self::Chunk>,
+        stash: &mut Vec<Self::Chunk>,
+    ) {
+        let mut keep = self.empty(stash);
+        let mut ready = self.empty(stash);
+
+        for buffer in merged {
+            for (key, val, time, diff) in buffer.iter().map(FR::into_parts) {
+                if upper.less_equal(&time) {
+                    frontier.insert_with(&time, |time| (*time).into_owned());
+                    if keep.len() == keep.capacity() && !keep.is_empty() {
+                        kept.push(keep);
+                        keep = self.empty(stash);
+                    }
+                    keep.copy(((key, val), time, diff));
+                } else {
+                    if ready.len() == ready.capacity() && !ready.is_empty() {
+                        readied.push(ready);
+                        ready = self.empty(stash);
+                    }
+                    ready.copy(((key, val), time, diff));
+                }
+            }
+            // Recycling buffer.
+            self.recycle(buffer, stash);
+        }
+        // Finish the kept data.
+        if !keep.is_empty() {
+            kept.push(keep);
+        }
+        if !ready.is_empty() {
+            readied.push(ready);
+        }
+    }
+
+    fn seal<B: Builder<Input = Self::Output, Time = Self::Time>>(
+        chain: &mut Vec<Self::Chunk>,
+        lower: AntichainRef<Self::Time>,
+        upper: AntichainRef<Self::Time>,
+        since: AntichainRef<Self::Time>,
+    ) -> B::Output {
+        let mut keys = 0;
+        let mut vals = 0;
+        let mut upds = 0;
+        {
+            let mut prev_keyval = None;
+            for buffer in chain.iter() {
+                for (key, val, time, _diff) in buffer.iter().map(FR::into_parts) {
+                    if !upper.less_equal(&time) {
+                        if let Some((p_key, p_val)) = prev_keyval {
+                            debug_assert!(p_key <= key);
+                            debug_assert!(p_key != key || p_val <= val);
+                            if p_key != key {
+                                keys += 1;
+                                vals += 1;
+                            } else if p_val != val {
+                                vals += 1;
+                            }
+                        } else {
+                            keys += 1;
+                            vals += 1;
+                        }
+                        upds += 1;
+                        prev_keyval = Some((key, val));
+                    }
+                }
+            }
+        }
+        let mut builder = B::with_capacity(keys, vals, upds);
+        for mut chunk in chain.drain(..) {
+            builder.push(&mut chunk);
+        }
+
+        builder.done(lower.to_owned(), upper.to_owned(), since.to_owned())
+    }
+
+    fn account(chunk: &Self::Chunk) -> (usize, usize, usize, usize) {
+        let (mut size, mut capacity, mut allocations) = (0, 0, 0);
+        let cb = |siz, cap| {
+            size += siz;
+            capacity += cap;
+            allocations += 1;
+        };
+        chunk.heap_size(cb);
+        (chunk.len(), size, capacity, allocations)
+    }
+}
+
+struct FlatStackQueue<R: Region> {
+    list: FlatStack<R>,
+    head: usize,
+}
+
+impl<R: Region> Default for FlatStackQueue<R> {
+    fn default() -> Self {
+        Self::from(Default::default())
+    }
+}
+
+impl<R: Region> FlatStackQueue<R> {
+    fn pop(&mut self) -> R::ReadItem<'_> {
+        self.head += 1;
+        self.list.get(self.head - 1)
+    }
+
+    fn peek(&self) -> R::ReadItem<'_> {
+        self.list.get(self.head)
+    }
+
+    fn from(list: FlatStack<R>) -> Self {
+        FlatStackQueue { list, head: 0 }
+    }
+
+    fn done(self) -> FlatStack<R> {
+        self.list
+    }
+
+    fn is_empty(&self) -> bool {
+        self.head >= self.list.len()
+    }
+
+    /// Return an iterator over the remaining elements.
+    fn iter(&self) -> impl Iterator<Item = R::ReadItem<'_>> + Clone {
+        self.list.iter().skip(self.head)
+    }
+
+    fn advance(&mut self, consumed: usize) {
+        self.head += consumed;
+    }
+}
diff --git a/src/trace/implementations/mod.rs b/src/trace/implementations/mod.rs
index e8d068387..e19712f06 100644
--- a/src/trace/implementations/mod.rs
+++ b/src/trace/implementations/mod.rs
@@ -42,9 +42,11 @@ pub mod spine_fueled;
 
 pub mod merge_batcher;
 pub mod merge_batcher_col;
+pub mod merge_batcher_flat;
 pub mod ord_neu;
 pub mod rhh;
 pub mod huffman_container;
+pub mod chunker;
 
 // Opinionated takes on default spines.
 pub use self::ord_neu::OrdValSpine as ValSpine;
@@ -198,7 +200,6 @@ where
 
 use std::convert::TryInto;
 use abomonation_derive::Abomonation;
-use crate::trace::cursor::IntoOwned;
 
 /// A list of unsigned integers that uses `u32` elements as long as they are small enough, and switches to `u64` once they are not.
 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Abomonation)]
@@ -320,7 +321,7 @@ impl BatchContainer for OffsetList {
 }
 
 /// Behavior to split an update into principal components.
-pub trait BuilderInput<L: Layout>: Container {
+pub trait BuilderInput<K: BatchContainer, V: BatchContainer>: Container {
     /// Key portion
     type Key<'a>: Ord;
     /// Value portion
@@ -334,16 +335,20 @@ pub trait BuilderInput<L: Layout>: Container {
     fn into_parts<'a>(item: Self::Item<'a>) -> (Self::Key<'a>, Self::Val<'a>, Self::Time, Self::Diff);
 
     /// Test that the key equals a key in the layout's key container.
-    fn key_eq(this: &Self::Key<'_>, other: <L::KeyContainer as BatchContainer>::ReadItem<'_>) -> bool;
+    fn key_eq(this: &Self::Key<'_>, other: K::ReadItem<'_>) -> bool;
 
     /// Test that the value equals a key in the layout's value container.
-    fn val_eq(this: &Self::Val<'_>, other: <L::ValContainer as BatchContainer>::ReadItem<'_>) -> bool;
+    fn val_eq(this: &Self::Val<'_>, other: V::ReadItem<'_>) -> bool;
 }
 
-impl<K,V,T,R> BuilderInput<Vector<((K, V), T,R)>> for Vec<((K, V), T, R)>
+impl<K,KBC,V,VBC,T,R> BuilderInput<KBC, VBC> for Vec<((K, V), T, R)>
 where
     K: Ord + Clone + 'static,
+    KBC: BatchContainer,
+    for<'a> KBC::ReadItem<'a>: PartialEq<&'a K>,
     V: Ord + Clone + 'static,
+    VBC: BatchContainer,
+    for<'a> VBC::ReadItem<'a>: PartialEq<&'a V>,
     T: Timestamp + Lattice + Clone + 'static,
     R: Ord + Semigroup + 'static,
 {
@@ -356,24 +361,28 @@ where
         (key, val, time, diff)
     }
 
-    fn key_eq(this: &K, other: &K) -> bool {
-        this == other
+    fn key_eq(this: &K, other: KBC::ReadItem<'_>) -> bool {
+        KBC::reborrow(other) == this
     }
 
-    fn val_eq(this: &V, other: &V) -> bool {
-        this == other
+    fn val_eq(this: &V, other: VBC::ReadItem<'_>) -> bool {
+        VBC::reborrow(other) == this
     }
 }
 
-impl<K,V,T,R> BuilderInput<TStack<((K, V), T, R)>> for TimelyStack<((K, V), T, R)>
+impl<K,V,T,R> BuilderInput<K, V> for TimelyStack<((K::Owned, V::Owned), T, R)>
 where
-    K: Ord + Columnation + Clone + 'static,
-    V: Ord + Columnation + Clone + 'static,
+    K: BatchContainer,
+    for<'a> K::ReadItem<'a>: PartialEq<&'a K::Owned>,
+    K::Owned: Ord + Columnation + Clone + 'static,
+    V: BatchContainer,
+    for<'a> V::ReadItem<'a>: PartialEq<&'a V::Owned>,
+    V::Owned: Ord + Columnation + Clone + 'static,
     T: Timestamp + Lattice + Columnation + Clone + 'static,
     R: Ord + Clone + Semigroup + Columnation + 'static,
 {
-    type Key<'a> = &'a K;
-    type Val<'a> = &'a V;
+    type Key<'a> = &'a K::Owned;
+    type Val<'a> = &'a V::Owned;
     type Time = T;
     type Diff = R;
 
@@ -381,39 +390,36 @@ where
         (key, val, time.clone(), diff.clone())
     }
 
-    fn key_eq(this: &&K, other: &K) -> bool {
-        *this == other
+    fn key_eq(this: &&K::Owned, other: K::ReadItem<'_>) -> bool {
+        K::reborrow(other) == *this
     }
 
-    fn val_eq(this: &&V, other: &V) -> bool {
-        *this == other
+    fn val_eq(this: &&V::Owned, other: V::ReadItem<'_>) -> bool {
+        V::reborrow(other) == *this
     }
 }
 
 mod flatcontainer {
-    use timely::container::columnation::{Columnation, TimelyStack};
-    use timely::container::flatcontainer::{Containerized, FlatStack, Push, Region};
-    use timely::progress::Timestamp;
-    use crate::difference::Semigroup;
-    use crate::lattice::Lattice;
-    use crate::trace::implementations::{BuilderInput, FlatLayout, Layout, OffsetList, Update};
+    use timely::container::flatcontainer::{Containerized, FlatStack, IntoOwned, Push, Region};
+    use timely::container::flatcontainer::impls::tuple::{TupleABCRegion, TupleABRegion};
+    use crate::trace::implementations::{BatchContainer, BuilderInput, FlatLayout, Layout, OffsetList, Update};
 
     impl<U: Update> Layout for FlatLayout<U>
-        where
-            U::Key: Containerized,
-            for<'a> <U::Key as Containerized>::Region: Push<U::Key> + Push<<<U::Key as Containerized>::Region as Region>::ReadItem<'a>>,
-            for<'a> <<U::Key as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
-            U::Val: Containerized,
-            for<'a> <U::Val as Containerized>::Region: Push<U::Val> + Push<<<U::Val as Containerized>::Region as Region>::ReadItem<'a>>,
-            for<'a> <<U::Val as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
-            U::Time: Containerized,
-            <U::Time as Containerized>::Region: Region<Owned=U::Time>,
-            for<'a> <U::Time as Containerized>::Region: Push<U::Time> + Push<<<U::Time as Containerized>::Region as Region>::ReadItem<'a>>,
-            for<'a> <<U::Time as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
-            U::Diff: Containerized,
-            <U::Diff as Containerized>::Region: Region<Owned=U::Diff>,
-            for<'a> <U::Diff as Containerized>::Region: Push<U::Diff> + Push<<<U::Diff as Containerized>::Region as Region>::ReadItem<'a>>,
-            for<'a> <<U::Diff as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
+    where
+        U::Key: Containerized,
+        for<'a> <U::Key as Containerized>::Region: Push<U::Key> + Push<<<U::Key as Containerized>::Region as Region>::ReadItem<'a>>,
+        for<'a> <<U::Key as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
+        U::Val: Containerized,
+        for<'a> <U::Val as Containerized>::Region: Push<U::Val> + Push<<<U::Val as Containerized>::Region as Region>::ReadItem<'a>>,
+        for<'a> <<U::Val as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
+        U::Time: Containerized,
+        <U::Time as Containerized>::Region: Region<Owned=U::Time>,
+        for<'a> <U::Time as Containerized>::Region: Push<U::Time> + Push<<<U::Time as Containerized>::Region as Region>::ReadItem<'a>>,
+        for<'a> <<U::Time as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
+        U::Diff: Containerized,
+        <U::Diff as Containerized>::Region: Region<Owned=U::Diff>,
+        for<'a> <U::Diff as Containerized>::Region: Push<U::Diff> + Push<<<U::Diff as Containerized>::Region as Region>::ReadItem<'a>>,
+        for<'a> <<U::Diff as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
     {
         type Target = U;
         type KeyContainer = FlatStack<<U::Key as Containerized>::Region>;
@@ -423,73 +429,40 @@ mod flatcontainer {
         type OffsetContainer = OffsetList;
     }
 
-    impl<K,V,T,R> BuilderInput<FlatLayout<((K, V), T, R)>> for TimelyStack<((K, V), T, R)>
+    impl<K,KBC,V,VBC,T,R> BuilderInput<KBC, VBC> for FlatStack<TupleABCRegion<TupleABRegion<K,V>,T,R>>
     where
-        K: Ord + Columnation + Containerized + Clone + 'static,
-        for<'a> K::Region: Push<K> + Push<<K::Region as Region>::ReadItem<'a>>,
-        for<'a> <K::Region as Region>::ReadItem<'a>: Copy + Ord,
-        for<'a> K: PartialEq<<K::Region as Region>::ReadItem<'a>>,
-        V: Ord + Columnation + Containerized + Clone + 'static,
-        for<'a> V::Region: Push<V> + Push<<V::Region as Region>::ReadItem<'a>>,
-        for<'a> <V::Region as Region>::ReadItem<'a>: Copy + Ord,
-        for<'a> V: PartialEq<<V::Region as Region>::ReadItem<'a>>,
-        T: Timestamp + Lattice + Columnation + Containerized + Clone + 'static,
-        for<'a> T::Region: Region<Owned=T> + Push<T> + Push<<T::Region as Region>::ReadItem<'a>>,
-        for<'a> <T::Region as Region>::ReadItem<'a>: Copy + Ord,
-        for<'a> T: PartialEq<<T::Region as Region>::ReadItem<'a>>,
-        R: Ord + Clone + Semigroup + Columnation + Containerized + 'static,
-        for<'a> R::Region: Region<Owned=R> + Push<R> + Push<<R::Region as Region>::ReadItem<'a>>,
-        for<'a> <R::Region as Region>::ReadItem<'a>: Copy + Ord,
-        for<'a> R: PartialEq<<R::Region as Region>::ReadItem<'a>>,
+        K: Region + Clone + 'static,
+        for<'a> K::ReadItem<'a>: Copy + Ord,
+        KBC: BatchContainer,
+        for<'a> KBC::ReadItem<'a>: PartialEq<K::ReadItem<'a>>,
+        for<'a> V: Region + Clone + 'static,
+        for<'a> V::ReadItem<'a>: Copy + Ord,
+        VBC: BatchContainer,
+        for<'a> VBC::ReadItem<'a>: PartialEq<V::ReadItem<'a>>,
+        for<'a> T: Region + Clone + 'static,
+        for<'a> T::ReadItem<'a>: Copy + Ord,
+        for<'a> R: Region + Clone + 'static,
+        for<'a> R::ReadItem<'a>: Copy + Ord,
     {
-        type Key<'a> = &'a K;
-        type Val<'a> = &'a V;
-        type Time = T;
-        type Diff = R;
+        type Key<'a> = K::ReadItem<'a>;
+        type Val<'a> = V::ReadItem<'a>;
+        type Time = T::Owned;
+        type Diff = R::Owned;
 
         fn into_parts<'a>(((key, val), time, diff): Self::Item<'a>) -> (Self::Key<'a>, Self::Val<'a>, Self::Time, Self::Diff) {
-            (key, val, time.clone(), diff.clone())
+            (key, val, time.into_owned(), diff.into_owned())
         }
 
-        fn key_eq(this: &&K, other: <<K as Containerized>::Region as Region>::ReadItem<'_>) -> bool {
-            **this == <K as Containerized>::Region::reborrow(other)
+        fn key_eq(this: &Self::Key<'_>, other: KBC::ReadItem<'_>) -> bool {
+            KBC::reborrow(other) == K::reborrow(*this)
         }
 
-        fn val_eq(this: &&V, other: <<V as Containerized>::Region as Region>::ReadItem<'_>) -> bool {
-            **this == <V as Containerized>::Region::reborrow(other)
+        fn val_eq(this: &Self::Val<'_>, other: VBC::ReadItem<'_>) -> bool {
+            VBC::reborrow(other) == V::reborrow(*this)
         }
     }
 }
 
-impl<K,V,T,R> BuilderInput<Preferred<K, V, T, R>> for TimelyStack<((<K as ToOwned>::Owned, <V as ToOwned>::Owned), T, R)>
-where
-    K: Ord+ToOwned+PreferredContainer + ?Sized,
-    K::Owned: Columnation + Ord+Clone+'static,
-    for<'a> <<K as PreferredContainer>::Container as BatchContainer>::ReadItem<'a> : IntoOwned<'a, Owned = K::Owned>,
-    V: Ord+ToOwned+PreferredContainer + ?Sized,
-    V::Owned: Columnation + Ord+Clone+'static,
-    for<'a> <<V as PreferredContainer>::Container as BatchContainer>::ReadItem<'a> : IntoOwned<'a, Owned = V::Owned>,
-    T: Columnation + Ord+Clone+Lattice+Timestamp,
-    R: Columnation + Ord+Clone+Semigroup+'static,
-{
-    type Key<'a> = &'a K::Owned;
-    type Val<'a> = &'a V::Owned;
-    type Time = T;
-    type Diff = R;
-
-    fn into_parts<'a>(((key, val), time, diff): Self::Item<'a>) -> (Self::Key<'a>, Self::Val<'a>, Self::Time, Self::Diff) {
-        (key, val, time.clone(), diff.clone())
-    }
-
-    fn key_eq(this: &&K::Owned, other: <<K as PreferredContainer>::Container as BatchContainer>::ReadItem<'_>) -> bool {
-        <<K as PreferredContainer>::Container as BatchContainer>::reborrow(other).eq(&<<K as PreferredContainer>::Container as BatchContainer>::reborrow(<<<K as PreferredContainer>::Container as BatchContainer>::ReadItem<'_> as IntoOwned>::borrow_as(this)))
-    }
-
-    fn val_eq(this: &&V::Owned, other: <<V as PreferredContainer>::Container as BatchContainer>::ReadItem<'_>) -> bool {
-        <<V as PreferredContainer>::Container as BatchContainer>::reborrow(other).eq(&<<V as PreferredContainer>::Container as BatchContainer>::reborrow(<<<V as PreferredContainer>::Container as BatchContainer>::ReadItem<'_> as IntoOwned>::borrow_as(this)))
-    }
-}
-
 pub use self::containers::{BatchContainer, SliceContainer};
 
 /// Containers for data that resemble `Vec<T>`, with leaner implementations.
diff --git a/src/trace/implementations/ord_neu.rs b/src/trace/implementations/ord_neu.rs
index e475a0b3d..20b7cd002 100644
--- a/src/trace/implementations/ord_neu.rs
+++ b/src/trace/implementations/ord_neu.rs
@@ -10,10 +10,13 @@
 
 use std::rc::Rc;
 use timely::container::columnation::{TimelyStack};
+use timely::container::flatcontainer::{Containerized, FlatStack};
+use crate::trace::implementations::chunker::{ColumnationChunker, ContainerChunker, VecChunker};
 
 use crate::trace::implementations::spine_fueled::Spine;
 use crate::trace::implementations::merge_batcher::{MergeBatcher, VecMerger};
 use crate::trace::implementations::merge_batcher_col::ColumnationMerger;
+use crate::trace::implementations::merge_batcher_flat::FlatcontainerMerger;
 use crate::trace::rc_blanket_impls::RcBuilder;
 
 use super::{Update, Layout, Vector, TStack, Preferred, FlatLayout};
@@ -24,7 +27,7 @@ pub use self::key_batch::{OrdKeyBatch, OrdKeyBuilder};
 /// A trace implementation using a spine of ordered lists.
 pub type OrdValSpine<K, V, T, R> = Spine<
     Rc<OrdValBatch<Vector<((K,V),T,R)>>>,
-    MergeBatcher<VecMerger<((K, V), T, R)>, T>,
+    MergeBatcher<Vec<((K,V),T,R)>, VecChunker<((K,V),T,R)>, VecMerger<((K, V), T, R)>, T>,
     RcBuilder<OrdValBuilder<Vector<((K,V),T,R)>, Vec<((K,V),T,R)>>>,
 >;
 // /// A trace implementation for empty values using a spine of ordered lists.
@@ -33,21 +36,26 @@ pub type OrdValSpine<K, V, T, R> = Spine<
 /// A trace implementation backed by columnar storage.
 pub type ColValSpine<K, V, T, R> = Spine<
     Rc<OrdValBatch<TStack<((K,V),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,V),T,R)>, T>,
+    MergeBatcher<Vec<((K,V),T,R)>, ColumnationChunker<((K,V),T,R)>, ColumnationMerger<((K,V),T,R)>, T>,
     RcBuilder<OrdValBuilder<TStack<((K,V),T,R)>, TimelyStack<((K,V),T,R)>>>,
 >;
 
 /// A trace implementation backed by flatcontainer storage.
-pub type FlatValSpine<K, V, T, R> = Spine<
+pub type FlatValSpine<K, V, T, R, C> = Spine<
     Rc<OrdValBatch<FlatLayout<((K,V),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,V),T,R)>, T>,
-    RcBuilder<OrdValBuilder<FlatLayout<((K,V),T,R)>, TimelyStack<((K,V),T,R)>>>,
+    MergeBatcher<
+        C,
+        ContainerChunker<FlatStack<<((K,V),T,R) as Containerized>::Region>>,
+        FlatcontainerMerger<T, R, <((K,V),T,R) as Containerized>::Region>,
+        T,
+    >,
+    RcBuilder<OrdValBuilder<FlatLayout<((K,V),T,R)>, FlatStack<<((K,V),T,R) as Containerized>::Region>>>,
 >;
 
 /// A trace implementation using a spine of ordered lists.
 pub type OrdKeySpine<K, T, R> = Spine<
     Rc<OrdKeyBatch<Vector<((K,()),T,R)>>>,
-    MergeBatcher<VecMerger<((K, ()), T, R)>, T>,
+    MergeBatcher<Vec<((K,()),T,R)>, VecChunker<((K,()),T,R)>, VecMerger<((K, ()), T, R)>, T>,
     RcBuilder<OrdKeyBuilder<Vector<((K,()),T,R)>, Vec<((K,()),T,R)>>>,
 >;
 // /// A trace implementation for empty values using a spine of ordered lists.
@@ -56,21 +64,26 @@ pub type OrdKeySpine<K, T, R> = Spine<
 /// A trace implementation backed by columnar storage.
 pub type ColKeySpine<K, T, R> = Spine<
     Rc<OrdKeyBatch<TStack<((K,()),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,()),T,R)>, T>,
+    MergeBatcher<Vec<((K,()),T,R)>, ColumnationChunker<((K,()),T,R)>, ColumnationMerger<((K,()),T,R)>, T>,
     RcBuilder<OrdKeyBuilder<TStack<((K,()),T,R)>, TimelyStack<((K,()),T,R)>>>,
 >;
 
 /// A trace implementation backed by flatcontainer storage.
-pub type FlatKeySpine<K, T, R> = Spine<
-    Rc<OrdValBatch<FlatLayout<((K,()),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,()),T,R)>, T>,
-    RcBuilder<OrdValBuilder<FlatLayout<((K,()),T,R)>, TimelyStack<((K,()),T,R)>>>,
+pub type FlatKeySpine<K, T, R, C> = Spine<
+    Rc<OrdKeyBatch<FlatLayout<((K,()),T,R)>>>,
+    MergeBatcher<
+        C,
+        ContainerChunker<FlatStack<<((K,()),T,R) as Containerized>::Region>>,
+        FlatcontainerMerger<T, R, <((K,()),T,R) as Containerized>::Region>,
+        T,
+    >,
+    RcBuilder<OrdKeyBuilder<FlatLayout<((K,()),T,R)>, FlatStack<<((K,()),T,R) as Containerized>::Region>>>,
 >;
 
 /// A trace implementation backed by columnar storage.
 pub type PreferredSpine<K, V, T, R> = Spine<
     Rc<OrdValBatch<Preferred<K,V,T,R>>>,
-    MergeBatcher<ColumnationMerger<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>,T>,
+    MergeBatcher<Vec<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>, ColumnationChunker<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>, ColumnationMerger<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>,T>,
     RcBuilder<OrdValBuilder<Preferred<K,V,T,R>, TimelyStack<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>>>,
 >;
 
@@ -577,7 +590,7 @@ mod val_batch {
     impl<L, CI> Builder for OrdValBuilder<L, CI>
     where
         L: Layout,
-        CI: for<'a> BuilderInput<L, Time=<L::Target as Update>::Time, Diff=<L::Target as Update>::Diff>,
+        CI: for<'a> BuilderInput<L::KeyContainer, L::ValContainer, Time=<L::Target as Update>::Time, Diff=<L::Target as Update>::Diff>,
         for<'a> L::KeyContainer: PushInto<CI::Key<'a>>,
         for<'a> L::ValContainer: PushInto<CI::Val<'a>>,
         for<'a> <L::TimeContainer as BatchContainer>::ReadItem<'a> : IntoOwned<'a, Owned = <L::Target as Update>::Time>,
@@ -1044,7 +1057,7 @@ mod key_batch {
     impl<L: Layout, CI> Builder for OrdKeyBuilder<L, CI>
     where
         L: Layout,
-        CI: for<'a> BuilderInput<L, Time=<L::Target as Update>::Time, Diff=<L::Target as Update>::Diff>,
+        CI: for<'a> BuilderInput<L::KeyContainer, L::ValContainer, Time=<L::Target as Update>::Time, Diff=<L::Target as Update>::Diff>,
         for<'a> L::KeyContainer: PushInto<CI::Key<'a>>,
         for<'a> <L::TimeContainer as BatchContainer>::ReadItem<'a> : IntoOwned<'a, Owned = <L::Target as Update>::Time>,
         for<'a> <L::DiffContainer as BatchContainer>::ReadItem<'a> : IntoOwned<'a, Owned = <L::Target as Update>::Diff>,
diff --git a/src/trace/implementations/rhh.rs b/src/trace/implementations/rhh.rs
index 1d87d55d2..0a68ec519 100644
--- a/src/trace/implementations/rhh.rs
+++ b/src/trace/implementations/rhh.rs
@@ -12,6 +12,7 @@ use abomonation_derive::Abomonation;
 use timely::container::columnation::TimelyStack;
 
 use crate::Hashable;
+use crate::trace::implementations::chunker::{ColumnationChunker, VecChunker};
 use crate::trace::implementations::merge_batcher::{MergeBatcher, VecMerger};
 use crate::trace::implementations::merge_batcher_col::ColumnationMerger;
 use crate::trace::implementations::spine_fueled::Spine;
@@ -24,7 +25,7 @@ use self::val_batch::{RhhValBatch, RhhValBuilder};
 /// A trace implementation using a spine of ordered lists.
 pub type VecSpine<K, V, T, R> = Spine<
     Rc<RhhValBatch<Vector<((K,V),T,R)>>>,
-    MergeBatcher<VecMerger<((K, V), T, R)>, T>,
+    MergeBatcher<Vec<((K,V),T,R)>, VecChunker<((K,V),T,R)>, VecMerger<((K, V), T, R)>, T>,
     RcBuilder<RhhValBuilder<Vector<((K,V),T,R)>, Vec<((K,V),T,R)>>>,
 >;
 // /// A trace implementation for empty values using a spine of ordered lists.
@@ -33,7 +34,7 @@ pub type VecSpine<K, V, T, R> = Spine<
 /// A trace implementation backed by columnar storage.
 pub type ColSpine<K, V, T, R> = Spine<
     Rc<RhhValBatch<TStack<((K,V),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,V),T,R)>, T>,
+    MergeBatcher<Vec<((K,V),T,R)>, ColumnationChunker<((K,V),T,R)>, ColumnationMerger<((K,V),T,R)>, T>,
     RcBuilder<RhhValBuilder<TStack<((K,V),T,R)>, TimelyStack<((K,V),T,R)>>>,
 >;
 // /// A trace implementation backed by columnar storage.
@@ -759,7 +760,7 @@ mod val_batch {
     impl<L: Layout, CI> Builder for RhhValBuilder<L, CI>
     where
         <L::Target as Update>::Key: Default + HashOrdered,
-        CI: for<'a> BuilderInput<L, Key<'a> = <L::Target as Update>::Key, Time=<L::Target as Update>::Time, Diff=<L::Target as Update>::Diff>,
+        CI: for<'a> BuilderInput<L::KeyContainer, L::ValContainer, Key<'a> = <L::Target as Update>::Key, Time=<L::Target as Update>::Time, Diff=<L::Target as Update>::Diff>,
         for<'a> L::ValContainer: PushInto<CI::Val<'a>>,
         for<'a> <L::KeyContainer as BatchContainer>::ReadItem<'a>: HashOrdered + IntoOwned<'a, Owned = <L::Target as Update>::Key>,
     {
diff --git a/tests/bfs.rs b/tests/bfs.rs
index 8de5c8274..930f4fead 100644
--- a/tests/bfs.rs
+++ b/tests/bfs.rs
@@ -1,27 +1,32 @@
+use std::fmt::Debug;
 use rand::{Rng, SeedableRng, StdRng};
 
 use std::sync::{Arc, Mutex};
 
-use timely::Config;
+use timely::{Config, PartialOrder};
+use timely::container::flatcontainer::{Containerized, MirrorRegion, Push, Region, ReserveItems};
 
 use timely::dataflow::*;
 use timely::dataflow::operators::Capture;
 use timely::dataflow::operators::capture::Extract;
+use timely::order::{FlatProductRegion, Product};
 
 use differential_dataflow::input::Input;
 use differential_dataflow::Collection;
 
 use differential_dataflow::operators::*;
 use differential_dataflow::lattice::Lattice;
+use differential_dataflow::operators::arrange::Arrange;
+use differential_dataflow::trace::implementations::ord_neu::{FlatKeySpine, FlatValSpine};
 
 type Node = usize;
 type Edge = (Node, Node);
 
-#[test] fn bfs_10_20_1000() { test_sizes(10, 20, 1000, Config::process(3)); }
-#[test] fn bfs_100_200_10() { test_sizes(100, 200, 10, Config::process(3)); }
-#[test] fn bfs_100_2000_1() { test_sizes(100, 2000, 1, Config::process(3)); }
+#[test] fn bfs_10_20_1000() { test_sizes(10, 20, 1000, 3); }
+#[test] fn bfs_100_200_10() { test_sizes(100, 200, 10, 3); }
+#[test] fn bfs_100_2000_1() { test_sizes(100, 2000, 1, 3); }
 
-fn test_sizes(nodes: usize, edges: usize, rounds: usize, config: Config) {
+fn test_sizes(nodes: usize, edges: usize, rounds: usize, threads: usize) {
 
     let root_list = vec![(1, 0, 1)];
     let mut edge_list = Vec::new();
@@ -39,30 +44,34 @@ fn test_sizes(nodes: usize, edges: usize, rounds: usize, config: Config) {
         edge_list.push(((rng2.gen_range(0, nodes), rng2.gen_range(0, nodes)), round,-1));
     }
 
-    let mut results1 = bfs_sequential(root_list.clone(), edge_list.clone());
-    let mut results2 = bfs_differential(root_list.clone(), edge_list.clone(), config);
+    let mut results = [
+        bfs_sequential(root_list.clone(), edge_list.clone()),
+        bfs_differential(root_list.clone(), edge_list.clone(), Config::process(threads)),
+        bfs_differential_flat(root_list.clone(), edge_list.clone(), Config::process(threads)),
+    ];
 
-    results1.sort();
-    results1.sort_by(|x,y| x.1.cmp(&y.1));
-    results2.sort();
-    results2.sort_by(|x,y| x.1.cmp(&y.1));
+    for results in results.iter_mut() {
+        results.sort();
+        results.sort_by(|x,y| x.1.cmp(&y.1));
+    }
 
-    if results1 != results2 {
-        println!("RESULTS INEQUAL!!!");
-        for x in &results1 {
-            if !results2.contains(x) {
-                println!("  in seq, not diff: {:?}", x);
+    let results1 = &results[0];
+    for other in results.iter().skip(1) {
+        if results1 != other {
+            println!("RESULTS INEQUAL!!!");
+            for x in results1 {
+                if !other.contains(x) {
+                    println!("  in seq, not diff: {:?}", x);
+                }
             }
-        }
-        for x in &results2 {
-            if !results1.contains(x) {
-                println!("  in diff, not seq: {:?}", x);
+            for x in other {
+                if !results1.contains(x) {
+                    println!("  in diff, not seq: {:?}", x);
+                }
             }
         }
-
+        assert_eq!(results1, other);
     }
-
-    assert_eq!(results1, results2);
 }
 
 
@@ -216,3 +225,102 @@ where G::Timestamp: Lattice+Ord {
              .reduce(|_, s, t| t.push((*s[0].0, 1)))
      })
 }
+
+fn bfs_differential_flat(
+    roots_list: Vec<(usize, usize, isize)>,
+    edges_list: Vec<((usize, usize), usize, isize)>,
+    config: Config,
+) -> Vec<((usize, usize), usize, isize)> {
+    let (send, recv) = std::sync::mpsc::channel();
+    let send = Arc::new(Mutex::new(send));
+
+    timely::execute(config, move |worker| {
+        let mut roots_list = roots_list.clone();
+        let mut edges_list = edges_list.clone();
+
+        // define BFS dataflow; return handles to roots and edges inputs
+        let (mut roots, mut edges) = worker.dataflow(|scope| {
+            let send = send.lock().unwrap().clone();
+
+            let (root_input, roots) = scope.new_collection();
+            let (edge_input, edges) = scope.new_collection();
+
+            let c = bfs_flat(&edges, &roots).map(|(_, dist)| (dist, ()));
+            let arranged = c.arrange::<FlatKeySpine<usize, _, isize, Vec<((usize, ()), _, _)>>>();
+            type T2 = FlatValSpine<usize, isize, usize, isize, Vec<((usize, isize), usize, isize)>>;
+            let reduced = arranged.reduce_abelian::<_, _, _, T2>("Count", |_k, s, t| {
+                t.push((s[0].1.clone(), isize::from(1i8)))
+            });
+            reduced
+                .as_collection(|k, c| (k, c as usize))
+                .inner
+                .capture_into(send);
+
+            (root_input, edge_input)
+        });
+
+        // sort by decreasing insertion time.
+        roots_list.sort_by(|x, y| y.1.cmp(&x.1));
+        edges_list.sort_by(|x, y| y.1.cmp(&x.1));
+
+        let mut round = 0;
+        while roots_list.len() > 0 || edges_list.len() > 0 {
+            while roots_list.last().map(|x| x.1) == Some(round) {
+                let (node, _time, diff) = roots_list.pop().unwrap();
+                roots.update(node, diff);
+            }
+            while edges_list.last().map(|x| x.1) == Some(round) {
+                let ((src, dst), _time, diff) = edges_list.pop().unwrap();
+                edges.update((src, dst), diff);
+            }
+
+            round += 1;
+            roots.advance_to(round);
+            edges.advance_to(round);
+        }
+    })
+        .unwrap();
+
+    recv.extract()
+        .into_iter()
+        .flat_map(|(_, list)| {
+            list.into_iter()
+                .map(|((dst, cnt), time, diff)| ((dst, cnt), time, diff))
+        })
+        .collect()
+}
+
+// returns pairs (n, s) indicating node n can be reached from a root in s steps.
+fn bfs_flat<G: Scope>(
+    edges: &Collection<G, Edge>,
+    roots: &Collection<G, Node>,
+) -> Collection<G, (Node, usize)>
+where
+    G::Timestamp: Lattice + Ord + Containerized,
+    for<'a> G::Timestamp: PartialOrder<<<G::Timestamp as Containerized>::Region as Region>::ReadItem<'a>>,
+    <G::Timestamp as Containerized>::Region: Region<Owned=G::Timestamp> + Push<G::Timestamp>,
+    for<'a> <Product<G::Timestamp, u64> as Containerized>::Region: Region<Owned=Product<G::Timestamp, u64>> + Push<<<Product<G::Timestamp, u64> as Containerized>::Region as Region>::ReadItem<'a>>,
+    <G::Timestamp as Containerized>::Region: Clone + Ord,
+    for<'a> FlatProductRegion<<G::Timestamp as Containerized>::Region, MirrorRegion<u64>>: Push<&'a Product<G::Timestamp, u64>>,
+    for<'a> <FlatProductRegion<<G::Timestamp as Containerized>::Region, MirrorRegion<u64>> as Region>::ReadItem<'a>: Copy + Ord + Debug,
+    Product<G::Timestamp, u64>: for<'a> PartialOrder<<<Product<G::Timestamp, u64> as Containerized>::Region as Region>::ReadItem<'a>>,
+    for<'a> <<Product<G::Timestamp, u64> as Containerized>::Region as Region>::ReadItem<'a>: PartialOrder<Product<G::Timestamp, u64>>,
+    for<'a> <Product<G::Timestamp, u64> as Containerized>::Region: ReserveItems<<<Product<G::Timestamp, u64> as Containerized>::Region as Region>::ReadItem<'a>>,
+{
+    // initialize roots as reaching themselves at distance 0
+    let nodes = roots.map(|x| (x, 0));
+
+    // repeatedly update minimal distances each node can be reached from each root
+    nodes.iterate(|inner| {
+        let edges = edges.enter(&inner.scope());
+        let nodes = nodes.enter(&inner.scope());
+
+        type Spine<K, V, T, R = isize> = FlatValSpine<K, V, T, R, Vec<((K, V), T, R)>>;
+        let arranged1 = inner.arrange::<Spine<Node, Node, Product<G::Timestamp, _>>>();
+        let arranged2 = edges.arrange::<Spine<Node, Node, Product<G::Timestamp, _>>>();
+        arranged1
+            .join_core(&arranged2, move |_k, l, d| Some((d, l + 1)))
+            .concat(&nodes)
+            .reduce(|_, s, t| t.push((*s[0].0, 1)))
+    })
+}