diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 68a36b4d0..155d5107a 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -88,7 +88,7 @@ jobs:
       # and build directories, but this is only available with CMake 3.13 and higher.  
       # The CMake binaries on the Github Actions machines are (as of this writing) 3.12
       run: |
-        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log)
+        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeConfigureLog.yaml)
 
     - name: Build
       working-directory: ${{github.workspace}}/build
@@ -113,11 +113,11 @@ jobs:
       working-directory: ${{github.workspace}}/build
       shell: bash
       run: |
-        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_helloworld/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_helloworld/CMakeFiles/CMakeError.log)
+        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && test_install_devsamp_helloworld/CMakeFiles/CMakeConfigureLog.yaml)
         cmake --build test_install_devsamp_helloworld
         $MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-parsec
         $MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-mad
-        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_fibonacci/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeError.log)
+        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeConfigureLog.yaml)
         cmake --build test_install_devsamp_fibonacci
         $MPIEXEC -n 2 test_install_devsamp_fibonacci/fibonacci-parsec
         cmake -E make_directory test_install_userexamples
@@ -130,7 +130,7 @@ jobs:
         add_ttg_executable(iterative $GITHUB_WORKSPACE/doc/dox/user/examples/iterative.cc NOT_EXCLUDE_FROM_ALL)
         add_ttg_executable(distributed $GITHUB_WORKSPACE/doc/dox/user/examples/distributed.cc NOT_EXCLUDE_FROM_ALL)
         EOF
-        cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_userexamples/CMakeFiles/CMakeOutput.log && cat test_install_userexamples/CMakeFiles/CMakeError.log)
+        cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeConfigureLog.yaml)
         cmake --build test_install_userexamples/build
 
     - name: Build+Deploy Dox
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5406877c8..63b0d2bfa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,7 @@ option(TTG_EXAMPLES "Whether to build examples" OFF)
 option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)
 
 option(TTG_ENABLE_COROUTINES "Whether to enable C++ coroutines, needed for accelerator device support" ON)
-option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
+option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" ON)
 option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF)
 option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF)
 # See https://medium.com/@alasher/colored-c-compiler-output-with-ninja-clang-gcc-10bfe7f2b949
diff --git a/INSTALL.md b/INSTALL.md
index b8974f927..b9497ac0a 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -63,14 +63,14 @@ TTG includes several examples that may require additional prerequisites. These a
 
 ## useful cmake cache variables:
 
-| Variable                             |Default             | Description                                                                                                                                                                                                                                  |
-|--------------------------------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `TTG_ENABLE_CUDA`                    | `OFF`              | whether to enable CUDA device support                                                                                                                                                                                                        |
-| `TTG_ENABLE_HIP`                     | `OFF`              | whether to enable HIP/ROCm device support                                                                                                                                                                                                    |
-| `TTG_ENABLE_LEVEL_ZERO`              | `OFF`              | whether to enable Intel oneAPI Level Zero device support                                                                                                                                                                                     |
-| `BUILD_TESTING`                      | `ON`               | whether target `check-ttg` and its relatives will actually build and run unit tests                                                                                                                                                          |
-| `TTG_EXAMPLES`                       | `OFF`              | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source                                        |
-| `TTG_ENABLE_TRACE`                   | `OFF`              | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op                                                                               |
-| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF`       | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. |
-| `TTG_FETCH_BOOST`                    | `OFF`              | whether to download and build Boost automatically, if missing                                                                                                                                                                                |
-| `TTG_IGNORE_BUNDLED_EXTERNALS`       | `OFF`              | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits)                                                                                                                                              |
+| Variable                             | Default | Description                                                                                                                                                                                                                                  |
+|--------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `TTG_ENABLE_CUDA`                    | `OFF`   | whether to enable CUDA device support                                                                                                                                                                                                        |
+| `TTG_ENABLE_HIP`                     | `OFF`   | whether to enable HIP/ROCm device support                                                                                                                                                                                                    |
+| `TTG_ENABLE_LEVEL_ZERO`              | `OFF`   | whether to enable Intel oneAPI Level Zero device support                                                                                                                                                                                     |
+| `BUILD_TESTING`                      | `ON`    | whether target `check-ttg` and its relatives will actually build and run unit tests                                                                                                                                                          |
+| `TTG_EXAMPLES`                       | `OFF`   | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source                                        |
+| `TTG_ENABLE_TRACE`                   | `OFF`   | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op                                                                               |
+| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF`   | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. |
+| `TTG_FETCH_BOOST`                    | `ON`    | whether to download and build Boost automatically, if missing                                                                                                                                                                                |
+| `TTG_IGNORE_BUNDLED_EXTERNALS`       | `OFF`   | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits)                                                                                                                                              |
diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index 1429de0bf..49f66fe0a 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -1,12 +1,17 @@
 # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS)
 # to be able to auto-update them
 
-set(TTG_TRACKED_VG_CMAKE_KIT_TAG 092efee765e039b02e0a9aaf013c12fc3c4e89cf)  # used to provide "real" FindOrFetchBoost
+set(TTG_TRACKED_VG_CMAKE_KIT_TAG d1b34157c349cf0a7c2f149b7704a682d53f6486)  # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
-set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3)
+set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
 set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
-set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2)
-set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73)
+set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
+set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815)
 
 # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
-set(TTG_OLDEST_BOOST_VERSION 1.66)
+# BUT if will be building examples, inherit the oldest version from the pickiest Boost consumer (TA and/or BSPMM)
+if (TTG_EXAMPLES)
+  set(TTG_OLDEST_BOOST_VERSION 1.81)
+else()
+  set(TTG_OLDEST_BOOST_VERSION 1.66)
+endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c34e3c07e..b5d1ff622 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,7 @@ if (TARGET tiledarray)
     add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES tiledarray lapackpp)
 
     if (TARGET CUDA::cublas)
-      add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc
+      add_ttg_executable(bspmm-cuda spmm/spmm.cc
                          LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS CUDA::cublas
                          COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1
                          RUNTIMES "parsec")
@@ -32,7 +32,7 @@ if (TARGET tiledarray)
                           RUNTIMES "parsec")
       endif(TARGET CUDA::cusolver)
     elseif (TARGET roc::hipblas)
-      add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
+      add_ttg_executable(bspmm-hip spmm/spmm.cc
                           LINK_LIBRARIES tiledarray TiledArray_Eigen roc::hipblas
                           COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_HIP=1
                           RUNTIMES "parsec")
@@ -43,7 +43,7 @@ if (TARGET tiledarray)
                            RUNTIMES "parsec")
       endif(TARGET roc::hipsolver)
     elseif (TARGET MKL::MKL_DPCPP)
-      add_ttg_executable(bspmm-lz spmm/spmm_cuda.cc
+      add_ttg_executable(bspmm-lz spmm/spmm.cc
                           LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS MKL::MKL_DPCPP level_zero::ze_loader m
                           COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_LEVEL_ZERO=1
                           RUNTIMES "parsec")
diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index a759b1c3c..203c58bf2 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -10,23 +10,27 @@
 #include <ttg/serialization/splitmd_data_descriptor.h>
 
 
-#include <TiledArray/device/allocators.h>
+#include <TiledArray/external/device.h>
 #if defined(TILEDARRAY_HAS_DEVICE)
-#define ALLOCATOR TiledArray::device_pinned_allocator<T>
+template<typename T>
+using Allocator = TiledArray::device_pinned_allocator<T>;
 
 inline void allocator_init(int argc, char **argv) {
   // initialize MADNESS so that TA allocators can be created
 #if defined(TTG_PARSEC_IMPORTED)
   madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
-#endif // TTG_PARSEC_IMPORTED
   madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
+#endif // TTG_PARSEC_IMPORTED
 }
 
 inline void allocator_fini() {
+#if defined(TTG_PARSEC_IMPORTED)
   madness::finalize();
+#endif // TTG_PARSEC_IMPORTED
 }
 #else  // TILEDARRAY_HAS_DEVICE
-#define ALLOCATOR std::allocator<T>
+template<typename T>
+using Allocator = std::allocator<T>;
 
 inline void allocator_init(int argc, char **argv) { }
 
@@ -34,13 +38,13 @@ inline void allocator_fini() { }
 
 #endif // TILEDARRAY_HAS_DEVICE
 
-template <typename T, class Allocator = ALLOCATOR>
-class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
+template <typename T, class AllocatorT = Allocator<T>>
+class MatrixTile : public ttg::TTValue<MatrixTile<T, AllocatorT>> {
  public:
   using metadata_t = typename std::tuple<std::size_t, std::size_t, std::size_t>;
 
-  using buffer_t  = typename ttg::Buffer<T, Allocator>;
-  using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;
+  using buffer_t  = typename ttg::Buffer<T, AllocatorT>;
+  using ttvalue_type = ttg::TTValue<MatrixTile<T, AllocatorT>>;
 
  private:
   buffer_t _buffer;
@@ -87,15 +91,15 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
   , _lda(lda)
   { }
 
-  MatrixTile(MatrixTile<T, Allocator>&& other) = default;
+  MatrixTile(MatrixTile<T, AllocatorT>&& other) = default;
 
-  MatrixTile& operator=(MatrixTile<T, Allocator>&& other) = default;
+  MatrixTile& operator=(MatrixTile<T, AllocatorT>&& other) = default;
 
   /* Deep copy ctor und op are not needed for PO since tiles will never be read
    * and written concurrently. Hence shallow copies are enough, will all
    * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
    * can handle data sharing without excessive copying */
-  MatrixTile(const MatrixTile<T, Allocator>& other)
+  MatrixTile(const MatrixTile<T, AllocatorT>& other)
   : ttvalue_type()
   , _buffer(other._lda*other._cols)
   , _rows(other._rows)
@@ -108,7 +112,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
     std::copy_n(other.data(), _lda * _cols, this->data());
   }
 
-  MatrixTile& operator=(const MatrixTile<T, Allocator>& other) {
+  MatrixTile& operator=(const MatrixTile<T, AllocatorT>& other) {
     this->_rows = other._rows;
     this->_cols = other._cols;
     this->_lda = other._lda;
@@ -166,7 +170,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
   }
 #endif // DEBUG_TILES_VALUES
 
-  friend std::ostream& operator<<(std::ostream& o, MatrixTile<T> const& tt) {
+  friend std::ostream& operator<<(std::ostream& o, MatrixTile<T, AllocatorT> const& tt) {
     auto ptr = tt.data();
     o << std::endl << " ";
     o << "MatrixTile<" << typeid(T).name() << ">{ rows=" << tt.rows() << " cols=" << tt.cols() << " ld=" << tt.lda();
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index d3a92cb2f..4e5e3c194 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -95,7 +95,7 @@ namespace potrf {
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
     using T = typename MatrixT::element_type;
 #if defined(ENABLE_DEVICE_KERNEL)
-    auto iallocator = std::make_shared<TiledArray::device_pinned_allocator<int>>();
+    auto iallocator = std::make_shared<Allocator<int>>();
     //std::cout << "Creating CUDA POTRF task " << std::endl;
     auto f_dev = [=, iallocator = std::move(iallocator)]
                     (const Key1& key, MatrixTile<T>&& tile_kk,
@@ -669,7 +669,8 @@ namespace potrf {
 
   template <typename MatrixT>
   auto make_potrf_ttg(MatrixT& A, ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& input,
-                      ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output, bool defer_write) {
+                      ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output, bool defer_write,
+                      bool enable_device_map = true) {
     using T = typename MatrixT::element_type;
     auto keymap1 = [&](const Key1& key) { return A.rank_of(key[0], key[0]); };
 
@@ -705,28 +706,36 @@ namespace potrf {
     tt_potrf->set_keymap(keymap1);
     tt_potrf->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_potrf->set_devicemap(devmap1);
+    if (enable_device_map) {
+      tt_potrf->set_devicemap(devmap1);
+    }
 #endif // 0
 
     auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output);
     tt_trsm->set_keymap(keymap2a);
     tt_trsm->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_trsm->set_devicemap(devmap2a);
+    if (enable_device_map) {
+      tt_trsm->set_devicemap(devmap2a);
+    }
 #endif // 0
 
     auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk);
     tt_syrk->set_keymap(keymap2b);
     tt_syrk->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_syrk->set_devicemap(devmap2b);
+    if (enable_device_map) {
+      tt_syrk->set_devicemap(devmap2b);
+    }
 #endif // 0
 
     auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm);
     tt_gemm->set_keymap(keymap3);
     tt_gemm->set_defer_writer(defer_write);
 #ifdef ENABLE_DEVICE_KERNEL
-    tt_gemm->set_devicemap(devmap3);
+    if (enable_device_map) {
+      tt_gemm->set_devicemap(devmap3);
+    }
 #endif // 0
 
     /* Priorities taken from DPLASMA */
diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc
index 2a5f6dacb..781cc417c 100644
--- a/examples/potrf/testing_dpotrf.cc
+++ b/examples/potrf/testing_dpotrf.cc
@@ -61,6 +61,9 @@ int main(int argc, char **argv)
   bool check = !cmdOptionExists(argv+1, argv+argc, "-x");
   bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w");
 
+  /* whether we set a device mapping */
+  bool enable_device_map = !cmdOptionExists(argv, argv+argc, "--default-device-map");
+
   // TODO: need to filter out our arguments to make parsec happy
   ttg::initialize(1, argv, nthreads);
 
@@ -130,7 +133,7 @@ int main(int argc, char **argv)
       init_tt->set_keymap([&]() {return world.rank();});
 
       auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint);
-      auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint);
+      auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint, enable_device_map);
       auto result_ttg = make_result_ttg(A, result, cow_hint);
 
       auto connected = make_graph_executable(init_tt.get());
diff --git a/examples/spmm/devicegemm.h b/examples/spmm/devicegemm.h
new file mode 100644
index 000000000..bdfc99562
--- /dev/null
+++ b/examples/spmm/devicegemm.h
@@ -0,0 +1,81 @@
+
+#if defined(TTG_ENABLE_LEVEL_ZERO)
+#include <oneapi/mkl.hpp>
+#include <sys/time.h>
+#endif
+
+#include "../devblas_helper.h"
+
+
+template <typename Blk>
+inline void device_gemm(Blk &C, const Blk &A, const Blk &B) {
+  using blk_t = Blk;
+  using T = typename blk_t::value_type;
+  static_assert(std::is_same_v<T,double> || std::is_same_v<T,float>);
+  static const T alpha = 1.0;
+  static const T beta  = 1.0;
+  // make sure all memory is on the device
+  // TODO: A and B are read-only so the owner device will be 0. How to fix?
+  //assert(A.b.get_current_device() != 0);
+  //assert(B.b.get_current_device() != 0);
+  auto device = ttg::device::current_device();
+  assert(device.is_device());
+#if defined(TTG_ENABLE_CUDA)
+  if constexpr (std::is_same_v<T,double>) {
+      cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
+                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
+                  C.b.current_device_ptr(), C.extent(0));
+  }
+  else if constexpr (std::is_same_v<T,float>) {
+      cublasSgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
+                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
+                  C.b.current_device_ptr(), C.extent(0));
+  }
+#elif defined(TTG_ENABLE_HIP)
+  if constexpr (std::is_same_v<T,double>) {
+    hipblasDgemm(hipblas_handle(),
+                 HIPBLAS_OP_N, HIPBLAS_OP_N,
+                 C.extent(0), C.extent(1), A.extent(1), &alpha,
+                 A.b.current_device_ptr(), A.extent(0),
+                 B.b.current_device_ptr(), B.extent(0), &beta,
+                 C.b.current_device_ptr(), C.extent(0));
+  } else if constexpr (std::is_same_v<T,float>) {
+    hipblasSgemm(hipblas_handle(),
+                 HIPBLAS_OP_N, HIPBLAS_OP_N,
+                 C.extent(0), C.extent(1), A.extent(1), &alpha,
+                 A.b.current_device_ptr(), A.extent(0),
+                 B.b.current_device_ptr(), B.extent(0), &beta,
+                 C.b.current_device_ptr(), C.extent(0));
+  }
+#elif defined(TTG_ENABLE_LEVEL_ZERO)
+
+#if defined(DEBUG_SYNCHRONOUS)
+  try {
+#endif /* DEBUG_SYNCHRONOUS */
+    cl::sycl::event gemm_event;
+    gemm_event = oneapi::mkl::blas::gemm(ttg::device::current_stream(),
+         oneapi::mkl::transpose::N, oneapi::mkl::transpose::N,
+         C.extent(0), C.extent(1), A.extent(1),
+         alpha, A.b.current_device_ptr(), A.extent(0),
+                B.b.current_device_ptr(), B.extent(0),
+         beta,  C.b.current_device_ptr(), C.extent(0));
+#if defined(DEBUG_SYNCHRONOUS)
+    gemm_event.wait();
+  } catch (const oneapi::mkl::invalid_argument &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws invalid argument exception" << std::endl;
+  } catch (const oneapi::mkl::unsupported_device &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unsuported device exception" << std::endl;
+  } catch (const oneapi::mkl::host_bad_alloc &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws host bad allocation exception" << std::endl;
+  } catch (const oneapi::mkl::device_bad_alloc &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws device bad allocation exception" << std::endl;
+  } catch (const oneapi::mkl::unimplemented &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unimplemented exception" << std::endl;
+  } catch (const std::exception& e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception" << std::endl;
+  } catch (...) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception that is also badly formatted..." << std::endl;
+  }
+#endif /* DEBUG_SYNCHRONOUS */
+#endif
+}
diff --git a/examples/spmm/devicetensor.h b/examples/spmm/devicetensor.h
new file mode 100644
index 000000000..d5a0f6d9f
--- /dev/null
+++ b/examples/spmm/devicetensor.h
@@ -0,0 +1,218 @@
+#ifndef HAVE_DEVICETENSOR_H
+#define HAVE_DEVICETENSOR_H
+
+#include <ttg.h>
+
+#if __has_include(<btas/features.h>)
+#pragma message("C Preprocessor got here!")
+#include <btas/features.h>
+#ifdef BTAS_IS_USABLE
+#include <btas/btas.h>
+#include <btas/optimize/contract.h>
+#include <btas/util/mohndle.h>
+#include <TiledArray/external/device.h>
+#include "../devblas_helper.h"
+#include <madness/world/parsec.h>  // need to initialize MADNESS purely for the purposes of TA allocators
+#else
+#warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost"
+#endif
+#endif
+
+#if defined(BTAS_IS_USABLE)
+
+/**
+ * Derives from btas::Tensor and wraps a ttg::Buffer
+ * to enable device support in SPMM. The ttg::Buffer
+ * does not own the host memory but mananages the device
+ * memory.
+ */
+template <typename _T, class _Range, class _Storage>
+struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
+                    , public btas::Tensor<_T, _Range, _Storage> {
+  using tensor_type = typename btas::Tensor<_T, _Range, _Storage>;
+  using ttvalue_type = typename ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>;
+  ttg::Buffer<_T> b; // does not own the host buffer
+
+  using value_type = typename tensor_type::value_type;
+  using size_type = typename tensor_type::size_type;
+  using storage_type = typename tensor_type::storage_type;
+  using range_type = typename tensor_type::range_type;
+
+
+   public:
+    DeviceTensor() = default;
+    ~DeviceTensor() = default;
+
+    /// constructor with index extent
+    template <typename... _args>
+    explicit DeviceTensor(const size_type& first, const _args&... rest)
+    : ttvalue_type()
+    , tensor_type(first, rest...)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range, allocate data, but not initialized
+    template <typename Range>
+    explicit DeviceTensor(const Range& range, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range object, set all elements to \c v
+    template <typename Range>
+    DeviceTensor(const Range& range, value_type v, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range object, copy elements from \c vec
+    template <typename Range, typename U>
+    DeviceTensor(const Range& range, U* vec, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, vec)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range and \c storage
+    template <typename Range, typename Storage>
+    DeviceTensor(const Range& range, const Storage& storage,
+           typename std::enable_if<btas::is_boxrange<Range>::value & not std::is_same<Range, range_type>::value &
+                                   not std::is_same<Storage, storage_type>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, storage)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy-copy-construct from \c range and \c storage
+    DeviceTensor(const range_type& range, const storage_type& storage)
+    : ttvalue_type()
+    , tensor_type(range, storage)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy-move-construct from \c range and \c storage
+    DeviceTensor(const range_type& range, storage_type&& storage)
+    : ttvalue_type()
+    , tensor_type(range, std::forward<storage_type>(storage))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// move-construct from \c range and \c storage
+    DeviceTensor(range_type&& range, storage_type&& storage)
+    : ttvalue_type()
+    , tensor_type(std::forward<range_type>(range), std::forward<storage_type>(storage))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// Construct an evaluated tensor
+
+    /// This constructor will allocate memory for \c range.area() elements. Each element
+    /// will be initialized as:
+    /// \code
+    ///   for(auto&& idx: range)
+    ///     (*this)[idx] = op(*(it++));
+    /// \endcode
+    /// \tparam Range An input Range type.
+    /// \tparam InIter An input iterator type.
+    /// \tparam Op A unary operation type
+    /// \param range the input range type
+    /// \param first An input iterator for the argument
+    /// \param op The unary operation to be applied to the argument data
+    template <typename Range, typename InIter, typename Op>
+    DeviceTensor(const Range& range, InIter it, const Op& op,
+           typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, it, op)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy constructor
+    /// It will accept Tensors and TensorViews
+    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type>
+    DeviceTensor(const _Tensor& x) noexcept
+    : ttvalue_type()
+    , tensor_type(x.clone())
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor tensor_type copy ctor" << std::endl;
+    }
+
+    /// copy constructor: devicebuf cannot be copied, so deleted
+    DeviceTensor(const DeviceTensor& x) noexcept
+    : ttvalue_type(x)
+    , tensor_type(x.clone())
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor copy ctor" << std::endl;
+    }
+
+    /// move constructor
+    DeviceTensor(tensor_type&& x) noexcept
+    : ttvalue_type()
+    , tensor_type(std::move(x))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor tensor_type move ctor" << std::endl;
+    }
+
+    DeviceTensor(DeviceTensor&& x) noexcept
+    : ttvalue_type(std::move(x))
+    , tensor_type(static_cast<tensor_type&&>(x))
+    , b(std::move(x.b))
+    {
+      assert(this->data() == b.host_ptr());
+      //std::cout << "DeviceTensor move ctor" << std::endl;
+    }
+
+    /// copy assignment operator
+    template <class _Tensor, class = typename std::enable_if<
+                                 btas::is_boxtensor<_Tensor>::value &&
+                                 not std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
+    DeviceTensor& operator=(const _Tensor& x) noexcept {
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
+      return *this;
+    }
+
+    /// copy assignment operator
+    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type,
+              class = typename std::enable_if<
+                  std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
+    DeviceTensor& operator=(const _Tensor& x) noexcept {
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
+      return *this;
+    }
+
+    /// copy assignment: devicebuf cannot be copied, deleted
+    DeviceTensor& operator=(const DeviceTensor& x) noexcept {
+      ttvalue_type::operator=(x);
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor copy operator" << std::endl;
+      return *this;
+    }
+
+    /// move assignment operator
+    DeviceTensor& operator=(DeviceTensor&& x) noexcept {
+      ttvalue_type::operator=(std::move(x));
+      tensor_type::operator=(static_cast<tensor_type&&>(x));
+      b = std::move(x.b);
+      //std::cout << "DeviceTensor move ctor" << std::endl;
+      return *this;
+    }
+
+    using tensor_type::begin;
+    using tensor_type::cbegin;
+    using tensor_type::end;
+    using tensor_type::cend;
+
+};
+
+#endif // defined(BTAS_IS_USABLE)
+
+#endif // HAVE_DEVICETENSOR_H
\ No newline at end of file
diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 3760c9946..f29ae9088 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -29,8 +29,7 @@
 #endif
 
 #include "ttg.h"
-
-using namespace ttg;
+#include "../ttg_matrix.h"
 
 #include "ttg/util/future.h"
 
@@ -39,12 +38,41 @@ using namespace ttg;
 
 #include "ttg/util/bug.h"
 
+#include "devicetensor.h"
+#include "devicegemm.h"
+
+using namespace ttg;
+
+#if defined(TTG_ENABLE_CUDA)
+#define HAVE_SPMM_DEVICE 1
+static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::CUDA;
+#elif defined(TTG_ENABLE_HIP)
+#define HAVE_SPMM_DEVICE 1
+static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::HIP;
+#elif defined(TTG_ENABLE_LEVEL_ZERO)
+#define HAVE_SPMM_DEVICE 1
+static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::L0;
+#else
+static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::Host;
+#endif
+
+/* set to true to automatically release constraints
+ * this removes the ability to control the window
+ * size and is equal to a window size of 1 */
+#define USE_AUTO_CONSTRAINT false
+
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 using scalar_t = double;
+
+#if HAVE_SPMM_DEVICE
+using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
+                           btas::mohndle<btas::varray<scalar_t,
+                                                      TiledArray::device_pinned_allocator<scalar_t>>,
+                                         btas::Handle::shared_ptr>>;
+#else   // HAVE_SPMM_DEVICE
 using blk_t = btas::Tensor<scalar_t, btas::DEFAULT::range, btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
+#endif  // HAVE_SPMM_DEVICE
 
-//#include <atomic>
-//static std::atomic<uint64_t> reduce_count = 0;
 
 #if defined(TTG_USE_PARSEC)
 namespace ttg {
@@ -101,7 +129,7 @@ using blk_t = double;
 template <typename T = blk_t>
 using SpMatrix = Eigen::SparseMatrix<T>;
 template <typename T = blk_t>
-using SpMatrixTriplet = Eigen::Triplet<T>;  // {row,col,value}
+using SpMatrixTriplet = ttg::matrix::Triplet<T>;  // {row,col,value}
 
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
 
@@ -121,7 +149,7 @@ namespace btas {
   }
 
   template <typename T_, class Range_, class Store_>
-  btas::Tensor<T_, Range_, Store_> gemm(btas::Tensor<T_, Range_, Store_> &&C, const btas::Tensor<T_, Range_, Store_> &A,
+  void gemm(btas::Tensor<T_, Range_, Store_> &C, const btas::Tensor<T_, Range_, Store_> &A,
                                         const btas::Tensor<T_, Range_, Store_> &B) {
     using array = btas::DEFAULT::index<int>;
     if (C.empty()) {  // first contribution to C = allocate it and gemm with beta=0
@@ -131,11 +159,11 @@ namespace btas {
     else {   // subsequent contributions to C = gemm with beta=1
       btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false);
     }
-    return std::move(C);
+    //return std::move(C);
   }
 }  // namespace btas
 #endif  // BTAS_IS_USABLE
-double gemm(double C, double A, double B) { return C + A * B; }
+inline void gemm(double& C, const double A, const double B) { C += A * B; }
 
 // template <typename _Scalar, int _Options, typename _StorageIndex>
 // struct colmajor_layout;
@@ -206,7 +234,9 @@ class Read_SpMatrix : public TT<Key<3>,
         const auto i = it.row();
         // IF the receiver uses the same keymap, these sends are local
         if (rank == this->ij_keymap_(Key<2>(std::initializer_list<long>({i, j})))) {
-          ::send<0>(Key<2>(std::initializer_list<long>({i, j})), it.value(), out);
+          ::send<0>(Key<2>(std::initializer_list<long>({i, j})),
+                           ttg::persistent(it.value()),
+                           out);
         }
       }
     }
@@ -230,14 +260,14 @@ class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg:
       , write_back_(write_back)
   { }
 
-  void op(const Key<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) {
+  void op(const Key<2> &key, typename baseT::input_refs_tuple_type &&elem, std::tuple<> &) {
     if (write_back_) {
       std::lock_guard<std::mutex> lock(mtx_);
       ttg::trace("rank =", default_execution_context().rank(),
                 "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()), "spmm.cc Write_SpMatrix wrote {",
                 key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast<void *>(&matrix_),
                 " with mutex @", static_cast<void *>(&mtx_), " for object @", static_cast<void *>(this));
-      values_.emplace_back(key[0], key[1], baseT::template get<0>(elem));
+      values_.emplace_back(key[0], key[1], std::move(baseT::template get<0>(elem)));
     }
   }
 
@@ -269,7 +299,9 @@ class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg:
 
 /// @tparam KeyMap2 maps {i,j} to processor
 /// @tparam KeyMap3 maps {i,j,k} to processor
-template <typename Keymap2 = std::function<int(const Key<2> &)>, typename Keymap3 = std::function<int(const Key<3> &)>,
+template <ttg::ExecutionSpace Space = space,
+          typename Keymap2 = std::function<int(const Key<2> &)>,
+          typename Keymap3 = std::function<int(const Key<3> &)>,
           typename Blk = blk_t>
 class SpMM25D {
  public:
@@ -282,24 +314,31 @@ class SpMM25D {
           const std::vector<std::vector<long>> &b_cols_of_row,
           const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
           const std::vector<int> &nTiles, const std::vector<int> &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap,
-          long R, long parallel_bcasts = 1)
+          long P, long Q, long R, long parallel_bcasts = 1, bool enable_device_map = true)
       : a_cols_of_row_(a_cols_of_row)
       , b_rows_of_col_(b_rows_of_col)
       , a_rows_of_col_(a_rows_of_col)
       , b_cols_of_row_(b_cols_of_row)
+      , k_cnt_(a_rows_of_col_.size()+1)
       , ij_keymap_(std::move(ij_keymap))
       , ijk_keymap_(std::move(ijk_keymap))
-      , parallel_bcasts_(parallel_bcasts) {
+      , parallel_bcasts_(std::max(parallel_bcasts, 1L))
+      , enable_device_map_(enable_device_map) {
     Edge<Key<2>, void> a_ctl, b_ctl;
     Edge<Key<2>, int> a_rowctl, b_colctl; // TODO: can we have multiple control inputs per TT?
-    bcast_a_ = std::make_unique<BcastA>(a, a_ctl, a_rowctl, local_a_ijk_, a_rows_of_col_, a_cols_of_row_, b_cols_of_row_,
-                                        ij_keymap_, ijk_keymap_, parallel_bcasts_);
+    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint<Key<2>>>(USE_AUTO_CONSTRAINT);
+    bcast_a_ = std::make_unique<BcastA>(a, local_a_ijk_, b_cols_of_row_, ij_keymap_, ijk_keymap_);
+    // add constraint with external mapper: key[1] represents `k`
+    bcast_a_->add_constraint(constraint, [](const Key<2>& key){ return key[1]; });
     local_bcast_a_ = std::make_unique<LocalBcastA>(local_a_ijk_, a_ijk_, b_cols_of_row_, ijk_keymap_);
-    bcast_b_ = std::make_unique<BcastB>(b, b_ctl, b_colctl, local_b_ijk_, a_rows_of_col_, b_cols_of_row_, b_rows_of_col_,
-                                        ij_keymap_, ijk_keymap_, parallel_bcasts_);
+    bcast_b_ = std::make_unique<BcastB>(b, local_b_ijk_, a_rows_of_col_, ij_keymap_, ijk_keymap_);
+    // add constraint with external mapper: key[0] represents `k`
+    bcast_b_->add_constraint(constraint, [](const Key<2>& key){ return key[0]; });
     local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
-    multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
-                                                 b_rows_of_col_, mTiles, nTiles, ijk_keymap_);
+    multiplyadd_ = std::make_unique<MultiplyAdd<Space>>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
+                                                        b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint,
+                                                        k_cnt_, P, Q, parallel_bcasts_, enable_device_map_);
+
     reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
     reduce_c_->template set_input_reducer<0>(
       [&](Blk &c_ij, const Blk &c_ij_p) {
@@ -314,6 +353,11 @@ class SpMM25D {
     auto world = ttg::default_execution_context();
     const auto my_rank = world.rank();
     std::vector<bool> c_ij_procmask(world.size(), false);
+    std::vector<unsigned long> first_k_map(world.size(), std::numeric_limits<unsigned long>::max());
+    std::size_t max_k = a_rows_of_col_.size();
+    std::vector<std::size_t> k_cnt;
+    k_cnt.resize(a_rows_of_col_.size(), 0);
+    int release_k = 0;
     for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) {
       if (a_cols_of_row_[i].empty()) continue;
       for (auto j = 0ul; j != b_rows_of_col_.size(); ++j) {
@@ -326,7 +370,10 @@ class SpMM25D {
           while (have_k) {
             const auto pR = ijk_keymap_(Key<3>{i, j, k});
             assert(pR < c_ij_procmask.size());
+            k_cnt[k]++;
             c_ij_procmask[pR] = true;
+            // find the first k that is needed from us by this rank
+            first_k_map[pR] = std::min(first_k_map[pR], k);
             /* get next k */
             std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k);
           }
@@ -338,61 +385,21 @@ class SpMM25D {
       }
     }
 
-    /* kick off the first broadcast in each row of A
-     * this is used to enforce strict ordering within a row of A */
-    for (int i = 0; i < a_cols_of_row_.size(); ++i) {
-      for (int k : a_cols_of_row_[i]) {
-        auto key = Key<2>(i, k);
-        if (world.rank() == ij_keymap_(key)) {
-          bcast_a_->template in<1>()->send(key, 0);
-          break;
-        }
-      }
-    }
-
-    /* initial ctl input for a number of bcasts for A
-     * this is used to limit the number of concurrent bcasts */
-    int to_start = parallel_bcasts;
-    for (int k = 0;
-          0 < to_start && k < a_rows_of_col_.size();
-          ++k) {
-      for (auto i : a_rows_of_col_[k]) {
-        auto key = Key<2>(i, k);
-        if (world.rank() == ij_keymap_(key)) {
-          //std::cout << "SPMM kick off BcastA " << key << std::endl;
-          bcast_a_->template in<2>()->sendk(key);
-          if (0 == --to_start) break;
-        }
-      }
-    }
-
-    /* kick off the first broadcast in each column of B
-     * this is used to enforce strict ordering within a column of B */
-    for (int j = 0; j < b_rows_of_col_.size(); ++j) {
-      for (int k : b_rows_of_col_[j]) {
-        auto key = Key<2>(k, j);
-        if (world.rank() == ij_keymap_(key)) {
-          //std::cout << "BcastB kick off " << key << std::endl;
-          bcast_b_->template in<1>()->send(key, 0);
-          break;
-        }
-      }
+    k_cnt.push_back(1); // we always want to release the last k
+    assert(k_cnt.size() == k_cnt_.size());
+    // copy into atomic counters
+    std::size_t i = 0;
+    for (auto c : k_cnt) {
+      assert(i < k_cnt_.size());
+      k_cnt_[i++].store(c, std::memory_order_relaxed);
     }
 
-    /* initial ctl input for bcasts for B */
-    to_start = parallel_bcasts;
-    for (int k = 0;
-          0 < to_start && k < b_cols_of_row_.size();
-          ++k) {
-      for (auto j : b_cols_of_row_[k]) {
-        auto key = Key<2>(k, j);
-        if (world.rank() == ij_keymap_(key)) {
-          //std::cout << "SPMM kick off BcastB " << key << std::endl;
-          bcast_b_->template in<2>()->sendk(key);
-          if (0 == --to_start) break;
-        }
-      }
-    }
+    /* release the first parallel_bcasts_ k that are non-zero */
+    auto k_cnt_iter = k_cnt.begin();
+    do {
+      k_cnt_iter = std::find_if(k_cnt_iter, k_cnt.end(), [](auto c){ return c > 0; });
+    } while (++k_cnt_iter != k_cnt.end() && std::distance(k_cnt_iter, k_cnt.end()) < parallel_bcasts_);
+    constraint->release(std::distance(k_cnt.begin(), k_cnt_iter));
 
     TTGUNUSED(bcast_a_);
     TTGUNUSED(bcast_b_);
@@ -402,7 +409,7 @@ class SpMM25D {
 
   /// Locally broadcast `A[i][k]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `j` such that
   /// `B[k][j]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
-  class LocalBcastA : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastA, ttg::typelist<Blk>> {
+  class LocalBcastA : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastA, ttg::typelist<const Blk>> {
    public:
     using baseT = typename LocalBcastA::ttT;
 
@@ -439,32 +446,24 @@ class SpMM25D {
   };  // class LocalBcastA
 
   /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists
-  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>>, BcastA, ttg::typelist<Blk, int, void>> {
+  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastA, ttg::typelist<const Blk>> {
    public:
     using baseT = typename BcastA::ttT;
 
-    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<2>, void> &ctl,
-           Edge<Key<2>, int> &rowctl, Edge<Key<3>, Blk> &a_ikp,
-           const std::vector<std::vector<long>> &a_rows_of_col,
-           const std::vector<std::vector<long>> &a_cols_of_row,
+    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<3>, Blk> &a_ikp,
            const std::vector<std::vector<long>> &b_cols_of_row,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap,
-           const int parallel_bcasts)
-        : baseT(edges(a_ik, rowctl, ctl), edges(a_ikp, rowctl, ctl), "SpMM25D::bcast_a", {"a_ik", "rowctl", "ctl"}, {"a_ikp", "rowctl", "ctl"}, ij_keymap)
-        , a_rows_of_col_(a_rows_of_col)
-        , a_cols_of_row_(a_cols_of_row)
+           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
+        : baseT(edges(a_ik), edges(a_ikp), "SpMM25D::bcast_a", {"a_ik"}, {"a_ikp"}, ij_keymap)
         , b_cols_of_row_(b_cols_of_row)
-        , ijk_keymap_(ijk_keymap)
-        , ij_keymap_(ij_keymap)
-        , parallel_bcasts_(parallel_bcasts) {
+        , ijk_keymap_(ijk_keymap) {
 
       this->set_priomap([](const Key<2>& key){
         return std::numeric_limits<int>::max() - key[0];
       });
     }
 
-    void op(const Key<2> &ik, typename baseT::input_values_tuple_type &&a_ik,
-            std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>> &outs) {
+    void op(const Key<2> &ik, typename baseT::input_refs_tuple_type &&a_ik,
+            std::tuple<Out<Key<3>, Blk>> &outs) {
       const auto i = ik[0]; // row
       const auto k = ik[1]; // col
       ttg::trace("BcastA(", i, ", ", k, ")");
@@ -482,86 +481,18 @@ class SpMM25D {
           ikp_keys.emplace_back(Key<3>({i, k, p}));
           procmap[p] = true;
         }
-        // TODO: debug
-        //if (p != world.rank() && ij_keymap_(Key<2>{k, j}) != p) {
-        //  std::cout << "[" << world.rank() << "] BCAST A " << ik << " for C update " << Key<3>({i, k, p}) << " on " << p << " has B from " << ij_keymap_(Key<2>{k, j}) << std::endl;
-        //}
       }
       ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), outs);
-
-      /* enable the next broadcast on this row */
-      int row = i;
-      int col = k;
-      auto rowit = std::find(a_cols_of_row_[row].begin(), a_cols_of_row_[row].end(), col);
-      for (++rowit; rowit != a_cols_of_row_[row].end(); ++rowit) {
-        Key<2> key = {row, *rowit};
-        if (world.rank() == this->get_keymap()(key)) {
-          ::send<1>(key, std::move(baseT::template get<1>(a_ik)), outs);
-          break;
-        }
-      }
-
-
-      /* enable next broadcast through a control message
-       * we don't check whether this tile is in B here, this is
-       * done inside the next task (see above)
-       * we walk the matrix A column-major in an attempt to send from top to bottom, left to right */
-      long to_skip = parallel_bcasts_;
-
-      auto colit = std::find(a_rows_of_col_[col].begin(), a_rows_of_col_[col].end(), row);
-      ++colit; // skip to next row
-      do {
-        for (; colit != a_rows_of_col_[col].end(); ++colit) {
-          Key<2> key = {*colit, col};
-          if (world.rank() == this->get_keymap()(key)) {
-            if (0 == --to_skip) {
-              //std::cout << "BcastA sending to " << key << " from " << ik << std::endl;
-              ::sendk<2>(key, outs);
-              return;
-            }
-          }
-        }
-        /* nothing for us in this column, move on to the next column */
-        if (++col < a_rows_of_col_.size()) {
-          colit = a_rows_of_col_[col].begin();
-        } else {
-          break;
-        }
-      } while (1);
-
-#if 0
-      do {
-        for (; it != a_cols_of_row_[i].end(); ++it) {
-          Key<2> key = {i, *it};
-          if (world.rank() == this->get_keymap()(key)) {
-            if (0 == --to_skip) {
-              ::sendk<1>(key, outs);
-              return;
-            }
-          }
-        }
-        if ((i+1) < num_rows) {
-          it = a_cols_of_row_[++i].begin();
-        } else {
-          break;
-        }
-      } while (1);
-#endif // 0
     }
 
    private:
-    //const std::vector<std::vector<long>> &a_cols_of_row_;
-    const std::vector<std::vector<long>> &a_rows_of_col_;
-    const std::vector<std::vector<long>> &a_cols_of_row_;
     const std::vector<std::vector<long>> &b_cols_of_row_;
     const Keymap3 &ijk_keymap_;
-    const Keymap2 &ij_keymap_;
-    const int parallel_bcasts_;
   };  // class BcastA
 
   /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that
   /// `A[i][k]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
-  class LocalBcastB : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastB, ttg::typelist<Blk>> {
+  class LocalBcastB : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastB, ttg::typelist<const Blk>> {
    public:
     using baseT = typename LocalBcastB::ttT;
 
@@ -597,30 +528,24 @@ class SpMM25D {
   };  // class LocalBcastB
 
   /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists
-  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>>, BcastB, ttg::typelist<Blk, int, void>> {
+  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastB, ttg::typelist<const Blk>> {
    public:
     using baseT = typename BcastB::ttT;
 
-    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<2>, void> ctl, Edge<Key<2>, int> colctl, Edge<Key<3>, Blk> &b_kjp,
+    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<3>, Blk> &b_kjp,
            const std::vector<std::vector<long>> &a_rows_of_col,
-           const std::vector<std::vector<long>> &b_cols_of_row,
-           const std::vector<std::vector<long>> &b_rows_of_col,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap,
-           const int parallel_bcasts)
-        : baseT(edges(b_kj, colctl, ctl), edges(b_kjp, colctl, ctl), "SpMM25D::bcast_b", {"b_kj", "colctl", "ctl"}, {"b_kjp", "colctl", "ctl"}, ij_keymap)
+           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
+        : baseT(edges(b_kj), edges(b_kjp), "SpMM25D::bcast_b", {"b_kj"}, {"b_kjp"}, ij_keymap)
         , a_rows_of_col_(a_rows_of_col)
-        , b_cols_of_row_(b_cols_of_row)
-        , b_rows_of_col_(b_rows_of_col)
         , ijk_keymap_(ijk_keymap)
-        , parallel_bcasts_(parallel_bcasts)
     {
       this->set_priomap([](const Key<2>& key){
         return std::numeric_limits<int>::max() - key[1];
       });
     }
 
-    void op(const Key<2> &kj, typename baseT::input_values_tuple_type &&b_kj,
-            std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>> &outs) {
+    void op(const Key<2> &kj, typename baseT::input_refs_tuple_type &&b_kj,
+            std::tuple<Out<Key<3>, Blk>> &outs) {
       const auto k = kj[0]; // row
       const auto j = kj[1]; // col
       // broadcast b_kj to all processors which will contain at least one c_ij such that a_ik exists
@@ -639,100 +564,75 @@ class SpMM25D {
         }
       }
       ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), outs);
-
-      /* enable the next broadcast on this row */
-      int row = k;
-      int col = j;
-      auto colit = std::find(b_rows_of_col_[col].begin(), b_rows_of_col_[col].end(), row);
-      for (++colit; colit != b_rows_of_col_[col].end(); ++colit) {
-        Key<2> key = {*colit, col};
-        if (world.rank() == this->get_keymap()(key)) {
-          //std::cout << "BcastB kick off " << key << std::endl;
-          ::send<1>(key, std::move(baseT::template get<1>(b_kj)), outs);
-          break;
-        }
-      }
-
-      /* enable next broadcast through a control message
-       * we don't check whether this tile is in A here, this is
-       * done inside the next task (see above)
-       * we run across a row to enable broadcasts */
-      long to_skip = parallel_bcasts_;
-
-      // iterator over the current row
-      auto rowit = std::find(b_cols_of_row_[row].begin(), b_cols_of_row_[row].end(), col);
-      ++rowit; // skip to next col
-      do {
-        for (; rowit != b_cols_of_row_[row].end(); ++rowit) {
-          Key<2> key = {row, *rowit};
-          if (world.rank() == this->get_keymap()(key)) {
-            if (0 == --to_skip) {
-              //std::cout << "BcastB sending to " << key << " from " << kj << " pb " << parallel_bcasts_ << std::endl;
-              ::sendk<2>(key, outs);
-              return;
-            } else {
-              //std::cout << "BcastB skipping " << key << " from " << kj << " pb " << parallel_bcasts_ << std::endl;
-            }
-          }
-        }
-        /* nothing for us in this row, move on to the next row */
-        if (++row != b_cols_of_row_.size()) {
-          rowit = b_cols_of_row_[row].begin();
-        } else {
-          break;
-        }
-      } while (1);
-
-
-#if 0
-      std::size_t num_rows = b_cols_of_row_.size();
-      auto it = std::find(b_cols_of_row_[k].begin(), b_cols_of_row_[k].end(), j);
-      ++it; // skip the current tile
-      long to_skip = parallel_bcasts_;
-      do {
-        for (; it != b_cols_of_row_[k].end(); ++it) {
-          Key<2> key = {k, *it};
-          if (world.rank() == this->get_keymap()(key)) {
-            if (0 == --to_skip) {
-              ::sendk<1>(key, outs);
-              return;
-            }
-          }
-        }
-        if ((k+1) < num_rows) {
-          it = b_cols_of_row_[++k].begin();
-        } else {
-          break;
-        }
-      } while (1);
-#endif // 0
     }
 
    private:
     const std::vector<std::vector<long>> &a_rows_of_col_;
-    const std::vector<std::vector<long>> &b_cols_of_row_;
-    const std::vector<std::vector<long>> &b_rows_of_col_;
     const Keymap3 &ijk_keymap_;
-    const int parallel_bcasts_;
   };  // class BcastB
 
   /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this layer of the
   /// 3-D process grid only
-  class MultiplyAdd : public TT<Key<3>, std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>>, MultiplyAdd,
+  template<ttg::ExecutionSpace Space_>
+  class MultiplyAdd : public TT<Key<3>, std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>>, MultiplyAdd<Space_>,
                                 ttg::typelist<const Blk, const Blk, Blk>> {
+    static constexpr const bool is_device_space = (Space_ != ttg::ExecutionSpace::Host);
+    using task_return_type = std::conditional_t<is_device_space, ttg::device::Task, void>;
+
+    void release_next_k(long k) {
+      assert(k_cnt_.size() > k);
+      long cnt = k_cnt_[k].fetch_sub(1, std::memory_order_relaxed)-1;
+      assert(cnt >= 0);
+      if (0 == cnt) {
+        auto release_k = k;
+        auto bcasts_ahead = parallel_bcasts_;
+        // this was the last gemm in this k, find the next one to release
+        while (++release_k < k_cnt_.size() &&
+                (0 == k_cnt_[release_k].load(std::memory_order_relaxed)
+                || --bcasts_ahead > 0))
+        { }
+        constraint->release(release_k);
+      }
+    }
+
+
    public:
     using baseT = typename MultiplyAdd::ttT;
 
+    /* communicate to the runtime which device we support (if any) */
+    static constexpr bool have_cuda_op = (Space_ == ttg::ExecutionSpace::CUDA);
+    static constexpr bool have_hip_op  = (Space_ == ttg::ExecutionSpace::HIP);
+    static constexpr bool have_level_zero_op = (Space_ == ttg::ExecutionSpace::L0);
+
     MultiplyAdd(Edge<Key<3>, Blk> &a_ijk, Edge<Key<3>, Blk> &b_ijk, Edge<Key<3>, Blk> &c_ijk, Edge<Key<2>, Blk> &c,
                 const std::vector<std::vector<long>> &a_cols_of_row,
                 const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
-                const std::vector<int> &nTiles, const Keymap3 &ijk_keymap)
+                const std::vector<int> &nTiles, const Keymap3 &ijk_keymap,
+                std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint,
+                std::vector<std::atomic<std::size_t>>& k_cnt,
+                long P, long Q,
+                std::size_t parallel_bcasts, bool enable_device_map)
         : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
                 {"c_ij", "c_ijk"}, ijk_keymap)
         , a_cols_of_row_(a_cols_of_row)
-        , b_rows_of_col_(b_rows_of_col) {
+        , b_rows_of_col_(b_rows_of_col)
+        , k_cnt_(k_cnt)
+        , constraint(std::move(constraint))
+        , parallel_bcasts_(parallel_bcasts) {
       this->set_priomap([=,this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
-
+      if constexpr (is_device_space) {
+        if (enable_device_map) {
+          int num_devices = ttg::device::num_devices();
+          int gp = std::sqrt(num_devices);
+          int gq = num_devices / gp;
+          this->set_devicemap(
+            [P,Q,gp,gq,num_devices](const Key<3> &ijk){
+              // TODO: include the number of rows/columns in this formula
+              auto device = (((ijk[0]/P)%gp)*gq) + (ijk[1]/Q)%gq;
+              return device;
+            });
+        }
+      }
       // for each {i,j} determine first k that contributes AND belongs to this node,
       // initialize input {i,j,first_k} flow to 0
       for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) {
@@ -760,8 +660,8 @@ class SpMM25D {
       }
     }
 
-    void op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
-            std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>> &result) {
+    task_return_type op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
+                        std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>> &result) {
       const auto i = ijk[0];
       const auto j = ijk[1];
       const auto k = ijk[2];  // k==l same because 000 will always be on layer 0, 001 will be accessed on layer 1
@@ -774,25 +674,64 @@ class SpMM25D {
                  " C[",
                  i, "][", j, "]  += A[", i, "][", k, "] by B[", k, "][", j, "],  next_k? ",
                  (have_next_k ? std::to_string(next_k) : "does not exist"));
+      // release the constraint on the next round of broadcasts
+      release_next_k(k);
+      const blk_t& A = baseT::template get<0>(_ijk);
+      const blk_t& B = baseT::template get<1>(_ijk);
+      blk_t& C = baseT::template get<2>(_ijk);
+
+#if defined(BLOCK_SPARSE_GEMM)
+      if (C.empty()) {
+        C = blk_t(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0);
+      }
+#endif // BLOCK_SPARSE_GEMM
+
+#ifdef HAVE_SPMM_DEVICE
+      /* pull all buffers onto the device */
+      co_await ttg::device::select(A.b, B.b, C.b);
+
+      /* everything is on the device, call the gemm */
+      device_gemm(C, A, B);
+
+      // pass the running total to the next flow, if needed
+      // otherwise write to the result flow
+      if (have_next_k) {
+        co_await ttg::device::forward(ttg::device::send<1>(
+                                                Key<3>({i, j, next_k}),
+                                                std::move(C),
+                                                result));
+      } else {  // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j]
+                // belongs
+        co_await ttg::device::forward(ttg::device::send<0>(
+                                                Key<2>({i, j}),
+                                                std::move(C),
+                                                result));
+      }
+#else  // HAVE_SPMM_DEVICE
+      gemm(C, A, B);
       // compute the contrib, pass the running total to the next flow, if needed
       // otherwise write to the result flow
       if (have_next_k) {
         ::send<1>(
             Key<3>({i, j, next_k}),
-            gemm(std::move(baseT::template get<2>(_ijk)), baseT::template get<0>(_ijk), baseT::template get<1>(_ijk)),
+            std::move(C),
             result);
       } else {  // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j]
                 // belongs
         ::send<0>(
             Key<2>({i, j}),
-            gemm(std::move(baseT::template get<2>(_ijk)), baseT::template get<0>(_ijk), baseT::template get<1>(_ijk)),
+            std::move(C),
             result);
       }
+#endif // HAVE_SPMM_DEVICE
     }
 
    private:
     const std::vector<std::vector<long>> &a_cols_of_row_;
     const std::vector<std::vector<long>> &b_rows_of_col_;
+    std::vector<std::atomic<std::size_t>>& k_cnt_;
+    std::shared_ptr<ttg::SequencedKeysConstraint<Key<2>>> constraint;
+    std::size_t parallel_bcasts_;
 
     /* Compute the length of the remaining sequence on that tile */
     int32_t prio(const Key<3> &key) const {
@@ -936,11 +875,13 @@ class SpMM25D {
   std::unique_ptr<LocalBcastA> local_bcast_a_;
   std::unique_ptr<BcastB> bcast_b_;
   std::unique_ptr<LocalBcastB> local_bcast_b_;
-  std::unique_ptr<MultiplyAdd> multiplyadd_;
+  std::unique_ptr<MultiplyAdd<Space>> multiplyadd_;
   std::unique_ptr<ReduceC> reduce_c_;
+  std::vector<std::atomic<std::size_t>> k_cnt_;
   Keymap2 ij_keymap_;
   Keymap3 ijk_keymap_;
   long parallel_bcasts_;
+  bool enable_device_map_;
 };
 
 class Control : public TT<void, std::tuple<Out<Key<3>>>, Control> {
@@ -1139,7 +1080,7 @@ static void initSpRmat(const std::function<int(const Key<2> &)> &keymap, const c
   boost::minstd_rand gen(seed);
   boost::rmat_iterator<boost::minstd_rand, boost::directed_graph<>> rmat_it(gen, N, E, a, b, c, d);
 
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   for (int i = 0; i < N; i++) {
     nnz++;
@@ -1175,7 +1116,7 @@ static void initSpHardCoded(const std::function<int(const Key<2> &)> &keymap, Sp
   C.resize(m, n);
   // We initialize the same matrices on all the ranks, but we will use only the local part
   // following the keymap
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   A_elements.emplace_back(0, 1, 12.3);
   A_elements.emplace_back(0, 2, 10.7);
@@ -1222,7 +1163,7 @@ static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap,
 
   int rank = ttg::default_execution_context().rank();
 
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   std::vector<triplet_t> Aref_elements;
 #if defined(BTAS_IS_USABLE)
@@ -1388,7 +1329,7 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
   std::mt19937 genv(seed + 1);
 
   std::uniform_int_distribution<> dist(minTs, maxTs);  // randomly pick any value in the range minTs, maxTs
-  using triplet_t = Eigen::Triplet<blk_t>;
+  using triplet_t = ttg::matrix::Triplet<blk_t>;
   std::vector<triplet_t> A_elements;
   std::vector<triplet_t> B_elements;
   std::vector<triplet_t> Aref_elements;
@@ -1512,7 +1453,7 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
                               const std::vector<std::vector<long>> &b_cols_of_row,
                               const std::vector<std::vector<long>> &b_rows_of_col, std::vector<int> &mTiles,
                               std::vector<int> &nTiles, std::vector<int> &kTiles, int M, int N, int K, int minTs,
-                              int maxTs, int P, int Q, int R, int parallel_bcasts) {
+                              int maxTs, int P, int Q, int R, int parallel_bcasts, bool enable_device_map) {
   int MT = (int)A.rows();
   int NT = (int)B.cols();
   int KT = (int)A.cols();
@@ -1539,7 +1480,7 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
   assert(!has_value(c_status));
   //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
   SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col,
-                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R, parallel_bcasts);
+                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, P, Q, R, parallel_bcasts, enable_device_map);
   TTGUNUSED(a);
   TTGUNUSED(b);
   TTGUNUSED(a_times_b);
@@ -1566,12 +1507,12 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
   std::string rt("Unkown???");
 #endif
   if (ttg::default_execution_context().rank() == 0) {
-    std::cout << "TTG-" << rt << " PxQxR=   " << P << " " << Q << " " << R << " 1 average_NB= " << avg_nb << " M= " << M
+    std::cout << "TTG-" << rt << " PxQxR=   " << P << " " << Q << " " << R << " " << ttg::device::num_devices()
+              << " average_NB= " << avg_nb << " M= " << M
               << " N= " << N << " K= " << K << " t= " << minTs << " T=" << maxTs << " Tiling= " << tiling_type
               << " A_density= " << Adensity << " B_density= " << Bdensity << " gflops= " << gflops << " seconds= " << tc
               << " gflops/s= " << gflops / tc << std::endl;
   }
-  //std::cout << "num reductions " << reduce_count.load() << " tiles " << MT*KT << std::endl;
 }
 
 #if !defined(BLOCK_SPARSE_GEMM)
@@ -1635,20 +1576,6 @@ int main(int argc, char **argv) {
   bool timing;
   double gflops;
 
-  // warm up silicon by calling gemm a few times
-#ifdef BTAS_IS_USABLE
-  for (int i = 0; i < 20; i++) {
-    using baseT = typename btas::Tensor<double>;
-    btas::Tensor<double, btas::Range, std::vector<double>> At(30, 30);
-    btas::Tensor<double, btas::Range, std::vector<double>> Bt(30, 30);
-    btas::Tensor<double, btas::Range, std::vector<double>> Ct(30, 30);
-    At.fill(1.0);
-    Bt.fill(2.0);
-    Ct.fill(3.0);
-    btas::gemm(std::move(Ct), Bt, At);
-  }
-#endif  // BTAS_IS_USABLE
-
   int cores = -1;
   std::string nbCoreStr(getCmdOption(argv, argv + argc, "-c"));
   cores = parseOption(nbCoreStr, cores);
@@ -1659,6 +1586,12 @@ int main(int argc, char **argv) {
     initialize(1, argv, cores);
   }
 
+#if defined(BTAS_IS_USABLE) && defined(TTG_PARSEC_IMPORTED)
+  // initialize MADNESS so that TA allocators can be created
+  madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
+  madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
+#endif  // BTAS_IS_USABLE
+
   std::string debugStr(getCmdOption(argv, argv + argc, "-d"));
   auto debug = (unsigned int)parseOption(debugStr, 0);
 
@@ -1710,6 +1643,9 @@ int main(int argc, char **argv) {
         parallel_bcasts = std::stol(pStr);
       }
 
+      /* whether we set a device mapping */
+      bool enable_device_map = !cmdOptionExists(argv, argv+argc, "--default-device-map");
+
       std::string PStr(getCmdOption(argv, argv + argc, "-P"));
       P = parseOption(PStr, P);
       std::string QStr(getCmdOption(argv, argv + argc, "-Q"));
@@ -1842,6 +1778,9 @@ int main(int argc, char **argv) {
         minTs = parseOption(minTsStr, 32);
         std::string maxTsStr(getCmdOption(argv, argv + argc, "-T"));
         maxTs = parseOption(maxTsStr, 256);
+        if (minTs >= maxTs) {
+          maxTs = minTs;
+        }
         std::string avgStr(getCmdOption(argv, argv + argc, "-a"));
         double avg = parseOption(avgStr, 0.3);
         timing = (check == 0);
@@ -1867,9 +1806,17 @@ int main(int argc, char **argv) {
         // Start up engine
         execute();
         for (int nrun = 0; nrun < nb_runs; nrun++) {
+#if TTG_USE_PARSEC
+          /* flush all PaRSEC memory */
+          parsec_devices_release_memory();
+#endif // TTG_USE_PARSEC
           timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity,
                             a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col, mTiles,
-                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R, parallel_bcasts);
+                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R, parallel_bcasts, enable_device_map);
+#if TTG_USE_PARSEC
+          /* reset PaRSEC's load tracking */
+          parsec_devices_reset_load(default_execution_context().impl().context());
+#endif // TTG_USE_PARSEC
         }
       } else {
         // flow graph needs to exist on every node
@@ -1890,7 +1837,7 @@ int main(int argc, char **argv) {
         assert(!has_value(c_status));
         //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
         SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row,
-                            b_rows_of_col, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
+                            b_rows_of_col, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, P, Q, R);
         TTGUNUSED(a_times_b);
         // calling the Dot constructor with 'true' argument disables the type
         if (default_execution_context().rank() == 0) std::cout << Dot{/*disable_type=*/true}(&control) << std::endl;
@@ -1940,6 +1887,10 @@ int main(int argc, char **argv) {
       }
     }
 
+#if defined(BTAS_IS_USABLE) && defined(TTG_PARSEC_IMPORTED)
+    madness::finalize();
+#endif  // BTAS_IS_USABLE
+
     ttg_finalize();
 
     return 0;
diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc
deleted file mode 100644
index 469aef8f7..000000000
--- a/examples/spmm/spmm_cuda.cc
+++ /dev/null
@@ -1,1969 +0,0 @@
-
-#include <Eigen/SparseCore>
-#include <algorithm>
-#include <array>
-#include <chrono>
-#include <iostream>
-#include <random>
-#include <thread>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <madness/world/world.h>
-
-#if __has_include(<btas/features.h>)
-#pragma message("C Preprocessor got here!")
-#include <btas/features.h>
-#ifdef BTAS_IS_USABLE
-#include <btas/btas.h>
-#include <btas/optimize/contract.h>
-#include <btas/util/mohndle.h>
-#include <TiledArray/device/allocators.h>
-#include "../devblas_helper.h"
-#include <madness/world/parsec.h>  // need to initialize MADNESS purely for the purposes of TA allocators
-#else
-#warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost"
-#endif
-#endif
-
-#include <sys/time.h>
-#include <boost/graph/rmat_graph_generator.hpp>
-#if !defined(BLOCK_SPARSE_GEMM)
-#include <boost/graph/directed_graph.hpp>
-#include <boost/random/linear_congruential.hpp>
-#include <unsupported/Eigen/SparseExtra>
-#endif
-
-#include "ttg.h"
-
-#include "../devblas_helper.h"
-
-using namespace ttg;
-
-#include "ttg/util/future.h"
-
-#include "ttg/util/multiindex.h"
-
-#include "ttg/util/bug.h"
-
-#include "ttg/serialization/std/pair.h"
-
-#if defined(TTG_ENABLE_LEVEL_ZERO)
-#include <oneapi/mkl.hpp>
-#include <sys/time.h>
-#endif
-
-#if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
-
-template <typename _T, class _Range, class _Storage>
-struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
-                    , public btas::Tensor<_T, _Range, _Storage> {
-  using tensor_type = typename btas::Tensor<_T, _Range, _Storage>;
-  using ttvalue_type = typename ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>;
-  ttg::Buffer<_T> b; // does not own the host buffer
-
-  using value_type = typename tensor_type::value_type;
-  using size_type = typename tensor_type::size_type;
-  using storage_type = typename tensor_type::storage_type;
-  using range_type = typename tensor_type::range_type;
-
-
-   public:
-    DeviceTensor() = default;
-    ~DeviceTensor() = default;
-
-    /// constructor with index extent
-    template <typename... _args>
-    explicit DeviceTensor(const size_type& first, const _args&... rest)
-    : ttvalue_type()
-    , tensor_type(first, rest...)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// construct from \c range, allocate data, but not initialized
-    template <typename Range>
-    explicit DeviceTensor(const Range& range, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// construct from \c range object, set all elements to \c v
-    template <typename Range>
-    DeviceTensor(const Range& range, value_type v, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// construct from \c range object, copy elements from \c vec
-    template <typename Range, typename U>
-    DeviceTensor(const Range& range, U* vec, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range, vec)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// construct from \c range and \c storage
-    template <typename Range, typename Storage>
-    DeviceTensor(const Range& range, const Storage& storage,
-           typename std::enable_if<btas::is_boxrange<Range>::value & not std::is_same<Range, range_type>::value &
-                                   not std::is_same<Storage, storage_type>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range, storage)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// copy-copy-construct from \c range and \c storage
-    DeviceTensor(const range_type& range, const storage_type& storage)
-    : ttvalue_type()
-    , tensor_type(range, storage)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// copy-move-construct from \c range and \c storage
-    DeviceTensor(const range_type& range, storage_type&& storage)
-    : ttvalue_type()
-    , tensor_type(range, std::forward<storage_type>(storage))
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// move-construct from \c range and \c storage
-    DeviceTensor(range_type&& range, storage_type&& storage)
-    : ttvalue_type()
-    , tensor_type(std::forward<range_type>(range), std::forward<storage_type>(storage))
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// Construct an evaluated tensor
-
-    /// This constructor will allocate memory for \c range.area() elements. Each element
-    /// will be initialized as:
-    /// \code
-    ///   for(auto&& idx: range)
-    ///     (*this)[idx] = op(*(it++));
-    /// \endcode
-    /// \tparam Range An input Range type.
-    /// \tparam InIter An input iterator type.
-    /// \tparam Op A unary operation type
-    /// \param range the input range type
-    /// \param first An input iterator for the argument
-    /// \param op The unary operation to be applied to the argument data
-    template <typename Range, typename InIter, typename Op>
-    DeviceTensor(const Range& range, InIter it, const Op& op,
-           typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
-    : ttvalue_type()
-    , tensor_type(range, it, op)
-    , b(this->size() ? this->data() : nullptr, this->size())
-    { }
-
-    /// copy constructor
-    /// It will accept Tensors and TensorViews
-    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type>
-    DeviceTensor(const _Tensor& x) noexcept
-    : ttvalue_type()
-    , tensor_type(x.clone())
-    , b(this->size() ? this->data() : nullptr, this->size())
-    {
-      //std::cout << "DeviceTensor tensor_type copy ctor" << std::endl;
-    }
-
-    /// copy constructor: devicebuf cannot be copied, so deleted
-    DeviceTensor(const DeviceTensor& x) noexcept
-    : ttvalue_type(x)
-    , tensor_type(x.clone())
-    , b(this->size() ? this->data() : nullptr, this->size())
-    {
-      //std::cout << "DeviceTensor copy ctor" << std::endl;
-    }
-
-    /// move constructor
-    DeviceTensor(tensor_type&& x) noexcept
-    : ttvalue_type()
-    , tensor_type(std::move(x))
-    , b(this->size() ? this->data() : nullptr, this->size())
-    {
-      //std::cout << "DeviceTensor tensor_type move ctor" << std::endl;
-    }
-
-    DeviceTensor(DeviceTensor&& x) noexcept
-    : ttvalue_type(std::move(x))
-    , tensor_type(std::move(x))
-    /* Grrrr, moving a Tensor does not guarantee to move the pointer */
-    , b((this->size() == 0 ||
-         this->data() == x.b.host_ptr()) ? std::move(x.b)
-                                         : ttg::Buffer<_T>(this->size() ? this->data()
-                                                                        : nullptr,
-                                                           this->size()))
-    {
-      assert(this->data() == b.host_ptr());
-      //std::cout << "DeviceTensor move ctor" << std::endl;
-    }
-
-    /// copy assignment operator
-    template <class _Tensor, class = typename std::enable_if<
-                                 btas::is_boxtensor<_Tensor>::value &&
-                                 not std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
-    DeviceTensor& operator=(const _Tensor& x) noexcept {
-      tensor_type::operator=(x.clone());
-      b.reset(this->size() ? this->data() : nullptr, this->size());
-      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
-      return *this;
-    }
-
-    /// copy assignment operator
-    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type,
-              class = typename std::enable_if<
-                  std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
-    DeviceTensor& operator=(const _Tensor& x) noexcept {
-      tensor_type::operator=(x.clone());
-      b.reset(this->size() ? this->data() : nullptr, this->size());
-      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
-      return *this;
-    }
-
-    /// copy assignment: devicebuf cannot be copied, deleted
-    DeviceTensor& operator=(const DeviceTensor& x) noexcept {
-      ttvalue_type::operator=(x);
-      tensor_type::operator=(x.clone());
-      b.reset(this->size() ? this->data() : nullptr, this->size());
-      //std::cout << "DeviceTensor copy operator" << std::endl;
-      return *this;
-    }
-
-    /// move assignment operator
-    DeviceTensor& operator=(DeviceTensor&& x) noexcept {
-      ttvalue_type::operator=(std::move(x));
-      tensor_type::operator=(std::move(x));
-      if (this->size() == 0 || this->data() == x.b.host_ptr()){
-        b = std::move(x.b);
-      } else  {
-        b = ttg::Buffer<_T>(this->size() ? this->data() : nullptr, this->size());
-      }
-      //std::swap(x.b, b);
-      //std::cout << "DeviceTensor move ctor" << std::endl;
-      return *this;
-    }
-
-    using tensor_type::begin;
-    using tensor_type::cbegin;
-    using tensor_type::end;
-    using tensor_type::cend;
-
-};
-
-using scalar_t = double;
-#if defined(TTG_ENABLE_CUDA) || defined(TTG_ENABLE_HIP)
-using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
-                           btas::mohndle<btas::varray<scalar_t, TiledArray::device_pinned_allocator<scalar_t>>,
-                                         btas::Handle::shared_ptr>>;
-#else
-using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
-                           btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
-#endif
-
-
-//inline blk_t operator*(const blk_t &A, const blk_t &B) {
-//  blk_t::tensor_type c;
-//  btas::contract(1.0, A, {1, 2}, B, {2, 3}, 0.0, c, {1, 3});
-//  return blk_t(std::move(c));
-//}
-
-/* TODO: call CUDA gemm here */
-template <typename Blk>
-static void device_gemm(Blk &C, const Blk &A, const Blk &B) {
-  using blk_t = Blk;
-  using T = typename blk_t::value_type;
-  static_assert(std::is_same_v<T,double> || std::is_same_v<T,float>);
-  static const T alpha = 1.0;
-  static const T beta  = 1.0;
-  // make sure all memory is on the device
-  // TODO: A and B are read-only so the owner device will be 0. How to fix?
-  //assert(A.b.get_current_device() != 0);
-  //assert(B.b.get_current_device() != 0);
-  auto device = ttg::device::current_device();
-  assert(device.is_device());
-#if defined(TTG_ENABLE_CUDA)
-  if constexpr (std::is_same_v<T,double>) {
-      cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
-                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
-                  C.b.current_device_ptr(), C.extent(0));
-  }
-  else if constexpr (std::is_same_v<T,float>) {
-      cublasSgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
-                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
-                  C.b.current_device_ptr(), C.extent(0));
-  }
-#elif defined(TTG_ENABLE_HIP)
-  if constexpr (std::is_same_v<T,double>) {
-    hipblasDgemm(hipblas_handle(),
-                 HIPBLAS_OP_N, HIPBLAS_OP_N,
-                 C.extent(0), C.extent(1), A.extent(1), &alpha,
-                 A.b.current_device_ptr(), A.extent(0),
-                 B.b.current_device_ptr(), B.extent(0), &beta,
-                 C.b.current_device_ptr(), C.extent(0));
-  } else if constexpr (std::is_same_v<T,float>) {
-    hipblasSgemm(hipblas_handle(),
-                 HIPBLAS_OP_N, HIPBLAS_OP_N,
-                 C.extent(0), C.extent(1), A.extent(1), &alpha,
-                 A.b.current_device_ptr(), A.extent(0),
-                 B.b.current_device_ptr(), B.extent(0), &beta,
-                 C.b.current_device_ptr(), C.extent(0));
-  }
-#elif defined(TTG_ENABLE_LEVEL_ZERO)
-
-#if defined(DEBUG_SYNCHRONOUS)
-  try {
-#endif /* DEBUG_SYNCHRONOUS */
-    cl::sycl::event gemm_event;
-    gemm_event = oneapi::mkl::blas::gemm(ttg::device::current_stream(),
-         oneapi::mkl::transpose::N, oneapi::mkl::transpose::N,
-         C.extent(0), C.extent(1), A.extent(1),
-         alpha, A.b.current_device_ptr(), A.extent(0),
-                B.b.current_device_ptr(), B.extent(0),
-         beta,  C.b.current_device_ptr(), C.extent(0));
-#if defined(DEBUG_SYNCHRONOUS)
-    gemm_event.wait();
-  } catch (const oneapi::mkl::invalid_argument &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws invalid argument exception" << std::endl;
-  } catch (const oneapi::mkl::unsupported_device &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws unsuported device exception" << std::endl;
-  } catch (const oneapi::mkl::host_bad_alloc &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws host bad allocation exception" << std::endl;
-  } catch (const oneapi::mkl::device_bad_alloc &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws device bad allocation exception" << std::endl;
-  } catch (const oneapi::mkl::unimplemented &e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws unimplemented exception" << std::endl;
-  } catch (const std::exception& e) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception" << std::endl;
-  } catch (...) {
-    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception that is also badly formatted..." << std::endl;
-  }
-#endif /* DEBUG_SYNCHRONOUS */
-#endif
-}
-
-#if defined(TTG_USE_PARSEC)
-namespace ttg {
-  template <>
-  struct SplitMetadataDescriptor<blk_t> {
-    // TODO: this is a quick and dirty approach.
-    //   - blk_t could have any number of dimensions, this code only works for 2 dim blocks
-    //   - we use Blk{} to send a control flow in some tasks below, these blocks have only
-    //     1 dimension (of size 0), to code this, we set the second dimension to 0 in our
-    //     quick and dirty linearization, then have a case when we create the object
-    //   - when we create the object with the metadata, we use a constructor that initializes
-    //     the data to 0, which is useless: the data could be left uninitialized
-    static auto get_metadata(const blk_t &b) {
-      std::pair<int, int> dim{0, 0};
-      if (!b.empty()) {
-        assert(b.range().extent().size() == 2);
-        std::get<0>(dim) = (int)b.range().extent(0);
-        std::get<1>(dim) = (int)b.range().extent(1);
-      }
-      return dim;
-    }
-    static auto get_data(blk_t &b) {
-      using T = typename blk_t::value_type;
-      if (!b.empty())
-        return boost::container::small_vector<iovec, 1>(1, iovec{b.size() * sizeof(T), b.data()});
-      else
-        return boost::container::small_vector<iovec, 1>{};
-    }
-    static auto create_from_metadata(const std::pair<int, int> &meta) {
-      if (meta != std::pair{0, 0}) // N.B. allocate only, do not fill with zeroes
-        return blk_t(btas::Range(std::get<0>(meta), std::get<1>(meta)));
-      else
-        return blk_t{};
-    }
-  };
-}  // namespace ttg
-#endif /* TTG_USE_PARSEC */
-
-// declare btas::Tensor serializable by Boost
-#include "ttg/serialization/backends/boost.h"
-namespace ttg::detail {
-  // BTAS defines all of its Boost serializers in boost::serialization namespace ... as explained in
-  // ttg/serialization/boost.h such functions are not detectable via SFINAE, so must explicitly define serialization
-  // traits here
-  template <typename Archive>
-  inline static constexpr bool is_boost_serializable_v<Archive, blk_t> = is_boost_archive_v<Archive>;
-  template <typename Archive>
-  inline static constexpr bool is_boost_serializable_v<Archive, const blk_t> = is_boost_archive_v<Archive>;
-}  // namespace ttg::detail
-
-#else
-using blk_t = double;
-#endif
-template <typename T = blk_t>
-using SpMatrix = Eigen::SparseMatrix<T>;
-template <typename T = blk_t>
-using SpMatrixTriplet = Eigen::Triplet<T>;  // {row,col,value}
-
-#if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
-
-#if __has_include(<madness/world/archive.h>)
-
-#include <madness/world/archive.h>
-
-#endif  // __has_include(<madness/world/archive.h>)
-
-namespace btas {
-  template <typename T_, class Range_, class Store_>
-  inline btas::Tensor<T_, Range_, Store_> operator*(const btas::Tensor<T_, Range_, Store_> &A,
-                                                    const btas::Tensor<T_, Range_, Store_> &B) {
-    btas::Tensor<T_, Range_, Store_> C;
-    btas::contract(1.0, A, {1, 2}, B, {2, 3}, 0.0, C, {1, 3});
-    return C;
-  }
-
-  template <typename T_, class Range_, class Store_>
-  btas::Tensor<T_, Range_, Store_> gemm(btas::Tensor<T_, Range_, Store_> &&C, const btas::Tensor<T_, Range_, Store_> &A,
-                                        const btas::Tensor<T_, Range_, Store_> &B) {
-    using array = btas::DEFAULT::index<int>;
-    if (C.empty()) {  // first contribution to C = allocate it and gemm with beta=0
-      C = btas::Tensor<T_, Range_, Store_>(btas::Range(A.range().extent(0), B.range().extent(1)));
-      btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 0.0, C, array{1, 3}, false, false);
-    }
-    else {   // subsequent contributions to C = gemm with beta=1
-      btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false);
-    }
-    return std::move(C);
-  }
-}  // namespace btas
-#endif  // BTAS_IS_USABLE
-double gemm(double C, double A, double B) { return C + A * B; }
-
-// template <typename _Scalar, int _Options, typename _StorageIndex>
-// struct colmajor_layout;
-// template <typename _Scalar, typename _StorageIndex>
-// struct colmajor_layout<_Scalar, Eigen::ColMajor, _StorageIndex> : public std::true_type {};
-// template <typename _Scalar, typename _StorageIndex>
-// struct colmajor_layout<_Scalar, Eigen::RowMajor, _StorageIndex> : public std::false_type {};
-
-template <std::size_t Rank>
-using Key = MultiIndex<Rank>;
-
-/// maps {i,j} to rank within first (R=0) layer of the 3-d process grid
-inline int ij2rank(int i, int j, int P, int Q) {
-  std::vector<int> vec;
-  int p = (i % P);
-  int q = (j % Q);
-  int rank = (q * P) + p;
-  return rank;
-}
-
-/// maps {i,j,k} to rank within a 3-d process grid
-inline int ijk2rank(int i, int j, int k, int P, int Q, int R) {
-  std::vector<int> vec;
-  int p = (i % P);
-  int q = (j % Q);
-  int l = (k % R);
-  int rank = (l * P * Q) + (q * P) + p;
-  return rank;
-}
-
-// flow data from an existing SpMatrix on rank 0
-template <typename Blk = blk_t, typename Keymap = std::function<int(const Key<2> &)>>
-class Read_SpMatrix : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read_SpMatrix<Blk, Keymap>, ttg::typelist<void>> {
- public:
-  using baseT = typename Read_SpMatrix::ttT;
-  Read_SpMatrix(const char *label, const SpMatrix<Blk> &matrix, Edge<Key<2>> &ctl, Edge<Key<2>, Blk> &out,
-                Keymap &ij_keymap)
-      : baseT(edges(ctl), edges(out), std::string("read_spmatrix(") + label + ")", {"ctl"}, {std::string(label) + "ij"},
-              ij_keymap)
-      , matrix_(matrix) {}
-
-  void op(const Key<2> &, std::tuple<Out<Key<2>, Blk>> &out) {
-    auto rank = ttg::default_execution_context().rank();
-    for (int k = 0; k < matrix_.outerSize(); ++k) {
-      for (typename SpMatrix<Blk>::InnerIterator it(matrix_, k); it; ++it) {
-        if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({it.row(), it.col()}))))
-          ::send<0>(Key<2>(std::initializer_list<long>({it.row(), it.col()})), ttg::persistent(it.value()), out);
-      }
-    }
-  }
-
- private:
-  const SpMatrix<Blk> &matrix_;
-};
-
-// flow (move?) data into an existing SpMatrix on rank 0
-template <typename Blk = blk_t>
-class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg::typelist<Blk>> {
- public:
-  using baseT = typename Write_SpMatrix::ttT;
-
-  template <typename Keymap2>
-  Write_SpMatrix(SpMatrix<Blk> &matrix, Edge<Key<2>, Blk> &in, Keymap2 &&ij_keymap, bool write_back = false)
-      : baseT(edges(in), edges(), "write_spmatrix", {"Cij"}, {}, ij_keymap)
-      , matrix_(matrix)
-      , write_back(write_back)
-  { }
-
-  void op(const Key<2> &key, typename baseT::input_refs_tuple_type &&elem, std::tuple<> &) {
-
-    if (write_back) {
-      std::lock_guard<std::mutex> lock(mtx_);
-      ttg::trace("rank =", default_execution_context().rank(),
-                 "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()), "spmm.cc Write_SpMatrix wrote {",
-                 key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast<void *>(&matrix_),
-                 " with mutex @", static_cast<void *>(&mtx_), " for object @", static_cast<void *>(this));
-      values_.emplace_back(key[0], key[1], std::move(baseT::template get<0>(elem)));
-    }
-  }
-
-  /// grab completion status as a future<void>
-  /// \note cannot be called once this is executable
-  const std::shared_future<void> &status() const {
-    assert(!this->is_executable());
-    if (!completion_status_) {  // if not done yet, register completion work with the world
-      auto promise = std::make_shared<std::promise<void>>();
-      completion_status_ = std::make_shared<std::shared_future<void>>(promise->get_future());
-      ttg_register_status(this->get_world(), std::move(promise));
-      ttg_register_callback(this->get_world(),
-                            [this]() { this->matrix_.setFromTriplets(this->values_.begin(), this->values_.end()); });
-    } else {  // if done already, commit the result
-      this->matrix_.setFromTriplets(this->values_.begin(), this->values_.end());
-    }
-    return *completion_status_;
-  }
-
- private:
-  std::mutex mtx_;
-  SpMatrix<Blk> &matrix_;
-  std::vector<SpMatrixTriplet<Blk>> values_;
-  mutable std::shared_ptr<std::shared_future<void>> completion_status_;
-  bool write_back = false;
-};
-
-/// sparse mm via 2.5D SUMMA
-
-/// @tparam KeyMap2 maps {i,j} to processor
-/// @tparam KeyMap3 maps {i,j,k} to processor
-template <typename Keymap2 = std::function<int(const Key<2> &)>, typename Keymap3 = std::function<int(const Key<3> &)>,
-          typename Blk = blk_t>
-class SpMM25D {
- public:
-  /// @param ij_keymap maps {i,j} to process, specifies distribution of tiles of A, B, and C
-  /// @param ijk_keymap maps {i,j,k} to process, controls distribution of tasks performing C[i][j] += A[i][k]*B[k][j]
-  /// @param R the number of "layers" in the 3-D process grid
-  SpMM25D(Edge<Key<2>, Blk> &a, Edge<Key<2>, Blk> &b, Edge<Key<2>, Blk> &c, const SpMatrix<Blk> &a_mat,
-          const SpMatrix<Blk> &b_mat, const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-          const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-          const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-          const std::vector<std::vector<long>> &b_colidx_to_rowidx, const std::vector<int> &mTiles,
-          const std::vector<int> &nTiles, const std::vector<int> &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap, long R)
-      : a_rowidx_to_colidx_(a_rowidx_to_colidx)
-      , b_colidx_to_rowidx_(b_colidx_to_rowidx)
-      , a_colidx_to_rowidx_(a_colidx_to_rowidx)
-      , b_rowidx_to_colidx_(b_rowidx_to_colidx)
-      , ij_keymap_(std::move(ij_keymap))
-      , ijk_keymap_(std::move(ijk_keymap)) {
-    bcast_a_ = std::make_unique<BcastA>(a, local_a_ijk_, b_rowidx_to_colidx_, ij_keymap_, ijk_keymap_);
-    local_bcast_a_ = std::make_unique<LocalBcastA>(local_a_ijk_, a_ijk_, b_rowidx_to_colidx_, ijk_keymap_);
-    bcast_b_ = std::make_unique<BcastB>(b, local_b_ijk_, a_colidx_to_rowidx_, ij_keymap_, ijk_keymap_);
-    local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_colidx_to_rowidx_, ijk_keymap_);
-    multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_rowidx_to_colidx_,
-                                                 b_colidx_to_rowidx_, mTiles, nTiles, ijk_keymap_);
-    reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
-    reduce_c_->template set_input_reducer<0>([](Blk &c_ij, const Blk &c_ij_p) { c_ij = c_ij + c_ij_p; });
-    // compute how many contributions each C[i][j] should expect ... MultiplyAdd already does this, but need a way to
-    // send message from each process p to the process owning C[i][j] to expect a contribution from it for now replicate
-    // this logic ...
-    // TODO: do this in MultiplyAdd (need to allreduce this info so that everyone has it)
-    // N.B. only need to set stream size on the rank that will accumulate the C[i][j] contribution
-    const auto my_rank = ttg::default_execution_context().rank();
-    for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) {
-      if (a_rowidx_to_colidx_[i].empty()) continue;
-      for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) {
-        if (b_colidx_to_rowidx_[j].empty()) continue;
-
-        if (ij_keymap_(Key<2>{i, j}) == my_rank) {
-          decltype(i) k;
-          bool have_k;
-          std::tie(k, have_k) = multiplyadd_->compute_first_k(i, j);
-          std::vector<bool> c_ij_procmask(R, false);
-          if (have_k) {
-            const auto pR = k % R;  // k values are distributed round-robin among the layers of the 3-D grid
-            assert(pR < c_ij_procmask.size());
-            c_ij_procmask[pR] = true;
-            while (have_k) {
-              std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k);
-              if (have_k) {
-                const auto pR = k % R;
-                assert(pR < c_ij_procmask.size());
-                c_ij_procmask[pR] = true;
-              }
-            }
-          }
-          const auto c_ij_nprocs = std::count_if(c_ij_procmask.begin(), c_ij_procmask.end(), [](bool b) { return b; });
-          if (c_ij_nprocs > 0) reduce_c_->template set_argstream_size<0>(Key<2>{i, j}, c_ij_nprocs);
-        }
-      }
-    }
-
-    TTGUNUSED(bcast_a_);
-    TTGUNUSED(bcast_b_);
-    TTGUNUSED(multiplyadd_);
-    TTGUNUSED(reduce_c_);
-  }
-
-  /// Locally broadcast `A[i][k]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `j` such that
-  /// `B[k][j]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
-  class LocalBcastA : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastA, ttg::typelist<const Blk>> {
-   public:
-    using baseT = typename LocalBcastA::ttT;
-
-    LocalBcastA(Edge<Key<3>, Blk> &a, Edge<Key<3>, Blk> &a_ijk,
-                const std::vector<std::vector<long>> &b_rowidx_to_colidx, const Keymap3 &ijk_keymap)
-        : baseT(edges(a), edges(a_ijk), "SpMM25D::local_bcast_a", {"a_ikp"}, {"a_ijk"},
-                [](const Key<3> &ikp) { return ikp[2]; })
-        , b_rowidx_to_colidx_(b_rowidx_to_colidx)
-        , ijk_keymap_(ijk_keymap) {}
-
-    void op(const Key<3> &ikp, typename baseT::input_refs_tuple_type &&a_ik, std::tuple<Out<Key<3>, Blk>> &a_ijk) {
-      const auto i = ikp[0];
-      const auto k = ikp[1];
-      const auto p = ikp[2];
-
-      auto world = default_execution_context();
-      assert(p == world.rank());
-      ttg::trace("LocalBcastA(", i, ", ", k, ", ", p, ")");
-      if (k >= b_rowidx_to_colidx_.size()) return;
-      // local broadcast a_ik to all {i,j,k} such that b_kj exists
-      std::vector<Key<3>> ijk_keys;
-      for (auto &j : b_rowidx_to_colidx_[k]) {
-        if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) {
-          ttg::trace("Broadcasting A[", i, "][", k, "] on proc ", p, " to j=", j);
-          ijk_keys.emplace_back(Key<3>({i, j, k}));
-        }
-      }
-      ::broadcast<0>(ijk_keys, std::move(baseT::template get<0>(a_ik)), a_ijk);
-    }
-
-   private:
-    const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
-    const Keymap3 &ijk_keymap_;
-  };  // class LocalBcastA
-
-  /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists
-  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastA, ttg::typelist<const Blk>> {
-   public:
-    using baseT = typename BcastA::ttT;
-
-    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<3>, Blk> &a_ikp, const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
-        : baseT(edges(a_ik), edges(a_ikp), "SpMM25D::bcast_a", {"a_ik"}, {"a_ikp"}, ij_keymap)
-        , b_rowidx_to_colidx_(b_rowidx_to_colidx)
-        , ijk_keymap_(ijk_keymap) {}
-
-    void op(const Key<2> &ik, typename baseT::input_refs_tuple_type &&a_ik, std::tuple<Out<Key<3>, Blk>> &a_ikp) {
-      const auto i = ik[0];
-      const auto k = ik[1];
-      ttg::trace("BcastA(", i, ", ", k, ")");
-      std::vector<Key<3>> ikp_keys;
-
-      if (k >= b_rowidx_to_colidx_.size()) return;
-      auto world = default_execution_context();
-      std::vector<bool> procmap(world.size());
-      for (auto &j : b_rowidx_to_colidx_[k]) {
-        const long p = ijk_keymap_(Key<3>(
-            {i, j, k}));  // N.B. in 2.5D SUMMA different k contributions to C[i][j] are computed on different nodes
-        if (!procmap[p]) {
-          ttg::trace("Broadcasting A[", i, "][", k, "] to proc ", p);
-          ikp_keys.emplace_back(Key<3>({i, k, p}));
-          procmap[p] = true;
-        }
-      }
-      ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), a_ikp);
-    }
-
-   private:
-    const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
-    const Keymap3 &ijk_keymap_;
-  };  // class BcastA
-
-  /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that
-  /// `A[i][k]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
-  class LocalBcastB : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastB, ttg::typelist<const Blk>> {
-   public:
-    using baseT = typename LocalBcastB::ttT;
-
-    LocalBcastB(Edge<Key<3>, Blk> &b_kjp, Edge<Key<3>, Blk> &b_ijk,
-                const std::vector<std::vector<long>> &a_colidx_to_rowidx, const Keymap3 &ijk_keymap)
-        : baseT(edges(b_kjp), edges(b_ijk), "SpMM25D::local_bcast_b", {"b_kjp"}, {"b_ijk"},
-                [](const Key<3> &kjp) { return kjp[2]; })
-        , a_colidx_to_rowidx_(a_colidx_to_rowidx)
-        , ijk_keymap_(ijk_keymap) {}
-
-    void op(const Key<3> &kjp, typename baseT::input_refs_tuple_type &&b_kj, std::tuple<Out<Key<3>, Blk>> &b_ijk) {
-      const auto k = kjp[0];
-      const auto j = kjp[1];
-      const auto p = kjp[2];
-      auto world = default_execution_context();
-      assert(p == world.rank());
-      ttg::trace("BcastB(", k, ", ", j, ", ", p, ")");
-      if (k >= a_colidx_to_rowidx_.size()) return;
-      // broadcast b_kj to all ijk for which c_ij is on this processor and a_ik exists
-      std::vector<Key<3>> ijk_keys;
-      for (auto &i : a_colidx_to_rowidx_[k]) {
-        if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) {
-          ttg::trace("Broadcasting B[", k, "][", j, "] on proc ", p, " to i=", i);
-          ijk_keys.emplace_back(Key<3>({i, j, k}));
-        }
-      }
-      ::broadcast<0>(ijk_keys, std::move(baseT::template get<0>(b_kj)), b_ijk);
-    }
-
-   private:
-    const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
-    const Keymap3 &ijk_keymap_;
-  };  // class LocalBcastB
-
-  /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists
-  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastB, ttg::typelist<const Blk>> {
-   public:
-    using baseT = typename BcastB::ttT;
-
-    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<3>, Blk> &b_kjp, const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
-        : baseT(edges(b_kj), edges(b_kjp), "SpMM25D::bcast_b", {"b_kj"}, {"b_kjp"}, ij_keymap)
-        , a_colidx_to_rowidx_(a_colidx_to_rowidx)
-        , ijk_keymap_(ijk_keymap) {}
-
-    void op(const Key<2> &kj, typename baseT::input_refs_tuple_type &&b_kj, std::tuple<Out<Key<3>, Blk>> &b_kjp) {
-      const auto k = kj[0];
-      const auto j = kj[1];
-      // broadcast b_kj to all processors which will contain at least one c_ij such that a_ik exists
-      std::vector<Key<3>> kjp_keys;
-      ttg::trace("BcastB(", k, ", ", j, ")");
-      if (k >= a_colidx_to_rowidx_.size()) return;
-      auto world = default_execution_context();
-      std::vector<bool> procmap(world.size());
-      for (auto &i : a_colidx_to_rowidx_[k]) {
-        long p = ijk_keymap_(Key<3>({i, j, k}));
-        if (!procmap[p]) {
-          ttg::trace("Broadcasting B[", k, "][", j, "] to proc ", p);
-          kjp_keys.emplace_back(Key<3>({k, j, p}));
-          procmap[p] = true;
-        }
-      }
-      ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), b_kjp);
-    }
-
-   private:
-    const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
-    const Keymap3 &ijk_keymap_;
-  };  // class BcastB
-
-  /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this kayer of the
-  /// 3-D process grid only
-  class MultiplyAdd : public TT<Key<3>, std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>>, MultiplyAdd,
-                                ttg::typelist<const Blk, const Blk, Blk>> {
-   public:
-    using baseT = typename MultiplyAdd::ttT;
-
-#if defined(TTG_ENABLE_CUDA)
-    static constexpr bool have_cuda_op = true;
-#warning SPMM using CUDA implementation
-#elif defined(TTG_ENABLE_HIP)
-    static constexpr bool have_hip_op  = true;
-#warning SPMM using HIP implementation
-#elif defined(TTG_ENABLE_LEVEL_ZERO)
-    static constexpr bool have_level_zero_op = true;
-#warning SPMM using LEVEL_ZERO implementation
-#else
-#error No valid device implementation found!
-#endif
-
-    MultiplyAdd(Edge<Key<3>, Blk> &a_ijk, Edge<Key<3>, Blk> &b_ijk, Edge<Key<3>, Blk> &c_ijk, Edge<Key<2>, Blk> &c,
-                const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                const std::vector<std::vector<long>> &b_colidx_to_rowidx, const std::vector<int> &mTiles,
-                const std::vector<int> &nTiles, const Keymap3 &ijk_keymap)
-        : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
-                {"c_ij", "c_ijk"}, ijk_keymap)
-        , a_rowidx_to_colidx_(a_rowidx_to_colidx)
-        , b_colidx_to_rowidx_(b_colidx_to_rowidx) {
-      this->set_priomap([this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
-      auto num_devices = ttg::device::num_devices();
-      this->set_devicemap(
-        [num_devices](const Key<3> &ijk){
-          return ((((uint64_t)ijk[0]) << 32) + ijk[1]) % num_devices;
-        });
-      // for each {i,j} determine first k that contributes AND belongs to this node,
-      // initialize input {i,j,first_k} flow to 0
-      for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) {
-        if (a_rowidx_to_colidx_[i].empty()) continue;
-        for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) {
-          if (b_colidx_to_rowidx_[j].empty()) continue;
-
-          const auto p = ttg::default_execution_context().rank();
-          decltype(i) k;
-          bool have_k;
-          std::tie(k, have_k) = compute_first_k(i, j, p);
-          if (have_k) {
-            ttg::trace("Initializing C[", i, "][", j, "] on process ", p, " to zero");
-#if BLOCK_SPARSE_GEMM
-            Blk zero(btas::Range(mTiles[i], nTiles[j]), 0.0);
-#else
-            Blk zero{0.0};
-#endif
-            this->template in<2>()->send(Key<3>({i, j, k}), zero);
-          } else {
-            if (tracing() && a_rowidx_to_colidx_.size() * b_colidx_to_rowidx_.size() < 400)
-              ttg::print("C[", i, "][", j, "] is empty");
-          }
-        }
-      }
-    }
-
-    ttg::device::Task op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
-            std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>> &result) {
-      const auto i = ijk[0];
-      const auto j = ijk[1];
-      const auto k = ijk[2];  // k==l same because 000 will always be on layer 0, 001 will be accessed on layer 1
-      const auto p = ttg::default_execution_context().rank();
-      long next_k;
-      bool have_next_k;
-
-      const blk_t& A = baseT::template get<0>(_ijk);
-      const blk_t& B = baseT::template get<1>(_ijk);
-      blk_t& C = baseT::template get<2>(_ijk);
-
-      if (C.empty()) {
-        C = blk_t(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0);
-      }
-
-      /* pull all buffers onto the device */
-      co_await ttg::device::select(A.b, B.b, C.b);
-
-      /* everything is on the device, call the gemm */
-      device_gemm(C, A, B);
-
-      /* compute next k while the kernel is running */
-      std::tie(next_k, have_next_k) = compute_next_k(i, j, k, p);
-      ttg::trace("Rank ", ttg::default_execution_context().rank(),
-                 " :"
-                 " C[",
-                 i, "][", j, "]  += A[", i, "][", k, "] by B[", k, "][", j, "],  next_k? ",
-                 (have_next_k ? std::to_string(next_k) : "does not exist"));
-
-      /* wait for the kernel to complete */
-      co_await ttg::device::wait();
-
-
-      // compute the contrib, pass the running total to the next flow, if needed
-      // otherwise write to the result flow
-      if (have_next_k) {
-        co_await ttg::device::forward(ttg::device::send<1>(
-                                                Key<3>({i, j, next_k}),
-                                                std::move(C),
-                                                result));
-      } else {  // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j]
-                // belongs
-        co_await ttg::device::forward(ttg::device::send<0>(
-                                                Key<2>({i, j}),
-                                                std::move(C),
-                                                result));
-      }
-    }
-
-   private:
-    const std::vector<std::vector<long>> &a_rowidx_to_colidx_;
-    const std::vector<std::vector<long>> &b_colidx_to_rowidx_;
-
-    /* Compute the length of the remaining sequence on that tile */
-    int32_t prio(const Key<3> &key) const {
-      const auto i = key[0];
-      const auto j = key[1];
-      const auto k = key[2];
-      int32_t len = -1;  // will be incremented at least once
-      long next_k = k;
-      bool have_next_k;
-      do {
-        std::tie(next_k, have_next_k) = compute_next_k(i, j, next_k);  // here I know how many 'k' I have with same ij
-        ++len;
-      } while (have_next_k);
-      return len;
-    }
-
-   public:  // to be able to reuse this logic in SpMM25D
-    // given {i,j} return first k such that A[i][k] and B[k][j] exist
-    std::tuple<long, bool> compute_first_k(long i, long j) const {
-      const auto &a_k_range = a_rowidx_to_colidx_.at(i);
-      auto a_iter = a_k_range.begin();
-      auto a_iter_fence = a_k_range.end();
-      if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
-      const auto &b_k_range = b_colidx_to_rowidx_.at(j);
-      auto b_iter = b_k_range.begin();
-      auto b_iter_fence = b_k_range.end();
-      if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
-
-      {
-        auto a_colidx = *a_iter;  // pointing to next kth element
-        auto b_rowidx = *b_iter;
-        while (a_colidx != b_rowidx) {
-          if (a_colidx < b_rowidx) {
-            ++a_iter;
-            if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
-            a_colidx = *a_iter;
-          } else {
-            ++b_iter;
-            if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
-            b_rowidx = *b_iter;
-          }
-        }
-        return std::make_tuple(a_colidx, true);  // returned true for kth element exist and also returns next k since
-                                                 // a_colidx points to ++a_iter,  if not reaches to fence
-      }
-      assert(false);
-    }
-
-    // given {i,j,k} such that A[i][k] and B[k][j] exist
-    // return next k such that this condition holds
-    std::tuple<long, bool> compute_next_k(long i, long j, long k) const {
-      const auto &a_k_range = a_rowidx_to_colidx_.at(i);
-      auto a_iter_fence = a_k_range.end();
-      auto a_iter = std::find(a_k_range.begin(), a_iter_fence, k);
-      assert(a_iter != a_iter_fence);
-      const auto &b_k_range = b_colidx_to_rowidx_.at(j);
-      auto b_iter_fence = b_k_range.end();
-      auto b_iter = std::find(b_k_range.begin(), b_iter_fence, k);
-      assert(b_iter != b_iter_fence);
-      while (a_iter != a_iter_fence && b_iter != b_iter_fence) {
-        ++a_iter;
-        ++b_iter;
-        if (a_iter == a_iter_fence || b_iter == b_iter_fence) return std::make_tuple(-1, false);
-        auto a_colidx = *a_iter;
-        auto b_rowidx = *b_iter;
-        while (a_colidx != b_rowidx) {
-          if (a_colidx < b_rowidx) {
-            ++a_iter;
-            if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
-            a_colidx = *a_iter;
-          } else {
-            ++b_iter;
-            if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
-            b_rowidx = *b_iter;
-          }
-        }
-        return std::make_tuple(a_colidx, true);
-      }
-      ttg::abort();  // unreachable
-      return std::make_tuple(0, false);
-    }
-
-    // given {i,j} return first k such that A[i][k] and B[k][j] exist AND ijk_keymap_(i,j,k) == p
-    std::tuple<long, bool> compute_first_k(long i, long j, long p) const {
-      long first_k = 0;
-      bool have_k = false;
-      std::tie(first_k, have_k) = compute_first_k(i, j);
-      while (have_k) {
-        if (this->get_keymap()(Key<3>{i, j, first_k}) == p)
-          return {first_k, true};
-        else
-          std::tie(first_k, have_k) = compute_next_k(i, j, first_k);
-      }
-      return {0, false};
-    }
-
-    // given {i,j,k} such that A[i][k] and B[k][j] exist
-    // return next k such that this condition holds AND ijk_keymap_(i,j,k) == p
-    std::tuple<long, bool> compute_next_k(long i, long j, long k, long p) const {
-      long next_k = 0;
-      bool have_k = false;
-      std::tie(next_k, have_k) = compute_next_k(i, j, k);
-      while (have_k) {
-        if (this->get_keymap()(Key<3>{i, j, next_k}) == p)
-          return {next_k, true};
-        else
-          std::tie(next_k, have_k) = compute_next_k(i, j, next_k);
-      }
-      return {0, false};
-    }
-
-  };  // MultiplyAdd
-
-  /// reduces contributions to `C[i][j]` produced on different layers of the 3-d process grid
-  class ReduceC : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, ReduceC, ttg::typelist<Blk>> {
-   public:
-    using baseT = typename ReduceC::ttT;
-
-    ReduceC(Edge<Key<2>, Blk> &c_ij_p, Edge<Key<2>, Blk> &c_ij, const Keymap2 &ij_keymap)
-        : baseT(edges(c_ij_p), edges(c_ij), "SpMM25D::reduce_c", {"c_ij(p)"}, {"c_ij"}, ij_keymap) {}
-
-    void op(const Key<2> &ij, typename baseT::input_refs_tuple_type &&c_ij_p, std::tuple<Out<Key<2>, Blk>> &c_ij) {
-      ttg::trace("ReduceC(", ij[0], ", ", ij[1], ")");
-      ::send<0>(ij, std::move(baseT::template get<0>(c_ij_p)), c_ij);
-    }
-  };  // class ReduceC
-
- private:
-  Edge<Key<3>, Blk> a_ijk_;
-  Edge<Key<3>, Blk> local_a_ijk_;
-  Edge<Key<3>, Blk> b_ijk_;
-  Edge<Key<3>, Blk> local_b_ijk_;
-  Edge<Key<3>, Blk> c_ijk_;
-  Edge<Key<2>, Blk> c_ij_p_;
-  const std::vector<std::vector<long>> &a_rowidx_to_colidx_;
-  const std::vector<std::vector<long>> &b_colidx_to_rowidx_;
-  const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
-  const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
-  std::unique_ptr<BcastA> bcast_a_;
-  std::unique_ptr<LocalBcastA> local_bcast_a_;
-  std::unique_ptr<BcastB> bcast_b_;
-  std::unique_ptr<LocalBcastB> local_bcast_b_;
-  std::unique_ptr<MultiplyAdd> multiplyadd_;
-  std::unique_ptr<ReduceC> reduce_c_;
-  Keymap2 ij_keymap_;
-  Keymap3 ijk_keymap_;
-};
-
-class Control : public TT<void, std::tuple<Out<Key<2>>>, Control> {
-  using baseT = typename Control::ttT;
-  int P;
-  int Q;
-
- public:
-  explicit Control(Edge<Key<2>> &ctl) : baseT(edges(), edges(ctl), "Control", {}, {"ctl"}), P(0), Q(0) {}
-
-  void op(std::tuple<Out<Key<2>>> &out) const {
-    for (int p = 0; p < P; p++) {
-      for (int q = 0; q < Q; q++) {
-        ttg::trace("Control: start computing on process {", p, ", ", q, "}");
-        ::sendk<0>(Key<2>{p, q}, out);
-      }
-    }
-  }
-
-  void start(const int _p, const int _q) {
-    P = _p;
-    Q = _q;
-    invoke();
-  }
-};
-
-#ifdef BTAS_IS_USABLE
-template <typename T_, class Range_, class Store_>
-std::tuple<T_, T_> norms(const btas::Tensor<T_, Range_, Store_> &t) {
-  T_ norm_2_square = 0.0;
-  T_ norm_inf = 0.0;
-  for (auto k : t) {
-    norm_2_square += k * k;
-    norm_inf = std::max(norm_inf, std::abs(k));
-  }
-  return std::make_tuple(norm_2_square, norm_inf);
-}
-#endif
-
-std::tuple<double, double> norms(double t) { return std::make_tuple(t * t, std::abs(t)); }
-
-template <typename Blk = blk_t>
-std::tuple<double, double> norms(const SpMatrix<Blk> &A) {
-  double norm_2_square = 0.0;
-  double norm_inf = 0.0;
-  for (int i = 0; i < A.outerSize(); ++i) {
-    for (typename SpMatrix<Blk>::InnerIterator it(A, i); it; ++it) {
-      //  cout << 1+it.row() << "\t"; // row index
-      //  cout << 1+it.col() << "\t"; // col index (here it is equal to k)
-      //  cout << it.value() << endl;
-      auto& elem = it.value();
-      double elem_norm_2_square, elem_norm_inf;
-      std::tie(elem_norm_2_square, elem_norm_inf) = norms(elem);
-      norm_2_square += elem_norm_2_square;
-      norm_inf = std::max(norm_inf, elem_norm_inf);
-    }
-  }
-  return std::make_tuple(norm_2_square, norm_inf);
-}
-
-char *getCmdOption(char **begin, char **end, const std::string &option) {
-  static char *empty = (char *)"";
-  char **itr = std::find(begin, end, option);
-  if (itr != end && ++itr != end) return *itr;
-  return empty;
-}
-
-bool cmdOptionExists(char **begin, char **end, const std::string &option) {
-  return std::find(begin, end, option) != end;
-}
-
-int cmdOptionIndex(char **begin, char **end, const std::string &option) {
-  char **itr = std::find(begin, end, option);
-  if (itr != end) return (int)(itr - begin);
-  return -1;
-}
-
-static int parseOption(std::string &option, int default_value) {
-  size_t pos;
-  std::string token;
-  int N = default_value;
-  if (option.length() == 0) return N;
-  pos = option.find(':');
-  if (pos == std::string::npos) {
-    pos = option.length();
-  }
-  token = option.substr(0, pos);
-  N = std::stoi(token);
-  option.erase(0, pos + 1);
-  return N;
-}
-
-static long parseOption(std::string &option, long default_value) {
-  size_t pos;
-  std::string token;
-  long N = default_value;
-  if (option.length() == 0) return N;
-  pos = option.find(':');
-  if (pos == std::string::npos) {
-    pos = option.length();
-  }
-  token = option.substr(0, pos);
-  N = std::stol(token);
-  option.erase(0, pos + 1);
-  return N;
-}
-
-static double parseOption(std::string &option, double default_value = 0.25) {
-  size_t pos;
-  std::string token;
-  double N = default_value;
-  if (option.length() == 0) return N;
-  pos = option.find(':');
-  if (pos == std::string::npos) {
-    pos = option.length();
-  }
-  token = option.substr(0, pos);
-  N = std::stod(token);
-  option.erase(0, pos + 1);
-  return N;
-}
-
-#if !defined(BLOCK_SPARSE_GEMM)
-static void initSpMatrixMarket(const std::function<int(const Key<2> &)> &keymap, const char *filename, SpMatrix<> &A,
-                               SpMatrix<> &B, SpMatrix<> &C, int &M, int &N, int &K) {
-  std::vector<int> sizes;
-  // We load the entire matrix on each rank, but we only use the local part for the GEMM
-  // loadMarket() is the eigan fuction to load matrix from a file
-  if (!loadMarket(A, filename)) {
-    std::cerr << "Failed to load " << filename << ", bailing out..." << std::endl;
-    ttg::ttg_abort();
-  }
-  if (0 == ttg::default_execution_context().rank()) {
-    std::cout << "##MatrixMarket file " << filename << " -- " << A.rows() << " x " << A.cols() << " -- " << A.nonZeros()
-              << " nnz (density: " << (float)A.nonZeros() / (float)A.rows() / (float)A.cols() << ")" << std::endl;
-  }
-  if (A.rows() != A.cols()) {
-    B = A.transpose();
-  } else {
-    B = A;
-  }
-
-  C.resize(A.rows(), B.cols());
-  M = (int)A.rows();
-  N = (int)C.cols();
-  K = (int)A.cols();
-}
-
-static void initSpRmat(const std::function<int(const Key<2> &)> &keymap, const char *opt, SpMatrix<> &A, SpMatrix<> &B,
-                       SpMatrix<> &C, int &M, int &N, int &K, unsigned long seed) {
-  int E;
-  double a = 0.25, b = 0.25, c = 0.25, d = 0.25;
-  size_t nnz = 0;
-
-  if (nullptr == opt) {
-    std::cerr << "Usage: -rmat <#nodes>[:<#edges>[:<a>[:<b>:[<c>[:<d>]]]]]" << std::endl;
-    exit(1);
-  }
-  std::string token;
-  std::string option = std::string(opt);
-  N = parseOption(option, -1);
-  K = N;
-  M = N;
-
-  // We build the entire sparse matrix on each rank, but use only the local part
-  // on a given rank, according to keymap
-  A.resize(N, N);
-
-  E = parseOption(option, (int)(0.01 * N * N));
-  a = parseOption(option, a);
-  b = parseOption(option, b);
-  c = parseOption(option, c);
-  d = parseOption(option, d);
-
-  if (ttg::default_execution_context().rank() == 0) {
-    std::cout << "#R-MAT: " << N << " nodes, " << E << " edges, a/b/c/d = " << a << "/" << b << "/" << c << "/" << d
-              << std::endl;
-  }
-
-  boost::minstd_rand gen(seed);
-  boost::rmat_iterator<boost::minstd_rand, boost::directed_graph<>> rmat_it(gen, N, E, a, b, c, d);
-
-  using triplet_t = Eigen::Triplet<blk_t>;
-  std::vector<triplet_t> A_elements;
-  for (int i = 0; i < N; i++) {
-    nnz++;
-    A_elements.emplace_back(i, i, 1.0);
-  }
-  for (int i = 0; i < E; i++) {
-    auto x = *rmat_it++;
-    if (x.first != x.second) {
-      A_elements.emplace_back(x.first, x.second, 1.0);
-      nnz++;
-    }
-  }
-  A.setFromTriplets(A_elements.begin(), A_elements.end());
-
-  B = A;
-  C.resize(N, N);
-
-  if (ttg::default_execution_context().rank() == 0) {
-    std::cout << "#R-MAT: " << E << " nonzero elements, density: " << (double)nnz / (double)N / (double)N << std::endl;
-  }
-}
-
-static void initSpHardCoded(const std::function<int(const Key<2> &)> &keymap, SpMatrix<> &A, SpMatrix<> &B,
-                            SpMatrix<> &C, int &m, int &n, int &k) {
-  m = 2;
-  n = 3;
-  k = 4;
-
-  std::cout << "#HardCoded A, B, C" << std::endl;
-  A.resize(m, k);
-  B.resize(k, n);
-  C.resize(m, n);
-  // We initialize the same matrices on all the ranks, but we will use only the local part
-  // following the keymap
-  using triplet_t = Eigen::Triplet<blk_t>;
-  std::vector<triplet_t> A_elements;
-  A_elements.emplace_back(0, 1, 12.3);
-  A_elements.emplace_back(0, 2, 10.7);
-  A_elements.emplace_back(0, 3, -2.3);
-  A_elements.emplace_back(1, 0, -0.3);
-  A_elements.emplace_back(1, 2, 1.2);
-  A.setFromTriplets(A_elements.begin(), A_elements.end());
-
-  std::vector<triplet_t> B_elements;
-  B_elements.emplace_back(0, 0, 12.3);
-  B_elements.emplace_back(1, 0, 10.7);
-  B_elements.emplace_back(3, 0, -2.3);
-  B_elements.emplace_back(1, 1, -0.3);
-  B_elements.emplace_back(1, 2, 1.2);
-  B_elements.emplace_back(2, 2, 7.2);
-  B_elements.emplace_back(3, 2, 0.2);
-  B.setFromTriplets(B_elements.begin(), B_elements.end());
-}
-
-#else
-static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap, SpMatrix<> &A, SpMatrix<> &B,
-                              SpMatrix<> &C, SpMatrix<> &Aref, SpMatrix<> &Bref, bool buildRefs,
-                              std::vector<int> &mTiles, std::vector<int> &nTiles, std::vector<int> &kTiles,
-                              std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                              std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                              std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                              std::vector<std::vector<long>> &b_colidx_to_rowidx, int &m, int &n, int &k) {
-  m = 2;
-  n = 3;
-  k = 4;
-
-  std::cout << "#HardCoded A, B, C" << std::endl;
-  A.resize(m, k);
-  B.resize(k, n);
-  C.resize(m, n);
-  if (buildRefs) {
-    Aref.resize(m, k);
-    Bref.resize(k, n);
-  }
-
-  for (int mt = 0; mt < m; mt++) mTiles.push_back(128);
-  for (int nt = 0; nt < n; nt++) nTiles.push_back(196);
-  for (int kt = 0; kt < k; kt++) kTiles.push_back(256);
-
-  int rank = ttg::default_execution_context().rank();
-
-  using triplet_t = Eigen::Triplet<blk_t>;
-  std::vector<triplet_t> A_elements;
-  std::vector<triplet_t> Aref_elements;
-#if defined(BTAS_IS_USABLE)
-  if (keymap({0, 1}) == rank) {
-    A_elements.emplace_back(0, 1, blk_t(btas::Range(128, 256), 12.3));
-  }
-  if (keymap({0, 2}) == rank) {
-    A_elements.emplace_back(0, 2, blk_t(btas::Range(128, 256), 10.7));
-  }
-  if (keymap({0, 3}) == rank) {
-    A_elements.emplace_back(0, 3, blk_t(btas::Range(128, 256), -2.3));
-  }
-  if (keymap({1, 0}) == rank) {
-    A_elements.emplace_back(1, 0, blk_t(btas::Range(128, 256), -0.3));
-  }
-  if (keymap({1, 2}) == rank) {
-    A_elements.emplace_back(1, 2, blk_t(btas::Range(128, 256), 1.2));
-  }
-  if (buildRefs && rank == 0) {
-    Aref_elements.emplace_back(0, 1, blk_t(btas::Range(128, 256), 12.3));
-    Aref_elements.emplace_back(0, 2, blk_t(btas::Range(128, 256), 10.7));
-    Aref_elements.emplace_back(0, 3, blk_t(btas::Range(128, 256), -2.3));
-    Aref_elements.emplace_back(1, 0, blk_t(btas::Range(128, 256), -0.3));
-    Aref_elements.emplace_back(1, 2, blk_t(btas::Range(128, 256), 1.2));
-  }
-#else
-  if ((buildRefs && rank == 0) || keymap({0, 1}) == rank) {
-    A_elements.emplace_back(0, 1, 12.3);
-  }
-  if ((buildRefs && rank == 0) || keymap({0, 2}) == rank) {
-    A_elements.emplace_back(0, 2, 10.7);
-  }
-  if ((buildRefs && rank == 0) || keymap({0, 3}) == rank) {
-    A_elements.emplace_back(0, 3, -2.3);
-  }
-  if ((buildRefs && rank == 0) || keymap({1, 0}) == rank) {
-    A_elements.emplace_back(1, 0, -0.3);
-  }
-  if ((buildRefs && rank == 0) || keymap({1, 2}) == rank) {
-    A_elements.emplace_back(1, 2, .2);
-  }
-  if (buildRefs && rank == 0) {
-    Aref_elements.emplace_back(0, 1, 12.3);
-    Aref_elements.emplace_back(0, 2, 10.7);
-    Aref_elements.emplace_back(0, 3, -2.3);
-    Aref_elements.emplace_back(1, 0, -0.3);
-    Aref_elements.emplace_back(1, 2, .2);
-  }
-#endif
-  a_rowidx_to_colidx.resize(2);
-  a_rowidx_to_colidx[0].emplace_back(1);  // A[0][1]
-  a_rowidx_to_colidx[0].emplace_back(2);  // A[0][2]
-  a_rowidx_to_colidx[0].emplace_back(3);  // A[0][3]
-  a_rowidx_to_colidx[1].emplace_back(0);  // A[1][0]
-  a_rowidx_to_colidx[1].emplace_back(2);  // A[1][2]
-
-  a_colidx_to_rowidx.resize(4);
-  a_colidx_to_rowidx[0].emplace_back(1);  // A[1][0]
-  a_colidx_to_rowidx[1].emplace_back(0);  // A[0][1]
-  a_colidx_to_rowidx[2].emplace_back(0);  // A[0][2]
-  a_colidx_to_rowidx[2].emplace_back(1);  // A[1][2]
-  a_colidx_to_rowidx[3].emplace_back(0);  // A[0][3]
-
-  A.setFromTriplets(A_elements.begin(), A_elements.end());
-
-  if (buildRefs && 0 == rank) {
-    Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end());
-  }
-
-  std::vector<triplet_t> B_elements;
-  std::vector<triplet_t> Bref_elements;
-#if defined(BTAS_IS_USABLE)
-  if (keymap({0, 0}) == rank) {
-    B_elements.emplace_back(0, 0, blk_t(btas::Range(256, 196), 12.3));
-  }
-  if (keymap({1, 0}) == rank) {
-    B_elements.emplace_back(1, 0, blk_t(btas::Range(256, 196), 10.7));
-  }
-  if (keymap({3, 0}) == rank) {
-    B_elements.emplace_back(3, 0, blk_t(btas::Range(256, 196), -2.3));
-  }
-  if (keymap({1, 1}) == rank) {
-    B_elements.emplace_back(1, 1, blk_t(btas::Range(256, 196), -0.3));
-  }
-  if (keymap({1, 2}) == rank) {
-    B_elements.emplace_back(1, 2, blk_t(btas::Range(256, 196), 1.2));
-  }
-  if (keymap({2, 2}) == rank) {
-    B_elements.emplace_back(2, 2, blk_t(btas::Range(256, 196), 7.2));
-  }
-  if (keymap({3, 2}) == rank) {
-    B_elements.emplace_back(3, 2, blk_t(btas::Range(256, 196), 0.2));
-  }
-  if (buildRefs && rank == 0) {
-    Bref_elements.emplace_back(0, 0, blk_t(btas::Range(256, 196), 12.3));
-    Bref_elements.emplace_back(1, 0, blk_t(btas::Range(256, 196), 10.7));
-    Bref_elements.emplace_back(3, 0, blk_t(btas::Range(256, 196), -2.3));
-    Bref_elements.emplace_back(1, 1, blk_t(btas::Range(256, 196), -0.3));
-    Bref_elements.emplace_back(1, 2, blk_t(btas::Range(256, 196), 1.2));
-    Bref_elements.emplace_back(2, 2, blk_t(btas::Range(256, 196), 7.2));
-    Bref_elements.emplace_back(3, 2, blk_t(btas::Range(256, 196), 0.2));
-  }
-#else
-  if (keymap({0, 0}) == rank) {
-    B_elements.emplace_back(0, 0, 12.3);
-  }
-  if (keymap({1, 0}) == rank) {
-    B_elements.emplace_back(1, 0, 10.7);
-  }
-  if (keymap({3, 0}) == rank) {
-    B_elements.emplace_back(3, 0, -2.3);
-  }
-  if (keymap({1, 1}) == rank) {
-    B_elements.emplace_back(1, 1, -0.3);
-  }
-  if (keymap({1, 2}) == rank) {
-    B_elements.emplace_back(1, 2, 1.2);
-  }
-  if (keymap({2, 2}) == rank) {
-    B_elements.emplace_back(2, 2, 7.2);
-  }
-  if (keymap({3, 2}) == rank) {
-    B_elements.emplace_back(3, 2, 0.2);
-  }
-#endif
-  b_rowidx_to_colidx.resize(4);
-  b_rowidx_to_colidx[0].emplace_back(0);  // B[0][0]
-  b_rowidx_to_colidx[1].emplace_back(0);  // B[1][0]
-  b_rowidx_to_colidx[1].emplace_back(1);  // B[1][1]
-  b_rowidx_to_colidx[1].emplace_back(2);  // B[1][2]
-  b_rowidx_to_colidx[2].emplace_back(2);  // B[2][2]
-  b_rowidx_to_colidx[3].emplace_back(0);  // B[3][0]
-  b_rowidx_to_colidx[3].emplace_back(2);  // B[3][2]
-
-  b_colidx_to_rowidx.resize(3);
-  b_colidx_to_rowidx[0].emplace_back(0);  // B[0][0]
-  b_colidx_to_rowidx[0].emplace_back(1);  // B[1][0]
-  b_colidx_to_rowidx[0].emplace_back(3);  // B[3][0]
-  b_colidx_to_rowidx[1].emplace_back(1);  // B[1][1]
-  b_colidx_to_rowidx[2].emplace_back(1);  // B[1][2]
-  b_colidx_to_rowidx[2].emplace_back(2);  // B[2][2]
-  b_colidx_to_rowidx[2].emplace_back(3);  // A[3][2]
-
-  B.setFromTriplets(B_elements.begin(), B_elements.end());
-  if (buildRefs && 0 == rank) {
-    Bref.setFromTriplets(Bref_elements.begin(), Bref_elements.end());
-  }
-}
-
-#if defined(BTAS_IS_USABLE)
-static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, size_t M, size_t N, size_t K, int minTs,
-                           int maxTs, double avgDensity, SpMatrix<> &A, SpMatrix<> &B, SpMatrix<> &Aref,
-                           SpMatrix<> &Bref, bool buildRefs, std::vector<int> &mTiles, std::vector<int> &nTiles,
-                           std::vector<int> &kTiles, std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                           std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                           std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                           std::vector<std::vector<long>> &b_colidx_to_rowidx, double &average_tile_size,
-                           double &Adensity, double &Bdensity, unsigned int seed) {
-  int rank = ttg::default_execution_context().rank();
-
-  int ts;
-  std::mt19937 gen(seed);
-  std::mt19937 genv(seed + 1);
-
-  std::uniform_int_distribution<> dist(minTs, maxTs);  // randomly pick any value in the range minTs, maxTs
-  using triplet_t = Eigen::Triplet<blk_t>;
-  std::vector<triplet_t> A_elements;
-  std::vector<triplet_t> B_elements;
-  std::vector<triplet_t> Aref_elements;
-  std::vector<triplet_t> Bref_elements;
-
-  for (int m = 0; m < M; m += ts) {
-    ts = dist(gen);
-    if (ts > M - m) ts = M - m;
-    mTiles.push_back(ts);
-  }
-  for (int n = 0; n < N; n += ts) {
-    ts = dist(gen);
-    if (ts > N - n) ts = N - n;
-    nTiles.push_back(ts);
-  }
-  for (int k = 0; k < K; k += ts) {
-    ts = dist(gen);
-    if (ts > K - k) ts = K - k;
-    kTiles.push_back(ts);
-  }
-
-  A.resize(mTiles.size(), kTiles.size());
-  B.resize(kTiles.size(), nTiles.size());
-  if (buildRefs) {
-    Aref.resize(mTiles.size(), kTiles.size());
-    Bref.resize(kTiles.size(), nTiles.size());
-  }
-
-  std::uniform_int_distribution<> mDist(0, mTiles.size() - 1);
-  std::uniform_int_distribution<> nDist(0, nTiles.size() - 1);
-  std::uniform_int_distribution<> kDist(0, kTiles.size() - 1);
-  std::uniform_real_distribution<> vDist(-1.0, 1.0);
-
-  size_t filling = 0;
-  size_t avg_nb = 0;
-  int avg_nb_nb = 0;
-
-  struct tuple_hash : public std::unary_function<std::tuple<int, int>, std::size_t> {
-    std::size_t operator()(const std::tuple<int, int> &k) const {
-      return static_cast<size_t>(std::get<0>(k)) | (static_cast<size_t>(std::get<1>(k)) << 32);
-    }
-  };
-
-  std::unordered_set<std::tuple<int, int>, tuple_hash> fills;
-
-  fills.clear();
-  while ((double)filling / (double)(M * K) < avgDensity) {
-    int mt = mDist(gen);
-    int kt = kDist(gen);
-
-    if (fills.find({mt, kt}) != fills.end()) continue;
-    fills.insert({mt, kt});
-
-    if (mt >= a_rowidx_to_colidx.size()) a_rowidx_to_colidx.resize(mt + 1);
-    a_rowidx_to_colidx[mt].emplace_back(kt);
-    if (kt >= a_colidx_to_rowidx.size()) a_colidx_to_rowidx.resize(kt + 1);
-    a_colidx_to_rowidx[kt].emplace_back(mt);
-
-    filling += mTiles[mt] * kTiles[kt];
-    avg_nb += mTiles[mt] * kTiles[kt];
-    avg_nb_nb++;
-    double value = vDist(genv);
-    if (0 == rank && buildRefs) Aref_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value));
-    if (rank != keymap({mt, kt})) continue;
-    A_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value));
-  }
-  for (auto &row : a_rowidx_to_colidx) {
-    std::sort(row.begin(), row.end());
-  }
-  for (auto &col : a_colidx_to_rowidx) {
-    std::sort(col.begin(), col.end());
-  }
-  A.setFromTriplets(A_elements.begin(), A_elements.end());
-  Adensity = (double)filling / (double)(M * K);
-  if (0 == rank && buildRefs) Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end());
-
-  filling = 0;
-  fills.clear();
-  while ((double)filling / (double)(K * N) < avgDensity) {
-    int nt = nDist(gen);
-    int kt = kDist(gen);
-
-    if (fills.find({kt, nt}) != fills.end()) continue;
-    fills.insert({kt, nt});
-
-    if (kt >= b_rowidx_to_colidx.size()) b_rowidx_to_colidx.resize(kt + 1);
-    b_rowidx_to_colidx[kt].emplace_back(nt);
-    if (nt >= b_colidx_to_rowidx.size()) b_colidx_to_rowidx.resize(nt + 1);
-    b_colidx_to_rowidx[nt].emplace_back(kt);
-
-    filling += kTiles[kt] * nTiles[nt];
-    avg_nb += kTiles[kt] * nTiles[nt];
-    avg_nb_nb++;
-    double value = vDist(genv);
-    if (0 == rank && buildRefs) Bref_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value));
-    if (rank != keymap({kt, nt})) continue;
-    B_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value));
-  }
-  for (auto &row : b_rowidx_to_colidx) {
-    std::sort(row.begin(), row.end());
-  }
-  for (auto &col : b_colidx_to_rowidx) {
-    std::sort(col.begin(), col.end());
-  }
-  B.setFromTriplets(B_elements.begin(), B_elements.end());
-  Bdensity = (double)filling / (double)(K * N);
-  if (0 == rank && buildRefs) Bref.setFromTriplets(Bref_elements.begin(), Bref_elements.end());
-  fills.clear();
-
-  average_tile_size = (double)avg_nb / avg_nb_nb;
-}
-#endif
-
-#endif
-
-static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<int(const Key<2> &)> &ij_keymap,
-                              const std::function<int(const Key<3> &)> &ijk_keymap, const std::string &tiling_type,
-                              double gflops, double avg_nb, double Adensity, double Bdensity,
-                              const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                              const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                              const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                              const std::vector<std::vector<long>> &b_colidx_to_rowidx, std::vector<int> &mTiles,
-                              std::vector<int> &nTiles, std::vector<int> &kTiles, int M, int N, int K, int minTs,
-                              int maxTs, int P, int Q, int R) {
-  int MT = (int)A.rows();
-  int NT = (int)B.cols();
-  int KT = (int)A.cols();
-  assert(KT == B.rows());
-
-  SpMatrix<> C;
-  C.resize(MT, NT);
-
-  // flow graph needs to exist on every node
-  Edge<Key<2>> ctl("control");
-  Control control(ctl);
-  Edge<Key<2>, blk_t> eA, eB;
-  Edge<Key<2>, blk_t> eC;
-
-  Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
-  Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
-  Write_SpMatrix<> c(C, eC, ij_keymap);
-  auto &c_status = c.status();
-  assert(!has_value(c_status));
-  //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
-  SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx,
-                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
-  TTGUNUSED(a);
-  TTGUNUSED(b);
-  TTGUNUSED(a_times_b);
-
-  auto connected = make_graph_executable(&control);
-  assert(connected);
-  TTGUNUSED(connected);
-
-  struct timeval start {
-    0
-  }, end{0}, diff{0};
-  gettimeofday(&start, nullptr);
-  // ready, go! need only 1 kick, so must be done by 1 thread only
-  if (ttg::default_execution_context().rank() == 0) control.start(P, Q);
-  fence();
-  gettimeofday(&end, nullptr);
-  timersub(&end, &start, &diff);
-  double tc = (double)diff.tv_sec + (double)diff.tv_usec / 1e6;
-#if defined(TTG_USE_MADNESS)
-  std::string rt("MAD");
-#elif defined(TTG_USE_PARSEC)
-  std::string rt("PARSEC");
-#else
-  std::string rt("Unkown???");
-#endif
-  if (ttg::default_execution_context().rank() == 0) {
-    std::cout << "TTG-" << rt << " PxQxR=   " << P << " " << Q << " " << R << " 1 average_NB= " << avg_nb << " M= " << M
-              << " N= " << N << " K= " << K << " t= " << minTs << " T=" << maxTs << " Tiling= " << tiling_type
-              << " A_density= " << Adensity << " B_density= " << Bdensity << " gflops= " << gflops << " seconds= " << tc
-              << " gflops/s= " << gflops / tc << std::endl;
-  }
-}
-
-#if !defined(BLOCK_SPARSE_GEMM)
-static void make_rowidx_to_colidx_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &r2c) {
-  for (int k = 0; k < mat.outerSize(); ++k) {  // cols, if col-major, rows otherwise
-    for (typename SpMatrix<blk_t>::InnerIterator it(mat, k); it; ++it) {
-      const long row = it.row();
-      const long col = it.col();
-      if (row >= r2c.size()) r2c.resize(row + 1);
-      r2c[row].push_back(col);
-    }
-  }
-  // Sort each vector of column indices, as we pushed them in an arbitrary order
-  for (auto &row : r2c) {
-    std::sort(row.begin(), row.end());
-  }
-}
-
-static void make_colidx_to_rowidx_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &c2r) {
-  for (int k = 0; k < mat.outerSize(); ++k) {  // cols, if col-major, rows otherwise
-    for (typename SpMatrix<blk_t>::InnerIterator it(mat, k); it; ++it) {
-      const long row = it.row();
-      const long col = it.col();
-
-      if (col >= c2r.size()) c2r.resize(col + 1);
-      c2r[col].push_back(row);
-    }
-    // Sort each vector of row indices, as we pushed them in an arbitrary order
-    for (auto &col : c2r) {
-      std::sort(col.begin(), col.end());
-    }
-  }
-}
-#endif
-
-static double compute_gflops(const std::vector<std::vector<long>> &a_r2c, const std::vector<std::vector<long>> &b_r2c,
-                             const std::vector<int> &mTiles, const std::vector<int> &nTiles,
-                             const std::vector<int> &kTiles) {
-  unsigned long flops = 0;
-  for (auto i = 0; i < a_r2c.size(); i++) {
-    for (auto kk = 0; kk < a_r2c[i].size(); kk++) {
-      auto k = a_r2c[i][kk];
-      if (k > b_r2c.size()) continue;
-      for (auto jj = 0; jj < b_r2c[k].size(); jj++) {
-        auto j = b_r2c[k][jj];
-        flops += static_cast<long>(mTiles[i]) * nTiles[j] * kTiles[k];
-      }
-    }
-  }
-  return 2.0 * (double)flops / 1e9;
-}
-
-int main(int argc, char **argv) {
-  bool timing;
-  double gflops;
-
-  // warm up silicon by calling gemm a few times
-#ifdef BTAS_IS_USABLE
-  for (int i = 0; i < 20; i++) {
-    using baseT = typename btas::Tensor<double>;
-    btas::Tensor<double, btas::Range, std::vector<double>> At(30, 30);
-    btas::Tensor<double, btas::Range, std::vector<double>> Bt(30, 30);
-    btas::Tensor<double, btas::Range, std::vector<double>> Ct(30, 30);
-    At.fill(1.0);
-    Bt.fill(2.0);
-    Ct.fill(3.0);
-    btas::gemm(std::move(Ct), Bt, At);
-  }
-#endif  // BTAS_IS_USABLE
-
-//  static volatile int debug_signal = 0;
-//  std::cout << "Waiting on debug signal (int*)" << &debug_signal << std::endl;
-//  while (!debug_signal) {}
-
-
-  int cores = -1;
-  std::string nbCoreStr(getCmdOption(argv, argv + argc, "-c"));
-  cores = parseOption(nbCoreStr, cores);
-
-  if (int dashdash = cmdOptionIndex(argv, argv + argc, "--") > -1) {
-    initialize(argc - dashdash, argv + dashdash, cores);
-  } else {
-    initialize(1, argv, cores);
-  }
-
-#ifdef BTAS_IS_USABLE
-  // initialize MADNESS so that TA allocators can be created
-  madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
-  madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
-#endif  // BTAS_IS_USABLE
-
-  std::string debugStr(getCmdOption(argv, argv + argc, "-d"));
-  auto debug = (unsigned int)parseOption(debugStr, 0);
-
-  if (debug & (1 << 1)) {
-    using ttg::Debugger;
-    auto debugger = std::make_shared<Debugger>();
-    Debugger::set_default_debugger(debugger);
-    debugger->set_exec(argv[0]);
-    debugger->set_prefix(ttg::default_execution_context().rank());
-    // debugger->set_cmd("lldb_xterm");
-    debugger->set_cmd("gdb_xterm");
-  }
-
-  int mpi_size = ttg::default_execution_context().size();
-  int mpi_rank = ttg::default_execution_context().rank();
-  int best_pqc = mpi_size;
-  int P, Q, R;
-  for (int c = 1; c <= (int)cbrt(mpi_size); c++) {
-    for (int p = 1; p <= (int)sqrt(mpi_size / c); p++) {
-      if ((mpi_size % (p * c)) == 0) {
-        int q = mpi_size / (p * c);
-        if (abs(c - p - q) <= best_pqc) {
-          best_pqc = abs(c - p - q);
-          P = p;
-          Q = q;
-          R = c;
-        }
-      }
-    }
-    // ttg::launch_lldb(ttg::default_execution_context().rank(), argv[0]);
-
-    {
-      if (debug & (1 << 0)) {
-        ttg::trace_on();
-        TTBase::set_trace_all(true);
-      }
-
-      SpMatrix<> A, B, C, Aref, Bref;
-      std::string tiling_type;
-      int M = 0, N = 0, K = 0;
-      int minTs = 0, maxTs = 0;
-
-      double avg_nb = nan("undefined");
-      double Adensity = nan("undefined");
-      double Bdensity = nan("undefined");
-
-      std::string PStr(getCmdOption(argv, argv + argc, "-P"));
-      P = parseOption(PStr, P);
-      std::string QStr(getCmdOption(argv, argv + argc, "-Q"));
-      Q = parseOption(QStr, Q);
-      // to make code behave like 2D summa if R not given
-      std::string RStr(getCmdOption(argv, argv + argc, "-R"));
-      R = parseOption(RStr, 1);
-
-      if (P * Q * R != mpi_size) {
-        if (!cmdOptionExists(argv, argv + argc, "-Q") && (mpi_size % (P * R) == 0))
-          Q = mpi_size / (P * R);
-        else if (!cmdOptionExists(argv, argv + argc, "-P") && (mpi_size % (Q * R)) == 0)
-          P = mpi_size / (Q * R);
-        else if (!cmdOptionExists(argv, argv + argc, "-R") && (mpi_size % (Q * P)) == 0)
-          R = mpi_size / (Q * P);
-        else {
-          if (0 == mpi_rank) {
-            std::cerr << P << "x" << Q << "x" << R << " is not a valid process grid -- bailing out" << std::endl;
-            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
-          }
-        }
-      }
-
-      auto ij_keymap = [P, Q](const Key<2> &ij) {
-        int i = (int)ij[0];
-        int j = (int)ij[1];
-        int r = ij2rank(i, j, P, Q);
-        return r;
-      };
-
-      auto ijk_keymap = [P, Q, R](const Key<3> &ijk) {
-        int i = (int)ijk[0];
-        int j = (int)ijk[1];
-        int k = (int)ijk[2];
-        int r = ijk2rank(i, j, k, P, Q, R);
-        return r;
-      };
-
-      std::string seedStr(getCmdOption(argv, argv + argc, "-s"));
-      unsigned int seed = parseOption(seedStr, 0);
-      if (seed == 0) {
-        std::random_device rd;
-        seed = rd();
-        if (0 == ttg::default_execution_context().rank()) std::cerr << "#Random seeded with " << seed << std::endl;
-      }
-      ttg_broadcast(ttg::default_execution_context(), seed, 0);
-
-      std::vector<int> mTiles;
-      std::vector<int> nTiles;
-      std::vector<int> kTiles;
-      std::vector<std::vector<long>> a_rowidx_to_colidx;
-      std::vector<std::vector<long>> a_colidx_to_rowidx;
-      std::vector<std::vector<long>> b_rowidx_to_colidx;
-      std::vector<std::vector<long>> b_colidx_to_rowidx;
-
-      std::string checkStr(getCmdOption(argv, argv + argc, "-x"));
-      int check = parseOption(checkStr, !(argc >= 2));
-      timing = (check == 0);
-
-#if !defined(BLOCK_SPARSE_GEMM)
-      if (cmdOptionExists(argv, argv + argc, "-mm")) {
-        char *filename = getCmdOption(argv, argv + argc, "-mm");
-        tiling_type = filename;
-        initSpMatrixMarket(ij_keymap, filename, A, B, C, M, N, K);
-      } else if (cmdOptionExists(argv, argv + argc, "-rmat")) {
-        char *opt = getCmdOption(argv, argv + argc, "-rmat");
-        tiling_type = "RandomSparseMatrix";
-        initSpRmat(ij_keymap, opt, A, B, C, M, N, K, seed);
-      } else {
-        tiling_type = "HardCodedSparseMatrix";
-        initSpHardCoded(ij_keymap, A, B, C, M, N, K);
-      }
-
-      if (check) {
-        // We don't generate the sparse matrices in distributed, so Aref and Bref can
-        // just point to the same matrix, or be a local copy.
-        Aref = A;
-        Bref = B;
-      }
-
-      // We still need to build the metadata from the  matrices.
-      make_rowidx_to_colidx_from_eigen(A, a_rowidx_to_colidx);
-      make_colidx_to_rowidx_from_eigen(A, a_colidx_to_rowidx);
-      make_rowidx_to_colidx_from_eigen(B, b_rowidx_to_colidx);
-      make_colidx_to_rowidx_from_eigen(B, b_colidx_to_rowidx);
-      // This is only needed to compute the flops
-      for (int mt = 0; mt < M; mt++) mTiles.emplace_back(1);
-      for (int nt = 0; nt < N; nt++) nTiles.emplace_back(1);
-      for (int kt = 0; kt < K; kt++) kTiles.emplace_back(1);
-#else
-      if (argc >= 2) {
-        std::string Mstr(getCmdOption(argv, argv + argc, "-M"));
-        M = parseOption(Mstr, 1200);
-        std::string Nstr(getCmdOption(argv, argv + argc, "-N"));
-        N = parseOption(Nstr, M);
-        std::string Kstr(getCmdOption(argv, argv + argc, "-K"));
-        K = parseOption(Kstr, N);
-        std::string minTsStr(getCmdOption(argv, argv + argc, "-t"));
-        minTs = parseOption(minTsStr, 64);
-        std::string maxTsStr(getCmdOption(argv, argv + argc, "-T"));
-        maxTs = parseOption(maxTsStr, minTs);
-        std::string avgStr(getCmdOption(argv, argv + argc, "-a"));
-        double avg = parseOption(avgStr, 0.3);
-        timing = (check == 0);
-        tiling_type = "RandomIrregularTiling";
-        initBlSpRandom(ij_keymap, M, N, K, minTs, maxTs, avg, A, B, Aref, Bref, check, mTiles, nTiles, kTiles,
-                       a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, avg_nb, Adensity,
-                       Bdensity, seed);
-
-        C.resize(mTiles.size(), nTiles.size());
-      } else {
-        tiling_type = "HardCodedBlockSparseMatrix";
-        initBlSpHardCoded(ij_keymap, A, B, C, Aref, Bref, true, mTiles, nTiles, kTiles, a_rowidx_to_colidx,
-                          a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, M, N, K);
-      }
-#endif  // !defined(BLOCK_SPARSE_GEMM)
-
-      gflops = compute_gflops(a_rowidx_to_colidx, b_rowidx_to_colidx, mTiles, nTiles, kTiles);
-
-      std::string nbrunStr(getCmdOption(argv, argv + argc, "-n"));
-      int nb_runs = parseOption(nbrunStr, 1);
-
-      if (timing) {
-        // Start up engine
-        execute();
-        for (int nrun = 0; nrun < nb_runs; nrun++) {
-          parsec_devices_release_memory();
-          timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity,
-                            a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, mTiles,
-                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R);
-          parsec_devices_reset_load(default_execution_context().impl().context());
-        }
-      } else {
-        // flow graph needs to exist on every node
-        // N.B. to validate C we need it on node 0!
-        auto keymap_write = [](const Key<2> &key) { return 0; };
-        Edge<Key<2>> ctl("control");
-        Control control(ctl);
-        Edge<Key<2>, blk_t> eA, eB, eC;
-        Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
-        Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
-        Write_SpMatrix<> c(C, eC, keymap_write, true);
-        auto &c_status = c.status();
-        assert(!has_value(c_status));
-        //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
-        SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx,
-                            b_colidx_to_rowidx, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
-        TTGUNUSED(a_times_b);
-        // calling the Dot constructor with 'true' argument disables the type
-        if (default_execution_context().rank() == 0) std::cout << Dot{/*disable_type=*/true}(&control) << std::endl;
-
-        // ready to run!
-        auto connected = make_graph_executable(&control);
-        assert(connected);
-        TTGUNUSED(connected);
-
-        // ready, go! need only 1 kick, so must be done by 1 thread only
-        if (ttg::default_execution_context().rank() == 0) control.start(P, Q);
-
-        execute();
-        fence();
-
-        // validate C=A*B against the reference output
-        assert(has_value(c_status));
-        if (ttg::default_execution_context().rank() == 0) {
-          SpMatrix<> Cref = Aref * Bref;
-
-          double norm_2_square, norm_inf;
-          std::tie(norm_2_square, norm_inf) = norms<blk_t>(Cref - C);
-          std::cout << "||Cref - C||_2      = " << std::sqrt(norm_2_square) << std::endl;
-          std::cout << "||Cref - C||_\\infty = " << norm_inf << std::endl;
-          if (norm_inf > 1e-9) {
-            std::cout << "Cref:\n" << Cref << std::endl;
-            std::cout << "C:\n" << C << std::endl;
-            ttg_abort();
-          }
-        }
-
-        // validate Acopy=A against the reference output
-        //      assert(has_value(copy_status));
-        //      if (ttg::default_execution_context().rank() == 0) {
-        //        double norm_2_square, norm_inf;
-        //        std::tie(norm_2_square, norm_inf) = norms<blk_t>(Acopy - A);
-        //        std::cout << "||Acopy - A||_2      = " << std::sqrt(norm_2_square) << std::endl;
-        //        std::cout << "||Acopy - A||_\\infty = " << norm_inf << std::endl;
-        //        if (::ttg::tracing()) {
-        //          std::cout << "Acopy (" << static_cast<void *>(&Acopy) << "):\n" << Acopy << std::endl;
-        //          std::cout << "A (" << static_cast<void *>(&A) << "):\n" << A << std::endl;
-        //        }
-        //        if (norm_inf != 0) {
-        //          ttg_abort();
-        //        }
-        //      }
-      }
-    }
-
-#ifdef BTAS_IS_USABLE
-    madness::finalize();
-#endif  // BTAS_IS_USABLE
-    ttg_finalize();
-    return 0;
-  }
-}
diff --git a/examples/ttg_matrix.h b/examples/ttg_matrix.h
index 3478e2aa4..e0d6cb3f6 100644
--- a/examples/ttg_matrix.h
+++ b/examples/ttg_matrix.h
@@ -8,11 +8,53 @@
 #include <vector>
 
 #include "ttg/serialization/std/vector.h"
+#include "ttg/util/multiindex.h"
+
+#include <Eigen/SparseCore>
 
 namespace ttg {
 
   namespace matrix {
 
+    enum class ReadOnceTriplet { yes, no };
+
+    /// element of a sparse matrix = {row index, col index, value}
+
+    /// move-capable replacement for Eigen::Triplet
+    template<typename Value, ReadOnceTriplet ReadOnce = ReadOnceTriplet::yes, typename StorageIndex=typename Eigen::SparseMatrix<Value>::StorageIndex >
+    class Triplet {
+     public:
+      Triplet() = default;
+      Triplet(const Triplet&) = default;
+      Triplet(Triplet&&) = default;
+      Triplet& operator=(const Triplet&) = default;
+      Triplet& operator=(Triplet&&) = default;
+
+      Triplet(StorageIndex r, const StorageIndex c, const Value& v)
+          : m_row(r), m_col(c), m_value(v)
+      {}
+      Triplet(StorageIndex r, const StorageIndex c, Value&& v = Value{})
+          : m_row(r), m_col(c), m_value(std::move(v))
+      {}
+
+      /** \returns the row index of the element */
+      const StorageIndex& row() const { return m_row; }
+
+      /** \returns the column index of the element */
+      const StorageIndex& col() const { return m_col; }
+
+      /** \returns the value of the element */
+      std::conditional_t<ReadOnce == ReadOnceTriplet::yes, Value&&, const Value&> value() const {
+        if constexpr (ReadOnce == ReadOnceTriplet::yes)
+          return std::move(m_value);
+        else
+          return m_value;
+      }
+     protected:
+      StorageIndex m_row = -1, m_col = -1;
+      mutable Value m_value;
+    };
+
     // matrix shape = maps {column,row} index to {row,column} indices
     class Shape : public std::vector<std::vector<long>> {
       using base_t = std::vector<std::vector<long>>;
@@ -192,22 +234,22 @@ namespace ttg {
     }
 
     // compute shape of an existing SpMatrix on rank 0
-    template <typename Blk = blk_t>
+    template <typename Blk>
     class ReadShape : public TT<void, std::tuple<Out<void, Shape>>, ReadShape<Blk>, ttg::typelist<void>> {
      public:
       using baseT = typename ReadShape::ttT;
       static constexpr const int owner = 0;  // where data resides
 
-      ReadShape(const char *label, const SpMatrix<Blk> &matrix, Edge<void, void> &in, Edge<void, Shape> &out)
+      ReadShape(const char *label, const Eigen::SparseMatrix<Blk> &matrix, Edge<void, void> &in, Edge<void, Shape> &out)
           : baseT(edges(in), edges(out), std::string("read_spmatrix_shape(") + label + ")", {"ctl"},
                   {std::string("shape[") + label + "]"},
                   /* keymap */ []() { return owner; })
           , matrix_(matrix) {}
 
-      void op(std::tuple<Out<void, Shape>> &out) { ::sendv<0>(Shape(matrix_), out); }
+      void op(std::tuple<Out<void, Shape>> &out) { ttg::sendv<0>(Shape(matrix_), out); }
 
      private:
-      const SpMatrix<Blk> &matrix_;
+      const Eigen::SparseMatrix<Blk> &matrix_;
     };
 
     // flow data from an existing SpMatrix on rank 0
@@ -217,38 +259,38 @@ namespace ttg {
     //   but will only be efficient if can do random access (slow with CSC format used by Eigen matrices)
     // - this could be generalized to read efficiently from a distributed data structure
     // Use Read_SpMatrix if need to read all data from a data structure localized on 1 process
-    template <typename Blk = blk_t>
-    class Read : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read<Blk>, ttg::typelist<void>> {
+    template <typename Blk>
+    class Read : public TT<MultiIndex<2>, std::tuple<Out<MultiIndex<2>, Blk>>, Read<Blk>, ttg::typelist<void>> {
      public:
-      using baseT = TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read<Blk>, void>;
+      using baseT = TT<MultiIndex<2>, std::tuple<Out<MultiIndex<2>, Blk>>, Read<Blk>, void>;
       static constexpr const int owner = 0;  // where data resides
 
-      Read(const char *label, const SpMatrix<Blk> &matrix, Edge<Key<2>, void> &in, Edge<Key<2>, Blk> &out)
+      Read(const char *label, const Eigen::SparseMatrix<Blk> &matrix, Edge<MultiIndex<2>, void> &in, Edge<MultiIndex<2>, Blk> &out)
           : baseT(edges(in), edges(out), std::string("read_spmatrix(") + label + ")", {"ctl[ij]"},
                   {std::string(label) + "[ij]"},
                   /* keymap */ [](auto key) { return owner; })
           , matrix_(matrix) {}
 
-      void op(const Key<2> &key, std::tuple<Out<Key<2>, Blk>> &out) {
+      void op(const MultiIndex<2> &key, std::tuple<Out<MultiIndex<2>, Blk>> &out) {
         // random access in CSC format is inefficient, this is only to demonstrate the way to go for hash-based storage
         // for whatever reason coeffRef does not work on a const SpMatrix&
-        ::send<0>(key, static_cast<const Blk &>(const_cast<SpMatrix<Blk> &>(matrix_).coeffRef(key[0], key[1])), out);
+        ttg::send<0>(key, static_cast<const Blk &>(const_cast<Eigen::SparseMatrix<Blk> &>(matrix_).coeffRef(key[0], key[1])), out);
       }
 
      private:
-      const SpMatrix<Blk> &matrix_;
+      const Eigen::SparseMatrix<Blk> &matrix_;
     };
 
     // WriteShape commits shape to an existing SpMatrix on rank 0 and sends it on
     // since SpMatrix supports random inserts there is no need to commit the shape into the matrix, other than get the
     // dimensions
-    template <typename Blk = blk_t>
+    template <typename Blk>
     class WriteShape : public TT<void, std::tuple<Out<void, Shape>>, WriteShape<Blk>, ttg::typelist<Shape>> {
      public:
       using baseT = typename WriteShape::ttT;
       static constexpr const int owner = 0;  // where data resides
 
-      WriteShape(const char *label, SpMatrix<Blk> &matrix, Edge<void, Shape> &in, Edge<void, Shape> &out)
+      WriteShape(const char *label, Eigen::SparseMatrix<Blk> &matrix, Edge<void, Shape> &in, Edge<void, Shape> &out)
           : baseT(edges(in), edges(out), std::string("write_spmatrix_shape(") + label + ")",
                   {std::string("shape_in[") + label + "]"}, {std::string("shape_out[") + label + "]"},
                   /* keymap */ []() { return owner; })
@@ -256,28 +298,28 @@ namespace ttg {
 
       void op(typename baseT::input_values_tuple_type &&ins, std::tuple<Out<void, Shape>> &out) {
         const auto &shape = baseT::template get<0>(ins);
-        ::ttg::trace("Resizing ", static_cast<void *>(&matrix_));
+        ttg::trace("Resizing ", static_cast<void *>(&matrix_));
         matrix_.resize(shape.nrows(), shape.ncols());
-        ::sendv<0>(shape, out);
+        ttg::sendv<0>(shape, out);
       }
 
      private:
-      SpMatrix<Blk> &matrix_;
+      Eigen::SparseMatrix<Blk> &matrix_;
     };
 
     // flow (move?) data into an existing SpMatrix on rank 0
-    template <typename Blk = blk_t>
-    class Write : public TT<Key<2>, std::tuple<>, Write<Blk>, Blk, ttg::typelist<void>> {
+    template <typename Blk>
+    class Write : public TT<MultiIndex<2>, std::tuple<>, Write<Blk>, ttg::typelist<void>> {
      public:
       using baseT = typename Write::ttT;
 
-      Write(const char *label, SpMatrix<Blk> &matrix, Edge<Key<2>, Blk> &data_in, Edge<Key<2>, void> &ctl_in)
+      Write(const char *label, Eigen::SparseMatrix<Blk> &matrix, Edge<MultiIndex<2>, Blk> &data_in, Edge<MultiIndex<2>, void> &ctl_in)
           : baseT(edges(data_in, ctl_in), edges(), std::string("write_spmatrix(") + label + ")",
                   {std::string(label) + "[ij]", std::string("ctl[ij]")}, {},
                   /* keymap */ [](auto key) { return 0; })
           , matrix_(matrix) {}
 
-      void op(const Key<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) {
+      void op(const MultiIndex<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) {
         std::lock_guard<std::mutex> lock(mtx_);
         ttg::trace("rank =", default_execution_context().rank(),
                    "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()),
@@ -309,8 +351,8 @@ namespace ttg {
 
      private:
       std::mutex mtx_;
-      SpMatrix<Blk> &matrix_;
-      std::vector<SpMatrixTriplet<Blk>> values_;
+      Eigen::SparseMatrix<Blk> &matrix_;
+      std::vector<Triplet<Blk>> values_;
       mutable std::shared_ptr<std::shared_future<void>> completion_status_;
     };
 
@@ -325,28 +367,28 @@ namespace ttg {
                   /* keymap */ []() { return owner; }) {}
 
       void op(typename baseT::input_values_tuple_type &&ins, std::tuple<Out<void, Shape>> &out) {
-        ::sendv<0>(Shape::add(baseT::template get<0>(ins), baseT::template get<1>(ins)), out);
+        ttg::sendv<0>(Shape::add(baseT::template get<0>(ins), baseT::template get<1>(ins)), out);
       }
     };
 
     // pushes all blocks given by the shape
-    class Push : public TT<void, std::tuple<Out<Key<2>, void>>, Push, ttg::typelist<Shape>> {
+    class Push : public TT<void, std::tuple<Out<MultiIndex<2>, void>>, Push, ttg::typelist<Shape>> {
      public:
       using baseT = typename Push::ttT;
       static constexpr const int owner = 0;  // where data resides
 
-      Push(const char *label, Edge<void, Shape> &in, Edge<Key<2>, void> &out)
+      Push(const char *label, Edge<void, Shape> &in, Edge<MultiIndex<2>, void> &out)
           : baseT(edges(in), edges(out), std::string("push_spmatrix(") + label + ")",
                   {std::string("shape[") + label + "]"}, {"ctl[ij]"},
                   /* keymap */ []() { return owner; }) {}
 
-      void op(typename baseT::input_values_tuple_type &&ins, std::tuple<Out<Key<2>, void>> &out) {
+      void op(typename baseT::input_values_tuple_type &&ins, std::tuple<Out<MultiIndex<2>, void>> &out) {
         const auto &shape = baseT::get<0>(ins);
         if (shape.type() == Shape::Type::col2row) {
           long colidx = 0;
           for (const auto &col : shape) {
             for (const auto rowidx : col) {
-              ::sendk<0>(Key<2>({rowidx, colidx}), out);
+              ttg::sendk<0>(MultiIndex<2>({rowidx, colidx}), out);
             }
             ++colidx;
           }
@@ -354,7 +396,7 @@ namespace ttg {
           long rowidx = 0;
           for (const auto &row : shape) {
             for (const auto colidx : row) {
-              ::sendk<0>(Key<2>({rowidx, colidx}), out);
+              ttg::sendk<0>(MultiIndex<2>({rowidx, colidx}), out);
             }
             ++rowidx;
           }
@@ -372,9 +414,9 @@ namespace ttg {
   class Matrix {
    public:
     using shape_t = matrix::Shape;
-    using data_edge_t = Edge<Key<2>, T>;
+    using data_edge_t = Edge<MultiIndex<2>, T>;
     using shape_edge_t = Edge<void, shape_t>;
-    using ctl_edge_t = Edge<Key<2>, void>;
+    using ctl_edge_t = Edge<MultiIndex<2>, void>;
 
     Matrix() = default;
 
@@ -403,7 +445,7 @@ namespace ttg {
     /// @return an std::future<void> object indicating the status; @c destination_matrix is ready if calling has_value()
     /// on the return value of this function is true.
     /// @note up to the user to ensure completion before reading destination_matrix
-    auto operator>>(SpMatrix<T> &destination_matrix) {
+    auto operator>>(Eigen::SparseMatrix<T> &destination_matrix) {
       // shape writer writes shape to destination_matrix
       ttg_register_ptr(world_, std::make_shared<matrix::WriteShape<T>>("Matrix.WriteShape", destination_matrix,
                                                                        shape_edge_, shape_writer_push_edge_));
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index e1fb7d685..a33bc3c94 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -21,6 +21,8 @@ if (CXXStdCoroutine_FOUND)
     list(APPEND ut_libs std::coroutine)
 endif(CXXStdCoroutine_FOUND)
 
+list(APPEND ut_src constraints.cc)
+
 add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" COMPILE_DEFINITIONS "CATCH_CONFIG_NO_POSIX_SIGNALS=1" )
 
 # serialization test: probes serialization via all supported serialization methods (MADNESS, Boost::serialization) that are available
diff --git a/tests/unit/constraints.cc b/tests/unit/constraints.cc
new file mode 100644
index 000000000..70003b945
--- /dev/null
+++ b/tests/unit/constraints.cc
@@ -0,0 +1,113 @@
+#include <catch2/catch_all.hpp>
+#include <ctime>
+
+#include "ttg.h"
+
+#include "ttg/serialization/std/pair.h"
+#include "ttg/util/hash/std/pair.h"
+#include "ttg/util/multiindex.h"
+
+using Key = ttg::MultiIndex<2>;
+
+TEST_CASE("constraints", "") {
+
+  SECTION("manual") {
+    ttg::Edge<Key, int> e;
+    auto world = ttg::default_execution_context();
+    std::atomic<int> check_ord = 1;
+    std::atomic<int> cnt = 10;
+    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint>([](const Key& k){ return k[1]; });
+    auto tt = ttg::make_tt([&](const Key& key, const int& value){
+      std::cout << "key " << key[0] << ", " << key[1] << " check_ord " << check_ord << " cnt " << cnt << std::endl;
+      CHECK((key[1] == check_ord));
+      if (--cnt == 0) {
+        cnt = 10;
+        check_ord++;
+        std::cout << "key " << key[0] << " releasing next ord " << check_ord << std::endl;
+        constraint->release(check_ord);
+      }
+    }, ttg::edges(e), ttg::edges());
+    // every process executes all tasks
+    tt->set_keymap([&](const Key&){ return world.rank(); });
+    tt->add_constraint(constraint);
+    constraint->stop();
+    constraint->release(1);
+
+    auto bcast = ttg::make_tt([&](){
+      std::vector<Key> keys;
+      // loop iteration order intentionally reversed
+      for (int i = 10; i > 0; --i) {
+        for (int j = 10; j > 0; --j) {
+          keys.push_back(Key{i, j});
+        }
+      }
+      ttg::broadcast<0>(std::move(keys), 0);
+
+      // explicit start here to ensure absolute order
+      constraint->start();
+    }, ttg::edges(), ttg::edges(e));
+    bcast->set_keymap([&](){ return world.rank(); });
+
+    /**
+     * Constraints are currently only implemented in the PaRSEC backend.
+     * Codes using constraints will still compile but they will not
+     * affect the execution order in other backends.
+     */
+#ifdef TTG_USE_PARSEC
+    make_graph_executable(bcast);
+    ttg::execute(ttg::default_execution_context());
+    bcast->invoke();
+
+    ttg::ttg_fence(ttg::default_execution_context());
+#endif // TTG_USE_PARSEC
+
+  }
+
+
+  SECTION("automatic") {
+    ttg::Edge<Key, int> e;
+    auto world = ttg::default_execution_context();
+    std::atomic<int> last_ord = 0;
+    auto constraint = ttg::make_shared_constraint<ttg::SequencedKeysConstraint>(
+                                                    [](const Key& k){ return k[1]; }, true);
+    auto tt = ttg::make_tt([&](const Key& key, const int& value){
+      int check_ord = last_ord;
+      std::cout << "key " << key[0] << ", " << key[1] << " check_ord " << check_ord << std::endl;
+      CHECK(((key[1] == check_ord) || (key[1] == check_ord+1)));
+      last_ord = key[1];
+    }, ttg::edges(e), ttg::edges());
+    // every process executes all tasks
+    tt->set_keymap([&](const Key&){ return world.rank(); });
+    tt->add_constraint(constraint);
+    constraint->stop();
+
+    auto bcast = ttg::make_tt([&](){
+      std::vector<Key> keys;
+      // loop iteration order intentionally reversed
+      for (int i = 10; i > 0; --i) {
+        for (int j = 10; j > 0; --j) {
+          keys.push_back(Key{i, j});
+        }
+      }
+      ttg::broadcast<0>(std::move(keys), 0);
+
+      // explicit start here to ensure absolute order
+      constraint->start();
+    }, ttg::edges(), ttg::edges(e));
+    bcast->set_keymap([&](){ return world.rank(); });
+
+    /**
+     * Constraints are currently only implemented in the PaRSEC backend.
+     * Codes using constraints will still compile but they will not
+     * affect the execution order in other backends.
+     */
+#ifdef TTG_USE_PARSEC
+    make_graph_executable(bcast);
+    ttg::execute(ttg::default_execution_context());
+    bcast->invoke();
+
+    ttg::ttg_fence(ttg::default_execution_context());
+#endif // TTG_USE_PARSEC
+
+  }
+}  // TEST_CASE("streams")
diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt
index 272f005ab..329697ed2 100644
--- a/ttg/CMakeLists.txt
+++ b/ttg/CMakeLists.txt
@@ -46,6 +46,7 @@ configure_file(
 set(ttg-impl-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/broadcast.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/buffer.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/constraint.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescope.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescratch.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/edge.h
diff --git a/ttg/ttg.h b/ttg/ttg.h
index 50a891c1d..8dd86a636 100644
--- a/ttg/ttg.h
+++ b/ttg/ttg.h
@@ -26,6 +26,7 @@
 #include "ttg/util/print.h"
 #include "ttg/world.h"
 
+#include "ttg/constraint.h"
 #include "ttg/edge.h"
 
 #include "ttg/ptr.h"
diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h
new file mode 100644
index 000000000..ab3671601
--- /dev/null
+++ b/ttg/ttg/constraint.h
@@ -0,0 +1,415 @@
+#ifndef TTG_CONSTRAINT_H
+#define TTG_CONSTRAINT_H
+
+#include <functional>
+#include <atomic>
+#include <cstdint>
+#include <mutex>
+#include <map>
+
+#include "ttg/util/span.h"
+
+#ifdef TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS
+#include <ttg/external/boost/callable_traits.hpp>
+#else
+#include <boost/callable_traits.hpp>
+#endif
+
+namespace ttg {
+
+  template<typename Key>
+  struct ConstraintBase {
+    using key_type = Key;
+    using listener_t = std::function<void(const ttg::span<key_type>&)>;
+
+    ConstraintBase()
+    { }
+
+    ConstraintBase(ConstraintBase&& cb)
+    : m_listeners(std::move(cb.m_listeners))
+    { }
+
+    ConstraintBase(const ConstraintBase& cb)
+    : m_listeners(cb.m_listeners)
+    {}
+
+    ConstraintBase& operator=(ConstraintBase&& cb) {
+      m_listeners = std::move(cb.m_listeners);
+    }
+    ConstraintBase& operator=(const ConstraintBase& cb) {
+      m_listeners = cb.m_listeners;
+    }
+
+    virtual ~ConstraintBase() = default;
+
+    void add_listener(listener_t l, ttg::TTBase *tt) {
+      auto g = this->lock_guard();
+      m_listeners.insert_or_assign(tt, std::move(l));
+    }
+
+    void notify_listener(const ttg::span<key_type>& keys, ttg::TTBase* tt) {
+      auto& release = m_listeners[tt];
+      release(keys);
+    }
+
+  protected:
+
+    auto lock_guard() {
+      return std::lock_guard{m_mtx};
+    }
+
+  private:
+    std::map<ttg::TTBase*, listener_t> m_listeners;
+    std::mutex m_mtx;
+  };
+
+  template<typename Key,
+           typename Ordinal = std::size_t,
+           typename Compare = std::less<Ordinal>,
+           typename Mapper = ttg::Void>
+  struct SequencedKeysConstraint : public ConstraintBase<Key> {
+
+    using key_type = std::conditional_t<ttg::meta::is_void_v<Key>, ttg::Void, Key>;
+    using ordinal_type = Ordinal;
+    using keymap_t = std::function<Ordinal(const key_type&)>;
+    using compare_t = Compare;
+    using base_t = ConstraintBase<Key>;
+
+    static_assert((!Compare{}(Ordinal{}, Ordinal{})), "Comparator must provide strict ordering.");
+
+  protected:
+    struct sequence_elem_t {
+      std::map<ttg::TTBase*, std::vector<key_type>> m_keys;
+
+      sequence_elem_t() = default;
+      sequence_elem_t(sequence_elem_t&&) = default;
+      sequence_elem_t(const sequence_elem_t&) = default;
+      sequence_elem_t& operator=(sequence_elem_t&&) = default;
+      sequence_elem_t& operator=(const sequence_elem_t&) = default;
+
+      void add_key(const key_type& key, ttg::TTBase* tt) {
+        auto it = m_keys.find(tt);
+        if (it == m_keys.end()) {
+          m_keys.insert(std::make_pair(tt, std::vector<key_type>{key}));
+        } else {
+          it->second.push_back(key);
+        }
+      }
+    };
+
+    bool comp_equal(const Ordinal& a, const Ordinal& b) const {
+      return (!m_order(a, b) && !m_order(b, a));
+    }
+
+    bool eligible(const Ordinal& ord) const {
+      return m_order(ord, m_current) || comp_equal(ord, m_current);
+    }
+
+    bool check_key_impl(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
+      if (!m_stopped) {
+        if (eligible(ord)) {
+          // key should be executed
+          if (m_auto_release) { // only needed for auto-release
+            m_active.fetch_add(1, std::memory_order_relaxed);
+            // revert the current ordinal to the lower ordinal
+            m_current = ord;
+          }
+          return true;
+        } else if (m_sequence.empty() && m_auto_release && 0 == m_active.load(std::memory_order_relaxed)) {
+          // there are no keys (active or blocked) so we execute to avoid a deadlock
+          // we don't change the current ordinal because there may be lower ordinals coming in later
+          // NOTE: there is a race condition between the check here and the increment above.
+          //       This is mostly benign as it can lead to out-of-sequence released tasks.
+          //       Avoiding this race would incur significant overheads.
+          m_active.fetch_add(1, std::memory_order_relaxed);
+          return true;
+        }
+      }
+      // key should be deferred
+      auto g = this->lock_guard();
+      if (!m_stopped && eligible(ord)) {
+        // someone released this ordinal while we took the lock
+        return true;
+      }
+      auto it = m_sequence.find(ord);
+      if (it == m_sequence.end()) {
+        auto [iter, success] = m_sequence.insert(std::make_pair(ord, sequence_elem_t{}));
+        assert(success);
+        it = iter;
+      }
+      it->second.add_key(key, tt);
+      return false;
+    }
+
+
+    void complete_key_impl() {
+      if (m_auto_release) {
+        auto active = m_active.fetch_sub(1, std::memory_order_relaxed) - 1;
+        if (0 == active) {
+          release_next();
+        }
+      }
+    }
+
+    // used in the auto case
+    void release_next() {
+      if (this->m_stopped) {
+        // don't release tasks if we're stopped
+        return;
+      }
+      // trigger the next sequence
+      sequence_elem_t elem;
+      {
+        // extract the next sequence
+        auto g = this->lock_guard();
+        auto it = this->m_sequence.begin(); // sequence is ordered by ordinal
+        if (it == this->m_sequence.end()) {
+          return; // nothing to be done
+        }
+        this->m_current = it->first;
+        elem = std::move(it->second);
+        this->m_sequence.erase(it);
+      }
+
+      for (auto& seq : elem.m_keys) {
+        // account for the newly active keys
+        this->m_active.fetch_add(seq.second.size(), std::memory_order_relaxed);
+        this->notify_listener(ttg::span<key_type>(seq.second.data(), seq.second.size()), seq.first);
+      }
+    }
+
+
+    // used in the non-auto case
+    void release_next(ordinal_type ord, bool force_check = false) {
+      if (this->m_stopped) {
+        // don't release tasks if we're stopped but remember that this was released
+        this->m_current = ord;
+        return;
+      }
+      if (!force_check && eligible(ord)) {
+        return; // already at the provided ordinal, nothing to be done
+      }
+      // trigger the next sequence(s) (m_sequence is ordered by ordinal)
+      std::vector<sequence_elem_t> seqs;
+      {
+        auto g = this->lock_guard();
+        // set current ordinal
+        this->m_current = ord;
+        for (auto it = this->m_sequence.begin(); it != this->m_sequence.end();) {
+          if (!eligible(it->first)) break;
+          // extract the next sequence
+          this->m_current = it->first;
+          seqs.push_back(std::move(it->second));
+          it = this->m_sequence.erase(it);
+        }
+      }
+      for (auto& elem : seqs) {
+        for (auto& e : elem.m_keys) {
+          // account for the newly active keys
+          this->notify_listener(ttg::span<key_type>(e.second.data(), e.second.size()), e.first);
+        }
+      }
+    }
+
+  public:
+
+    /**
+     * Used for external key mapper.
+     */
+    SequencedKeysConstraint(bool auto_release = false)
+    : base_t()
+    , m_auto_release(auto_release)
+    { }
+
+    template<typename Mapper_>
+    requires(std::is_invocable_v<Mapper_, Key>)
+    SequencedKeysConstraint(Mapper_&& map, bool auto_release = false)
+    : base_t()
+    , m_map(std::forward<Mapper_>(map))
+    , m_auto_release(auto_release)
+    { }
+
+    SequencedKeysConstraint(SequencedKeysConstraint&& skc) = default;
+
+    SequencedKeysConstraint(const SequencedKeysConstraint& skc) = default;
+
+    SequencedKeysConstraint& operator=(SequencedKeysConstraint&& skc) = default;
+
+    SequencedKeysConstraint& operator=(const SequencedKeysConstraint& skc) = default;
+
+    virtual ~SequencedKeysConstraint() = default;
+
+    /* Check whether the key may be executed.
+     * Returns true if the key may be executed.
+     * Otherwise, returns false and  */
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>, bool>
+    check(const key_type& key, ttg::TTBase *tt) {
+      ordinal_type ord = m_map(key);
+      return this->check_key_impl(key, ord, tt);
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>, bool>
+    check(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
+      return this->check_key_impl(key, ord, tt);
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>, bool>
+    check(ttg::TTBase *tt) {
+      return this->check_key_impl(ttg::Void{}, m_map(), tt);
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>, bool>
+    check(ordinal_type ord, ttg::TTBase *tt) {
+      return this->check_key_impl(ttg::Void{}, ord, tt);
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>>
+    complete(const key_type& key, ttg::TTBase *tt) {
+      this->complete_key_impl();
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>>
+    complete(const key_type& key, Ordinal ord, ttg::TTBase *tt) {
+      this->complete_key_impl();
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && ttg::meta::is_void_v<Mapper_>>
+    complete(Ordinal ord, ttg::TTBase *tt) {
+      this->complete_key_impl();
+    }
+
+    template<typename Key_ = key_type, typename Mapper_ = Mapper>
+    std::enable_if_t<!ttg::meta::is_void_v<Key_> && !ttg::meta::is_void_v<Mapper_>>
+    complete(ttg::TTBase *tt) {
+      this->complete_key_impl();
+    }
+
+    /**
+     * Stop all execution. Call \c start to resume.
+     * This constraint is not stopped by default so calls to \c start
+     * are only necessary if explictily stopped.
+     */
+    void stop() {
+      m_stopped = true;
+    }
+
+    /**
+     * Start execution.
+     * This constraint is not stopped by default so calls to \c start
+     * are only necessary if explictily stopped.
+     */
+    void start() {
+      if (m_stopped) {
+        m_stopped = false;
+        if (m_auto_release) {
+          release_next();
+        } else {
+          auto ord = m_current;
+          // release the first set of available keys if none were set explicitly
+          if (ord == std::numeric_limits<ordinal_type>::min() &&
+              this->m_sequence.begin() != this->m_sequence.end()) {
+            ord = this->m_sequence.begin()->first;
+          }
+          release_next(ord, true); // force the check for a next release even if the current ordinal hasn't changed
+        }
+      }
+    }
+
+    /**
+     * Release tasks up to the ordinal. The provided ordinal is ignored if `auto_release` is enabled.
+     */
+    void release(ordinal_type ord = 0) {
+      if (m_auto_release) {
+        // last key for this ordinal, release the next
+        // the provided ordinal is ignored
+        release_next();
+      } else {
+        release_next(ord);
+      }
+    }
+
+    bool is_auto() const {
+      return m_auto_release;
+    }
+
+
+  protected:
+    std::map<ordinal_type, sequence_elem_t, compare_t> m_sequence;
+    ordinal_type m_current = std::numeric_limits<ordinal_type>::min();
+    [[no_unique_address]]
+    Mapper m_map;
+    [[no_unique_address]]
+    compare_t m_order;
+    std::atomic<std::size_t> m_active;
+    bool m_stopped = false;
+    bool m_auto_release = false;
+  };
+
+  // deduction guides: take type of first argument to Mapper as the key type
+  // TODO: can we use the TTG callable_args instead?
+  template<typename Mapper, typename = std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>>>
+  SequencedKeysConstraint(Mapper&&)
+    -> SequencedKeysConstraint<
+          std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>,
+          std::decay_t<boost::callable_traits::return_type_t<Mapper>>,
+          std::less<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
+          std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>, Mapper>
+          >;
+
+  template<typename Mapper, typename = std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>>>
+  SequencedKeysConstraint(Mapper&&, bool)
+    -> SequencedKeysConstraint<
+          std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>,
+          std::decay_t<boost::callable_traits::return_type_t<Mapper>>,
+          std::less<std::decay_t<boost::callable_traits::return_type_t<Mapper>>>,
+          std::enable_if_t<std::is_invocable_v<Mapper, std::decay_t<std::tuple_element_t<0, boost::callable_traits::args_t<Mapper>>>>, Mapper>
+          >;
+
+  template<typename Key, typename Ordinal, typename Compare, typename Mapper>
+  SequencedKeysConstraint(SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>&&)
+    -> SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>;
+
+  template<typename Key, typename Ordinal, typename Compare, typename Mapper>
+  SequencedKeysConstraint(const SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>&)
+    -> SequencedKeysConstraint<Key, Ordinal, Compare, Mapper>;
+
+  /**
+   * Make a constraint that can be shared between multiple TT instances.
+   * Overload for incomplete templated constraint types.
+   *
+   * Example:
+   * // SequencedKeysConstraint is incomplete
+   * auto c = ttg::make_shared_constraint<SequencedKeysConstraint>([](Key& k){ return k[0]; });
+   * auto tt_a = ttg::make_tt<Key>(...);
+   * tt_a->add_constraint(c);
+   * auto tt_b = ttg::make_tt<Key>(...);
+   * tt_b->add_constraint(c);
+   *
+   * -> the constraint will handle keys from both tt_a and tt_b. Both TTs must have the same key type.
+   */
+  template<template<typename...> typename Constraint, typename... Args>
+  auto make_shared_constraint(Args&&... args) {
+    return std::make_shared<decltype(Constraint(std::forward<Args>(args)...))>(std::forward<Args>(args)...);
+  }
+
+  /**
+   * Make a constraint that can be shared between multiple TT instances.
+   * Overload for complete constraint types.
+   */
+  template<typename Constraint, typename... Args>
+  auto make_shared_constraint(Args&&... args) {
+    return std::make_shared<Constraint>(std::forward<Args>(args)...);
+  }
+
+
+
+} // namespace ttg
+
+#endif // TTG_CONSTRAINT_H
\ No newline at end of file
diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h
index 85dee437f..d84f29dcc 100644
--- a/ttg/ttg/madness/ttg.h
+++ b/ttg/ttg/madness/ttg.h
@@ -1206,6 +1206,23 @@ namespace ttg_madness {
       priomap = std::forward<Priomap>(pm);
     }
 
+    /// add a constraint
+    /// the constraint must provide a valid override of `check_key(key)`
+    template<typename Constraint>
+    void add_constraint(Constraint&& c) {
+      /* currently a noop */
+    }
+
+    template<typename Constraint, typename Mapper>
+    void add_constraint(std::shared_ptr<Constraint> c, Mapper&& map) {
+      /* currently a noop */
+    }
+
+    template<typename Constraint, typename Mapper>
+    void add_constraint(Constraint c, Mapper&& map) {
+      /* currently a noop */
+    }
+
     /// implementation of TTBase::make_executable()
     void make_executable() override {
       TTBase::make_executable();
diff --git a/ttg/ttg/madness/ttvalue.h b/ttg/ttg/madness/ttvalue.h
index ad53ee5f8..a171c3c71 100644
--- a/ttg/ttg/madness/ttvalue.h
+++ b/ttg/ttg/madness/ttvalue.h
@@ -9,6 +9,11 @@ namespace ttg_madness {
     /* empty */
   };
 
+  template<typename ValueT>
+  inline auto persistent(ValueT&& value) {
+    return std::forward<ValueT>(value);
+  }
+
 } // namespace ttg_madness
 
   #endif // TTG_MADNESS_TTVALUE_H
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index 58ee528ad..f29ca8ecb 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -202,7 +202,7 @@ namespace ttg_parsec {
     struct parsec_ttg_task_t : public parsec_ttg_task_base_t {
       using key_type = typename TT::key_type;
       static constexpr size_t num_streams = TT::numins;
-      /* device tasks may have to store more copies than it's inputs as their sends are aggregated */
+      /* device tasks may have to store more copies than # of its inputs as their sends are aggregated */
       static constexpr size_t num_copies  = TT::derived_has_device_op() ? static_cast<size_t>(MAX_PARAM_COUNT)
                                                                       : (num_streams+1);
       TT* tt = nullptr;
@@ -215,8 +215,9 @@ namespace ttg_parsec {
       device_state_t<TT::derived_has_device_op()> dev_state;
       ttg_data_copy_t *copies[num_copies] = { nullptr };  // the data copies tracked by this task
 
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) {
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, TT *tt_ptr)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies)
+          , tt(tt_ptr) {
         tt_ht_item.key = pkey();
         this->dev_ptr = this->dev_state.dev_ptr();
         // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
@@ -282,8 +283,9 @@ namespace ttg_parsec {
       ttg_data_copy_t *copies[num_streams+1] = { nullptr };  // the data copies tracked by this task
                                                              // +1 for the copy needed during send/bcast
 
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) {
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, TT *tt_ptr)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies)
+          , tt(tt_ptr) {
         tt_ht_item.key = pkey();
         this->dev_ptr = this->dev_state.dev_ptr();
       }
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 3009891e1..47ab79068 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -22,6 +22,7 @@
 #include "ttg/base/keymap.h"
 #include "ttg/base/tt.h"
 #include "ttg/base/world.h"
+#include "ttg/constraint.h"
 #include "ttg/edge.h"
 #include "ttg/execution.h"
 #include "ttg/func.h"
@@ -1173,6 +1174,7 @@ namespace ttg_parsec {
      protected:
       //  static std::map<int, ParsecBaseTT*> function_id_to_instance;
       parsec_hash_table_t tasks_table;
+      parsec_hash_table_t task_constraint_table;
       parsec_task_class_t self;
     };
 
@@ -1342,6 +1344,9 @@ namespace ttg_parsec {
 
     bool m_defer_writer = TTG_PARSEC_DEFER_WRITER;
 
+    std::vector<ttg::meta::detail::constraint_callback_t<keyT>> constraints_check;
+    std::vector<ttg::meta::detail::constraint_callback_t<keyT>> constraints_complete;
+
    public:
     ttg::World get_world() const override final { return world; }
 
@@ -1992,7 +1997,7 @@ namespace ttg_parsec {
       task_t *dummy;
       parsec_execution_stream_s *es = world.impl().execution_stream();
       parsec_thread_mempool_t *mempool = get_task_mempool();
-      dummy = new (parsec_thread_mempool_allocate(mempool)) task_t(mempool, &this->self);
+      dummy = new (parsec_thread_mempool_allocate(mempool)) task_t(mempool, &this->self, this);
       dummy->set_dummy(true);
       // TODO: do we need to copy static_stream_goal in dummy?
 
@@ -2568,6 +2573,83 @@ namespace ttg_parsec {
       }
     }
 
+    bool check_constraints(task_t *task) {
+      bool constrained = false;
+      if (constraints_check.size() > 0) {
+        if constexpr (ttg::meta::is_void_v<keyT>) {
+          constrained = !constraints_check[0]();
+        } else {
+          constrained = !constraints_check[0](task->key);
+        }
+      }
+      if (constrained) {
+        // store the task so we can later access it once it is released
+        parsec_hash_table_insert(&task_constraint_table, &task->tt_ht_item);
+      }
+      return !constrained;
+    }
+
+    template<typename Key = keyT>
+    std::enable_if_t<ttg::meta::is_void_v<Key>, void> release_constraint(std::size_t cid) {
+      // check the next constraint, if any
+      assert(cid < constraints_check.size());
+      bool release = true;
+      for (std::size_t i = cid+1; i < constraints_check.size(); i++) {
+        if (!constraints_check[i]()) {
+          release = false;
+          break;
+        }
+      }
+      if (release) {
+        // no constraint blocked us
+        task_t *task;
+        parsec_key_t hk = 0;
+        task = (task_t*)parsec_hash_table_remove(&task_constraint_table, hk);
+        assert(task != nullptr);
+        auto &world_impl = world.impl();
+        parsec_execution_stream_t *es = world_impl.execution_stream();
+        parsec_task_t *vp_task_rings[1] = { &task->parsec_task };
+        __parsec_schedule_vp(es, vp_task_rings, 0);
+      }
+    }
+
+    template<typename Key = keyT>
+    std::enable_if_t<!ttg::meta::is_void_v<Key>, void> release_constraint(std::size_t cid, const std::span<Key>& keys) {
+      assert(cid < constraints_check.size());
+      parsec_task_t *task_ring = nullptr;
+      for (auto& key : keys) {
+        task_t *task;
+        bool release = true;
+        for (std::size_t i = cid+1; i < constraints_check.size(); i++) {
+          if (!constraints_check[i](key)) {
+            release = false;
+            break;
+          }
+        }
+
+        if (release) {
+          // no constraint blocked this task, so go ahead and release
+          auto hk = reinterpret_cast<parsec_key_t>(&key);
+          task = (task_t*)parsec_hash_table_remove(&task_constraint_table, hk);
+          assert(task != nullptr);
+          if (task_ring == nullptr) {
+            /* the first task is set directly */
+            task_ring = &task->parsec_task;
+          } else {
+            /* push into the ring */
+            parsec_list_item_ring_push_sorted(&task_ring->super, &task->parsec_task.super,
+                                              offsetof(parsec_task_t, priority));
+          }
+        }
+      }
+      if (nullptr != task_ring) {
+        auto &world_impl = world.impl();
+        parsec_execution_stream_t *es = world_impl.execution_stream();
+        parsec_task_t *vp_task_rings[1] = { task_ring };
+        __parsec_schedule_vp(es, vp_task_rings, 0);
+      }
+    }
+
     void release_task(task_t *task,
                       parsec_task_t **task_ring = nullptr) {
       constexpr const bool keyT_is_Void = ttg::meta::is_void_v<keyT>;
@@ -2597,16 +2679,19 @@ namespace ttg_parsec {
           }
         }
         if (task->remove_from_hash) parsec_hash_table_remove(&tasks_table, hk);
-        if (nullptr == task_ring) {
-          parsec_task_t *vp_task_rings[1] = { &task->parsec_task };
-          __parsec_schedule_vp(es, vp_task_rings, 0);
-        } else if (*task_ring == nullptr) {
-          /* the first task is set directly */
-          *task_ring = &task->parsec_task;
-        } else {
-          /* push into the ring */
-          parsec_list_item_ring_push_sorted(&(*task_ring)->super, &task->parsec_task.super,
-                                            offsetof(parsec_task_t, priority));
+
+        if (check_constraints(task)) {
+          if (nullptr == task_ring) {
+            parsec_task_t *vp_task_rings[1] = { &task->parsec_task };
+            __parsec_schedule_vp(es, vp_task_rings, 0);
+          } else if (*task_ring == nullptr) {
+            /* the first task is set directly */
+            *task_ring = &task->parsec_task;
+          } else {
+            /* push into the ring */
+            parsec_list_item_ring_push_sorted(&(*task_ring)->super, &task->parsec_task.super,
+                                              offsetof(parsec_task_t, priority));
+          }
         }
       } else if constexpr (!ttg::meta::is_void_v<keyT>) {
         if ((baseobj->num_pullins + count == numins) && baseobj->is_lazy_pull()) {
@@ -3766,6 +3851,14 @@ namespace ttg_parsec {
         detail::release_data_copy(copy);
         task->copies[i] = nullptr;
       }
+
+      for (auto& c : task->tt->constraints_complete) {
+        if constexpr(std::is_void_v<keyT>) {
+          c();
+        } else {
+          c(task->key);
+        }
+      }
       return PARSEC_HOOK_RETURN_DONE;
     }
 
@@ -3920,6 +4013,9 @@ namespace ttg_parsec {
 
       parsec_hash_table_init(&tasks_table, offsetof(detail::parsec_ttg_task_base_t, tt_ht_item), 8, tasks_hash_fcts,
                              NULL);
+
+      parsec_hash_table_init(&task_constraint_table, offsetof(detail::parsec_ttg_task_base_t, tt_ht_item), 8, tasks_hash_fcts,
+                             NULL);
     }
 
     template <typename keymapT = ttg::detail::default_keymap<keyT>,
@@ -4289,6 +4385,57 @@ namespace ttg_parsec {
     /// @return the device map
     auto get_devicemap() { return devicemap; }
 
+    /// add a shared constraint
+    /// the constraint must provide a valid override of `check_key(key)`
+    template<typename Constraint>
+    void add_constraint(std::shared_ptr<Constraint> c) {
+      std::size_t cid = constraints_check.size();
+      if constexpr(ttg::meta::is_void_v<keyT>) {
+        c->add_listener([this, cid](){ this->release_constraint(cid); }, this);
+        constraints_check.push_back([c, this](){ return c->check(this); });
+        constraints_complete.push_back([c, this](const keyT& key){ c->complete(this); return true; });
+      } else {
+        c->add_listener([this, cid](const std::span<keyT>& keys){ this->release_constraint(cid, keys); }, this);
+        constraints_check.push_back([c, this](const keyT& key){ return c->check(key, this); });
+        constraints_complete.push_back([c, this](const keyT& key){ c->complete(key, this); return true; });
+      }
+    }
+
+    /// add a constraint
+    /// the constraint must provide a valid override of `check_key(key)`
+    template<typename Constraint>
+    void add_constraint(Constraint&& c) {
+      // need to make this a shared_ptr since it's shared between different callbacks
+      this->add_constraint(std::make_shared<Constraint>(std::forward<Constraint>(c)));
+    }
+
+    /// add a shared constraint
+    /// the constraint must provide a valid override of `check_key(key, map(key))`
+    /// ths overload can be used to provide different key mapping functions for each TT
+    template<typename Constraint, typename Mapper>
+    void add_constraint(std::shared_ptr<Constraint> c, Mapper&& map) {
+      static_assert(std::is_same_v<typename Constraint::key_type, keyT>);
+      std::size_t cid = constraints_check.size();
+      if constexpr(ttg::meta::is_void_v<keyT>) {
+        c->add_listener([this, cid](){ this->release_constraint(cid); }, this);
+        constraints_check.push_back([map, c, this](){ return c->check(map(), this); });
+        constraints_complete.push_back([map, c, this](){ c->complete(map(), this); return true; });
+      } else {
+        c->add_listener([this, cid](const std::span<keyT>& keys){ this->release_constraint(cid, keys); }, this);
+        constraints_check.push_back([map, c, this](const keyT& key){ return c->check(key, map(key), this); });
+        constraints_complete.push_back([map, c, this](const keyT& key){ c->complete(key, map(key), this); return true; });
+      }
+    }
+
+    /// add a shared constraint
+    /// the constraint must provide a valid override of `check_key(key, map(key))`
+    /// ths overload can be used to provide different key mapping functions for each TT
+    template<typename Constraint, typename Mapper>
+    void add_constraint(Constraint c, Mapper&& map) {
+      // need to make this a shared_ptr since it's shared between different callbacks
+      this->add_constraint(std::make_shared<Constraint>(std::forward<Constraint>(c)), std::forward<Mapper>(map));
+    }
+
     // Register the static_op function to associate it to instance_id
     void register_static_op_function(void) {
       int rank;
@@ -4425,8 +4572,13 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
       bool inserted = ttg_parsec::detail::add_copy_to_task(copy, caller);
       assert(inserted);
       copy_to_remove = copy; // we want to remove the copy from the task once done sending
-      do_release = false; // we don't release the copy since we didn't allocate it
+      do_release = true; // we don't release the copy since we didn't allocate it
       copy->add_ref(); // add a reference so that TTG does not attempt to delete this object
+      copy->add_ref(); // add another reference so that TTG never attempts to free this copy
+      if (copy->num_readers() == 0) {
+        /* add at least one reader (the current task) */
+        copy->increment_readers<false>();
+      }
     }
     return vref.value_ref;
   }
diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h
index 351bc7a2c..fbd58c64e 100644
--- a/ttg/ttg/parsec/ttg_data_copy.h
+++ b/ttg/ttg/parsec/ttg_data_copy.h
@@ -572,15 +572,12 @@ namespace ttg_parsec {
     inline
     ttg_parsec_data_wrapper_t& ttg_parsec_data_wrapper_t::operator=(ttg_parsec_data_wrapper_t&& other) {
       m_data = std::move(other.m_data);
-      /* check whether the owning ttg_data_copy has already moved us */
-      if (other.m_ttg_copy != m_ttg_copy) {
-        /* remove from old ttg copy */
-        other.remove_from_owner();
-
-        if (nullptr != m_ttg_copy) {
-          /* register with the new ttg_copy */
-          m_ttg_copy->add_device_data(this);
-        }
+      /* remove from old ttg copy */
+      other.remove_from_owner();
+
+      if (nullptr != m_ttg_copy) {
+        /* register with the new ttg_copy */
+        m_ttg_copy->add_device_data(this);
       }
       return *this;
     }
diff --git a/ttg/ttg/parsec/ttvalue.h b/ttg/ttg/parsec/ttvalue.h
index b93f1687f..b5b6aa982 100644
--- a/ttg/ttg/parsec/ttvalue.h
+++ b/ttg/ttg/parsec/ttvalue.h
@@ -91,9 +91,11 @@ namespace ttg_parsec {
 
   template<typename ValueT>
   inline auto persistent(ValueT&& value) {
-    static_assert(std::is_base_of_v<TTValue<std::decay_t<ValueT>>, std::decay_t<ValueT>>,
-                  "ttg::persistent can only be used on types derived from ttg::TTValue");
-    return detail::persistent_value_ref<ValueT>{value};
+    if constexpr (std::is_base_of_v<TTValue<std::decay_t<ValueT>>, std::decay_t<ValueT>>) {
+      return detail::persistent_value_ref<ValueT>{value};
+    } else {
+      return std::forward<ValueT>(value);
+    }
   }
 
 } // namespace ttg_parsec
diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h
index f3af03152..c595369b0 100644
--- a/ttg/ttg/util/meta.h
+++ b/ttg/ttg/util/meta.h
@@ -912,6 +912,21 @@ namespace ttg {
       template <typename Key, typename Value>
       using prepare_send_callback_t = typename prepare_send_callback<Key, Value>::type;
 
+      template<typename Key, typename Enabler = void>
+      struct constraint_callback;
+
+      template<typename Key>
+      struct constraint_callback<Key, std::enable_if_t<!is_void_v<Key>>> {
+        using type = std::function<bool(const Key&)>;
+      };
+
+      template<typename Key>
+      struct constraint_callback<Key, std::enable_if_t<is_void_v<Key>>> {
+        using type = std::function<bool()>;
+      };
+
+      template<typename Key>
+      using constraint_callback_t = typename constraint_callback<Key>::type;
 
     }  // namespace detail