diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index bd9281d0b..68933c61c 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -10,11 +10,13 @@ jobs:
       fail-fast: false
       matrix:
         build_type : [ Release, Debug ]
-        os : [ macos-latest, ubuntu-20.04 ]
+        os : [ macos-latest, ubuntu-22.04 ]
         include:
-          - os: ubuntu-20.04
-            cxx: /usr/bin/g++-9
+          - os: ubuntu-22.04
+            cc: /usr/bin/gcc-12
+            cxx: /usr/bin/g++-12
           - os: macos-latest
+            cc: clang
             cxx: clang++
 
     name: "${{ matrix.os }}: ${{ matrix.cxx }} ${{ matrix.build_type }}"
@@ -36,6 +38,7 @@ jobs:
         -DMPIEXEC_PREFLAGS='--bind-to;none;--allow-run-as-root'
         -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/install
         -DTTG_EXAMPLES=ON
+        -DCMAKE_CXX_STANDARD=20
 
     steps:
     - uses: actions/checkout@v2
@@ -45,12 +48,12 @@ jobs:
       run: brew install ninja gcc@10 boost eigen open-mpi bison ccache
 
     - name: Install prerequisites Ubuntu packages
-      if: ${{ matrix.os == 'ubuntu-20.04' }}
+      if: ${{ matrix.os == 'ubuntu-22.04' }}
       run: |
         wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
         sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
         sudo apt-get update
-        sudo apt-get -y install ninja-build g++-9 liblapack-dev libboost-dev libboost-serialization-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake
+        sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake doxygen
 
     - name: Create Build Environment
       # Some projects don't allow in-source building, so create a separate build directory
@@ -58,24 +61,6 @@ jobs:
       run: |
         cmake -E make_directory ${{github.workspace}}/build
 
-
-    - name: Install doxygen for Release test
-      if: ${{ matrix.os == 'ubuntu-20.04' }}
-      run: |
-        if [ "${{matrix.build_type}}" = "Release" ]; then
-          sudo apt-get -y install libclang1-9 libclang-cpp9 graphviz fonts-liberation
-          cd ${{github.workspace}}/build
-          # If we fail getting doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz from sourceforge,
-          # use EFV's gdrive mirror of 1.9.2 to work around the unreliable sourceforge
-          # the sharing link: https://drive.google.com/file/d/16GXpH4YOEUxGXQrXOKdAIibhdfzATY0d/view?usp=sharing
-          wget https://downloads.sourceforge.net/project/doxygen/rel-${DOXYGEN_VERSION}/doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz || wget -4 --no-check-certificate -O doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz "https://drive.google.com/uc?export=download&id=16GXpH4YOEUxGXQrXOKdAIibhdfzATY0d"
-          tar xzf ./doxygen-${DOXYGEN_VERSION}.linux.bin.tar.gz
-          export DOXYGEN_DIR=${{github.workspace}}/build/doxygen-${DOXYGEN_VERSION}
-          ${DOXYGEN_DIR}/bin/doxygen --version
-          # doxygen should be in PATH in subsequent steps
-          echo "${DOXYGEN_DIR}/bin" >> $GITHUB_PATH
-        fi
-
     - name: Prepare ccache timestamp
       id: ccache_cache_timestamp
       shell: cmake -P {0}
@@ -99,7 +84,8 @@ jobs:
       # Note the current convention is to use the -S and -B options here to specify source 
       # and build directories, but this is only available with CMake 3.13 and higher.  
       # The CMake binaries on the Github Actions machines are (as of this writing) 3.12
-      run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG
+      run: |
+        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log)
 
     - name: Build
       working-directory: ${{github.workspace}}/build
@@ -124,7 +110,7 @@ jobs:
       working-directory: ${{github.workspace}}/build
       shell: bash
       run: |
-        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp -DCMAKE_PREFIX_PATH=${{github.workspace}}/install
+        cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/main -B test_install_devsamp -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp/CMakeFiles/CMakeOutput.log && cat test_install_devsamp/CMakeFiles/CMakeError.log)
         cmake --build test_install_devsamp
         cmake -E make_directory test_install_userexamples
         cat > test_install_userexamples/CMakeLists.txt <<EOF
@@ -136,11 +122,11 @@ jobs:
         add_ttg_executable(iterative $GITHUB_WORKSPACE/doc/dox/user/examples/iterative.cc NOT_EXCLUDE_FROM_ALL)
         add_ttg_executable(distributed $GITHUB_WORKSPACE/doc/dox/user/examples/distributed.cc NOT_EXCLUDE_FROM_ALL)
         EOF
-        cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install
+        cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_userexamples/CMakeFiles/CMakeOutput.log && cat test_install_userexamples/CMakeFiles/CMakeError.log)
         cmake --build test_install_userexamples/build
 
     - name: Build+Deploy Dox
-      if: ${{ matrix.os == 'ubuntu-20.04' && matrix.build_type == 'Release' && github.ref == 'refs/heads/master' }}
+      if: ${{ matrix.os == 'ubuntu-22.04' && matrix.build_type == 'Release' && github.ref == 'refs/heads/master' }}
       working-directory: ${{github.workspace}}/build
       shell: bash
       run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 158842853..28490557f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,5 @@
 cmake_minimum_required (VERSION 3.14) # supports FetchContent_MakeAvailable
+cmake_policy(SET CMP0104 OLD)
 
 # Set TTG version =======================================================
 
@@ -29,7 +30,14 @@ project(ttg
     LANGUAGES CXX
     HOMEPAGE_URL "https://tesseorg.github.io/ttg/")
 
-set(CMAKE_CXX_EXTENSIONS OFF)
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_STANDARD "20" CACHE STRING "The C++ standard")
+elseif (${CMAKE_CXX_STANDARD} LESS 20)
+    message(FATAL_ERROR "TTG requires C++ compiler with C++20, but CMAKE_CXX_STANDARD is set to ${CMAKE_CXX_STANDARD}; bump up CMAKE_CXX_STANDARD to 20 or above")
+endif()
+if (NOT CMAKE_CXX_EXTENSIONS)
+  set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "The C++ standard extensions allowed?")
+endif()
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules/")
 include(GNUInstallDirs)
@@ -45,21 +53,31 @@ set(CMAKE_INSTALL_CMAKEDIR "lib/cmake/ttg"
 #### user-defined configuration options
 ########################################
 option(TTG_PARSEC_USE_BOOST_SERIALIZATION "Whether to select Boost serialization methods in PaRSEC backend" ON)
+option(TTG_ENABLE_CUDA "Whether to TTG will look for CUDA" OFF)
+option(TTG_ENABLE_HIP "Whether to TTG will look for HIP" OFF)
+option(TTG_ENABLE_LEVEL_ZERO "Whether to TTG will look for Intel oneAPI Level Zero" OFF)
 option(TTG_EXAMPLES "Whether to build examples" OFF)
+option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)
 
 option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
-option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external depenedencies (Boost.CallableTraits)" OFF)
+option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF)
 option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF)
 # See https://medium.com/@alasher/colored-c-compiler-output-with-ninja-clang-gcc-10bfe7f2b949
 option (FORCE_COLORED_OUTPUT "Always produce ANSI-colored output (GNU/Clang only)." TRUE)
 if (FORCE_COLORED_OUTPUT)
     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-       add_compile_options (-fdiagnostics-color=always)
+       add_compile_options ($<$<COMPILE_LANGUAGE:CXX,C>:-fdiagnostics-color=always>)
     elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-       add_compile_options (-fcolor-diagnostics)
+       add_compile_options ($<$<COMPILE_LANGUAGE:CXX,C>:-fcolor-diagnostics>)
     endif ()
 endif (FORCE_COLORED_OUTPUT)
 
+if (TTG_ENABLE_ASAN)
+    add_compile_options(-fsanitize=address)
+    add_link_options(-fsanitize=address)
+endif (TTG_ENABLE_ASAN)
+
+set(TTG_HIP_PLATFORM "__HIP_PLATFORM_AMD__" CACHE STRING "Which platform to use when compiling HIP-related code (default: __HIP_PLATFORM_AMD__)")
 ##########################
 #### prerequisites
 ##########################
@@ -75,9 +93,73 @@ endif (BUILD_TESTING)
 #### optional prerequisites
 ###########################
 # Boost
-include(FindOrFetchBoost)
-# Cereal
-#include(FindOrFetchCereal)
+include("${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake")
+# C++ coroutines
+find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental)
+
+
+##########################
+#### CUDA: must come before PaRSEC
+##########################
+if (TTG_ENABLE_CUDA)
+  include(CheckLanguage)
+  check_language(CUDA)
+  if(CMAKE_CUDA_COMPILER)
+    enable_language(CUDA)
+  endif(CMAKE_CUDA_COMPILER)
+  set(TTG_HAVE_CUDA ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if TTG supports compiling .cu files")
+
+  find_package(CUDAToolkit)
+  if (TARGET CUDA::cudart)
+    set(TTG_HAVE_CUDART True CACHE BOOL "TTG supports execution on CUDA devices")
+  endif()
+endif(TTG_ENABLE_CUDA)
+
+if (TTG_ENABLE_HIP)
+  # HIP LANGUAGE introduced in 3.21
+  cmake_minimum_required(VERSION 3.21)
+  include(CheckLanguage)
+  check_language(HIP)
+  if(CMAKE_HIP_COMPILER)
+    enable_language(HIP)
+  endif(CMAKE_HIP_COMPILER)
+  set(TTG_HAVE_HIP ${CMAKE_HIP_COMPILER} CACHE BOOL "True if TTG supports compiling .hip files")
+
+  find_package(hipblas)
+  if (TARGET roc::hipblas)
+    set(TTG_HAVE_HIPBLAS True CACHE BOOL "TTG detected support for hipBLAS")
+  endif()
+
+  find_package(hipsolver)
+  if (TARGET roc::hipsolver)
+    set(TTG_HAVE_HIPSOLVER True CACHE BOOL "TTG detected support for hipSolver")
+  endif()
+  add_compile_definitions(${TTG_HIP_PLATFORM})
+endif(TTG_ENABLE_HIP)
+
+if (TTG_ENABLE_LEVEL_ZERO)
+  find_package(level-zero)
+  set(TTG_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if TTG provide support for Intel Level Zero")
+  if(TTG_HAVE_LEVEL_ZERO)
+    include_directories("${LEVEL_ZERO_INCLUDE_DIR}/level_zero/")
+    find_package(DPCPP)
+    if(DPCPP_EXECUTABLE)
+      set(TTG_HAVE_DPCPP TRUE CACHE BOOL "True if TTG knows how to compile DPCPP code")
+      message(STATUS "Found Intel level-zero ${LEVEL_ZERO_VERSION} in -I${LEVEL_ZERO_INCLUDE_DIR} / -L${LEVEL_ZERO_LIBRARY_DIR}")
+      message(STATUS "Found dpcpp in ${DPCPP_EXECUTABLE}")
+
+      find_package(MKL)
+    else(DPCPP_EXECUTABLE)
+      set(TTG_HAVE_DPCPP FALSE CACHE BOOL "True if TTG knows how to compile DPCPP code")
+    endif(DPCPP_EXECUTABLE)
+  endif(TTG_HAVE_LEVEL_ZERO)
+endif(TTG_ENABLE_LEVEL_ZERO)
+
+set(_ttg_have_device FALSE)
+if (TTG_HAVE_CUDA OR TTG_HAVE_HIP OR TTG_HAVE_LEVEL_ZERO)
+    set(_ttg_have_device TRUE)
+endif()
+set(TTG_HAVE_DEVICE ${_ttg_have_device} CACHE BOOL "True if TTG has support for any device programming model")
 
 ##########################
 #### prerequisite runtimes
@@ -93,21 +175,28 @@ if (TARGET MADworld)
   message(STATUS "MADNESS_FOUND=1")
 endif(TARGET MADworld)
 
+####################################################
+#### Check for MPIX_Query_[cuda|rocm]_support
+#### Open MPI provides mpi-ext.h for such extensions
+#### so check for that first.
+####################################################
+find_package(MPI)
+set(TTG_HAVE_MPI MPI_FOUND)
+if (MPI_FOUND)
+  include(CheckIncludeFiles)
+  set(CMAKE_REQUIRED_INCLUDES ${MPI_C_INCLUDE_DIRS})
+  check_include_files("mpi-ext.h" TTG_HAVE_MPIEXT)
+else(MPI_FOUND)
+  set(TTG_HAVE_MPIEXT $<BOOL:false>)
+endif(MPI_FOUND)
+
 ##########################
 #### Examples
 ##########################
 # N.B. discover last so that we do not affect core dependencies
 if (TTG_EXAMPLES)
-    # sparse tensor algorithms need Eigen
-    find_package(Eigen3)
-    message (STATUS "EIGEN3_FOUND=${EIGEN3_FOUND}")
-    if (EIGEN3_FOUND)
-        add_library(eigen3 INTERFACE IMPORTED)
-        set_property(TARGET eigen3 PROPERTY
-                INTERFACE_INCLUDE_DIRECTORIES ${EIGEN3_INCLUDE_DIR})
-    endif (EIGEN3_FOUND)
-    # BTAS brings in linear algebra (BLAS++/LAPACK++)
-    include(FindOrFetchBTAS)
+    # TiledArray brings in BTAS AND linear algebra (BLAS++/LAPACK++)
+    include(FindOrFetchTiledArray)
     # OpenMP may also be used by some examples
     find_package(OpenMP COMPONENTS CXX)
     # std::execution may also be used by some examples
@@ -154,6 +243,7 @@ configure_package_config_file(cmake/ttg-config.cmake.in
 install(FILES
         "${PROJECT_SOURCE_DIR}/cmake/modules/AddTTGExecutable.cmake"
         "${PROJECT_SOURCE_DIR}/cmake/modules/AddTTGTestExecutable.cmake"
+        "${PROJECT_SOURCE_DIR}/cmake/modules/FindCXXStdCoroutine.cmake"
         DESTINATION "${CMAKE_INSTALL_CMAKEDIR}/modules"
         COMPONENT ttg-config)
 
diff --git a/INSTALL.md b/INSTALL.md
index 7a2470f7b..bb459f618 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -13,12 +13,12 @@ $ cmake --build ttg/build --target install
 TTG is usable only on POSIX systems.
 
 ## mandatory prerequisites
-- [CMake](https://cmake.org/), version 3.14 or higher
-- C++ compiler with support for the [C++17 standard](http://www.iso.org/standard/68564.html), or a more recent standard. This includes the following compilers:
-  - [GNU C++](https://gcc.gnu.org/), version 7.0 or higher
-  - [Clang](https://clang.llvm.org/), version 5 or higher
-  - [Apple Clang](https://en.wikipedia.org/wiki/Xcode), version 9.3 or higher
-  - [Intel C++ compiler](https://software.intel.com/en-us/c-compilers), version 19 or higher
+- [CMake](https://cmake.org/), version 3.14 or higher; version 3.21 or higher is required to support execution on HIP/ROCm-capable devices.
+- C++ compiler with support for the [C++20 standard](http://www.iso.org/standard/68564.html), or a more recent standard. This includes the following compilers:
+  - [GNU C++](https://gcc.gnu.org/), version 10.0 or higher; GCC is the only compiler that can be used for accelerator programming.
+  - [Clang](https://clang.llvm.org/), version 10 or higher
+  - [Apple Clang](https://en.wikipedia.org/wiki/Xcode), version 10.0 or higher
+  - [Intel C++ compiler](https://software.intel.com/en-us/c-compilers), version 2021.1 or higher
 - one or more of the following runtimes:
   - [PaRSEC](https://bitbucket.org/icldistcomp/parsec): this distributed-memory runtime is the primary runtime intended for high-performance implementation of TTG
   - [MADNESS](https://github.org/m-a-d-n-e-s-s/madness): this distributed-memory runtime is to be used primarily for developmental purposes
@@ -27,12 +27,15 @@ While the list of prerequisites is short, note that the runtimes have many more
 Also: it is _strongly_ recommended that the runtimes are built as parts of the TTG build process (this requires some of the optional prerequisites, listed below). This will make sure that the correct versions of the runtimes are used.
 
 ## optional prerequisites
-- [Git](https://git-scm.com) 1.8 or later: needed to obtain the source code for PaRSEC or MADNESS runtimes
-- [Boost](https://boost.org/) version 1.66 or later: needed to use TTG with classes serializable by the [Boost.Serialization](https://www.boost.org/doc/libs/master/libs/serialization/doc/index.html) library.
-  - The [Boost.Serialization](https://www.boost.org/doc/libs/master/libs/serialization/doc/index.html) library is not header-only, i.e., it must be compiled.
-  - If the Boost package is not detected TTG can download and build Boost as part of its build process; to do that configure TTG with the CMake cache variable `TTG_FETCH_BOOST` set to `ON` (e.g., by adding `-DTTG_FETCH_BOOST=ON` to the CMake executable command line)
-  - *Note to package maintainers*: TTG also requires Boost.CallableTraits; if Boost is not found or built, TTG installs and uses a bundled copy of Boost.CallableTraits. To avoid the installation and use of the bundled Boost.CallableTraits configure TTG with the CMake cache variable `TTG_IGNORE_BUNDLED_EXTERNALS` set to `ON`.
-- ([Doxygen](http://www.doxygen.nl/), version 1.8.12 or later: needed for building documentation
+- [Git](https://git-scm.com): needed to obtain the source code for any prerequisite built from source code as part of TTG, such as PaRSEC or MADNESS runtimes
+- [Boost](https://boost.org/) version 1.81 or later. If the Boost package is not detected TTG can download and build Boost as part of its build process, but this is NOT recommended, you should obtain Boost via the system or third-party package manager. Experts may try to build Boost from source as part of TTG by configuring it with the CMake cache variable `TTG_FETCH_BOOST` set to `ON` (e.g., by adding `-DTTG_FETCH_BOOST=ON` to the CMake executable command line). The following primary Boost libraries/modules (and their transitive dependents) are used:
+  - (required) [Boost.CallableTraits](): used to introspect generic callables given to `make_tt`. P.S. TTG has a bundled copy of `Boost.CallableTraits` which is used and installed if Boost is not found or built from source. To avoid the installation and use of the bundled Boost.CallableTraits configure TTG with the CMake cache variable `TTG_IGNORE_BUNDLED_EXTERNALS` set to `ON`.
+  - (optional) [Boost.Serialization](https://www.boost.org/doc/libs/master/libs/serialization/doc/index.html): needed to use TTG with classes serializable by the [Boost.Serialization](https://www.boost.org/doc/libs/master/libs/serialization/doc/index.html) library. Note that `Boost.Serialization` is not header-only, i.e., it must be compiled. This is only required if TTG is configured with CMake cache variable `TTG_PARSEC_USE_BOOST_SERIALIZATION` set to `ON`.
+- ([Doxygen](http://www.doxygen.nl/), version 1.8.12 or later: needed for building documentation.
+- for execution on GPGPUs and other accelerators, the following are required:
+  - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required.
+  - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators.
+  - [oneAPI DPC++/SYCL/LevelZero compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on Intel accelerators.
 
 ## transitive prerequisites
 
@@ -60,10 +63,14 @@ TTG includes several examples that may require additional prerequisites. These a
 
 ## useful cmake cache variables:
 
-| Variable                       |Default             | Description                                                                                                                                                                                           |
-|--------------------------------|--------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `BUILD_TESTING`                | `ON`               | whether target `check-ttg` and its relatives will actually build and run unit tests                                                                                                                   |
-| `TTG_EXAMPLES`                 | `OFF`              | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source |
-| `TTG_ENABLE_TRACE`             | `OFF`              | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op                                        |
-| `TTG_FETCH_BOOST`              | `OFF`              | whether to download and build Boost automatically, if missing                                                                                                                                         |
-| `TTG_IGNORE_BUNDLED_EXTERNALS` | `OFF`              | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits)                                                                                                       |
+| Variable                             |Default             | Description                                                                                                                                                                                                                                  |
+|--------------------------------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `TTG_ENABLE_CUDA`                    | `OFF`              | whether to enable CUDA device support                                                                                                                                                                                                        |
+| `TTG_ENABLE_HIP`                     | `OFF`              | whether to enable HIP/ROCm device support                                                                                                                                                                                                    |
+| `TTG_ENABLE_LEVEL_ZERO`              | `OFF`              | whether to enable Intel oneAPI Level Zero device support                                                                                                                                                                                     |
+| `BUILD_TESTING`                      | `ON`               | whether target `check-ttg` and its relatives will actually build and run unit tests                                                                                                                                                          |
+| `TTG_EXAMPLES`                       | `OFF`              | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source                                        |
+| `TTG_ENABLE_TRACE`                   | `OFF`              | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op                                                                               |
+| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF`       | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. |
+| `TTG_FETCH_BOOST`                    | `OFF`              | whether to download and build Boost automatically, if missing                                                                                                                                                                                |
+| `TTG_IGNORE_BUNDLED_EXTERNALS`       | `OFF`              | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits)                                                                                                                                              |
diff --git a/cmake/modules/AddCUDAToolkit.cmake b/cmake/modules/AddCUDAToolkit.cmake
new file mode 100644
index 000000000..5d682345c
--- /dev/null
+++ b/cmake/modules/AddCUDAToolkit.cmake
@@ -0,0 +1,9 @@
+find_package(CUDAToolkit)
+set(TTG_HAVE_CUDA ${CUDAToolkit_FOUND} CACHE BOOL "True if TTG supports CUDA")
+if (TTG_HAVE_CUDA)
+    check_language(CUDA)
+    if(CMAKE_CUDA_COMPILER)
+        enable_language(CUDA)
+    endif(CMAKE_CUDA_COMPILER)
+    set(TTG_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if TTG supports compiling .cu files")
+endif(TTG_HAVE_CUDA)
\ No newline at end of file
diff --git a/cmake/modules/AddTTGLibrary.cmake b/cmake/modules/AddTTGLibrary.cmake
index 29ce0f8df..5c8c2d560 100644
--- a/cmake/modules/AddTTGLibrary.cmake
+++ b/cmake/modules/AddTTGLibrary.cmake
@@ -106,7 +106,14 @@ macro(add_ttg_library)
     # set_target_properties(${_library} PROPERTIES PUBLIC_HEADER "${ADD_TTG_LIBRARY_PUBLIC_HEADER}")
     # install manually
     foreach ( file ${ADD_TTG_LIBRARY_PUBLIC_HEADER} )
-        file(RELATIVE_PATH _rel_file_path "${PROJECT_SOURCE_DIR}/ttg" "${file}")
+        # N.B. some files are in the build tree
+        if ("${file}" MATCHES "^${PROJECT_SOURCE_DIR}/ttg")
+            file(RELATIVE_PATH _rel_file_path "${PROJECT_SOURCE_DIR}/ttg" "${file}")
+        elseif("${file}" MATCHES "^${PROJECT_BINARY_DIR}/ttg")
+            file(RELATIVE_PATH _rel_file_path "${PROJECT_BINARY_DIR}/ttg" "${file}")
+        else()
+            message(FATAL_ERROR "AddTTGLibrary: could not deduce install location for public header ${file} of component ${_library}")
+        endif()
         get_filename_component( dir "${_rel_file_path}" DIRECTORY )
         install( FILES ${file} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dir}" COMPONENT ${_library})
     endforeach()
diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index b0c10abf0..fd96e5816 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -1,10 +1,12 @@
 # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS)
 # to be able to auto-update them
 
+set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5)  # used to provide "real" FindOrFetchBoost
+set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
+set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3)
+set(TTG_TRACKED_PARSEC_TAG 0b3140f58ad9dc78a3d64da9fd73ecc7f443ece7)
+set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2)
+set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73)
+
 # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
-set(TTG_TRACKED_BOOST_VERSION 1.66)
-set(TTG_TRACKED_CATCH2_VERSION 2.13.1)
-set(TTG_TRACKED_CEREAL_VERSION 1.3.0)
-set(TTG_TRACKED_MADNESS_TAG 31b2470ca722a6a2d84d4de08d32fb72ae8fdeda)
-set(TTG_TRACKED_PARSEC_TAG 9fc74b6f165605a133125d8a5b62cf55642c1907) 
-set(TTG_TRACKED_BTAS_TAG d73153ad9bc41a177e441ef04eceff7fab0c766d)
+set(TTG_OLDEST_BOOST_VERSION 1.66)
diff --git a/cmake/modules/FindCXXStdCoroutine.cmake b/cmake/modules/FindCXXStdCoroutine.cmake
new file mode 100644
index 000000000..dbe0b394f
--- /dev/null
+++ b/cmake/modules/FindCXXStdCoroutine.cmake
@@ -0,0 +1,195 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+# This is copied from:
+#   https://github.com/vector-of-bool/CMakeCM/blob/master/modules/FindFilesystem.cmake
+
+#[=======================================================================[.rst:
+
+FindCXXStdCoroutine
+##############
+
+This module supports the C++20 standard library's coroutine utilities. Link your target to the
+:imp-target:`std::coroutine` imported target to provide standard C++ coroutine API.
+
+Options
+*******
+
+The ``COMPONENTS`` argument to this module supports the following values:
+
+.. find-component:: Experimental
+    :name: coro.Experimental
+
+    Allows the module to find the "experimental" version of the
+    Coroutine library. This is the library that should be used with the
+    ``std::experimental::coroutine`` namespace.
+
+.. find-component:: Final
+    :name: coro.Final
+
+    Finds the final C++20 standard version of the coroutine library.
+
+If no components are provided, behaves as if the
+:find-component:`coro.Final` component was specified.
+
+If both :find-component:`coro.Experimental` and :find-component:`coro.Final` are
+provided, first looks for ``Final``, and falls back to ``Experimental`` in case
+of failure. If ``Final`` is found, :imp-target:`std::coroutine` and all
+:ref:`variables <coro.variables>` will refer to the ``Final`` version.
+
+
+Imported Targets
+****************
+
+.. imp-target:: std::coroutine
+
+    The ``std::coroutine`` imported target is defined when any requested
+    version of the C++ coroutine library has been found, whether it is
+    *Experimental* or *Final*.
+
+    If no version of the coroutine library is available, this target will not
+    be defined.
+
+    .. note::
+        This target has ``cxx_std_20`` as an ``INTERFACE``
+        :ref:`compile language standard feature <req-lang-standards>`. Linking
+        to this target will automatically enable C++20 if no later standard
+        version is already required on the linking target.
+
+
+.. coro.variables:
+
+Variables
+*********
+
+.. variable:: CXX_COROUTINE_COMPONENT
+
+    Set to ``Final`` when the :find-component:`coro.Final` version of C++
+    coroutine library was found, ``Experimental`` when
+    the :find-component:`coro.Experimental` version of C++
+    coroutine library was found, otherwise not defined.
+
+.. variable:: CXX_COROUTINE_HAVE_CORO
+
+    Set to ``TRUE`` when a coroutine header was found.
+
+.. variable:: CXX_COROUTINE_HEADER
+
+    Set to either ``coroutine`` or ``experimental/coroutine`` depending on
+    whether :find-component:`coro.Final` or :find-component:`coro.Experimental` was
+    found.
+
+.. variable:: CXX_COROUTINE_NAMESPACE
+
+    Set to either ``std::coroutine`` or ``std::experimental::coroutine``
+    depending on whether :find-component:`coro.Final` or
+    :find-component:`coro.Experimental` was found.
+
+
+Examples
+********
+
+Using `find_package(Coroutine)` with no component arguments:
+
+.. code-block:: cmake
+
+    find_package(Coroutine REQUIRED)
+
+    add_executable(my-program main.cpp)
+    target_link_libraries(my-program PRIVATE std::coroutine)
+
+
+#]=======================================================================]
+
+
+if(TARGET std::coroutine)
+  # This module has already been processed. Don't do it again.
+  return()
+endif()
+
+include(CMakePushCheckState)
+include(CheckIncludeFileCXX)
+include(CheckCXXSourceCompiles)
+
+cmake_push_check_state()
+
+set(CMAKE_REQUIRED_QUIET ${CXXStdCoroutine_FIND_QUIETLY})
+
+# Normalize and check the component list we were given
+set(CXXStdCoroutines_want_components ${CXXStdCoroutine_FIND_COMPONENTS})
+if(CXXStdCoroutine_FIND_COMPONENTS STREQUAL "")
+  set(CXXStdCoroutines_want_components Final)
+endif()
+
+# Warn on any unrecognized components
+set(CXXStdCoroutines_extra_components ${CXXStdCoroutines_want_components})
+list(REMOVE_ITEM CXXStdCoroutines_extra_components Final Experimental)
+foreach(component IN LISTS CXXStdCoroutines_extra_components)
+  message(WARNING "Extraneous find_package component for CXXStdCoroutine: ${component}")
+endforeach()
+
+# clang may need to use -stdlib=c++ to have coroutines
+# gcc/libstdc++ needs -fcoroutines
+set(CXXStdCoroutines_find_options "" "-stdlib=libc++" "-fcoroutines")
+set(CXXStdCoroutines_std_options "" "-std=c++20" "-std=c++2a")
+set(CXXStdCoroutines_want_components_ordered "${CXXStdCoroutines_want_components}")
+list(SORT CXXStdCoroutines_want_components_ordered ORDER DESCENDING)  # Final before Experimental
+
+foreach(component IN LISTS CXXStdCoroutines_want_components_ordered)
+  if(component STREQUAL "Final")
+    set(_coro_header coroutine)
+    set(_coro_namespace std)
+  else()
+    set(_coro_header experimental/coroutine)
+    set(_coro_namespace std::experimental)
+  endif()
+  foreach(option IN LISTS CXXStdCoroutines_find_options)
+    foreach(stdoption IN LISTS CXXStdCoroutines_std_options)
+      cmake_push_check_state()
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${option} ${stdoption}")
+
+      string(CONFIGURE [[
+        #include <@_coro_header@>
+
+        int main() {
+            auto x = @_coro_namespace@::suspend_always{};
+            return 0;
+        }
+      ]] code @ONLY)
+
+      check_cxx_source_compiles("${code}" HAVE_USABLE_${_coro_header})
+      mark_as_advanced(HAVE_USABLE_${_coro_header})
+      cmake_pop_check_state()
+      if(HAVE_USABLE_${_coro_header})
+        add_library(std::coroutine INTERFACE IMPORTED GLOBAL)
+        target_compile_features(std::coroutine INTERFACE cxx_std_20)
+        if (option)
+          target_compile_options(std::coroutine INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:${option}>")
+        endif()
+        set(CXX_COROUTINE_COMPONENT "${component}" CACHE STRING "The component of CXXStdCoroutine package found")
+        # break out of this loop
+        break()
+      else()
+        unset(HAVE_USABLE_${_coro_header} CACHE)
+      endif()
+    endforeach()  # stdoption
+    if (TARGET std::coroutine)
+      break()
+    endif()
+  endforeach()  # option
+  if (TARGET std::coroutine)
+    break()
+  endif()
+endforeach() # components
+
+set(CXX_COROUTINE_HAVE_CORO ${HAVE_USABLE_${_coro_header}} CACHE BOOL "TRUE if we have usable C++ coroutine headers")
+set(CXX_COROUTINE_HEADER ${_coro_header} CACHE STRING "The header that should be included to obtain the coroutine APIs")
+set(CXX_COROUTINE_NAMESPACE ${_coro_namespace} CACHE STRING "The C++ namespace that contains the coroutine APIs")
+
+cmake_pop_check_state()
+
+set(CXXStdCoroutine_FOUND ${HAVE_USABLE_${_coro_header}} CACHE BOOL "TRUE if we have usable C++ coroutine headers" FORCE)
+
+if(CXXStdCoroutine_FIND_REQUIRED AND NOT TARGET std::coroutine)
+  message(FATAL_ERROR "Cannot discover std::coroutine headers and/or compile simple program using std::coroutine")
+endif()
diff --git a/cmake/modules/FindCXXStdExecution.cmake b/cmake/modules/FindCXXStdExecution.cmake
index 059368d1f..acb6b1730 100644
--- a/cmake/modules/FindCXXStdExecution.cmake
+++ b/cmake/modules/FindCXXStdExecution.cmake
@@ -9,8 +9,8 @@
 FindCXXStdExecution
 ##############
 
-This module supports the C++17 standard library's execution utilities. Use the
-:imp-target:`std::execution` imported target to
+This module supports the C++17 standard library's execution utilities. Link your target to the
+:imp-target:`std::execution` imported target to provide standard C++ execution API.
 
 Imported Targets
 ****************
@@ -69,9 +69,6 @@ cmake_push_check_state()
 
 set(CMAKE_REQUIRED_QUIET ${CXXStdExecution_FIND_QUIETLY})
 
-# All of our tests required C++17 or later
-set(CMAKE_CXX_STANDARD 17)
-
 set(CXXStdExecution_FOUND FALSE)
 
 # We have execution header, but how do we use it? Do link checks
@@ -87,7 +84,7 @@ string(CONFIGURE [[
   }
   ]] code @ONLY)
 
-# Try to compile a simple filesystem program without any linker flags
+# Try to compile a simple execution program without any linker flags
 check_cxx_source_compiles("${code}" CXX_EXECUTION_NO_LINK_NEEDED)
 
 set(CXXStdExecution_CAN_LINK ${CXX_EXECUTION_NO_LINK_NEEDED})
diff --git a/cmake/modules/FindDPCPP.cmake b/cmake/modules/FindDPCPP.cmake
new file mode 100644
index 000000000..42ebe2135
--- /dev/null
+++ b/cmake/modules/FindDPCPP.cmake
@@ -0,0 +1,43 @@
+if(DPCPP_EXECUTABLE)
+    get_filename_component( _dpcpp_path ${DPCPP_EXECUTABLE} PATH )
+else(DPCPP_EXECUTABLE)
+    set(_dpcpp_path "")
+endif(DPCPP_EXECUTABLE)
+find_program( DPCPP_EXECUTABLE NAMES dpcpp HINTS ${_dpcpp_path} ENV PATH )
+mark_as_advanced( DPCPP_EXECUTABLE )
+
+if( DPCPP_EXECUTABLE )
+    execute_process(COMMAND ${DPCPP_EXECUTABLE} --version
+            RESULT_VARIABLE _res
+            OUTPUT_VARIABLE _out
+            ERROR_VARIABLE _err)
+    if( _res EQUAL 0 )
+        string(REGEX MATCH "([^\n]+)" _ ${_out})
+        message(STATUS "Found dpcpp: ${DPCPP_EXECUTABLE} version ${CMAKE_MATCH_1}")
+
+        get_filename_component( _dpcpp_dir ${DPCPP_EXECUTABLE} DIRECTORY )
+        get_filename_component( _dpcpp_lib_dir "${_dpcpp_dir}/../lib" ABSOLUTE )
+
+        find_library(SYCL_LIBRARY sycl HINTS "${_dpcpp_lib_dir}" "${SYCL_LIBRARY_DIR}" ENV "SYCL_LIBRARY_DIR")
+        if( SYCL_LIBRARY )
+            message(STATUS "SYCL library: ${SYCL_LIBRARY}")
+
+            get_filename_component( _dpcpp_inc_dir "${_dpcpp_dir}/../include" ABSOLUTE )
+            find_file(_sycl_include_file "sycl.hpp" HINTS "${_dpcpp_inc_dir}" "${_dpcpp_inc_dir}/sycl/CL" "${SYCL_INCLUDE_DIR}" ENV "SYCL_INCLUDE_DIR")
+            if(_sycl_include_file)
+                get_filename_component(SYCL_INCLUDE_DIR "${_sycl_include_file}" DIRECTORY)
+                message(STATUS "SYCL include dir: ${SYCL_INCLUDE_DIR}")
+            else(_sycl_include_file)
+                message(WARNING "Found DPC++, and SYCL library, but could not find SYCL include directory. Define SYCL_INCLUDE_DIR to enable DPC++ support")
+		set(DPCPP_EXECUTABLE OFF)
+            endif(_sycl_include_file)
+        else( SYCL_LIBRARY )
+            message(WARNING "Found DPC++, but did not find SYCL library. Define SYCL_LIBRARY_DIR to enable DPC++ support")
+	    set(DPCPP_EXECUTABLE OFF)
+        endif( SYCL_LIBRARY )
+    else( _res EQUAL 0 )
+	set(DPCPP_EXECUTABLE OFF)
+        message(WARNING "${DPCPP_EXECUTABLE} does not work: 'dpcpp -v' returned ${_res}, with error ${_err}")
+        message(WARNING "dpc++ support is disabled, set DPCPP_EXECUTABLE to the path of dpcpp to enable DPC++ support, and/or fix your environment to run dpcpp")
+    endif( _res EQUAL 0 )
+endif( DPCPP_EXECUTABLE )
diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake
index e1733e925..1e133eee0 100644
--- a/cmake/modules/FindOrFetchBoost.cmake
+++ b/cmake/modules/FindOrFetchBoost.cmake
@@ -1,50 +1,81 @@
-if (NOT TARGET Boost::boost)
-  find_package(Boost ${TTG_TRACKED_BOOST_VERSION} QUIET CONFIG OPTIONAL_COMPONENTS serialization)
-endif(NOT TARGET Boost::boost)
-
-if (TARGET Boost::boost)
-  set(_msg "Found Boost at ${Boost_CONFIG}")
-  if (TARGET Boost::serialization)
-    list(APPEND _msg " includes Boost::serialization")
-  endif(TARGET Boost::serialization)
-  message(STATUS "${_msg}")
-
-  # Boost::* targets by default are not GLOBAL, so to allow users of TTG to safely use them we need to make them global
-  # more discussion here: https://gitlab.kitware.com/cmake/cmake/-/issues/17256
-  foreach(tgt boost;headers;${Boost_BTAS_DEPS_LIBRARIES})
-    if (TARGET Boost::${tgt})
-      get_target_property(_boost_tgt_${tgt}_is_imported_global Boost::${tgt} IMPORTED_GLOBAL)
-      if (NOT _boost_tgt_${tgt}_is_imported_global)
-        set_target_properties(Boost::${tgt} PROPERTIES IMPORTED_GLOBAL TRUE)
-      endif()
-      unset(_boost_tgt_${tgt}_is_imported_global)
+# update the Boost version that we can tolerate
+if (NOT DEFINED Boost_OLDEST_BOOST_VERSION)
+    set(Boost_OLDEST_BOOST_VERSION ${TTG_OLDEST_BOOST_VERSION})
+else()
+    if (${Boost_OLDEST_BOOST_VERSION} VERSION_LESS ${TTG_OLDEST_BOOST_VERSION})
+        if (DEFINED CACHE{Boost_OLDEST_BOOST_VERSION})
+            set(Boost_OLDEST_BOOST_VERSION "${TTG_OLDEST_BOOST_VERSION}" CACHE STRING "Oldest Boost version to use" FORCE)
+        else()
+            set(Boost_OLDEST_BOOST_VERSION ${TTG_OLDEST_BOOST_VERSION})
+        endif()
     endif()
-  endforeach()
+endif()
 
-elseif (TTG_FETCH_BOOST)
+# Boost can be discovered by every (sub)package but only the top package can *build* it ...
+# in either case must declare the components used by TTG
+set(required_components
+        headers
+        callable_traits
+)
+set(optional_components
+)
+if (TTG_PARSEC_USE_BOOST_SERIALIZATION)
+    list(APPEND optional_components
+            serialization
+            iostreams
+    )
+endif()
 
-  FetchContent_Declare(
-          CMAKEBOOST
-          GIT_REPOSITORY      https://github.com/Orphis/boost-cmake
-  )
-  FetchContent_MakeAvailable(CMAKEBOOST)
-  FetchContent_GetProperties(CMAKEBOOST
-          SOURCE_DIR CMAKEBOOST_SOURCE_DIR
-          BINARY_DIR CMAKEBOOST_BINARY_DIR
-          )
+# if not allowed to fetch Boost make all Boost optional
+if (NOT DEFINED Boost_FETCH_IF_MISSING AND TTG_FETCH_BOOST)
+    set(Boost_FETCH_IF_MISSING 1)
+endif()
+if (NOT Boost_FETCH_IF_MISSING)
+    foreach(__component IN LISTS required_components)
+    list(APPEND optional_components
+            ${__component}
+    )
+    endforeach()
+    set(required_components )
+endif()
 
-  # current boost-cmake/master does not install boost correctly, so warn that installed TTG will not be usable
-  # boost-cmake/install_rules https://github.com/Orphis/boost-cmake/pull/45 is supposed to fix it but is inactive
-  message(WARNING "Building Boost from source makes TTG unusable from the install location! Install Boost using package manager or manually and reconfigure/reinstall TTG to fix this")
+if (DEFINED Boost_REQUIRED_COMPONENTS)
+    list(APPEND Boost_REQUIRED_COMPONENTS
+            ${required_components})
+    list(REMOVE_DUPLICATES Boost_REQUIRED_COMPONENTS)
+else()
+    set(Boost_REQUIRED_COMPONENTS "${required_components}" CACHE STRING "Components of Boost to discovered or built")
+endif()
+if (DEFINED Boost_OPTIONAL_COMPONENTS)
+    list(APPEND Boost_OPTIONAL_COMPONENTS
+            ${optional_components}
+    )
+    list(REMOVE_DUPLICATES Boost_OPTIONAL_COMPONENTS)
+else()
+    set(Boost_OPTIONAL_COMPONENTS "${optional_components}" CACHE STRING "Optional components of Boost to discovered or built")
+endif()
 
-  if (TARGET Boost::serialization AND TARGET Boost_serialization)
-    install(TARGETS Boost_serialization EXPORT boost)
-    export(EXPORT boost
-           FILE "${PROJECT_BINARY_DIR}/boost-targets.cmake")
-    install(EXPORT boost
-            FILE "boost-targets.cmake"
-            DESTINATION "${CMAKE_INSTALL_CMAKEDIR}"
-            COMPONENT boost-libs)
-  endif()
+# Bring ValeevGroup cmake toolkit, if not yet available
+if (NOT DEFINED vg_cmake_kit_SOURCE_DIR)
+    include(FetchContent)
+    if (DEFINED PROJECT_BINARY_DIR)
+        set(VG_CMAKE_KIT_PREFIX_DIR PROJECT_BINARY_DIR)
+    else ()
+        set(VG_CMAKE_KIT_PREFIX_DIR CMAKE_CURRENT_BINARY_DIR)
+    endif()
+    FetchContent_Declare(
+            vg_cmake_kit
+            QUIET
+            GIT_REPOSITORY      https://github.com/ValeevGroup/kit-cmake.git
+            GIT_TAG             ${TTG_TRACKED_VG_CMAKE_KIT_TAG}
+            SOURCE_DIR ${${VG_CMAKE_KIT_PREFIX_DIR}}/cmake/vg
+            BINARY_DIR ${${VG_CMAKE_KIT_PREFIX_DIR}}/cmake/vg-build
+            SUBBUILD_DIR ${${VG_CMAKE_KIT_PREFIX_DIR}}/cmake/vg-subbuild
+    )
+    FetchContent_MakeAvailable(vg_cmake_kit)
+endif()
+include(${vg_cmake_kit_SOURCE_DIR}/modules/FindOrFetchBoost.cmake)
 
+if (TARGET Boost::headers)
+    set(TTG_HAS_BOOST 1)
 endif()
diff --git a/cmake/modules/FindOrFetchCereal.cmake b/cmake/modules/FindOrFetchCereal.cmake
deleted file mode 100644
index b1e6ab1c0..000000000
--- a/cmake/modules/FindOrFetchCereal.cmake
+++ /dev/null
@@ -1,32 +0,0 @@
-if (NOT TARGET cereal::cereal)
-  # find_package(cereal ${TTG_TRACKED_CEREAL_VERSION} QUIET)
-  # homebrew on macos provides cereal-config with version "unknown"
-  find_package(cereal)
-  if (cereal_FOUND AND NOT TARGET cereal::cereal)
-    if (TARGET cereal)
-      add_library(cereal::cereal ALIAS cereal)
-    else ()
-      message(FATAL_ERROR "cereal_FOUND=TRUE but no cereal target")
-    endif()
-  endif()
-endif(NOT TARGET cereal::cereal)
-
-if (TARGET cereal::cereal)
-  message(STATUS "Found cereal at ${cereal_CONFIG}")
-else (TARGET cereal::cereal)
-  # going hungry today
-endif()
-
-# fetchcontent is disabled for now
-if (FALSE)
-  FetchContent_Declare(
-          cereal
-          GIT_REPOSITORY      https://github.com/USCiLab/cereal
-          GIT_TAG v${TTG_TRACKED_CEREAL_VERSION})
-  FetchContent_MakeAvailable(cereal)
-  FetchContent_GetProperties(cereal
-          SOURCE_DIR CEREAL_SOURCE_DIR
-          BINARY_DIR CEREAL_BINARY_DIR
-          )
-
-endif(FALSE)
diff --git a/cmake/modules/FindOrFetchMADNESS.cmake b/cmake/modules/FindOrFetchMADNESS.cmake
index f112e4ff7..24f0bc798 100644
--- a/cmake/modules/FindOrFetchMADNESS.cmake
+++ b/cmake/modules/FindOrFetchMADNESS.cmake
@@ -10,7 +10,7 @@ if (NOT TARGET MADworld)
   set(MADNESS_TASK_BACKEND PaRSEC CACHE STRING "The task backend to use for MADNESS tasks")
   FetchContent_Declare(
           MADNESS
-          GIT_REPOSITORY https://github.com/therault/madness.git
+          GIT_REPOSITORY https://github.com/m-a-d-n-e-s-s/madness.git
           GIT_TAG ${TTG_TRACKED_MADNESS_TAG}
   )
   FetchContent_MakeAvailable(MADNESS)
diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake
index 82d6ac0d9..7b164019f 100644
--- a/cmake/modules/FindOrFetchPARSEC.cmake
+++ b/cmake/modules/FindOrFetchPARSEC.cmake
@@ -13,10 +13,11 @@ if (NOT TARGET PaRSEC::parsec)
   set(CMAKE_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR})
   set(PARSEC_WITH_DEVEL_HEADERS ON CACHE BOOL "Install PaRSEC headers")
   set(BUILD_TOOLS ON CACHE BOOL "Do not build PaRSEC tools")
+  set(PARSEC_GPU_WITH_CUDA ${TTG_ENABLE_CUDA} CACHE BOOL "Enable CUDA support in PaRSEC runtime?")
 
   FetchContent_Declare(
           PARSEC
-          GIT_REPOSITORY      https://github.com/ICLDisco/parsec.git
+          GIT_REPOSITORY      https://github.com/devreal/parsec-1.git
           GIT_TAG             ${TTG_TRACKED_PARSEC_TAG}
   )
   FetchContent_MakeAvailable(PARSEC)
diff --git a/cmake/modules/FindOrFetchTiledArray.cmake b/cmake/modules/FindOrFetchTiledArray.cmake
new file mode 100644
index 000000000..27ead4479
--- /dev/null
+++ b/cmake/modules/FindOrFetchTiledArray.cmake
@@ -0,0 +1,56 @@
+if (NOT TARGET tiledarray AND NOT MPQC_BUILD_DEPENDENCIES_FROM_SOURCE)
+  if(TiledArray_INSTALL_DIR)
+    set(TiledArray_DIR ${TiledArray_INSTALL_DIR}/lib/cmake/tiledarray)
+  endif()
+  find_package(TiledArray CONFIG QUIET COMPONENTS tiledarray)
+endif ()
+
+set(TA_PYTHON OFF)
+
+if (TARGET tiledarray)
+  message(STATUS "Found TiledArray CONFIG at ${TiledArray_CONFIG}")
+
+  if ((NOT TA_ASSUMES_ASLR_DISABLED AND MPQC_ASSUMES_ASLR_DISABLED) OR (TA_ASSUMES_ASLR_DISABLED AND NOT MPQC_ASSUMES_ASLR_DISABLED))
+    message(FATAL_ERROR "Found TiledArray configured with TA_ASSUMES_ASLR_DISABLED=${TA_ASSUMES_ASLR_DISABLED} but MPQC is configured with MPQC_ASSUMES_ASLR_DISABLED=${MPQC_ASSUMES_ASLR_DISABLED}; MPQC_ASSUMES_ASLR_DISABLED and TA_ASSUMES_ASLR_DISABLED should be the same")
+  endif()
+
+else (TARGET tiledarray)
+
+  # enable CUDA if TTG has it
+  set(ENABLE_CUDA ${TTG_HAVE_CUDA} CACHE BOOL "Enable CUDA")
+
+  # update CMake cache for TA
+  if (DEFINED MADNESS_CMAKE_EXTRA_ARGS)
+    set(MADNESS_CMAKE_EXTRA_ARGS "${MADNESS_CMAKE_EXTRA_ARGS};-DENABLE_DQ_PREBUF=OFF" CACHE STRING "Extra CMake arguments to MADNESS" FORCE)
+  else(DEFINED MADNESS_CMAKE_EXTRA_ARGS)
+    set(MADNESS_CMAKE_EXTRA_ARGS "-DENABLE_DQ_PREBUF=OFF" CACHE STRING "Extra CMake arguments to MADNESS")
+  endif(DEFINED MADNESS_CMAKE_EXTRA_ARGS)
+  if (NOT DEFINED TA_ASSUMES_ASLR_DISABLED)
+    set(TA_ASSUMES_ASLR_DISABLED ${MPQC_ASSUMES_ASLR_DISABLED} CACHE BOOL "TA assumes the Address Space Layout Randomization (ASLR) to be disabled")
+  endif(NOT DEFINED TA_ASSUMES_ASLR_DISABLED)
+
+  include(FetchContent)
+  FetchContent_Declare(
+      TILEDARRAY
+      GIT_REPOSITORY      https://github.com/ValeevGroup/tiledarray.git
+      GIT_TAG             ${TTG_TRACKED_TILEDARRAY_TAG}
+  )
+  FetchContent_MakeAvailable(TILEDARRAY)
+  FetchContent_GetProperties(TILEDARRAY
+      SOURCE_DIR TILEDARRAY_SOURCE_DIR
+      BINARY_DIR TILEDARRAY_BINARY_DIR
+      )
+  # TA includes dependencies that are built manually, not using FetchContent, hence make sure we build them before building any MPQC code
+  # add_dependencies(deps-mpqc External-tiledarray)
+
+  set(TTG_DOWNLOADED_TILEDARRAY ON CACHE BOOL "Whether TTG downloaded TiledArray")
+
+  include("${TILEDARRAY_BINARY_DIR}/cmake/modules/ReimportTargets.cmake")
+  if (NOT TARGET MADworld)
+    message(FATAL_ERROR "did not find re-imported target MADworld")
+  endif(NOT TARGET MADworld)
+
+  # this is where tiledarray-config.cmake will end up
+  # must be in sync with the "install(FILES ...tiledarray-config.cmake" statement in https://github.com/ValeevGroup/tiledarray/blob/${MPQC_TRACKED_TILEDARRAY_TAG}/CMakeLists.txt
+  set(TiledArray_CONFIG "${CMAKE_INSTALL_PREFIX}/${TILEDARRAY_INSTALL_CMAKEDIR}" CACHE INTERNAL "The location of installed tiledarray-config.cmake file")
+endif(TARGET tiledarray)
diff --git a/cmake/modules/Findlevel-zero.cmake b/cmake/modules/Findlevel-zero.cmake
new file mode 100644
index 000000000..0a13a32fe
--- /dev/null
+++ b/cmake/modules/Findlevel-zero.cmake
@@ -0,0 +1,81 @@
+if(LEVEL_ZERO_FOUND)
+    if(TARGET level_zero::ze_loader)
+        message(STATUS "level-zero is found and TARGET level_zero::ze_loader is defined")
+        return()
+    endif(TARGET level_zero::ze_loader)
+
+    # If the user defines LEVEL_ZERO_INCLUDE_DIR and LEVEL_ZERO_LIBRARY_DIR, CMake's find_package declares that LEVEL_ZERO_FOUND is 1, but does not define the target.
+    check_library_exists("ze_loader" "zeInit" "${LEVEL_ZERO_LIBRARY_DIR}" LEVEL_ZERO_HAVE_ZE_LOADER)
+    check_include_file("${LEVEL_ZERO_INCLUDE_DIR}/level_zero/ze_api.h" LEVEL_ZERO_HAVE_ZE_API_H)
+    if(LEVEL_ZERO_HAVE_ZE_LOADER AND LEVEL_ZERO_HAVE_ZE_API_H)
+        message(STATUS "Defining level_zero::ze_loader target with interface ${LEVEL_ZERO_INCLUDE_DIR}/level_zero/ze_api.h and library ${LEVEL_ZERO_LIBRARY_DIR}/libze_loader.so")
+        add_library(level_zero::ze_loader UNKNOWN IMPORTED GLOBAL)
+        set_property(TARGET level_zero::ze_loader PROPERTY IMPORTED_LOCATION "${LEVEL_ZERO_LIBRARY_DIR}/libze_loader.so")
+        set_property(TARGET level_zero::ze_loader PROPERTY INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}/level_zero/ze_api.h")
+        include_directories("${LEVEL_ZERO_INCLUDE_DIR}/") 
+        return()
+    else(LEVEL_ZERO_HAVE_ZE_LOADER AND LEVEL_ZERO_HAVE_ZE_API_H)
+        if(NOT LEVEL_ZERO_HAVE_ZE_LOADER)
+            message(STATUS "LEVEL_ZERO_FOUND is set, but could not find ze_loader library in ${LEVEL_ZERO_LIBRARY_DIR}")
+        endif(NOT LEVEL_ZERO_HAVE_ZE_LOADER)
+        if(NOT LEVEL_ZERO_HAVE_ZE_API_H)
+            message(STATUS "LEVEL_ZERO_FOUND is set, but could not find level_zero/ze_api.h in ${LEVEL_ZERO_INCLUDE_DIR}")
+        endif(NOT LEVEL_ZERO_HAVE_ZE_API_H)
+    endif(LEVEL_ZERO_HAVE_ZE_LOADER AND LEVEL_ZERO_HAVE_ZE_API_H)
+endif(LEVEL_ZERO_FOUND)
+
+if(LEVEL_ZERO_ROOT_DIR)
+    message(STATUS "Trying to locate level-zero library and headers under ${LEVEL_ZERO_ROOT_DIR}")
+    find_library(ZE_LOADER_LIBRARY "ze_loader" HINTS "${LEVEL_ZERO_ROOT_DIR}/lib" "${LEVEL_ZERO_ROOT_DIR}/lib64" NO_DEFAULT_PATH)
+    find_path (LEVEL_ZERO_INCLUDE_DIR NAMES "level_zero/ze_api.h" PATHS "${LEVEL_ZERO_ROOT_DIR}/include" NO_DEFAULT_PATH)
+
+    if(ZE_LOADER_LIBRARY AND LEVEL_ZERO_INCLUDE_DIR)
+	    get_filename_component(ZE_LOADER_LIBRARY_DIR ${ZE_LOADER_LIBRARY} DIRECTORY)
+	    include(CheckLibraryExists)
+	    check_library_exists("ze_loader" "zeInit" ${ZE_LOADER_LIBRARY_DIR} ZE_LOADER_HAVE_ZEINIT)
+
+        if(ZE_LOADER_HAVE_ZEINIT)
+            message(STATUS "Found ze_loader library in ${ZE_LOADER_LIBRARY} and level_zero/ze_api.h in ${LEVEL_ZERO_INCLUDE_DIR}")
+            add_library(level_zero::ze_loader UNKNOWN IMPORTED GLOBAL)
+            set_property(TARGET level_zero::ze_loader PROPERTY IMPORTED_LOCATION "${ZE_LOADER_LIBRARY}")
+            set_property(TARGET level_zero::ze_loader PROPERTY INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}/level_zero/ze_api.h")
+            include_directories("${LEVEL_ZERO_INCLUDE_DIR}/") 
+            set(LEVEL_ZERO_FOUND TRUE)
+        else(ZE_LOADER_HAVE_ZEINIT)
+            if(NOT ZE_LOADER_HAVE_ZEINIT)
+                message(WARNING "Found ze_loader library under ${ZE_LOADER_LIBRARY}, but could not find symbol zeInit in this library -- falling back to package config search")
+            endif(NOT ZE_LOADER_HAVE_ZEINIT)
+        endif(ZE_LOADER_HAVE_ZEINIT)
+    else(ZE_LOADER_LIBRARY AND LEVEL_ZERO_INCLUDE_DIR)
+	    if(NOT ZE_LOADER_LIBRARY)
+		    message(WARNING "Could not find ze_loader library under provided LEVEL_ZERO_ROOT_DIR='${LEVEL_ZERO_ROOT_DIR}' (tried subdirectories lib/ and lib64/) -- falling back to package config search")
+	    endif(NOT ZE_LOADER_LIBRARY)
+	    if(NOT LEVEL_ZERO_INCLUDE_DIR)
+		    message(WARNING "Cound not find level_zero/ze_api.h under provided LEVEL_ZERO_ROOT_DIR=${LEVEL_ZERO_ROOT_DIR}' (tried subdirectory include) -- falling back to package config search")
+	    endif(NOT LEVEL_ZERO_INCLUDE_DIR)
+    endif(ZE_LOADER_LIBRARY AND LEVEL_ZERO_INCLUDE_DIR)
+endif(LEVEL_ZERO_ROOT_DIR)
+
+if(NOT LEVEL_ZERO_FOUND)
+    find_package(PkgConfig QUIET)
+
+    if(PKG_CONFIG_FOUND)
+        pkg_check_modules(LEVEL_ZERO level-zero)
+        if(LEVEL_ZERO_FOUND)
+            pkg_get_variable(LEVEL_ZERO_LIBRARY_DIR level-zero libdir)
+            pkg_get_variable(LEVEL_ZERO_INCLUDE_DIR level-zero includedir)
+            #We double-check that the level-zero library is indeed where find_package claims it is
+            find_library(_PARSEC_ZE_LOADER_LIBRARY_FOUND ze_loader PATHS "${LEVEL_ZERO_LIBRARY_DIR}/libze_loader.so" 
+                         NO_DEFAULT_PATH NO_PACKAGE_ROOT_PATH NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH NO_SYSTEM_ENVIRONMENT_PATH NO_CMAKE_SYSTEM_PATH NO_CMAKE_INSTALL_PREFIX)
+            if( _PARSEC_ZE_LOADER_LIBRARY_FOUND )
+                add_library(level_zero::ze_loader UNKNOWN IMPORTED GLOBAL)
+                set_property(TARGET level_zero::ze_loader PROPERTY IMPORTED_LOCATION "${LEVEL_ZERO_LIBRARY_DIR}/libze_loader.so")
+                set_property(TARGET level_zero::ze_loader PROPERTY INTERFACE "${LEVEL_ZERO_INCLUDE_DIR}/level_zero/ze_api.h")
+                include_directories("${LEVEL_ZERO_INCLUDE_DIR}/")
+            else()
+                message(WARNING "level-zero was found in `${LEVEL_ZERO_LIBRARY_DIR}` and `${LEVEL_ZERO_INCLUDE_DIR}` according to pkg-config, but ze_loader library is not in `${LEVEL_ZERO_LIBRARY_DIR}/libze_loader.so`. Deactivating level-zero. Provide -DLEVEL_ZERO_ROOT_DIR to specify the location of level zero manually.")
+                set(LEVEL_ZERO_FOUND OFF CACHE BOOL "if level-zero was found" FORCE)
+            endif()
+        endif(LEVEL_ZERO_FOUND)
+    endif(PKG_CONFIG_FOUND)
+endif(NOT LEVEL_ZERO_FOUND)
diff --git a/cmake/ttg-config.cmake.in b/cmake/ttg-config.cmake.in
index b00a9c692..82f7c2ae6 100644
--- a/cmake/ttg-config.cmake.in
+++ b/cmake/ttg-config.cmake.in
@@ -6,14 +6,22 @@ set(TTG_EXT_VERSION "@TTG_EXT_VERSION@")
 
 set(PaRSEC_CONFIG "@PaRSEC_CONFIG@")
 set(MADNESS_CONFIG "@MADNESS_CONFIG@")
-set(Boost_CONFIG "@Boost_CONFIG@")
+set(CXX_COROUTINE_COMPONENT "@CXX_COROUTINE_COMPONENT@")
 
 set(TTG_TRACKED_BOOST_VERSION "@TTG_TRACKED_BOOST_VERSION@")
-
+set(TTG_HAS_BOOST @TTG_HAS_BOOST@)
 set(TTG_IGNORE_BUNDLED_EXTERNALS @TTG_IGNORE_BUNDLED_EXTERNALS@)
 
+# make TTG CMake modules discoverable + load AddTTGExecutable by default
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules")
+include(AddTTGExecutable)
+
 @PACKAGE_INIT@
 
+if (TTG_HAS_BOOST)
+@Boost_CONFIG_FILE_CONTENTS@
+endif(TTG_HAS_BOOST)
+
 if (NOT TARGET MADworld AND MADNESS_CONFIG)
   get_filename_component(MADNESS_CONFIG_DIR "${MADNESS_CONFIG}" DIRECTORY)
   find_package(MADNESS 0.10.1 CONFIG QUIET REQUIRED COMPONENTS world PATHS "${MADNESS_CONFIG_DIR}" NO_DEFAULT_PATH)
@@ -24,10 +32,9 @@ if (NOT TARGET PaRSEC::parsec AND PaRSEC_CONFIG)
   find_package(PaRSEC CONFIG QUIET REQUIRED COMPONENTS parsec PATHS "${PaRSEC_CONFIG_DIR}" NO_DEFAULT_PATH)
 endif()
 
-# N.B. load Boost
-if (NOT TARGET Boost::boost AND Boost_CONFIG)
-  get_filename_component(Boost_CONFIG_DIR "${Boost_CONFIG}" DIRECTORY)
-  find_package(Boost ${TTG_TRACKED_BOOST_VERSION} CONFIG QUIET REQUIRED OPTIONAL_COMPONENTS serialization PATHS "${Boost_CONFIG_DIR}" NO_DEFAULT_PATH)
+# if C++ coroutines were used discover same version of them
+if (NOT TARGET std::coroutine AND CXX_COROUTINE_COMPONENT)
+  find_package(CXXStdCoroutine MODULE QUIET REQUIRED COMPONENTS "${CXX_COROUTINE_COMPONENT}")
 endif()
 
 # Include library IMPORT targets
@@ -38,17 +45,4 @@ if(NOT TARGET ttg)
   endif()
 endif()
 
-# if don't have Boost, use bundled Boost.CallableTraits
-if (NOT TARGET Boost::boost)
-  if (TTG_IGNORE_BUNDLED_EXTERNALS)
-    find_package(Boost ${TTG_TRACKED_BOOST_VERSION} QUIET REQUIRED)
-  else()
-    target_compile_definitions(ttg INTERFACE TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS=1)
-  endif()
-endif()
-
-# load CMake modules
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules")
-include(AddTTGExecutable)
-
 set(TTG_FOUND TRUE)
diff --git a/doc/dox/config/Doxyfile.in b/doc/dox/config/Doxyfile.in
index a1154935d..581310a05 100644
--- a/doc/dox/config/Doxyfile.in
+++ b/doc/dox/config/Doxyfile.in
@@ -399,7 +399,7 @@ DISTRIBUTE_GROUP_DOC   = YES
 # is disabled and one has to add nested compounds explicitly via \ingroup.
 # The default value is: NO.
 
-GROUP_NESTED_COMPOUNDS = NO
+GROUP_NESTED_COMPOUNDS = YES
 
 # Set the SUBGROUPING tag to YES to allow class member groups of the same type
 # (for instance a group of public functions) to be put as a subgroup of that
@@ -2145,7 +2145,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           =
+INCLUDE_PATH           = @PROJECT_SOURCE_DIR@/ttg @PROJECT_BINARY_DIR@/ttg
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
diff --git a/doc/dox/contrib/CI-Administration-Notes.md b/doc/dox/contrib/CI-Administration-Notes.md
index 32542b970..c5dd9430a 100644
--- a/doc/dox/contrib/CI-Administration-Notes.md
+++ b/doc/dox/contrib/CI-Administration-Notes.md
@@ -4,7 +4,7 @@
 * TTG uses GitHub Actions (GHA) for its CI service
 * GHA CI configuration is in file `.github/workflows/cmake.yml`. Only Linux and MacOS builds are currently supported.
 * Unlike earlier CI setups, there is no need to cache TTG prerequisites; default system-wide packages are used for most prerequisites, and the rest is compiled from source every time. 
-* Doxygen documentation deployment uses a Github token that is defined as variable `GH_TTG_TOKEN` in GHA's TTG repo settings' [secrets](https://github.com/TESSEorg/ttg/settings/secrets/actions).
+* Doxygen documentation deployment uses a GitHub token that is defined as variable `GH_TTG_TOKEN` in GHA's TTG repo settings' [secrets](https://github.com/TESSEorg/ttg/settings/secrets/actions).
 
 # Debugging GitHub Actions jobs
 
diff --git a/doc/dox/contrib/Design-Device.md b/doc/dox/contrib/Design-Device.md
new file mode 100644
index 000000000..2c3c576f3
--- /dev/null
+++ b/doc/dox/contrib/Design-Device.md
@@ -0,0 +1,175 @@
+# Device Task Design {#Design-Device}
+
+## problem statement
+TTG must be able to execute general user-defined graphs on machines with heterogeneous execution and address spaces, e.g., using multiple processes each having multiple CPU threads + device streams, with each thread/stream preferring or limited to a specific address range.
+
+## key concerns
+The key issues are how to manage:
+- the asynchrony of the device programming models, and
+- the heterogeneity of the address space.
+
+There are multiple "solutions" to each issue, hence there are many possible designs. I'll discuss each issue first, then outline the aggregate designs we are pursuing.
+
+### Memory:
+- *Unified Memory (UM)*: where available, use single address space (unified memory visible to both host and device executors; it is also possible to use pinned host memory for device calls)
+    - pro: simplifies memory management by removing the capacity limitation
+    - con: still needs user cooperation: all compute data must be allocated on UM heap, this impacts the design of
+      user data types, e.g. making them allocator aware, etc.
+    - con: the user will likely needs to use pooled memory management for efficiency reasons (e.g., TiledArray uses Umpire)
+    - con: still necessary to provide hints to the kernel driver managing UM.
+    - con: reasoning about UM driver performance is difficult, its details are opaque and platform dependent.
+- *Device Memory (DM)*: using "native" device memory.
+    - pro: simpler performance model due to greatest amount of control (by runtime) over execution
+    - pro: can work with stack-capable data types
+    - con: The amount is limited, hence this memory must be explicitly managed (akin to how a cache is managed).
+
+Additional memory-related concerns common to both models:
+- only partial state needs to be transferred to/from the device
+    - which part of the state will differ from algorithm to algorithm, hence encoding/constructing such representation cannot use constexpr code (such as traits)
+    - the need for _explicit_ handling of object's partial state is shared by both models
+        - UM: such optimization may seem automatic (only the pages of the data actually used on the device are transfered) but in practice the data must be explicitly prefetched, hence partial state transfers are not automatic; furthermore, the unit of UM transfer is a page (4k or more), which is too coarse for many applications
+        - DM: serialization of an entire object (which can leverage standard RDMA-like serialization), transfering partial state requires explicit annotation 
+    - hence it makes sense to make representation of object's partial state (`View`) a first-class concept in both models.
+
+### Asynchrony
+- *Continuations/stages*: decompose tasks into _continuations_ (stages), with runtime-managed scheduling of continutations for managing the asynchrony of the actions initiated by each continuation
+    - pro: most explicit, easier to reason about, fewest performance implications
+    - con: most verbose; device-capable tasks look very different from host tasks
+    - con: limited composability
+        - difficult to support general computation patterns (e.g. generator continuation, etc.,)
+- *"Threads"*: use threads to deal with the asynchrony (in principle could use user-space threads = fibers)
+    - pro: least host/device dichotomy
+        - tasks are ordinary (synchronous) functions
+        - fully composable
+    - con: performance implications
+        - due to the need to context switch to "yield" to other tasks
+        - thus even fully synchronous computations will suffer
+    - con: asynchrony artifacts still appear
+        - asynchronous calls must be in general annotated (to force synchronous execution and/or to provide hints to the thread scheduler)
+- *"Coroutines"*: use C++20 coroutines
+    - pro: less host/device dichotomy compared to continuations
+        - task functions "look" like ordinary functions (and can be made almost like normal functions using macros) but returning a custom return object (containing return status + handle to the coroutine) instead of void
+        - fully composable
+    - performance implications
+        - pro: no impact on synchronous tasks
+        - con: coroutine implementation details are complex and usually involve heap allocation
+        - pro: custom allocators can be introduced to elide heap allocation (at the cost of limited generality)
+    - con: asynchrony artifacts still appear
+        - co_await annotate the spots where execution may need to be suspended
+    - con: less mature due to the need for C++20
+        - GCC (10+), LLVM (8+) support coroutines
+        - TTG and all of its dependencies will be impacted by the raised standard requirement
+
+### other considerations
+
+- it's not possible to manage memory from the device code, hence all program logic, including _device-capable_ tasks, must execute on host executors. In principle if we restricted ourselves to a single-source language (SYLC-extended C++) we could write device capable tasks directly as device code, but current language limitations mandate wrapping everything into host code.
+- runtime is still responsible for managing the executor space heterogeneity (control where to launch a task) and asynchrony (events/host callbacks).
+
+## Current designs
+- *UM+threads*: use UM for memory management + threads for asynchrony
+- *DM+stages*: use Parsec's device memory pool manager + stage-decomposed tasks
+- *?M+coroutines*: UM/DM for memory + C++20 coroutines for handling the asynchrony
+
+### Example code: threads vs continuations vs coroutines
+
+How should we map the following host task onto the device?
+```cpp
+make_tt([](auto& key, auto& data1, auto& data2) -> void {
+    double data3 = blas::dot(data1.data(), data2.data());
+    if (data3 >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+}
+```
+
+Ideally the task will receive `data1` and `data2` already transferred to the memory space(s) accessible from the device execution space:
+```cpp
+make_device_tt([](auto& key, auto& data1, auto& data2) -> void {
+    double data3 = blas::device_dot(data1.data(), data2.data());
+    if (data3 >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+}
+```
+But now `data3` lives in the host memory so in general we must manage its transfer from the device. Hence either:
+- all intermediate data must be managed explicitly within the task, or
+- except for the cases where user types are aware of multiple memory spaces (but this makes the state of such types asynchronous).
+
+Here are the tentative device versions of this task in each of the 3 approaches (the memory details are omitted).
+
+#### Threads
+```cpp
+make_tt([](auto& key, auto& data1, auto& data2) -> void {
+    // stage 1
+    ConstView view1(data1);
+    ConstView view2(data2);
+    double data3;
+    View view3(data3, NewView | SyncView_D2H);
+    // depending on the memory model may need to wait here for the transfers to complete
+    // could build the waits into View ctors, or need an explicit await()
+    
+    // stage 2
+    cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr());
+    // if called an async function need explicit await() here
+    // also: who/how will view3 be synchronized
+
+    if (data3 >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+}
+```
+N.B. `make_tt`: this is a regular task.
+
+#### Continuations
+```cpp
+make_device_tt(
+  // stage 1
+  [](auto& key, auto& data1, auto& data2) {
+    ConstView view1(data1);
+    ConstView view2(data2);
+    double data3;
+    View view3(data3, NewView | SyncView_D2H);
+    return {view1, view2, view3};
+    }, 
+  // stage 2
+  [](auto& key, auto& views) {
+    auto& [view1, view2, view3] = views;
+    cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr());
+  },
+  // stage 3
+  [](auto& key, auto& views) {
+    auto& [view1, view2, view3] = views;
+    if (*view3.host_ptr() >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+    }
+}
+```
+N.B. `make_device_tt` vs `make_tt`: this is a special task.
+
+#### Coroutines
+```cpp
+make_tt([](auto& key, auto& data1, auto& data2) -> ttg::resumable_task {
+    // stage 1
+    ConstView view1(data1);
+    ConstView view2(data2);
+    double data3;
+    View view3(data3, NewView | SyncView_D2H);
+    co_await sync_views(view1, view2, view3);  // creates list of transfers to be fulfilled by the runtime
+    
+    // stage 2
+    cublasDdot(view1.device_ptr(), view2.device_ptr(), view3.device_ptr());
+    co_await;  // syncs view3; since transfers and kernels execute in different streams the runtime will sync kernel stream, then launch transfers, then resume here
+
+    if (data3 >= 0.)
+        send<0>(data1);
+    else
+        send<0>(data2);
+    co_return;  // processes sends and destroys coroutine
+}, ...);
+```
+N.B. `make_tt` and `ttg::resumable_task`: this is a regular task but with special return type.
diff --git a/doc/Pull-terminal-design-doc.md b/doc/dox/contrib/Pull-terminal-design-doc.md
similarity index 98%
rename from doc/Pull-terminal-design-doc.md
rename to doc/dox/contrib/Pull-terminal-design-doc.md
index 377c42b22..1af4ed11b 100644
--- a/doc/Pull-terminal-design-doc.md
+++ b/doc/dox/contrib/Pull-terminal-design-doc.md
@@ -1,4 +1,4 @@
-# Pull Terminals - Design Notes
+# Pull Terminals Design Notes {#Design-Pull}
 
 ### Motivation
 
@@ -49,7 +49,3 @@
 
 - Should Pull Op be able to send data to multiple successors? Use cases?
 - Cholesky - why pull ops are needed?
-
-
-
-
diff --git a/doc/dox/contrib/top.md b/doc/dox/contrib/top.md
index 87fd1edd8..3cf224b2f 100644
--- a/doc/dox/contrib/top.md
+++ b/doc/dox/contrib/top.md
@@ -1,7 +1,12 @@
 # Contributor Guide {#contribguide}
 
-* [TTG Build Intrastructure](@ref TTG-Build-Infrastructure)
+* Development
+  - [TTG Build Intrastructure](@ref TTG-Build-Infrastructure)
+  - [Documenting TTG](@ref Documenting-TTG)
+  - [Recommended Workflow Elements](@ref Recommended-Workflow-Elements)
+  - [CodingStandard](@ref Coding-Standard)
+* Design Notes
+  - [Pull Terminals](@ref Design-Pull)
+  - [Device Tasks](@ref Design-Device)
+* Maintenance
   - [Managing Continuous Integration (CI)](@ref CI-Administration-Notes)
-* [Documenting TTG](@ref Documenting-TTG)
-* [Recommended Workflow Elements](@ref Recommended-Workflow-Elements)
-* [CodingStandard](@ref Coding-Standard)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 4fc51ce3d..10b808540 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -4,16 +4,57 @@ add_ttg_executable(test test/test.cc)
 add_ttg_executable(t9 t9/t9.cc)
 add_ttg_executable(t9-streaming t9/t9_streaming.cc)
 
-# sparse matmul
-if (TARGET eigen3)
+# sparse matmul need Eigen ... it's always provided by TA
+if (TARGET tiledarray)
     # MADworld used for MADNESS serialization
-    add_ttg_executable(spmm spmm/spmm.cc LINK_LIBRARIES eigen3)
-    # block-sparse needs BTAS
-    if (TARGET BTAS::BTAS)
-        # since only need to use matrices, limit BTAS_TARGET_MAX_INDEX_RANK to 2
-        add_ttg_executable(bspmm spmm/spmm.cc LINK_LIBRARIES eigen3 BTAS Boost::boost COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2)
-    endif (TARGET BTAS::BTAS)
-endif(TARGET eigen3)
+    add_ttg_executable(spmm spmm/spmm.cc LINK_LIBRARIES TiledArray_Eigen)
+    # block-sparse needs BTAS ... it's always provided by TA
+    # since only need to use matrices, limit BTAS_TARGET_MAX_INDEX_RANK to 2
+    add_ttg_executable(bspmm spmm/spmm.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2)
+
+    add_ttg_executable(testing_dpotrf potrf/testing_dpotrf.cc LINK_LIBRARIES tiledarray lapackpp)
+    add_ttg_executable(testing_dtrtri potrf/testing_dtrtri.cc LINK_LIBRARIES tiledarray lapackpp)
+    add_ttg_executable(testing_dlauum potrf/testing_dlauum.cc LINK_LIBRARIES tiledarray lapackpp)
+    add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES tiledarray lapackpp)
+
+    if (TARGET CUDA::cublas)
+      add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc
+                         LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS CUDA::cublas
+                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
+                         RUNTIMES "parsec")
+
+      if (TARGET CUDA::cusolver)
+        add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc
+                          LINK_LIBRARIES lapackpp tiledarray CUDA::cublas CUDA::cusolver
+                          COMPILE_DEFINITIONS TTG_ENABLE_CUDA=1 #;DEBUG_TILES_VALUES=1
+                          RUNTIMES "parsec")
+      endif(TARGET CUDA::cusolver)
+    elseif (TARGET roc::hipblas)
+      add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
+                          LINK_LIBRARIES tiledarray TiledArray_Eigen roc::hipblas
+                          COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
+                          RUNTIMES "parsec")
+      if (TARGET roc::hipsolver)
+        add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc
+                           LINK_LIBRARIES lapackpp tiledarray roc::hipblas roc::hipsolver
+                           COMPILE_DEFINITIONS TTG_ENABLE_HIP=1;DEBUG_TILES_VALUES=1
+                           RUNTIMES "parsec")
+      endif(TARGET roc::hipsolver)
+    elseif (TARGET MKL::MKL_DPCPP)
+      add_ttg_executable(bspmm-lz spmm/spmm_cuda.cc
+                          LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS MKL::MKL_DPCPP level_zero::ze_loader m
+                          COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
+                          RUNTIMES "parsec")
+    endif()
+
+    if (TTG_HAVE_CUDA)
+      add_ttg_executable(chain-ttg-cuda task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
+    endif(TTG_HAVE_CUDA)
+
+    if (TTG_HAVE_HIP)
+      add_ttg_executable(chain-ttg-hip task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
+    endif(TTG_HAVE_HIP)
+endif()
 
 if (TARGET MADworld)
   add_ttg_executable(madness-1d madness/madness-1d/madness-1d.cc RUNTIMES "mad")
@@ -32,11 +73,6 @@ add_ttg_executable(fw-apsp floyd-warshall/floyd_warshall.cc LINK_LIBRARIES MADwo
 add_ttg_executable(helloworld helloworld/helloworld.cpp)
 add_ttg_executable(simplegenerator simplegenerator/simplegenerator.cc RUNTIMES "mad")
 
-add_ttg_executable(testing_dpotrf potrf/testing_dpotrf.cc LINK_LIBRARIES lapackpp)
-add_ttg_executable(testing_dtrtri potrf/testing_dtrtri.cc LINK_LIBRARIES lapackpp)
-add_ttg_executable(testing_dlauum potrf/testing_dlauum.cc LINK_LIBRARIES lapackpp)
-add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES lapackpp)
-
 if (OpenMP_CXX_FOUND AND TARGET std::execution)
         add_ttg_executable(fw-apsp-df floyd-warshall/floyd_warshall_df.cc LINK_LIBRARIES OpenMP::OpenMP_CXX std::execution MADworld)
 endif ()
@@ -50,3 +86,4 @@ add_ttg_executable(sw sw/sw.cc)
 if (TARGET MADworld)
   add_ttg_executable(randomaccess randomaccess/randomaccess.cc RUNTIMES "mad")
 endif (TARGET MADworld)
+
diff --git a/examples/devblas_helper.h b/examples/devblas_helper.h
new file mode 100644
index 000000000..5f2ad3e89
--- /dev/null
+++ b/examples/devblas_helper.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#include "ttg/config.h"
+
+#include "ttg/device/device.h"
+
+#include <memory>
+#include <stdexcept>
+#include <optional>
+#include <map>
+#include <mutex>
+
+#ifdef TTG_HAVE_CUDART
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+
+/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread
+/// \return the cuBLAS handle for the current thread
+template<typename T = int>
+inline const cublasHandle_t& cublas_handle(T _ = 0) {
+  using map_type = std::map<std::pair<int, cudaStream_t>, cublasHandle_t>;
+  static map_type handles;
+  static std::mutex handle_mtx;
+
+  auto d = ttg::device::current_device();
+  int device = 0; // assume 0 if we don't have a device
+  if (d.is_device()) {
+    device = d;
+  }
+
+  cudaStream_t stream = ttg::device::current_stream();
+
+  std::lock_guard g(handle_mtx);
+  map_type::iterator it;
+  if ((it = handles.find({device, stream})) == handles.end()){
+    cublasHandle_t handle;
+    auto status = cublasCreate_v2(&handle);
+    if (CUBLAS_STATUS_SUCCESS != status) {
+      std::cerr << "cublasCreate_v2 failed: " << status << std::endl;
+      throw std::runtime_error("cublasCreate_v2 failed");
+    }
+    status = cublasSetStream_v2(handle, stream);
+    if (CUBLAS_STATUS_SUCCESS != status) {
+      std::cerr << "cublasSetStream_v2 failed: " << status << std::endl;
+      throw std::runtime_error("cublasSetStream_v2 failed");
+    }
+    auto [iterator, success] = handles.insert({{device, stream}, handle});
+    it = iterator;
+  }
+  return it->second;
+}
+
+template<typename T = int>
+inline const cusolverDnHandle_t& cusolver_handle(T _ = 0) {
+
+  using map_type = std::map<std::pair<int, cudaStream_t>, cusolverDnHandle_t>;
+  static map_type handles;
+  static std::mutex handle_mtx;
+
+  auto d = ttg::device::current_device();
+  int device = 0; // assume 0 if we don't have a device
+  if (d.is_device()) {
+    device = d;
+  }
+  cudaStream_t stream = ttg::device::current_stream();
+
+  std::lock_guard g(handle_mtx);
+  map_type::iterator it;
+  if ((it = handles.find({device, stream})) == handles.end()){
+    cusolverDnHandle_t handle;
+    auto status = cusolverDnCreate(&handle);
+    if (CUSOLVER_STATUS_SUCCESS != status) {
+      std::cerr << "cusolverDnCreate failed: " << status << std::endl;
+      throw std::runtime_error("cusolverDnCreate failed");
+    }
+    status = cusolverDnSetStream(handle, stream);
+    if (CUSOLVER_STATUS_SUCCESS != status) {
+      std::cerr << "cusolverDnSetStream failed: " << status << std::endl;
+      throw std::runtime_error("cusolverDnSetStream failed");
+    }
+
+    auto [iterator, success] = handles.insert({{device, stream}, handle});
+    it = iterator;
+  }
+
+  return it->second;
+}
+#endif // TTG_HAVE_CUDART
+
+#ifdef TTG_HAVE_HIPBLAS
+
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hipsolver/hipsolver.h>
+
+/// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread
+/// \return the rocBLAS handle for the current thread
+template<typename T = int>
+inline const hipblasHandle_t& hipblas_handle(T _ = 0) {
+  using map_type = std::map<std::pair<int, hipStream_t>, hipblasHandle_t>;
+  static map_type handles;
+  static std::mutex handle_mtx;
+
+  auto d = ttg::device::current_device();
+  int device = 0; // assume 0 if we don't have a device
+  if (d.is_device()) {
+    device = d;
+  }
+
+  hipStream_t stream = ttg::device::current_stream();
+
+  std::lock_guard g(handle_mtx);
+  map_type::iterator it;
+  if ((it = handles.find({device, stream})) == handles.end()){
+    hipblasHandle_t handle;
+    auto status = hipblasCreate(&handle);
+    if (HIPBLAS_STATUS_SUCCESS != status) {
+      throw std::runtime_error("hipblasCreate failed");
+    }
+    status = hipblasSetStream(handle, stream);
+    if (HIPBLAS_STATUS_SUCCESS != status) {
+      throw std::runtime_error("hipblasSetStream failed");
+    }
+    auto [iterator, success] = handles.insert({{device, stream}, handle});
+    it = iterator;
+  }
+
+  return it->second;
+}
+
+/// \brief Returns the hipsolver handle to be used for launching rocBLAS kernels from the current thread
+/// \return the hipsolver handle for the current thread
+template<typename T = int>
+inline const hipsolverDnHandle_t& hipsolver_handle(T _ = 0) {
+  using map_type = std::map<std::pair<int, hipStream_t>, hipsolverDnHandle_t>;
+  static map_type handles;
+  static std::mutex handle_mtx;
+  auto d = ttg::device::current_device();
+  int device = 0; // assume 0 if we don't have a device
+  if (d.is_device()) {
+    device = d;
+  }
+
+  hipStream_t stream = ttg::device::current_stream();
+
+  std::lock_guard g(handle_mtx);
+  map_type::iterator it;
+  if ((it = handles.find({device, stream})) == handles.end()){
+    hipsolverDnHandle_t handle;
+    auto status = hipsolverDnCreate(&handle);
+    if (HIPSOLVER_STATUS_SUCCESS != status) {
+      throw std::runtime_error("hipsolverCreate failed");
+    }
+    status = hipsolverDnSetStream(handle, stream);
+    if (HIPSOLVER_STATUS_SUCCESS != status) {
+      throw std::runtime_error("hipsolverSetStream failed");
+    }
+    auto [iterator, success] = handles.insert({{device, stream}, handle});
+    it = iterator;
+  }
+  return it->second;
+}
+#endif // TTG_HAVE_HIPBLAS
diff --git a/examples/madness/mrattg.cc b/examples/madness/mrattg.cc
index 47bce1ce9..2341ba864 100644
--- a/examples/madness/mrattg.cc
+++ b/examples/madness/mrattg.cc
@@ -124,6 +124,47 @@ auto make_project(functorT& f,
   return ttg::make_tt(F, edges(fuse(refine, ctl)), edges(refine, result), name, {"control"}, {"refine", "result"});
 }
 
+/* below is a preliminary attempt at a device version, needs revisiting */
+#if 0
+/// Returns an std::unique_ptr to the object
+template <typename functorT, typename T, size_t K, Dimension NDIM>
+auto make_project_device(functorT& f,
+                         const T thresh,  /// should be scalar value not complex
+                         ctlEdge<NDIM>& ctl, rnodeEdge<T, K, NDIM>& result, const std::string& name = "project") {
+  auto F = [f, thresh](const Key<NDIM>& key, std::tuple<ctlOut<NDIM>, rnodeOut<T, K, NDIM>>& out) {
+    FunctionReconstructedNode<T, K, NDIM> node(key);  // Our eventual result
+    auto& coeffs = node.coeffs;                       // Need to clean up OO design
+    bool is_leaf;
+
+    if (key.level() < initial_level(f)) {
+      for (auto child : children(key)) ttg::sendk<0>(child, out);
+      coeffs = T(1e7);  // set to obviously bad value to detect incorrect use
+      is_leaf = false;
+    } else if (is_negligible<functorT, T, NDIM>(f, Domain<NDIM>::template bounding_box<T>(key),
+                                                truncate_tol(key, thresh))) {
+      coeffs = T(0.0);
+      is_leaf = true;
+    } else {
+      auto node_view  = ttg::make_view(node, ttg::ViewScope::Out); // no need to move node onto the device
+      auto is_leaf_view = ttg::make_view(is_leaf, ttg::ViewScope::Out);
+      co_await ttg::device::wait_views{};
+      fcoeffs<functorT, T, K>(f, key, thresh,
+                              node_view.get_device_ptr<0>(),
+                              is_leaf_view.get_device_ptr<0>());  // cannot deduce K
+      co_await ttg::device::wait_kernel{};
+      if (!is_leaf) {
+        for (auto child : children(key)) ttg::sendk<0>(child, out);  // should be broadcast ?
+      }
+    }
+    node.is_leaf = is_leaf;
+    ttg::send<1>(key, node, out);  // always produce a result
+  };
+  ctlEdge<NDIM> refine("refine");
+  return ttg::make_tt(F, edges(fuse(refine, ctl)), edges(refine, result), name, {"control"}, {"refine", "result"});
+}
+#endif // 0
+
+
 namespace detail {
   template <typename T, size_t K, Dimension NDIM>
   struct tree_types {};
diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index e23007803..a2080f3ed 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -3,80 +3,112 @@
 
 #include <iostream>
 #include <memory>
+#include <optional>
+
+#include <ttg.h>
 
 #include <ttg/serialization/splitmd_data_descriptor.h>
 
-template <typename T>
-class MatrixTile {
+
+#include <TiledArray/device/allocators.h>
+#if defined(TILEDARRAY_HAS_DEVICE)
+#define ALLOCATOR TiledArray::device_pinned_allocator<T>
+
+inline void allocator_init() {
+  // initialize MADNESS so that TA allocators can be created
+#if defined(TTG_PARSEC_IMPORTED)
+  madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
+#endif // TTG_PARSEC_IMPORTED
+  madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
+}
+
+inline void allocator_fini() {
+  madness::finalize();
+}
+#else  // TILEDARRAY_HAS_DEVICE
+#define ALLOCATOR std::allocator<T>
+
+inline void allocator_init() { }
+
+inline void allocator_fini() { }
+
+#endif // TILEDARRAY_HAS_DEVICE
+
+template <typename T, class Allocator = ALLOCATOR>
+class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
  public:
-  using metadata_t = typename std::tuple<int, int, int>;
+  using metadata_t = typename std::tuple<std::size_t, std::size_t, std::size_t>;
 
-  using pointer_t = typename std::shared_ptr<T>;
+  using buffer_t  = typename ttg::Buffer<T, Allocator>;
+  using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;
 
  private:
-  pointer_t _data;
-  int _rows = 0, _cols = 0, _lda = 0;
+  buffer_t _buffer;
+  std::size_t _rows = 0, _cols = 0, _lda = 0;
+#ifdef DEBUG_TILES_VALUES
+  mutable std::optional<T> _norm;
+#endif // DEBUG_TILES_VALUES
 
   // (Re)allocate the tile memory
   void realloc() {
     // std::cout << "Reallocating new tile" << std::endl;
-    _data = std::shared_ptr<T>(new T[_lda * _cols], [](T* p) { delete[] p; });
+    _buffer.reset(_lda * _cols);
+#ifdef DEBUG_TILES_VALUES
+    std::fill(_buffer.host_ptr(), _lda * _cols, T{});
+#endif // DEBUG_TILES_VALUES
   }
 
  public:
   MatrixTile() {}
 
-  MatrixTile(int rows, int cols, int lda) : _rows(rows), _cols(cols), _lda(lda) { realloc(); }
+  MatrixTile(std::size_t rows, std::size_t cols, std::size_t lda)
+  : ttvalue_type()
+  , _buffer(lda*cols)
+  , _rows(rows)
+  , _cols(cols)
+  , _lda(lda)
+  { }
 
   MatrixTile(const metadata_t& metadata)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {}
 
-  MatrixTile(int rows, int cols, pointer_t data, int lda) : _data(data), _rows(rows), _cols(cols), _lda(lda) {}
-
-  MatrixTile(const metadata_t& metadata, pointer_t data)
+  MatrixTile(const metadata_t& metadata, T* data)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {}
 
   /**
    * Constructor with outside memory. The tile will *not* delete this memory
    * upon destruction.
    */
-  MatrixTile(int rows, int cols, T* data, int lda) : _data(data, [](T*) {}), _rows(rows), _cols(cols), _lda(lda) {}
-
-  MatrixTile(const metadata_t& metadata, T* data)
-      : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {}
-
-#if 0
-  /* Copy dtor and operator with a static_assert to catch unexpected copying */
-  MatrixTile(const MatrixTile& other) {
-    static_assert("Oops, copy ctor called?!");
-  }
-
-  MatrixTile& operator=(const MatrixTile& other) {
-    static_assert("Oops, copy ctor called?!");
-  }
-#endif
+  MatrixTile(std::size_t rows, std::size_t cols, T* data, std::size_t lda)
+  : ttvalue_type()
+  , _buffer(data, lda*cols)
+  , _rows(rows)
+  , _cols(cols)
+  , _lda(lda)
+  { }
 
-  MatrixTile(MatrixTile<T>&& other) = default;
+  MatrixTile(MatrixTile<T, Allocator>&& other) = default;
 
-  MatrixTile& operator=(MatrixTile<T>&& other) = default;
+  MatrixTile& operator=(MatrixTile<T, Allocator>&& other) = default;
 
-#if 0
-  /* Defaulted copy ctor and op for shallow copies, see comment below */
-  MatrixTile(const MatrixTile<T>& other)  = default;
-
-  MatrixTile& operator=(const MatrixTile<T>& other)  = default;
-#endif  // 0
   /* Deep copy ctor und op are not needed for PO since tiles will never be read
    * and written concurrently. Hence shallow copies are enough, will all
    * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
    * can handle data sharing without excessive copying */
-#if 1
-  MatrixTile(const MatrixTile<T>& other) : _rows(other._rows), _cols(other._cols), _lda(other._lda) {
-    this->realloc();
+  MatrixTile(const MatrixTile<T, Allocator>& other)
+  : ttvalue_type()
+  , _buffer(other._lda*other._cols)
+  , _rows(other._rows)
+  , _cols(other._cols)
+  , _lda(other._lda)
+#ifdef DEBUG_TILES_VALUES
+  , _norm(other._norm)
+#endif // DEBUG_TILES_VALUES
+  {
     std::copy_n(other.data(), _lda * _cols, this->data());
   }
 
-  MatrixTile& operator=(const MatrixTile<T>& other) {
+  MatrixTile& operator=(const MatrixTile<T, Allocator>& other) {
     this->_rows = other._rows;
     this->_cols = other._cols;
     this->_lda = other._lda;
@@ -84,62 +116,71 @@ class MatrixTile {
     std::copy_n(other.data(), _lda * _cols, this->data());
     return *this;
   }
-#endif  // 1
 
   void set_metadata(metadata_t meta) {
     _rows = std::get<0>(meta);
     _cols = std::get<1>(meta);
     _lda = std::get<2>(meta);
+    this->realloc();
   }
 
   metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; }
 
   // Accessing the raw data
-  T* data() { return _data.get(); }
+  T* data() { return _buffer.host_ptr(); }
 
-  const T* data() const { return _data.get(); }
+  const T* data() const { return _buffer.host_ptr(); }
 
-  /// @return shared_ptr to data
-  pointer_t data_shared() & { return _data; }
+  size_t size() const { return _cols * _lda; }
 
-  /// @return shared_ptr to data
-  pointer_t data_shared() const& { return _data; }
+  std::size_t rows() const { return _rows; }
 
-  /// yields data and resets this object to a default-constucted state
-  pointer_t yield_data() && {
-    pointer_t result = _data;
-    *this = MatrixTile();
-    return std::move(result);
-  }
+  std::size_t cols() const { return _cols; }
 
-  size_t size() const { return _cols * _lda; }
+  std::size_t lda() const { return _lda; }
 
-  int rows() const { return _rows; }
-
-  int cols() const { return _cols; }
+  buffer_t& buffer() {
+    return _buffer;
+  }
 
-  int lda() const { return _lda; }
+  const buffer_t& buffer() const {
+    return _buffer;
+  }
 
   auto& fill(T value) {
-    std::fill(_data.get(), _data.get() + size(), value);
+    std::fill(data().get(), data().get() + size(), value);
+    _buffer.set_current_device(0);
     return *this;
   }
 
+#ifdef DEBUG_TILES_VALUES
+  /* Only available if debugging is enabled. Norm must be
+   * set by application and is not computed automatically. */
+  T norm() const {
+    if (!_norm) _norm = blas::nrm2(size(), data(), 1);
+    return _norm.value();
+  }
+
+  void set_norm(T norm) {
+    _norm = norm;
+  }
+#endif // DEBUG_TILES_VALUES
+
   friend std::ostream& operator<<(std::ostream& o, MatrixTile<T> const& tt) {
     auto ptr = tt.data();
     o << std::endl << " ";
     o << "MatrixTile<" << typeid(T).name() << ">{ rows=" << tt.rows() << " cols=" << tt.cols() << " ld=" << tt.lda();
-#if DEBUG_TILES_VALUES
-    o << " data=[ " for (int i = 0; i < tt.rows(); i++) {
-      for (int j = 0; j < tt.cols(); j++) {
+#if DEBUG_TILES_VALUES && 0
+    o << " data=[ ";
+    for (std::size_t i = 0; i < tt.rows(); i++) {
+      for (std::size_t j = 0; j < tt.cols(); j++) {
         o << ptr[i + j * tt.lda()] << " ";
       }
       o << std::endl << " ";
     }
-    o << " ] "
+    o << " ] ";
 #endif
-        o
-      << " } ";
+    o << " } ";
     return o;
   }
 };
@@ -171,7 +212,7 @@ namespace madness {
     template <class Archive, typename T>
     struct ArchiveLoadImpl<Archive, MatrixTile<T>> {
       static inline void load(const Archive& ar, MatrixTile<T>& tile) {
-        int rows, cols, lda;
+        std::size_t rows, cols, lda;
         ar >> rows >> cols >> lda;
         tile = MatrixTile<T>(rows, cols, lda);
         ar >> wrap(tile.data(), tile.rows() * tile.cols());  // MatrixTile<T>(bm.rows(), bm.cols());
diff --git a/examples/potrf/lauum.h b/examples/potrf/lauum.h
index d30c011ce..dba40ef5b 100644
--- a/examples/potrf/lauum.h
+++ b/examples/potrf/lauum.h
@@ -44,7 +44,7 @@ auto make_lauum(const MatrixT<T>& A,
         ttg::send<1>(Key2{K, K}, std::move(tile_kk), out);
     }
   };
-  return ttg::make_tt(f, ttg::edges(input_disp), ttg::edges(to_syrk_C, output_result), "LAUUM", 
+  return ttg::make_tt(f, ttg::edges(input_disp), ttg::edges(to_syrk_C, output_result), "LAUUM",
                       {"tile_kk"}, {"to_syrk_C", "output_result"});
 }
 
@@ -85,7 +85,7 @@ auto make_trmm(const MatrixT<T>& A,
       ttg::send<1>(Key2{K, N}, std::move(tile_kn), out);
     }
   };
-  return ttg::make_tt(f, ttg::edges(input_kk, input_kn), ttg::edges(to_gemm_C, output_result), "TRMM", 
+  return ttg::make_tt(f, ttg::edges(input_kk, input_kn), ttg::edges(to_gemm_C, output_result), "TRMM",
                       {"tile_kk", "tile_kn"}, {"to_GEMM_C", "output_result"});
 }
 
@@ -93,7 +93,7 @@ auto make_trmm(const MatrixT<T>& A,
 template <typename T>
 auto make_syrk(const MatrixT<T>& A,
                ttg::Edge<Key2, MatrixTile<T>>& input_kn, // will from the dispatcher
-               ttg::Edge<Key2, MatrixTile<T>>& input_nn, 
+               ttg::Edge<Key2, MatrixTile<T>>& input_nn,
                ttg::Edge<Key2, MatrixTile<T>>& to_syrk_nn,
                ttg::Edge<Key2, MatrixTile<T>>& output_result)
 {
@@ -116,7 +116,7 @@ auto make_syrk(const MatrixT<T>& A,
     blas::syrk(blas::Layout::ColMajor,
                lapack::Uplo::Lower,
                blas::Op::Trans,
-               m, k, 
+               m, k,
                1.0, tile_kn.data(), tile_kn.rows(),
                1.0, tile_nn.data(), tile_nn.rows());
 
@@ -127,8 +127,8 @@ auto make_syrk(const MatrixT<T>& A,
         ttg::send<1>(Key2{N, N}, tile_kn, out);
     }
   };
-  return ttg::make_tt(f, ttg::edges(input_kn, input_nn), 
-                         ttg::edges(to_syrk_nn, output_result), "SYRK", 
+  return ttg::make_tt(f, ttg::edges(input_kn, input_nn),
+                         ttg::edges(to_syrk_nn, output_result), "SYRK",
                       {"tile_kn", "tile_nn"}, {"SYRK_nn", "output_result"});
 }
 
@@ -162,7 +162,7 @@ auto make_gemm(const MatrixT<T>& A,
                blas::Op::NoTrans,
                input_A.rows(), input_B.cols(), input_A.cols(),
                1.0, input_A.data(), input_A.rows(),
-                    input_B.data(), input_B.rows(), 
+                    input_B.data(), input_B.rows(),
                1.0, input_C.data(), input_C.rows());
 
     if(K < A.rows()-1) {
@@ -172,8 +172,8 @@ auto make_gemm(const MatrixT<T>& A,
         ttg::send<1>(Key2{M, N}, std::move(input_C), out);
     }
   };
-  return ttg::make_tt(f, ttg::edges(input_A, input_B, input_C), 
-                         ttg::edges(to_gemm_C, output_result), "GEMM", 
+  return ttg::make_tt(f, ttg::edges(input_A, input_B, input_C),
+                         ttg::edges(to_gemm_C, output_result), "GEMM",
                       {"A", "B", "C"}, {"GEMM_C", "output result"});
 }
 
@@ -189,7 +189,7 @@ auto make_dispatcher(const MatrixT<T>& A,
                      ttg::Edge<Key3, MatrixTile<T>>& to_gemm_B)
 {
   auto f = [=](const Key2& key,
-               MatrixTile<T>&&tile,
+               const MatrixTile<T>& tile,
                std::tuple<ttg::Out<Key1, MatrixTile<T>>,
                           ttg::Out<Key2, MatrixTile<T>>,
                           ttg::Out<Key2, MatrixTile<T>>,
@@ -215,7 +215,7 @@ auto make_dispatcher(const MatrixT<T>& A,
                 keylist_trmm_A.push_back(Key2{key[1], n});
             }
         }
-    } 
+    }
     if(key[0] > key[1]) {
         keylist_syrk.reserve(1);
         if(ttg::tracing()) ttg::print("LAUUM_Dispatch(", key, ") sending to SYRK(", key, ")");
@@ -227,7 +227,7 @@ auto make_dispatcher(const MatrixT<T>& A,
             keylist_gemm_A.reserve(key[1]);
             for(int n = 0; n < key[1]; n++) {
                 if(ttg::tracing()) ttg::print("LAUUM_Dispatch(", key, ") sending to A of GEMM(", Key3{key[0], n, key[1]}, ")");
-                keylist_gemm_A.push_back(Key3{key[0], n, key[1]});      
+                keylist_gemm_A.push_back(Key3{key[0], n, key[1]});
             }
         }
     }
@@ -238,10 +238,16 @@ auto make_dispatcher(const MatrixT<T>& A,
             keylist_gemm_B.push_back(Key3{key[0], key[1], n});
         }
     }
-    ttg::broadcast<0, 1, 2, 3, 4, 5>(std::make_tuple(std::move(keylist_lauum), std::move(keylist_syrk), std::move(keylist_trmm_A), std::move(keylist_trmm_B), std::move(keylist_gemm_A), std::move(keylist_gemm_B)), std::move(tile), out);
+    ttg::broadcast<0, 1, 2, 3, 4, 5>(std::make_tuple(std::move(keylist_lauum),
+                                                     std::move(keylist_syrk),
+                                                     std::move(keylist_trmm_A),
+                                                     std::move(keylist_trmm_B),
+                                                     std::move(keylist_gemm_A),
+                                                     std::move(keylist_gemm_B)),
+                                                     tile, out);
   };
 
-  return ttg::make_tt(f, ttg::edges(input), ttg::edges(to_lauum, to_syrk, to_trmm_A, to_trmm_B, to_gemm_A, to_gemm_B), 
+  return ttg::make_tt(f, ttg::edges(input), ttg::edges(to_lauum, to_syrk, to_trmm_A, to_trmm_B, to_gemm_A, to_gemm_B),
                       "LAUUM Dispatch", {"Input"}, {"LAUUM", "SYRK", "TRMM_A", "TRMM_B", "GEMM_A", "GEMM_B"});
 }
 
diff --git a/examples/potrf/plgsy.h b/examples/potrf/plgsy.h
index cfbc4192a..a29336712 100644
--- a/examples/potrf/plgsy.h
+++ b/examples/potrf/plgsy.h
@@ -13,7 +13,8 @@ auto make_plgsy(MatrixT<T>& A, unsigned long bump, unsigned long random_seed, tt
     if(ttg::tracing()) ttg::print("PLGSY( ", key, ") on rank ", A.rank_of(key[0], key[1]));
     assert(A.is_local(I, J));
 
-    T *a = A(I, J).data();
+    auto tile = A(I, J);
+    T *a = tile.data();
     int tempmm, tempnn, ldam;
 
     tempmm = (I==A.rows()-1) ? A.rows_in_matrix()-I*A.rows_in_tile() : A.rows_in_tile();
@@ -22,8 +23,11 @@ auto make_plgsy(MatrixT<T>& A, unsigned long bump, unsigned long random_seed, tt
 
     CORE_plgsy((double)bump, tempmm, tempnn, a, ldam,
                A.rows_in_matrix(), I*A.rows_in_tile(), J*A.cols_in_tile(), random_seed);
+#ifdef DEBUG_TILES_VALUES
+    tile.set_norm(blas::nrm2(tile.size(), a, 1));
+#endif // DEBUG_TILES_VALUES
 
-    ttg::send<0>(key, std::move(A(I, J)), out);
+    ttg::send<0>(key, std::move(tile), out);
   };
 
   return ttg::make_tt(f, ttg::edges(input), ttg::edges(output), "PLGSY", {"startup"}, {"output"});
diff --git a/examples/potrf/pmw.h b/examples/potrf/pmw.h
index 076f1fffc..0f8d75d7d 100644
--- a/examples/potrf/pmw.h
+++ b/examples/potrf/pmw.h
@@ -49,54 +49,55 @@ class PaRSECMatrixWrapper {
     //}
   }
 
-  MatrixTile<ValueT> operator()(int row, int col) const {
+  MatrixTile<ValueT> operator()(std::size_t row, std::size_t col) const {
     ValueT* ptr = static_cast<ValueT*>(parsec_data_copy_get_ptr(
                       parsec_data_get_copy(pm->super.super.data_of(&pm->super.super, row, col), 0)));
     auto mb = (row < pm->super.mt - 1) ? pm->super.mb : pm->super.m - row * pm->super.mb;
     auto nb = (col < pm->super.nt - 1) ? pm->super.nb : pm->super.n - col * pm->super.nb;
-    return MatrixTile<ValueT>{mb, nb, ptr, pm->super.mb};
+    std::size_t lda = pm->super.mb;
+    return MatrixTile<ValueT>{mb, nb, ptr, lda};
   }
 
   /** Number of tiled rows **/
-  int rows(void) const {
+  std::size_t rows(void) const {
     return pm->super.mt;
   }
 
   /** Number of rows in tile */
-  int rows_in_tile(void) const {
+  std::size_t rows_in_tile(void) const {
     return pm->super.mb;
   }
 
   /** Number of rows in the matrix */
-  int rows_in_matrix(void) const {
+  std::size_t rows_in_matrix(void) const {
     return pm->super.m;
   }
 
   /** Number of tiled columns **/
-  int cols(void) const {
+  std::size_t cols(void) const {
     return pm->super.nt;
   }
 
   /** Number of columns in tile */
-  int cols_in_tile(void) const {
+  std::size_t cols_in_tile(void) const {
     return pm->super.nb;
   }
 
   /** Number of columns in the matrix */
-  int cols_in_matrix(void) const {
+  std::size_t cols_in_matrix(void) const {
     return pm->super.n;
   }
 
   /* The rank storing the tile at {row, col} */
-  int rank_of(int row, int col) const {
+  std::size_t rank_of(std::size_t row, std::size_t col) const {
     return pm->super.super.rank_of(&pm->super.super, row, col);
   }
 
-  bool is_local(int row, int col) const {
+  bool is_local(std::size_t row, std::size_t col) const {
     return ttg::default_execution_context().rank() == rank_of(row, col);
   }
 
-  bool in_matrix(int row, int col) const {
+  bool in_matrix(std::size_t row, std::size_t col) const {
     return (pm->uplo == PARSEC_MATRIX_LOWER && col <= row) ||
            (pm->uplo == PARSEC_MATRIX_UPPER && col >= row);
   }
@@ -112,8 +113,8 @@ class PaRSECMatrixWrapper {
   /* Copy entire input matrix (which is local) into a single LAPACK format matrix */
   ValueT *getLAPACKMatrix() const {
     ValueT *ret = new ValueT[rows_in_matrix()*cols_in_matrix()];
-    for(auto i = 0; i < rows_in_matrix(); i++) {
-      for(auto j = 0; j < cols_in_matrix(); j++) {
+    for(std::size_t i = 0; i < rows_in_matrix(); i++) {
+      for(std::size_t j = 0; j < cols_in_matrix(); j++) {
         if( in_matrix(i/rows_in_tile(), j/cols_in_tile()) ) {
           auto m = i/rows_in_tile();
           auto n = j/cols_in_tile();
@@ -136,8 +137,8 @@ using MatrixT = PaRSECMatrixWrapper<sym_two_dim_block_cyclic_t, ValueT>;
 static auto make_load_tt(MatrixT<double> &A, ttg::Edge<Key2, MatrixTile<double>> &toop, bool defer_write)
 {
   auto load_tt = ttg::make_tt<void>([&](std::tuple<ttg::Out<Key2, MatrixTile<double>>>& out) {
-      for(int i = 0; i < A.rows(); i++) {
-        for(int j = 0; j < A.cols() && A.in_matrix(i, j); j++) {
+      for(std::size_t i = 0; i < A.rows(); i++) {
+        for(std::size_t j = 0; j < A.cols() && A.in_matrix(i, j); j++) {
           if(A.is_local(i, j)) {
             if(ttg::tracing()) ttg::print("load(", Key2{i, j}, ")");
             ttg::send<0>(Key2{i, j}, std::move(A(i, j)), out);
@@ -154,9 +155,9 @@ static auto make_load_tt(MatrixT<double> &A, ttg::Edge<Key2, MatrixTile<double>>
 static void print_LAPACK_matrix( const double *A, int N, const char *label)
 {
   std::cout << label << std::endl;
-  for(int i = 0; i < N; i++) {
+  for(std::size_t i = 0; i < N; i++) {
     std::cout << " ";
-    for(int j = 0; j < N; j++) {
+    for(std::size_t j = 0; j < N; j++) {
       std::cout << std::setw(11) << std::setprecision(5) << A[i+j*N] << " ";
     }
     std::cout << std::endl;
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index 6079bc6bc..f6ba6e147 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -1,10 +1,29 @@
 #pragma once
 
 #include <ttg.h>
+#include <ttg/config.h>
 #include "lapack.hh"
 #include "pmw.h"
+#include "util.h"
+#include "../devblas_helper.h"
 
-#undef DEBUG_TILES_VALUES
+#if (defined(TTG_ENABLE_CUDA) || defined(TTG_ENABLE_HIP))
+#define ENABLE_DEVICE_KERNEL 1
+#endif
+
+#if defined(TTG_HAVE_CUDART)
+#define ES ttg::ExecutionSpace::CUDA
+#define TASKRET -> ttg::device::Task
+#include <cusolverDn.h>
+#elif defined(TTG_HAVE_HIP)
+#define ES ttg::ExecutionSpace::HIP
+#define TASKRET -> ttg::device::Task
+#include <hipsolver/hipsolver.h>
+#include <hipblas/hipblas.h>
+#else
+#define ES ttg::ExecutionSpace::Host
+#define TASKRET -> void
+#endif
 
 namespace potrf {
 
@@ -13,6 +32,61 @@ namespace potrf {
   inline double FADDS_POTRF(double __n) { return (__n * (((1. / 6.) * __n) * __n - (1. / 6.))); }
   inline double FLOPS_DPOTRF(double __n) { return FMULS_POTRF(__n) + FADDS_POTRF(__n); }
 
+#if defined(ENABLE_DEVICE_KERNEL)
+  static int device_potrf_workspace_size(MatrixTile<double> &A) {
+    int Lwork;
+    #if defined(TTG_HAVE_CUDA)
+      cusolverDnDpotrf_bufferSize(cusolver_handle(),
+                                  CUBLAS_FILL_MODE_LOWER, A.cols(),
+                                  nullptr, A.lda(),
+                                  &Lwork);
+      return Lwork;
+    #elif defined(TTG_HAVE_HIPBLAS)
+      hipsolverDnDpotrf_bufferSize(hipsolver_handle(),
+                                   HIPSOLVER_FILL_MODE_LOWER, A.cols(),
+                                   nullptr, A.lda(),
+                                   &Lwork);
+      return Lwork;
+    #else
+      return 0;
+    #endif
+  }
+
+  static void device_potrf(MatrixTile<double> &A, double *workspace, int Lwork, int *devInfo) {
+    int device = ttg::device::current_device();
+    assert(device >= 0);
+#if defined(TTG_HAVE_CUDA)
+    //std::cout << "POTRF A " << A.buffer().device_ptr_on(device) << " device " << device << " cols " << A.cols() << " lda " << A.lda() << " Lwork " << Lwork << " WS " << workspace << " devInfo " << devInfo << std::endl;
+    auto handle = cusolver_handle();
+    //std::cout << "POTRF handle  " << handle << " device " << device << " stream " << ttg::device::current_stream() << std::endl;
+    cusolverDnDpotrf(handle,
+                      CUBLAS_FILL_MODE_LOWER, A.cols(),
+                      A.buffer().current_device_ptr(), A.lda(),
+                      workspace, Lwork,
+                      devInfo);
+  #elif defined(TTG_HAVE_HIPBLAS)
+    hipsolverDpotrf(hipsolver_handle(),
+                      HIPSOLVER_FILL_MODE_LOWER, A.cols(),
+                      A.buffer().current_device_ptr(), A.lda(),
+                      workspace, Lwork,
+                      devInfo);
+  #endif
+  }
+
+  static void device_norm(const MatrixTile<double> &A, double *norm) {
+    auto size = A.size();
+    auto buffer = A.buffer().current_device_ptr();
+    //std::cout << "device_norm ptr " << buffer << " device " << ttg::device::current_device() << std::endl;
+#if defined(TTG_HAVE_CUDA)
+    auto handle = cublas_handle();
+    //double n = 1.0;
+    cublasDnrm2(handle, size, buffer, 1, norm);
+  #elif defined(TTG_HAVE_HIPBLAS)
+    hipblasDnrm2(hipblas_handle(), size, buffer, 1, norm);
+  #endif
+  }
+#endif // ENABLE_DEVICE_KERNEL
+
   template <typename MatrixT>
   auto make_potrf(MatrixT& A,
                   ttg::Edge<Key1, MatrixTile<typename MatrixT::element_type>>& input_disp,  // from the dispatcher
@@ -20,20 +94,115 @@ namespace potrf {
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_trsm,
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
     using T = typename MatrixT::element_type;
+#if defined(ENABLE_DEVICE_KERNEL)
+    auto iallocator = std::make_shared<TiledArray::device_pinned_allocator<int>>();
+    //std::cout << "Creating CUDA POTRF task " << std::endl;
+    auto f_dev = [=, iallocator = std::move(iallocator)]
+                    (const Key1& key, MatrixTile<T>&& tile_kk,
+                     std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
+      const auto K = key[0];
+
+      /* compute successors before submitting the kernel
+       * TODO: this is parsec specific since this code is still executing on the worker threads
+       */
+      std::vector<Key2> keylist;
+      keylist.reserve(A.rows() - K);
+      /* TODO: reverse order of arrays */
+      //std::cout << "POTRF K " << K << " A.rows " << A.rows() << std::endl;
+      for (int m = K + 1; m < A.rows(); ++m) {
+        /* send tile to trsm */
+        //std::cout << "POTRF successor " << Key2(m, K) << std::endl;
+        keylist.push_back(Key2(m, K));
+      }
+
+      /* pull the matrix onto the device, as computing the workspace size might in theory depend on the data */
+      //TODO: extend MatrixTile<T> to be heterogeneous-aware. Look at spmm-cuda.cc 50-253
+      //       Need to include a ttg::buffer<T> _data instead of a shared_ptr;
+      //       Check pmw.h: when we generate the MatrixTile
+      //       Also check pinned allocator at the end of DeviceTensor (250-253)
+
+      int Lwork = device_potrf_workspace_size(tile_kk);
+
+      // Instead of using scratch here, we should have hostWS and hostInfo globals and use to_device
+      // this would reduce the number of I/O operations to devices
+      double *hostWS = new double[Lwork];
+      ttg::devicescratch<double> devWS = ttg::make_scratch(hostWS, ttg::scope::Allocate, Lwork);
+      int *hostInfo = iallocator->allocate(1);
+      ttg::devicescratch<int> devInfo = ttg::make_scratch(hostInfo, ttg::scope::Allocate);
+
+      *hostInfo = -32;
+
+#ifdef DEBUG_TILES_VALUES
+      std::array<T, 2> norms;
+      //auto norms_s  = ttg::make_scratch(norms.data(), ttg::scope::Allocate, norms.size());
+      /* the workspace and the devInfo must be device-level pointers */
+      //co_await ttg::to_device(tile_kk.buffer(), devWS, devInfo, norms_s);
+      co_await ttg::device::select(tile_kk.buffer(), devWS, devInfo);
+
+      /* compute the norm at input */
+      static_assert(std::is_same_v<double, T>, "Norm debugging only implementation for T=double");
+      device_norm(tile_kk, &norms[0]);
+#else
+      /* the workspace and the devInfo must be device-level pointers */
+      co_await ttg::device::select(tile_kk.buffer(), devWS, devInfo);
+#endif // DEBUG_TILES_VALUES
+
+      int device = ttg::device::current_device();
+      //std::cout << "POTRF [" << K << "] on " << device << std::endl;
+
+
+      //std::cout << "devWS host ptr " << hostWS << " device ptr " << devWS.device_ptr() << " size " <<  devWS.size()
+      //          << " devInfo host ptr " << hostInfo << " device ptr " << devInfo.device_ptr() << "size "  << devInfo.size() << std::endl;
+
+      /* everything is on the device, call the POTRF */
+      device_potrf(tile_kk, devWS.device_ptr(), Lwork, devInfo.device_ptr());
+
+#ifdef DEBUG_TILES_VALUES
+      /* compute the norm at input */
+      static_assert(std::is_same_v<double, T>, "Verification only implementation for T=double");
+      device_norm(tile_kk, &norms[1]);
+      /* wait for the kernel to complete */
+      co_await ttg::device::wait(devInfo);
+      // check that we got the input tile we expected
+      assert(check_norm(tile_kk.norm(), norms[0]));
+      // set the new norm
+      tile_kk.set_norm(norms[1]);
+#else
+      /* wait for the kernel to complete */
+      co_await ttg::device::wait(devInfo);
+#endif // DEBUG_TILES_VALUES
+
+      delete[] hostWS;
+      int info = *hostInfo;
+      assert(info == 0);
+      iallocator->deallocate(hostInfo, 1);
+      if( info == 0 ) {
+        co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), std::move(keylist)), std::move(tile_kk), out));
+        // Anything after this co_await is never executed
+        // co_return would look better, but co_return would destroy keylist before the runtime can handle it
+      } else {
+        // Well... Here we should interrupt the DAG of tasks, there is an error. Raise?
+        std::cerr << "Factorization of tile " << K << " failed: " << info << std::endl;
+        ttg::abort();
+      }
+    };
+    return ttg::make_tt<ES>(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
+                        {"tile_kk/dispatcher"}, {"output_result", "output_trsm"});
+#else /* defined(ENABLE_DEVICE_KERNEL) */
     auto f = [=](const Key1& key, MatrixTile<T>&& tile_kk,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) {
       const int K = key[0];
 
       if (ttg::tracing()) ttg::print("POTRF(", key, ")");
 #if defined(DEBUG_TILES_VALUES)
-      std::cout << "Before POTRF(" << key << "), A(" << K << ", " << K << ") is " << tile_kk;
+      //std::cout << "Before POTRF(" << key << "), A(" << K << ", " << K << ") is " << tile_kk;
 #endif
 
       auto info = lapack::potrf(lapack::Uplo::Lower, tile_kk.rows(), tile_kk.data(), tile_kk.lda());
       assert(info == 0);
 
 #if defined(DEBUG_TILES_VALUES)
-      std::cout << "After POTRF(" << key << "), A(" << K << ", " << K << ") is " << tile_kk << std::endl;
+      //std::cout << "After POTRF(" << key << "), A(" << K << ", " << K << ") is " << tile_kk << std::endl;
 #endif
 
       /* send the tile to outputs */
@@ -49,6 +218,7 @@ namespace potrf {
     };
     return ttg::make_tt(f, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
                         {"tile_kk/dispatcher"}, {"output_result", "output_trsm"});
+#endif // defined(ENABLE_DEVICE_KERNEL)
   }
 
   template <typename MatrixT>
@@ -61,6 +231,99 @@ namespace potrf {
                  ttg::Edge<Key3, MatrixTile<typename MatrixT::element_type>>& output_col,   // to GEMM
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
     using T = typename MatrixT::element_type;
+#if defined(ENABLE_DEVICE_KERNEL)
+    auto f = [=](const Key2& key, const MatrixTile<T>& tile_kk, MatrixTile<T>&& tile_mk,
+                 std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>,
+                            ttg::Out<Key3, MatrixTile<T>>>& out) TASKRET {
+      const int M = key[0];
+      const int K = key[1];  // the column equals the outer most look K (same as PO)
+
+      auto mb = tile_mk.rows();
+      auto nb = tile_mk.cols();
+
+      /* in trsm, tile_mk is mb x nb, and tile_kk needs to be lda x nb because side = Right */
+      assert(nb == tile_kk.rows());
+
+      if (ttg::tracing()) ttg::print("TRSM(", key, ")");
+
+      /* populate successor keys while we're on the worker thread */
+      std::vector<Key3> keylist_row;
+      keylist_row.reserve(M - K);
+      std::vector<Key3> keylist_col;
+      keylist_col.reserve(A.rows() - M - 1);
+
+      /* send tile to syrk on diagonal */
+      if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to syrk(", Key2{K, M}, ")");
+
+      /* send the tile to all gemms across in row i */
+      for (int n = K + 1; n < M; ++n) {
+        if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to gemm( ", Key3{M, n, K}, ")");
+        keylist_row.push_back(Key3(M, n, K));
+      }
+
+      /* send the tile to all gemms down in column i */
+      for (int m = M + 1; m < A.rows(); ++m) {
+        if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to gemm( ", Key3{m, M, K}, ")");
+        keylist_col.push_back(Key3(m, M, K));
+      }
+
+
+#ifdef DEBUG_TILES_VALUES
+      std::array<T, 3> norms; // input for tile_kk & tile_mk and output
+      //auto norms_s = ttg::make_scratch(norms.data(), ttg::scope::Allocate, norms.size());
+      co_await ttg::device::select(tile_kk.buffer(), tile_mk.buffer());
+#else
+      co_await ttg::device::select(tile_kk.buffer(), tile_mk.buffer());
+#endif // DEBUG_TILES_VALUES
+
+      int device = ttg::device::current_device();
+      double alpha = 1.0;
+
+#ifdef DEBUG_TILES_VALUES
+      /* compute the norms at input */
+      device_norm(tile_kk, &norms[0]);
+      device_norm(tile_mk, &norms[1]);
+#endif // DEBUG_TILES_VALUES
+
+
+      //std::cout << "TRSM [" << K << ", " << M << "] on " << device << std::endl;
+
+#if defined(TTG_HAVE_CUDA)
+      cublasDtrsm(cublas_handle(),
+                  CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER,
+                  CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
+                  mb, nb, &alpha,
+                  tile_kk.buffer().current_device_ptr(), tile_kk.lda(),
+                  tile_mk.buffer().current_device_ptr(), tile_mk.lda());
+#elif defined(TTG_HAVE_HIPBLAS)
+      hipblasDtrsm(hipblas_handle(),
+                   HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER,
+                   HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT,
+                   mb, nb, &alpha,
+                   tile_kk.buffer().current_device_ptr(), tile_kk.lda(),
+                   tile_mk.buffer().current_device_ptr(), tile_mk.lda());
+
+#endif
+
+#ifdef DEBUG_TILES_VALUES
+      /* compute the norms at input */
+      device_norm(tile_mk, &norms[2]);
+      /* wait for the kernel to complete */
+      co_await ttg::device::wait();
+      // check that we got the input tiles we expected
+      assert(check_norm(tile_kk.norm(), norms[0]));
+      assert(check_norm(tile_mk.norm(), norms[1]));
+      // set the new norm
+      tile_mk.set_norm(norms[2]);
+#endif // DEBUG_TILES_VALUES
+
+      co_await ttg::device::forward(ttg::device::broadcast<0, 1, 2, 3>(std::make_tuple(key, Key2(K, M), keylist_row, keylist_col),
+                                                                       std::move(tile_mk), out));
+    };
+    return ttg::make_tt<ES>(f, ttg::edges(input_kk, ttg::fuse(input_mk, input_disp)),
+                            ttg::edges(output_result, output_diag, output_row, output_col), "TRSM",
+                            {"tile_kk", "tile_mk/dispatcher"}, {"output_result", "tile_mk", "output_row", "output_col"});
+#else // defined(ENABLE_DEVICE_KERNEL)
     auto f = [=](const Key2& key, const MatrixTile<T>& tile_kk, MatrixTile<T>&& tile_mk,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>,
                             ttg::Out<Key3, MatrixTile<T>>>& out) {
@@ -75,15 +338,15 @@ namespace potrf {
 
       if (ttg::tracing()) ttg::print("TRSM(", key, ")");
 #if defined(DEBUG_TILES_VALUES)
-      std::cout << "Before TRSM(" << key << "), A(" << K << ", " << K << ") is " << tile_kk << " and A(" << M << ", "
-                << K << ") is " << tile_mk;
+      //std::cout << "Before TRSM(" << key << "), A(" << K << ", " << K << ") is " << tile_kk << " and A(" << M << ", "
+      //          << K << ") is " << tile_mk;
 #endif
 
       blas::trsm(blas::Layout::ColMajor, blas::Side::Right, lapack::Uplo::Lower, blas::Op::Trans, blas::Diag::NonUnit,
                  mb, nb, 1.0, tile_kk.data(), tile_kk.lda(), tile_mk.data(), tile_mk.lda());
 
 #if defined(DEBUG_TILES_VALUES)
-      std::cout << "After TRSM(" << key << "), A(" << K << ", " << K << ") is " << tile_mk << std::endl;
+      //std::cout << "After TRSM(" << key << "), A(" << K << ", " << K << ") is " << tile_mk << std::endl;
 #endif
 
       std::vector<Key3> keylist_row;
@@ -111,6 +374,7 @@ namespace potrf {
     return ttg::make_tt(f, ttg::edges(input_kk, ttg::fuse(input_mk, input_disp)),
                         ttg::edges(output_result, output_diag, output_row, output_col), "TRSM",
                         {"tile_kk", "tile_mk/dispatcher"}, {"output_result", "tile_mk", "output_row", "output_col"});
+#endif // defined(ENABLE_DEVICE_KERNEL)
   }
 
   template <typename MatrixT>
@@ -121,6 +385,80 @@ namespace potrf {
                  ttg::Edge<Key1, MatrixTile<typename MatrixT::element_type>>& output_potrf,  // to POTRF
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_syrk) {
     using T = typename MatrixT::element_type;
+#if defined(ENABLE_DEVICE_KERNEL)
+    auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk,
+                 std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
+      const int K = key[0];
+      const int M = key[1];
+
+      /* tile_kk is mb x mb and tile_mk is mb x nb */
+      assert(tile_kk.rows() == tile_kk.cols());
+      assert(tile_mk.rows() == tile_kk.rows());
+
+      auto mb = tile_mk.rows();
+      auto nb = tile_mk.cols();
+
+      if (ttg::tracing()) ttg::print("SYRK(", key, ")");
+
+#ifdef DEBUG_TILES_VALUES
+      std::array<T, 3> norms; // input for tile_kk & tile_mk and output
+      //auto norms_s  = ttg::make_scratch(norms.data(), ttg::scope::Allocate, norms.size());
+      co_await ttg::device::select(tile_kk.buffer(), tile_mk.buffer());
+      /* compute the norms at input */
+      device_norm(tile_mk, &norms[0]);
+      device_norm(tile_kk, &norms[1]);
+#else
+      co_await ttg::device::select(tile_kk.buffer(), tile_mk.buffer());
+#endif // DEBUG_TILES_VALUES
+
+      int device = ttg::device::current_device();
+
+      double alpha = -1.0;
+      double beta  =  1.0;
+
+      //std::cout << "SYRK [" << K << ", " << M << "] on " << device << std::endl;
+
+#if defined(TTG_HAVE_CUDA)
+      cublasDsyrk(cublas_handle(),
+                  CUBLAS_FILL_MODE_LOWER,
+                  CUBLAS_OP_N,
+                  mb, nb, &alpha,
+                  tile_mk.buffer().current_device_ptr(), tile_mk.lda(), &beta,
+                  tile_kk.buffer().current_device_ptr(), tile_kk.lda());
+#elif defined(TTG_HAVE_HIPBLAS)
+      hipblasDsyrk(hipblas_handle(),
+                   HIPBLAS_FILL_MODE_LOWER,
+                   HIPBLAS_OP_N,
+                   mb, nb, &alpha,
+                   tile_mk.buffer().current_device_ptr(), tile_mk.lda(), &beta,
+                   tile_kk.buffer().current_device_ptr(), tile_kk.lda());
+#endif
+
+#ifdef DEBUG_TILES_VALUES
+      /* compute the norm at output */
+      device_norm(tile_kk, &norms[2]);
+      /* wait for the kernel to complete */
+      co_await ttg::device::wait();
+      // check that we got the input tiles we expected
+      assert(check_norm(tile_mk.norm(), norms[0]));
+      assert(check_norm(tile_kk.norm(), norms[1]));
+      // set the new norm
+      tile_kk.set_norm(norms[2]);
+#endif // DEBUG_TILES_VALUES
+
+      if (M == K + 1) {
+        /* send the tile to potrf */
+        if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to POTRF(", Key1{K + 1}, ")");
+        co_await ttg::device::send<0>(Key1(K + 1), std::move(tile_kk), out);
+      } else {
+        /* send output to next syrk */
+        if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to SYRK(", Key2{K + 1, M}, ")");
+        co_await ttg::device::send<1>(Key2(K + 1, M), std::move(tile_kk), out);
+      }
+    };
+    return ttg::make_tt<ES>(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk),
+                            "SYRK", {"tile_mk", "tile_kk/dispatcher"}, {"output_potrf", "output_syrk"});
+#else // defined(ENABLE_DEVICE_KERNEL)
     auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk,
                  std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) {
       const int K = key[0];
@@ -135,15 +473,15 @@ namespace potrf {
 
       if (ttg::tracing()) ttg::print("SYRK(", key, ")");
 #if defined(DEBUG_TILES_VALUES)
-      std::cout << "Before SYRK(" << key << "), A(" << M << ", " << K << ") is " << tile_mk << " and A(" << K << ", "
-                << K << ") is " << tile_kk;
+      //std::cout << "Before SYRK(" << key << "), A(" << M << ", " << K << ") is " << tile_mk << " and A(" << K << ", "
+      //          << K << ") is " << tile_kk;
 #endif
 
       blas::syrk(blas::Layout::ColMajor, lapack::Uplo::Lower, blas::Op::NoTrans, mb, nb, -1.0, tile_mk.data(),
                  tile_mk.lda(), 1.0, tile_kk.data(), tile_kk.lda());
 
 #if defined(DEBUG_TILES_VALUES)
-      std::cout << "After SYRK(" << key << "), A(" << K << ", " << K << ") is " << tile_kk << std::endl;
+      //std::cout << "After SYRK(" << key << "), A(" << K << ", " << K << ") is " << tile_kk << std::endl;
 #endif
 
       if (M == K + 1) {
@@ -158,7 +496,8 @@ namespace potrf {
     };
     return ttg::make_tt(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk),
                         "SYRK", {"tile_mk", "tile_kk/dispatcher"}, {"output_potrf", "output_syrk"});
-  }
+#endif
+  } // defined(ENABLE_DEVICE_KERNEL)
 
   template <typename MatrixT>
   auto make_gemm(MatrixT& A,
@@ -169,6 +508,87 @@ namespace potrf {
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_trsm,  // to TRSM
                  ttg::Edge<Key3, MatrixTile<typename MatrixT::element_type>>& output_gemm) {
     using T = typename MatrixT::element_type;
+#if defined(ENABLE_DEVICE_KERNEL)
+    auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn,
+                 std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>>& out) TASKRET {
+      const int M = key[0];
+      const int N = key[1];
+      const int K = key[2];
+      assert(M != N && M > K && N > K);
+
+      assert(tile_mk.cols() == tile_nk.cols());
+      assert(tile_mk.rows() == tile_mn.rows());
+      assert(tile_nk.rows() == tile_mn.cols());
+
+      if (ttg::tracing()) ttg::print("GEMM(", key, ")");
+#if defined(DEBUG_TILES_VALUES) && 0
+      //std::cout << "Before GEMM(" << key << "), A(" << M << ", " << K << ") is " << tile_mk << " and A(" << K << ", "
+      //          << N << ") is " << tile_nk << " and A(" << M << ", " << N << ") is " << tile_mn;
+#endif
+
+#ifdef DEBUG_TILES_VALUES
+      std::array<T, 4> norms; // input for tile_mk & tile_nk & tile_mn and output
+      //auto norms_s  = ttg::make_scratch(norms.data(), ttg::scope::Allocate, norms.size());
+      co_await ttg::device::select(tile_mk.buffer(), tile_nk.buffer(), tile_mn.buffer());
+
+      /* compute the norms at input */
+      device_norm(tile_mk, &norms[0]);
+      device_norm(tile_nk, &norms[1]);
+      device_norm(tile_mn, &norms[2]);
+#else
+      co_await ttg::device::select(tile_mk.buffer(), tile_nk.buffer(), tile_mn.buffer());
+#endif // DEBUG_TILES_VALUES
+
+      int device = ttg::device::current_device();
+      double alpha = -1.0;
+      double beta  =  1.0;
+
+#if defined(TTG_HAVE_CUDA)
+      cublasDgemm(cublas_handle(),
+                  CUBLAS_OP_N, CUBLAS_OP_T,
+                  tile_mk.rows(), tile_nk.rows(),
+                  tile_nk.cols(), &alpha,
+                  tile_mk.buffer().current_device_ptr(), tile_mk.lda(),
+                  tile_nk.buffer().current_device_ptr(), tile_nk.lda(), &beta,
+                  tile_mn.buffer().current_device_ptr(), tile_mn.lda());
+#elif defined(TTG_HAVE_HIPBLAS)
+      hipblasDgemm(hipblas_handle(),
+                   HIPBLAS_OP_N, HIPBLAS_OP_T,
+                   tile_mk.rows(), tile_nk.rows(),
+                   tile_nk.cols(), &alpha,
+                   tile_mk.buffer().current_device_ptr(), tile_mk.lda(),
+                   tile_nk.buffer().current_device_ptr(), tile_nk.lda(), &beta,
+                   tile_mn.buffer().current_device_ptr(), tile_mn.lda());
+#endif
+
+
+#ifdef DEBUG_TILES_VALUES
+      /* compute the norm at output */
+      device_norm(tile_mn, &norms[3]);
+      /* wait for the kernel to complete */
+      co_await ttg::device::wait();
+      // check that we got the input tiles we expected
+      assert(check_norm(tile_mk.norm(), norms[0]));
+      assert(check_norm(tile_nk.norm(), norms[1]));
+      assert(check_norm(tile_mn.norm(), norms[2]));
+      // set the new norm
+      tile_mn.set_norm(norms[3]);
+#endif // DEBUG_TILES_VALUES
+
+      if (N == K + 1) {
+        /* send the tile to trsm */
+        if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to TRSM(", Key2{M, N}, ")");
+        co_await ttg::device::send<0>(Key2(M, N), std::move(tile_mn), out);
+      } else {
+        /* send the tile to the next gemm */
+        if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to GEMM(", Key3{M, N, K + 1}, ")");
+        co_await ttg::device::send<1>(Key3(M, N, K + 1), std::move(tile_mn), out);
+      }
+    };
+    return ttg::make_tt<ES>(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)),
+                            ttg::edges(output_trsm, output_gemm), "GEMM", {"input_mk", "input_kn", "input_mn/dispatcher"},
+                            {"output_trsm", "outout_gemm"});
+#else // defined(ENABLE_DEVICE_KERNEL)
     auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>>& out) {
       const int M = key[0];
@@ -182,8 +602,8 @@ namespace potrf {
 
       if (ttg::tracing()) ttg::print("GEMM(", key, ")");
 #if defined(DEBUG_TILES_VALUES)
-      std::cout << "Before GEMM(" << key << "), A(" << M << ", " << K << ") is " << tile_mk << " and A(" << K << ", "
-                << N << ") is " << tile_nk << " and A(" << M << ", " << N << ") is " << tile_mn;
+      //std::cout << "Before GEMM(" << key << "), A(" << M << ", " << K << ") is " << tile_mk << " and A(" << K << ", "
+      //          << N << ") is " << tile_nk << " and A(" << M << ", " << N << ") is " << tile_mn;
 #endif
 
       blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans, tile_mk.rows(), tile_nk.rows(),
@@ -191,7 +611,7 @@ namespace potrf {
                  tile_mn.data(), tile_mn.lda());
 
 #if defined(DEBUG_TILES_VALUES)
-      std::cout << "After GEMM(" << key << "), A(" << M << ", " << N << ") is " << tile_mn << std::endl;
+      //std::cout << "After GEMM(" << key << "), A(" << M << ", " << N << ") is " << tile_mn << std::endl;
 #endif
 
       if (N == K + 1) {
@@ -207,26 +627,27 @@ namespace potrf {
     return ttg::make_tt(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)),
                         ttg::edges(output_trsm, output_gemm), "GEMM", {"input_mk", "input_kn", "input_mn/dispatcher"},
                         {"output_trsm", "outout_gemm"});
+#endif // defined(ENABLE_DEVICE_KERNEL)
   }
 
   template <typename T>
   auto make_dispatcher(ttg::Edge<Key2, MatrixTile<T>>& input, ttg::Edge<Key1, MatrixTile<T>>& to_potrf,
                        ttg::Edge<Key2, MatrixTile<T>>& to_trsm, ttg::Edge<Key2, MatrixTile<T>>& to_syrk,
                        ttg::Edge<Key3, MatrixTile<T>>& to_gemm) {
-    auto f = [=](const Key2& key, MatrixTile<T>&& tile,
+    auto f = [=](const Key2& key, const MatrixTile<T>& tile,
                  std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>,
                             ttg::Out<Key3, MatrixTile<T>>>& out) {
       if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ")");
       if (0 == key[0] && 0 == key[1]) {
         // First element goes to POTRF
         if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ") sending to POTRF(", Key1{key[0]}, ")");
-        ttg::send<0>(Key1{key[0]}, std::move(tile), out);
+        ttg::send<0>(Key1{key[0]}, tile, out);
         return;
       }
       if (key[0] == key[1]) {
         // Other diagonal elements go to SYRK
         if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ") sending to SYRK(", Key2{0, key[0]}, ")");
-        ttg::send<2>(Key2{0, key[0]}, std::move(tile), out);
+        ttg::send<2>(Key2{0, key[0]}, tile, out);
         return;
       }
       // We only consider the lower triangular
@@ -234,12 +655,12 @@ namespace potrf {
       if (0 == key[1]) {
         // First column goes to TRSM
         if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ") sending to TRSM(", key, ")");
-        ttg::send<1>(key, std::move(tile), out);
+        ttg::send<1>(key, tile, out);
         return;
       }
       // Rest goes to GEMM
       if (ttg::tracing()) ttg::print("POTRF_Dispatch(", key, ") sending to GEMM(", Key3{key[0], key[1], 0}, ")");
-      ttg::send<3>(Key3{key[0], key[1], 0}, std::move(tile), out);
+      ttg::send<3>(Key3{key[0], key[1], 0}, tile, out);
     };
 
     return ttg::make_tt(f, ttg::edges(input), ttg::edges(to_potrf, to_trsm, to_syrk, to_gemm), "POTRF Dispatch",
diff --git a/examples/potrf/result.h b/examples/potrf/result.h
index caafd5795..257f1fd5a 100644
--- a/examples/potrf/result.h
+++ b/examples/potrf/result.h
@@ -2,6 +2,7 @@
 
 #include <ttg.h>
 #include "pmw.h"
+#include "util.h"
 
 template <typename T>
 auto make_result(MatrixT<T>& A, ttg::Edge<Key2, MatrixTile<T>>& result) {
@@ -10,9 +11,14 @@ auto make_result(MatrixT<T>& A, ttg::Edge<Key2, MatrixTile<T>>& result) {
     const int I = key[0];
     const int J = key[1];
     if (ttg::tracing()) ttg::print("RESULT( ", key, ") on rank ", A.rank_of(key[0], key[1]));
-    if (A(I, J).data() != tile.data()) {
+#if defined(DEBUG_TILES_VALUES)
+    T norm = blas::nrm2(tile.size(), tile.data(), 1);
+    assert(check_norm(norm, tile.norm()));
+#endif //defined(DEBUG_TILES_VALUES)
+    auto atile = A(I, J);
+    if (atile.data() != tile.data()) {
       if (ttg::tracing()) ttg::print("Writing back tile {", I, ",", J, "} ");
-      std::copy_n(tile.data(), tile.rows() * tile.cols(), A(I, J).data());
+      std::copy_n(tile.data(), tile.rows() * tile.cols(), atile.data());
     }
 #ifdef TTG_USE_USER_TERMDET
     if (I == A.cols() - 1 && J == A.rows() - 1) {
diff --git a/examples/potrf/testing_dlauum.cc b/examples/potrf/testing_dlauum.cc
index c25e36763..df87c63a6 100644
--- a/examples/potrf/testing_dlauum.cc
+++ b/examples/potrf/testing_dlauum.cc
@@ -6,6 +6,7 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "lauum.h"
 #include "plgsy.h"
@@ -57,6 +58,9 @@ int main(int argc, char **argv)
 
   ttg::initialize(argc, argv, nthreads);
 
+  /* set up TA to get the allocator */
+  allocator_init();
+
   auto world = ttg::default_execution_context();
 
   if( prof_filename != nullptr ) {
@@ -139,6 +143,39 @@ int main(int argc, char **argv)
   world.dag_off();
   world.profile_off();
 
+  allocator_fini();
   ttg::finalize();
   return ret;
 }
+
+static void
+dplasma_dprint_tile( int m, int n,
+                     const parsec_tiled_matrix_t* descA,
+                     const double *M )
+{
+    int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb;
+    int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb;
+    int ldam = BLKLDD( descA, m );
+
+    int ii, jj;
+
+    fflush(stdout);
+    for(ii=0; ii<tempmm; ii++) {
+        if ( ii == 0 )
+            fprintf(stdout, "(%2d, %2d) :", m, n);
+        else
+            fprintf(stdout, "          ");
+        for(jj=0; jj<tempnn; jj++) {
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            fprintf(stdout, " (% e, % e)",
+                    creal( M[jj*ldam + ii] ),
+                    cimag( M[jj*ldam + ii] ));
+#else
+            fprintf(stdout, " % e", M[jj*ldam + ii]);
+#endif
+        }
+        fprintf(stdout, "\n");
+    }
+    fflush(stdout);
+    usleep(1000);
+}
diff --git a/examples/potrf/testing_dpoinv.cc b/examples/potrf/testing_dpoinv.cc
index 550e6435d..bd10b24b7 100644
--- a/examples/potrf/testing_dpoinv.cc
+++ b/examples/potrf/testing_dpoinv.cc
@@ -6,6 +6,7 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"
@@ -93,10 +94,13 @@ int main(int argc, char **argv)
   ttg::initialize(ttg_argc, ttg_argv, nthreads);
   delete[] ttg_argv;
 
+  /* set up TA to get the allocator */
+  allocator_init();
+
   ttg::trace_on();
 
   auto world = ttg::default_execution_context();
-  
+
   if(nullptr != prof_filename) {
     world.profile_on();
     world.dag_on(prof_filename);
@@ -117,12 +121,12 @@ int main(int argc, char **argv)
 
   parsec_matrix_sym_block_cyclic_t dcA;
   parsec_matrix_sym_block_cyclic_init(&dcA, parsec_matrix_type_t::PARSEC_MATRIX_DOUBLE,
-                                world.rank(), 
-                                NB, NB, 
+                                world.rank(),
+                                NB, NB,
+                                N, M,
+                                0, 0,
                                 N, M,
-                                0, 0, 
-                                N, M, 
-                                P, Q, 
+                                P, Q,
                                 PARSEC_MATRIX_LOWER);
   dcA.mat = parsec_data_allocate((size_t)dcA.super.nb_local_tiles *
                                  (size_t)dcA.super.bsiz *
@@ -247,15 +251,15 @@ int main(int argc, char **argv)
           end = endpotri;
 
           auto elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(endpotrf - begpotrf).count());
-          std::cout << "POINV (POTRF+POTRI) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads 
+          std::cout << "POINV (POTRF+POTRI) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads
                     << " POTRF TTG Execution Time (milliseconds) : " << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
-          
+
           elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(endpotri - begpotri).count());
-          std::cout << "POINV (POTRF+POTRI) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads 
+          std::cout << "POINV (POTRF+POTRI) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads
                     << " POTRI TTG Execution Time (milliseconds) : " << elapsed / 1E3 << " : Flops " << (potri::FLOPS_DPOTRI(N)) << " " << (potri::FLOPS_DPOTRI(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
 
           elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count());
-          std::cout << "POINV (POTRF+POTRI) (" << (defer_cow_hint ? "with" : "without") << " defer writer) N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads 
+          std::cout << "POINV (POTRF+POTRI) (" << (defer_cow_hint ? "with" : "without") << " defer writer) N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads
                     << " TTG Execution Time (milliseconds) : " << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N) + potri::FLOPS_DPOTRI(N)) << " " << ((potrf::FLOPS_DPOTRF(N)+potri::FLOPS_DPOTRI(N))/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
         }
       } else {
@@ -327,20 +331,20 @@ int main(int argc, char **argv)
           end = endlauum;
 
           auto elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(endpotrf - begpotrf).count());
-          std::cout << "POINV (POTRF+TRTRI+LAUUM) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads 
+          std::cout << "POINV (POTRF+TRTRI+LAUUM) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads
                     << " POTRF TTG Execution Time (milliseconds) : " << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
-          
+
           elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(endtrtri - begtrtri).count());
-          std::cout << "POINV (POTRF+TRTRI+LAUUM) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads 
+          std::cout << "POINV (POTRF+TRTRI+LAUUM) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads
                     << " TRTRI TTG Execution Time (milliseconds) : " << elapsed / 1E3 << " : Flops " << (trtri_LOWER::FLOPS_DTRTRI(N)) << " " << (trtri_LOWER::FLOPS_DTRTRI(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
 
           elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(endlauum - beglauum).count());
-          std::cout << "POINV (POTRF+TRTRI+LAUUM) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads 
+          std::cout << "POINV (POTRF+TRTRI+LAUUM) (" << (defer_cow_hint ? "with" : "without") << " defer writer) -- N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads
                     << " LAUUM TTG Execution Time (milliseconds) : " << elapsed / 1E3 << " : Flops " << (lauum::FLOPS_DLAUUM(N)) << " " << (lauum::FLOPS_DLAUUM(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
 
           elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count());
-          std::cout << "POINV (POTRF+TRTRI+LAUUM) (" << (defer_cow_hint ? "with" : "without") << " defer writer) N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads 
-                    << " TTG Execution Time (milliseconds) : " << elapsed / 1E3 
+          std::cout << "POINV (POTRF+TRTRI+LAUUM) (" << (defer_cow_hint ? "with" : "without") << " defer writer) N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads
+                    << " TTG Execution Time (milliseconds) : " << elapsed / 1E3
                     << " : Flops " << (potrf::FLOPS_DPOTRF(N) + potri::FLOPS_DPOTRI(N)) << " " << ((potrf::FLOPS_DPOTRF(N)+potri::FLOPS_DPOTRI(N))/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
         }
       }
@@ -378,8 +382,8 @@ int main(int argc, char **argv)
         end = std::chrono::high_resolution_clock::now();
         auto elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count());
         end = std::chrono::high_resolution_clock::now();
-        std::cout << "POINV (POINV) (" << (defer_cow_hint ? "with" : "without") << " defer writer) N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads 
-                  << " TTG Execution Time (milliseconds) : " << elapsed / 1E3 
+        std::cout << "POINV (POINV) (" << (defer_cow_hint ? "with" : "without") << " defer writer) N= " << N << " NB= " << NB <<  " P= " << P << " Q= " << Q << " nthreads= " << nthreads
+                  << " TTG Execution Time (milliseconds) : " << elapsed / 1E3
                   << " : Flops " << (potrf::FLOPS_DPOTRF(N) + potri::FLOPS_DPOTRI(N)) << " " << ((potrf::FLOPS_DPOTRF(N)+potri::FLOPS_DPOTRI(N))/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
       }
     }
@@ -394,6 +398,7 @@ int main(int argc, char **argv)
   parsec_data_free(dcA.mat); dcA.mat = NULL;
   parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)&dcA);
 
+  allocator_fini();
   ttg::finalize();
   return ret;
 }
diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc
index 36ea59b16..1658dbed3 100644
--- a/examples/potrf/testing_dpotrf.cc
+++ b/examples/potrf/testing_dpotrf.cc
@@ -1,4 +1,5 @@
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"
@@ -35,6 +36,7 @@ int main(int argc, char **argv)
   const char* prof_filename = nullptr;
   char *opt = nullptr;
   int ret = EXIT_SUCCESS;
+  int niter = 3;
 
   if( (opt = getCmdOption(argv+1, argv+argc, "-N")) != nullptr ) {
     N = M = atoi(opt);
@@ -52,10 +54,18 @@ int main(int argc, char **argv)
     prof_filename = opt;
   }
 
+  if( (opt = getCmdOption(argv+1, argv+argc, "-n")) != nullptr) {
+    niter = atoi(opt);
+  }
+
   bool check = !cmdOptionExists(argv+1, argv+argc, "-x");
   bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w");
 
-  ttg::initialize(argc, argv, nthreads);
+  // TODO: need to filter out our arguments to make parsec happy
+  ttg::initialize(1, argv, nthreads);
+
+  /* set up TA to get the allocator */
+  allocator_init();
 
   auto world = ttg::default_execution_context();
   if(nullptr != prof_filename) {
@@ -85,53 +95,64 @@ int main(int argc, char **argv)
   parsec_data_collection_set_key((parsec_data_collection_t*)&dcA, (char*)"Matrix A");
 
   if(!check) {
-    ttg::Edge<Key2, void> startup("startup");
-    ttg::Edge<Key2, MatrixTile<double>> topotrf("To POTRF");
-    ttg::Edge<Key2, MatrixTile<double>> result("To result");
-
-    //Matrix<double>* A = new Matrix<double>(n_rows, n_cols, NB, NB);
-    MatrixT<double> A{&dcA};
-    /* TODO: initialize the matrix */
-    /* This works only with the parsec backend! */
-    int random_seed = 3872;
-
-    auto init_tt =  ttg::make_tt<void>([&](std::tuple<ttg::Out<Key2, void>>& out) {
-      for(int i = 0; i < A.rows(); i++) {
-        for(int j = 0; j <= i && j < A.cols(); j++) {
-          if(A.is_local(i, j)) {
-            if(ttg::tracing()) ttg::print("init(", Key2{i, j}, ")");
-            ttg::sendk<0>(Key2{i, j}, out);
+    for (int i = 0; i < niter; ++i) {
+      parsec_devices_release_memory();
+      ttg::Edge<Key2, void> startup("startup");
+      ttg::Edge<Key2, MatrixTile<double>> topotrf("To POTRF");
+      ttg::Edge<Key2, MatrixTile<double>> result("To result");
+
+      //Matrix<double>* A = new Matrix<double>(n_rows, n_cols, NB, NB);
+      MatrixT<double> A{&dcA};
+      /* TODO: initialize the matrix */
+      /* This works only with the parsec backend! */
+      int random_seed = 3872;
+
+      auto init_tt =  ttg::make_tt<void>([&](std::tuple<ttg::Out<Key2, void>>& out) {
+        for(int i = 0; i < A.rows(); i++) {
+          for(int j = 0; j <= i && j < A.cols(); j++) {
+            if(A.is_local(i, j)) {
+              if(ttg::tracing()) ttg::print("init(", Key2{i, j}, ")");
+              ttg::sendk<0>(Key2{i, j}, out);
+            }
           }
         }
+      }, ttg::edges(), ttg::edges(startup), "Startup Trigger", {}, {"startup"});
+      init_tt->set_keymap([&]() {return world.rank();});
+
+      auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint);
+      auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint);
+      auto result_ttg = make_result_ttg(A, result, cow_hint);
+
+      auto connected = make_graph_executable(init_tt.get());
+      assert(connected);
+      TTGUNUSED(connected);
+      std::cout << "Graph is connected: " << connected << std::endl;
+
+      if (world.rank() == 0) {
+        std::cout << "==== begin dot ====\n";
+        std::cout << ttg::Dot()(init_tt.get()) << std::endl;
+        std::cout << "==== end dot ====\n";
+        beg = std::chrono::high_resolution_clock::now();
       }
-    }, ttg::edges(), ttg::edges(startup), "Startup Trigger", {}, {"startup"});
-    init_tt->set_keymap([&]() {return world.rank();});
-
-    auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint);
-    auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint);
-    auto result_ttg = make_result_ttg(A, result, cow_hint);
 
-    auto connected = make_graph_executable(init_tt.get());
-    assert(connected);
-    TTGUNUSED(connected);
-    std::cout << "Graph is connected: " << connected << std::endl;
+      if (world.rank() == 0) {
+        beg = std::chrono::high_resolution_clock::now();
+      }
 
-    if (world.rank() == 0) {
-      std::cout << "==== begin dot ====\n";
-      std::cout << ttg::Dot()(init_tt.get()) << std::endl;
-      std::cout << "==== end dot ====\n";
-      beg = std::chrono::high_resolution_clock::now();
-    }
-    init_tt->invoke();
+      init_tt->invoke();
+      ttg::execute(world);
+      ttg::fence(world);
 
-    ttg::execute(world);
-    ttg::fence(world);
-    if (world.rank() == 0) {
-      end = std::chrono::high_resolution_clock::now();
-      auto elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count());
-      end = std::chrono::high_resolution_clock::now();
-      std::cout << "TTG Execution Time (milliseconds) : "
-                << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
+      if (world.rank() == 0) {
+        end = std::chrono::high_resolution_clock::now();
+        auto elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count());
+        end = std::chrono::high_resolution_clock::now();
+        std::cout << "TTG Execution Time (milliseconds) : "
+                  << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
+      }
+#if defined(TTG_PARSEC_IMPORTED)
+      parsec_devices_reset_load(ttg::default_execution_context().impl().context());
+#endif // TTG_PARSEC_IMPORTED
     }
 
     world.dag_off();
@@ -206,10 +227,43 @@ int main(int argc, char **argv)
 
   world.profile_off();
 
+  allocator_fini();
   ttg::finalize();
   return ret;
 }
 
+static void
+dplasma_dprint_tile( int m, int n,
+                     const parsec_tiled_matrix_t* descA,
+                     const double *M )
+{
+    int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb;
+    int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb;
+    int ldam = BLKLDD( descA, m );
+
+    int ii, jj;
+
+    fflush(stdout);
+    for(ii=0; ii<tempmm; ii++) {
+        if ( ii == 0 )
+            fprintf(stdout, "(%2d, %2d) :", m, n);
+        else
+            fprintf(stdout, "          ");
+        for(jj=0; jj<tempnn; jj++) {
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            fprintf(stdout, " (% e, % e)",
+                    creal( M[jj*ldam + ii] ),
+                    cimag( M[jj*ldam + ii] ));
+#else
+            fprintf(stdout, " % e", M[jj*ldam + ii]);
+#endif
+        }
+        fprintf(stdout, "\n");
+    }
+    fflush(stdout);
+    usleep(1000);
+}
+
 int check_dpotrf( double *A, double *A0, int N )
 {
     int ret;
diff --git a/examples/potrf/testing_dtrtri.cc b/examples/potrf/testing_dtrtri.cc
index 370e2a08c..bebe8cccf 100644
--- a/examples/potrf/testing_dtrtri.cc
+++ b/examples/potrf/testing_dtrtri.cc
@@ -6,6 +6,7 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"
@@ -73,6 +74,9 @@ int main(int argc, char **argv)
 
   ttg::initialize(argc, argv, nthreads);
 
+  /* set up TA to get the allocator */
+  allocator_init();
+
   auto world = ttg::default_execution_context();
 
   if(nullptr != prof_filename) {
@@ -227,10 +231,43 @@ int main(int argc, char **argv)
   world.dag_off();
   world.profile_off();
 
+  allocator_fini();
   ttg::finalize();
   return ret;
 }
 
+static void
+dplasma_dprint_tile( int m, int n,
+                     const parsec_tiled_matrix_t* descA,
+                     const double *M )
+{
+    int tempmm = ( m == descA->mt-1 ) ? descA->m - m*descA->mb : descA->mb;
+    int tempnn = ( n == descA->nt-1 ) ? descA->n - n*descA->nb : descA->nb;
+    int ldam = BLKLDD( descA, m );
+
+    int ii, jj;
+
+    fflush(stdout);
+    for(ii=0; ii<tempmm; ii++) {
+        if ( ii == 0 )
+            fprintf(stdout, "(%2d, %2d) :", m, n);
+        else
+            fprintf(stdout, "          ");
+        for(jj=0; jj<tempnn; jj++) {
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            fprintf(stdout, " (% e, % e)",
+                    creal( M[jj*ldam + ii] ),
+                    cimag( M[jj*ldam + ii] ));
+#else
+            fprintf(stdout, " % e", M[jj*ldam + ii]);
+#endif
+        }
+        fprintf(stdout, "\n");
+    }
+    fflush(stdout);
+    usleep(1000);
+}
+
 int check_dtrtri( lapack::Diag diag, lapack::Uplo uplo, double *A, double *Ainv, int N )
 {
     int ret;
@@ -276,7 +313,7 @@ int check_dtrtri( lapack::Diag diag, lapack::Uplo uplo, double *A, double *Ainv,
 
     std::cout << "============" << std::endl;
     std::cout << "Checking TRTRI " << std::endl;
-    std::cout <<  "-- ||A||_one = " << Anorm << " ||A^(-1)||_one = " << Ainvnorm << " ||I - A * A^(-1)||_one = " 
+    std::cout <<  "-- ||A||_one = " << Anorm << " ||A^(-1)||_one = " << Ainvnorm << " ||I - A * A^(-1)||_one = "
               << Rnorm << ", cond = " << Rcond << ", result = " << result << std::endl;
 
     if ( std::isinf(Ainvnorm) || std::isnan(result) || std::isinf(result) || (result > 10.0) ) {
diff --git a/examples/potrf/util.h b/examples/potrf/util.h
new file mode 100644
index 000000000..c76c2c59b
--- /dev/null
+++ b/examples/potrf/util.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <cmath>
+
+inline auto check_norm(double expected, double actual) {
+  if (std::abs(expected - actual) <= std::max(std::abs(expected), std::abs(actual))*1E-12) {
+    return true;
+  }
+  return false;
+}
diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 3136fcf92..0c85eb3d4 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -10,12 +10,11 @@
 #include <vector>
 
 #if __has_include(<btas/features.h>)
-#pragma message("C Preprocessor got here!")
 #include <btas/features.h>
 #ifdef BTAS_IS_USABLE
 #include <btas/btas.h>
-#include <btas/optimize/contract.h>
 #include <btas/util/mohndle.h>
+#include <btas/optimize/contract.h>
 #else
 #warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost"
 #endif
@@ -36,11 +35,16 @@ using namespace ttg;
 #include "ttg/util/future.h"
 
 #include "ttg/util/multiindex.h"
+#include "ttg/serialization/std/pair.h"
 
 #include "ttg/util/bug.h"
 
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
-using blk_t = btas::Tensor<double, btas::DEFAULT::range, btas::mohndle<btas::varray<double>, btas::Handle::shared_ptr>>;
+using scalar_t = double;
+using blk_t = btas::Tensor<scalar_t, btas::DEFAULT::range, btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
+
+//#include <atomic>
+//static std::atomic<uint64_t> reduce_count = 0;
 
 #if defined(TTG_USE_PARSEC)
 namespace ttg {
@@ -69,8 +73,8 @@ namespace ttg {
         return boost::container::small_vector<iovec, 1>{};
     }
     static auto create_from_metadata(const std::pair<int, int> &meta) {
-      if (meta != std::pair{0, 0})
-        return blk_t(btas::Range(std::get<0>(meta), std::get<1>(meta)), 0.0);
+      if (meta != std::pair{0, 0}) // N.B. allocate only, do not fill with zeroes
+        return blk_t(btas::Range(std::get<0>(meta), std::get<1>(meta)));
       else
         return blk_t{};
     }
@@ -91,6 +95,7 @@ namespace ttg::detail {
 }  // namespace ttg::detail
 
 #else
+using scalar_t = double;
 using blk_t = double;
 #endif
 template <typename T = blk_t>
@@ -119,10 +124,13 @@ namespace btas {
   btas::Tensor<T_, Range_, Store_> gemm(btas::Tensor<T_, Range_, Store_> &&C, const btas::Tensor<T_, Range_, Store_> &A,
                                         const btas::Tensor<T_, Range_, Store_> &B) {
     using array = btas::DEFAULT::index<int>;
-    if (C.empty()) {
-      C = btas::Tensor<T_, Range_, Store_>(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0);
+    if (C.empty()) {  // first contribution to C = allocate it and gemm with beta=0
+      C = btas::Tensor<T_, Range_, Store_>(btas::Range(A.range().extent(0), B.range().extent(1)));
+      btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 0.0, C, array{1, 3}, false, false);
+    }
+    else {   // subsequent contributions to C = gemm with beta=1
+      btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false);
     }
-    btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false);
     return std::move(C);
   }
 }  // namespace btas
@@ -140,17 +148,21 @@ template <std::size_t Rank>
 using Key = MultiIndex<Rank>;
 
 /// maps {i,j} to rank within first (R=0) layer of the 3-d process grid
-inline int ij2rank(int i, int j, int P, int Q) {
-  std::vector<int> vec;
+inline int ij2rank(int i, int j, int P, int Q, int R) {
   int p = (i % P);
   int q = (j % Q);
-  int rank = (q * P) + p;
+  //int rank = (q * P) + p;
+  //int pq = (q * P) + p;
+  int l = (i*j) % R;
+  int rank = (l * P * Q) + (q * P) + p;
+//  size_t hash = Key<2>{i, j}.hash();
+//  int rank = hash%(P*Q*R);
+  //std::cout << "ij2rank " << Key<2>{i, j} << " rank " << rank << std::endl;
   return rank;
 }
 
 /// maps {i,j,k} to rank within a 3-d process grid
 inline int ijk2rank(int i, int j, int k, int P, int Q, int R) {
-  std::vector<int> vec;
   int p = (i % P);
   int q = (j % Q);
   int l = (k % R);
@@ -158,29 +170,51 @@ inline int ijk2rank(int i, int j, int k, int P, int Q, int R) {
   return rank;
 }
 
-// flow data from an existing SpMatrix on rank 0
-template <typename Blk = blk_t, typename Keymap = std::function<int(const Key<2> &)>>
-class Read_SpMatrix : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read_SpMatrix<Blk, Keymap>, ttg::typelist<void>> {
+/// Pushes out data from an existing SpMatrix whose data is distributed on a 2-d grid.
+
+/// Data is pushed in the order of the appearance of the data in the container, without any tailoring to
+/// the order in which the data is consumed; thus this is likely to generate tasks in a suboptimal order.
+/// \note Reading should in general occur in the same order as the data will be consumed.
+/// If all consuming tasks can execute concurrently this should be OK, albeit the runtime will likely throttle
+/// sends, thus task dependencies further "down" the DAG may result in some reading orders being better than others
+template <typename Blk = blk_t,
+          typename Keymap = std::function<int(const Key<3> &)>,
+          typename OutKeymap = std::function<int(const Key<2> &)>>
+class Read_SpMatrix : public TT<Key<3>,
+                                std::tuple<Out<Key<2>, Blk>>,
+                                Read_SpMatrix<Blk, Keymap, OutKeymap>,
+                                ttg::typelist<void>> {
  public:
   using baseT = typename Read_SpMatrix::ttT;
-  Read_SpMatrix(const char *label, const SpMatrix<Blk> &matrix, Edge<Key<2>> &ctl, Edge<Key<2>, Blk> &out,
-                Keymap &ij_keymap)
+  Read_SpMatrix(const char *label, const SpMatrix<Blk> &matrix, Edge<Key<3>> &ctl, Edge<Key<2>, Blk> &out,
+                Keymap &pqr_keymap, std::function<int(const Key<2> &)> ij_keymap)
       : baseT(edges(ctl), edges(out), std::string("read_spmatrix(") + label + ")", {"ctl"}, {std::string(label) + "ij"},
-              ij_keymap)
-      , matrix_(matrix) {}
-
-  void op(const Key<2> &, std::tuple<Out<Key<2>, Blk>> &out) {
+              pqr_keymap)
+      , matrix_(matrix)
+      , ij_keymap_(ij_keymap) {}
+
+  // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ...
+  // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ...
+  // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local
+  void op(const Key<3> & /* pqr */, std::tuple<Out<Key<2>, Blk>> &out) {
     auto rank = ttg::default_execution_context().rank();
-    for (int k = 0; k < matrix_.outerSize(); ++k) {
-      for (typename SpMatrix<Blk>::InnerIterator it(matrix_, k); it; ++it) {
-        if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({it.row(), it.col()}))))
-          ::send<0>(Key<2>(std::initializer_list<long>({it.row(), it.col()})), it.value(), out);
+    // this code assumes col-major layout
+    static_assert(SpMatrix<Blk>::IsRowMajor == false, "SpMatrix must be col-major");
+    for (int j = 0; j < matrix_.outerSize(); ++j) {
+      for (typename SpMatrix<Blk>::InnerIterator it(matrix_, j); it; ++it) {
+        assert(j == it.col());
+        const auto i = it.row();
+        // IF the receiver uses the same keymap, these sends are local
+        if (rank == this->ij_keymap_(Key<2>(std::initializer_list<long>({i, j})))) {
+          ::send<0>(Key<2>(std::initializer_list<long>({i, j})), it.value(), out);
+        }
       }
     }
   }
 
  private:
   const SpMatrix<Blk> &matrix_;
+  std::function<int(const Key<2> &)> ij_keymap_;
 };
 
 // flow (move?) data into an existing SpMatrix on rank 0
@@ -190,16 +224,21 @@ class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg:
   using baseT = typename Write_SpMatrix::ttT;
 
   template <typename Keymap2>
-  Write_SpMatrix(SpMatrix<Blk> &matrix, Edge<Key<2>, Blk> &in, Keymap2 &&ij_keymap)
-      : baseT(edges(in), edges(), "write_spmatrix", {"Cij"}, {}, ij_keymap), matrix_(matrix) {}
-
-  void op(const Key<2> &key, typename baseT::input_refs_tuple_type &&elem, std::tuple<> &) {
-    std::lock_guard<std::mutex> lock(mtx_);
-    ttg::trace("rank =", default_execution_context().rank(),
-               "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()), "spmm.cc Write_SpMatrix wrote {",
-               key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast<void *>(&matrix_),
-               " with mutex @", static_cast<void *>(&mtx_), " for object @", static_cast<void *>(this));
-    values_.emplace_back(key[0], key[1], baseT::template get<0>(elem));
+  Write_SpMatrix(SpMatrix<Blk> &matrix, Edge<Key<2>, Blk> &in, Keymap2 &&ij_keymap, bool write_back = true)
+      : baseT(edges(in), edges(), "write_spmatrix", {"Cij"}, {}, ij_keymap)
+      , matrix_(matrix)
+      , write_back_(write_back)
+  { }
+
+  void op(const Key<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) {
+    if (write_back_) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      ttg::trace("rank =", default_execution_context().rank(),
+                "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()), "spmm.cc Write_SpMatrix wrote {",
+                key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast<void *>(&matrix_),
+                " with mutex @", static_cast<void *>(&mtx_), " for object @", static_cast<void *>(this));
+      values_.emplace_back(key[0], key[1], baseT::template get<0>(elem));
+    }
   }
 
   /// grab completion status as a future<void>
@@ -223,6 +262,7 @@ class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg:
   SpMatrix<Blk> &matrix_;
   std::vector<SpMatrixTriplet<Blk>> values_;
   mutable std::shared_ptr<std::shared_future<void>> completion_status_;
+  bool write_back_;
 };
 
 /// sparse mm via 2.5D SUMMA
@@ -237,56 +277,119 @@ class SpMM25D {
   /// @param ijk_keymap maps {i,j,k} to process, controls distribution of tasks performing C[i][j] += A[i][k]*B[k][j]
   /// @param R the number of "layers" in the 3-D process grid
   SpMM25D(Edge<Key<2>, Blk> &a, Edge<Key<2>, Blk> &b, Edge<Key<2>, Blk> &c, const SpMatrix<Blk> &a_mat,
-          const SpMatrix<Blk> &b_mat, const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-          const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-          const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-          const std::vector<std::vector<long>> &b_colidx_to_rowidx, const std::vector<int> &mTiles,
-          const std::vector<int> &nTiles, const std::vector<int> &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap, long R)
-      : a_rowidx_to_colidx_(a_rowidx_to_colidx)
-      , b_colidx_to_rowidx_(b_colidx_to_rowidx)
-      , a_colidx_to_rowidx_(a_colidx_to_rowidx)
-      , b_rowidx_to_colidx_(b_rowidx_to_colidx)
+          const SpMatrix<Blk> &b_mat, const std::vector<std::vector<long>> &a_cols_of_row,
+          const std::vector<std::vector<long>> &a_rows_of_col,
+          const std::vector<std::vector<long>> &b_cols_of_row,
+          const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
+          const std::vector<int> &nTiles, const std::vector<int> &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap,
+          long R, long parallel_bcasts = 1)
+      : a_cols_of_row_(a_cols_of_row)
+      , b_rows_of_col_(b_rows_of_col)
+      , a_rows_of_col_(a_rows_of_col)
+      , b_cols_of_row_(b_cols_of_row)
       , ij_keymap_(std::move(ij_keymap))
-      , ijk_keymap_(std::move(ijk_keymap)) {
-    bcast_a_ = std::make_unique<BcastA>(a, local_a_ijk_, b_rowidx_to_colidx_, ij_keymap_, ijk_keymap_);
-    local_bcast_a_ = std::make_unique<LocalBcastA>(local_a_ijk_, a_ijk_, b_rowidx_to_colidx_, ijk_keymap_);
-    bcast_b_ = std::make_unique<BcastB>(b, local_b_ijk_, a_colidx_to_rowidx_, ij_keymap_, ijk_keymap_);
-    local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_colidx_to_rowidx_, ijk_keymap_);
-    multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_rowidx_to_colidx_,
-                                                 b_colidx_to_rowidx_, mTiles, nTiles, ijk_keymap_);
+      , ijk_keymap_(std::move(ijk_keymap))
+      , parallel_bcasts_(parallel_bcasts) {
+    Edge<Key<2>, void> a_ctl, b_ctl;
+    Edge<Key<2>, int> a_rowctl, b_colctl; // TODO: can we have multiple control inputs per TT?
+    bcast_a_ = std::make_unique<BcastA>(a, a_ctl, a_rowctl, local_a_ijk_, a_rows_of_col_, a_cols_of_row_, b_cols_of_row_,
+                                        ij_keymap_, ijk_keymap_, parallel_bcasts_);
+    local_bcast_a_ = std::make_unique<LocalBcastA>(local_a_ijk_, a_ijk_, b_cols_of_row_, ijk_keymap_);
+    bcast_b_ = std::make_unique<BcastB>(b, b_ctl, b_colctl, local_b_ijk_, a_rows_of_col_, b_cols_of_row_, b_rows_of_col_,
+                                        ij_keymap_, ijk_keymap_, parallel_bcasts_);
+    local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_);
+    multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_,
+                                                 b_rows_of_col_, mTiles, nTiles, ijk_keymap_);
     reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
-    reduce_c_->template set_input_reducer<0>([](Blk &c_ij, const Blk &c_ij_p) { c_ij = c_ij + c_ij_p; });
+    reduce_c_->template set_input_reducer<0>(
+      [&](Blk &c_ij, const Blk &c_ij_p) {
+        //reduce_count++;
+        c_ij = c_ij + c_ij_p;
+      });
     // compute how many contributions each C[i][j] should expect ... MultiplyAdd already does this, but need a way to
     // send message from each process p to the process owning C[i][j] to expect a contribution from it for now replicate
     // this logic ...
     // TODO: do this in MultiplyAdd (need to allreduce this info so that everyone has it)
     // N.B. only need to set stream size on the rank that will accumulate the C[i][j] contribution
-    const auto my_rank = ttg::default_execution_context().rank();
-    for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) {
-      if (a_rowidx_to_colidx_[i].empty()) continue;
-      for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) {
-        if (b_colidx_to_rowidx_[j].empty()) continue;
+    auto world = ttg::default_execution_context();
+    const auto my_rank = world.rank();
+    std::vector<bool> c_ij_procmask(world.size(), false);
+    for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) {
+      if (a_cols_of_row_[i].empty()) continue;
+      for (auto j = 0ul; j != b_rows_of_col_.size(); ++j) {
+        if (b_rows_of_col_[j].empty()) continue;
 
         if (ij_keymap_(Key<2>{i, j}) == my_rank) {
           decltype(i) k;
           bool have_k;
           std::tie(k, have_k) = multiplyadd_->compute_first_k(i, j);
-          std::vector<bool> c_ij_procmask(R, false);
-          if (have_k) {
-            const auto pR = k % R;  // k values are distributed round-robin among the layers of the 3-D grid
+          while (have_k) {
+            const auto pR = ijk_keymap_(Key<3>{i, j, k});
             assert(pR < c_ij_procmask.size());
             c_ij_procmask[pR] = true;
-            while (have_k) {
-              std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k);
-              if (have_k) {
-                const auto pR = k % R;
-                assert(pR < c_ij_procmask.size());
-                c_ij_procmask[pR] = true;
-              }
-            }
+            /* get next k */
+            std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k);
           }
           const auto c_ij_nprocs = std::count_if(c_ij_procmask.begin(), c_ij_procmask.end(), [](bool b) { return b; });
           if (c_ij_nprocs > 0) reduce_c_->template set_argstream_size<0>(Key<2>{i, j}, c_ij_nprocs);
+          /* reset the map */
+          std::fill(c_ij_procmask.begin(), c_ij_procmask.end(), false);
+        }
+      }
+    }
+
+    /* kick off the first broadcast in each row of A
+     * this is used to enforce strict ordering within a row of A */
+    for (int i = 0; i < a_cols_of_row_.size(); ++i) {
+      for (int k : a_cols_of_row_[i]) {
+        auto key = Key<2>(i, k);
+        if (world.rank() == ij_keymap_(key)) {
+          bcast_a_->template in<1>()->send(key, 0);
+          break;
+        }
+      }
+    }
+
+    /* initial ctl input for a number of bcasts for A
+     * this is used to limit the number of concurrent bcasts */
+    int to_start = parallel_bcasts;
+    for (int k = 0;
+          0 < to_start && k < a_rows_of_col_.size();
+          ++k) {
+      for (auto i : a_rows_of_col_[k]) {
+        auto key = Key<2>(i, k);
+        if (world.rank() == ij_keymap_(key)) {
+          //std::cout << "SPMM kick off BcastA " << key << std::endl;
+          bcast_a_->template in<2>()->sendk(key);
+          if (0 == --to_start) break;
+        }
+      }
+    }
+
+    /* kick off the first broadcast in each column of B
+     * this is used to enforce strict ordering within a column of B */
+    for (int j = 0; j < b_rows_of_col_.size(); ++j) {
+      for (int k : b_rows_of_col_[j]) {
+        auto key = Key<2>(k, j);
+        if (world.rank() == ij_keymap_(key)) {
+          //std::cout << "BcastB kick off " << key << std::endl;
+          bcast_b_->template in<1>()->send(key, 0);
+          break;
+        }
+      }
+    }
+
+    /* initial ctl input for bcasts for B */
+    to_start = parallel_bcasts;
+    for (int k = 0;
+          0 < to_start && k < b_cols_of_row_.size();
+          ++k) {
+      for (auto j : b_cols_of_row_[k]) {
+        auto key = Key<2>(k, j);
+        if (world.rank() == ij_keymap_(key)) {
+          //std::cout << "SPMM kick off BcastB " << key << std::endl;
+          bcast_b_->template in<2>()->sendk(key);
+          if (0 == --to_start) break;
         }
       }
     }
@@ -304,10 +407,10 @@ class SpMM25D {
     using baseT = typename LocalBcastA::ttT;
 
     LocalBcastA(Edge<Key<3>, Blk> &a, Edge<Key<3>, Blk> &a_ijk,
-                const std::vector<std::vector<long>> &b_rowidx_to_colidx, const Keymap3 &ijk_keymap)
+                const std::vector<std::vector<long>> &b_cols_of_row, const Keymap3 &ijk_keymap)
         : baseT(edges(a), edges(a_ijk), "SpMM25D::local_bcast_a", {"a_ikp"}, {"a_ijk"},
                 [](const Key<3> &ikp) { return ikp[2]; })
-        , b_rowidx_to_colidx_(b_rowidx_to_colidx)
+        , b_cols_of_row_(b_cols_of_row)
         , ijk_keymap_(ijk_keymap) {}
 
     void op(const Key<3> &ikp, typename baseT::input_refs_tuple_type &&a_ik, std::tuple<Out<Key<3>, Blk>> &a_ijk) {
@@ -318,10 +421,10 @@ class SpMM25D {
       auto world = default_execution_context();
       assert(p == world.rank());
       ttg::trace("LocalBcastA(", i, ", ", k, ", ", p, ")");
-      if (k >= b_rowidx_to_colidx_.size()) return;
+      if (k >= b_cols_of_row_.size()) return;
       // local broadcast a_ik to all {i,j,k} such that b_kj exists
       std::vector<Key<3>> ijk_keys;
-      for (auto &j : b_rowidx_to_colidx_[k]) {
+      for (auto &j : b_cols_of_row_[k]) {
         if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) {
           ttg::trace("Broadcasting A[", i, "][", k, "] on proc ", p, " to j=", j);
           ijk_keys.emplace_back(Key<3>({i, j, k}));
@@ -331,45 +434,129 @@ class SpMM25D {
     }
 
    private:
-    const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
+    const std::vector<std::vector<long>> &b_cols_of_row_;
     const Keymap3 &ijk_keymap_;
   };  // class LocalBcastA
 
   /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists
-  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastA, ttg::typelist<Blk>> {
+  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>>, BcastA, ttg::typelist<Blk, int, void>> {
    public:
     using baseT = typename BcastA::ttT;
 
-    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<3>, Blk> &a_ikp, const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
-        : baseT(edges(a_ik), edges(a_ikp), "SpMM25D::bcast_a", {"a_ik"}, {"a_ikp"}, ij_keymap)
-        , b_rowidx_to_colidx_(b_rowidx_to_colidx)
-        , ijk_keymap_(ijk_keymap) {}
+    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<2>, void> &ctl,
+           Edge<Key<2>, int> &rowctl, Edge<Key<3>, Blk> &a_ikp,
+           const std::vector<std::vector<long>> &a_rows_of_col,
+           const std::vector<std::vector<long>> &a_cols_of_row,
+           const std::vector<std::vector<long>> &b_cols_of_row,
+           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap,
+           const int parallel_bcasts)
+        : baseT(edges(a_ik, rowctl, ctl), edges(a_ikp, rowctl, ctl), "SpMM25D::bcast_a", {"a_ik", "rowctl", "ctl"}, {"a_ikp", "rowctl", "ctl"}, ij_keymap)
+        , a_rows_of_col_(a_rows_of_col)
+        , a_cols_of_row_(a_cols_of_row)
+        , b_cols_of_row_(b_cols_of_row)
+        , ijk_keymap_(ijk_keymap)
+        , ij_keymap_(ij_keymap)
+        , parallel_bcasts_(parallel_bcasts) {
+
+      this->set_priomap([](const Key<2>& key){
+        return std::numeric_limits<int>::max() - key[0];
+      });
+    }
 
-    void op(const Key<2> &ik, typename baseT::input_refs_tuple_type &&a_ik, std::tuple<Out<Key<3>, Blk>> &a_ikp) {
-      const auto i = ik[0];
-      const auto k = ik[1];
+    void op(const Key<2> &ik, typename baseT::input_values_tuple_type &&a_ik,
+            std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>> &outs) {
+      const auto i = ik[0]; // row
+      const auto k = ik[1]; // col
       ttg::trace("BcastA(", i, ", ", k, ")");
       std::vector<Key<3>> ikp_keys;
 
-      if (k >= b_rowidx_to_colidx_.size()) return;
+      if (k >= b_cols_of_row_.size()) return;
       auto world = default_execution_context();
       std::vector<bool> procmap(world.size());
-      for (auto &j : b_rowidx_to_colidx_[k]) {
-        const long p = ijk_keymap_(Key<3>(
+      for (auto &j : b_cols_of_row_[k]) {
+        const int p = ijk_keymap_(Key<3>(
             {i, j, k}));  // N.B. in 2.5D SUMMA different k contributions to C[i][j] are computed on different nodes
         if (!procmap[p]) {
           ttg::trace("Broadcasting A[", i, "][", k, "] to proc ", p);
+          //std::cout << "[" << world.rank() << "] BcastA key " << ik << " op " << Key<3>({i, j, k}) << " to proc " << p << std::endl;
           ikp_keys.emplace_back(Key<3>({i, k, p}));
           procmap[p] = true;
         }
+        // TODO: debug
+        //if (p != world.rank() && ij_keymap_(Key<2>{k, j}) != p) {
+        //  std::cout << "[" << world.rank() << "] BCAST A " << ik << " for C update " << Key<3>({i, k, p}) << " on " << p << " has B from " << ij_keymap_(Key<2>{k, j}) << std::endl;
+        //}
+      }
+      ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), outs);
+
+      /* enable the next broadcast on this row */
+      int row = i;
+      int col = k;
+      auto rowit = std::find(a_cols_of_row_[row].begin(), a_cols_of_row_[row].end(), col);
+      for (++rowit; rowit != a_cols_of_row_[row].end(); ++rowit) {
+        Key<2> key = {row, *rowit};
+        if (world.rank() == this->get_keymap()(key)) {
+          ::send<1>(key, std::move(baseT::template get<1>(a_ik)), outs);
+          break;
+        }
       }
-      ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), a_ikp);
+
+
+      /* enable next broadcast through a control message
+       * we don't check whether this tile is in B here, this is
+       * done inside the next task (see above)
+       * we walk the matrix A column-major in an attempt to send from top to bottom, left to right */
+      long to_skip = parallel_bcasts_;
+
+      auto colit = std::find(a_rows_of_col_[col].begin(), a_rows_of_col_[col].end(), row);
+      ++colit; // skip to next row
+      do {
+        for (; colit != a_rows_of_col_[col].end(); ++colit) {
+          Key<2> key = {*colit, col};
+          if (world.rank() == this->get_keymap()(key)) {
+            if (0 == --to_skip) {
+              //std::cout << "BcastA sending to " << key << " from " << ik << std::endl;
+              ::sendk<2>(key, outs);
+              return;
+            }
+          }
+        }
+        /* nothing for us in this column, move on to the next column */
+        if (++col < a_rows_of_col_.size()) {
+          colit = a_rows_of_col_[col].begin();
+        } else {
+          break;
+        }
+      } while (1);
+
+#if 0
+      do {
+        for (; it != a_cols_of_row_[i].end(); ++it) {
+          Key<2> key = {i, *it};
+          if (world.rank() == this->get_keymap()(key)) {
+            if (0 == --to_skip) {
+              ::sendk<1>(key, outs);
+              return;
+            }
+          }
+        }
+        if ((i+1) < num_rows) {
+          it = a_cols_of_row_[++i].begin();
+        } else {
+          break;
+        }
+      } while (1);
+#endif // 0
     }
 
    private:
-    const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
+    //const std::vector<std::vector<long>> &a_cols_of_row_;
+    const std::vector<std::vector<long>> &a_rows_of_col_;
+    const std::vector<std::vector<long>> &a_cols_of_row_;
+    const std::vector<std::vector<long>> &b_cols_of_row_;
     const Keymap3 &ijk_keymap_;
+    const Keymap2 &ij_keymap_;
+    const int parallel_bcasts_;
   };  // class BcastA
 
   /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that
@@ -379,10 +566,10 @@ class SpMM25D {
     using baseT = typename LocalBcastB::ttT;
 
     LocalBcastB(Edge<Key<3>, Blk> &b_kjp, Edge<Key<3>, Blk> &b_ijk,
-                const std::vector<std::vector<long>> &a_colidx_to_rowidx, const Keymap3 &ijk_keymap)
+                const std::vector<std::vector<long>> &a_rows_of_col, const Keymap3 &ijk_keymap)
         : baseT(edges(b_kjp), edges(b_ijk), "SpMM25D::local_bcast_b", {"b_kjp"}, {"b_ijk"},
                 [](const Key<3> &kjp) { return kjp[2]; })
-        , a_colidx_to_rowidx_(a_colidx_to_rowidx)
+        , a_rows_of_col_(a_rows_of_col)
         , ijk_keymap_(ijk_keymap) {}
 
     void op(const Key<3> &kjp, typename baseT::input_refs_tuple_type &&b_kj, std::tuple<Out<Key<3>, Blk>> &b_ijk) {
@@ -392,10 +579,10 @@ class SpMM25D {
       auto world = default_execution_context();
       assert(p == world.rank());
       ttg::trace("BcastB(", k, ", ", j, ", ", p, ")");
-      if (k >= a_colidx_to_rowidx_.size()) return;
+      if (k >= a_rows_of_col_.size()) return;
       // broadcast b_kj to all ijk for which c_ij is on this processor and a_ik exists
       std::vector<Key<3>> ijk_keys;
-      for (auto &i : a_colidx_to_rowidx_[k]) {
+      for (auto &i : a_rows_of_col_[k]) {
         if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) {
           ttg::trace("Broadcasting B[", k, "][", j, "] on proc ", p, " to i=", i);
           ijk_keys.emplace_back(Key<3>({i, j, k}));
@@ -405,47 +592,131 @@ class SpMM25D {
     }
 
    private:
-    const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
+    const std::vector<std::vector<long>> &a_rows_of_col_;
     const Keymap3 &ijk_keymap_;
   };  // class LocalBcastB
 
   /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists
-  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastB, ttg::typelist<Blk>> {
+  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>>, BcastB, ttg::typelist<Blk, int, void>> {
    public:
     using baseT = typename BcastB::ttT;
 
-    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<3>, Blk> &b_kjp, const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
-        : baseT(edges(b_kj), edges(b_kjp), "SpMM25D::bcast_b", {"b_kj"}, {"b_kjp"}, ij_keymap)
-        , a_colidx_to_rowidx_(a_colidx_to_rowidx)
-        , ijk_keymap_(ijk_keymap) {}
+    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<2>, void> ctl, Edge<Key<2>, int> colctl, Edge<Key<3>, Blk> &b_kjp,
+           const std::vector<std::vector<long>> &a_rows_of_col,
+           const std::vector<std::vector<long>> &b_cols_of_row,
+           const std::vector<std::vector<long>> &b_rows_of_col,
+           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap,
+           const int parallel_bcasts)
+        : baseT(edges(b_kj, colctl, ctl), edges(b_kjp, colctl, ctl), "SpMM25D::bcast_b", {"b_kj", "colctl", "ctl"}, {"b_kjp", "colctl", "ctl"}, ij_keymap)
+        , a_rows_of_col_(a_rows_of_col)
+        , b_cols_of_row_(b_cols_of_row)
+        , b_rows_of_col_(b_rows_of_col)
+        , ijk_keymap_(ijk_keymap)
+        , parallel_bcasts_(parallel_bcasts)
+    {
+      this->set_priomap([](const Key<2>& key){
+        return std::numeric_limits<int>::max() - key[1];
+      });
+    }
 
-    void op(const Key<2> &kj, typename baseT::input_refs_tuple_type &&b_kj, std::tuple<Out<Key<3>, Blk>> &b_kjp) {
-      const auto k = kj[0];
-      const auto j = kj[1];
+    void op(const Key<2> &kj, typename baseT::input_values_tuple_type &&b_kj,
+            std::tuple<Out<Key<3>, Blk>, Out<Key<2>, int>, Out<Key<2>, void>> &outs) {
+      const auto k = kj[0]; // row
+      const auto j = kj[1]; // col
       // broadcast b_kj to all processors which will contain at least one c_ij such that a_ik exists
       std::vector<Key<3>> kjp_keys;
       ttg::trace("BcastB(", k, ", ", j, ")");
-      if (k >= a_colidx_to_rowidx_.size()) return;
+      if (k >= a_rows_of_col_.size()) return;
       auto world = default_execution_context();
       std::vector<bool> procmap(world.size());
-      for (auto &i : a_colidx_to_rowidx_[k]) {
+      for (auto &i : a_rows_of_col_[k]) {
         long p = ijk_keymap_(Key<3>({i, j, k}));
         if (!procmap[p]) {
           ttg::trace("Broadcasting B[", k, "][", j, "] to proc ", p);
+          //std::cout << "[" << world.rank() << "] BcastB key " << kj << " op " << Key<3>({i, j, k}) << " to proc " << p << std::endl;
           kjp_keys.emplace_back(Key<3>({k, j, p}));
           procmap[p] = true;
         }
       }
-      ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), b_kjp);
+      ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), outs);
+
+      /* enable the next broadcast on this row */
+      int row = k;
+      int col = j;
+      auto colit = std::find(b_rows_of_col_[col].begin(), b_rows_of_col_[col].end(), row);
+      for (++colit; colit != b_rows_of_col_[col].end(); ++colit) {
+        Key<2> key = {*colit, col};
+        if (world.rank() == this->get_keymap()(key)) {
+          //std::cout << "BcastB kick off " << key << std::endl;
+          ::send<1>(key, std::move(baseT::template get<1>(b_kj)), outs);
+          break;
+        }
+      }
+
+      /* enable next broadcast through a control message
+       * we don't check whether this tile is in A here, this is
+       * done inside the next task (see above)
+       * we run across a row to enable broadcasts */
+      long to_skip = parallel_bcasts_;
+
+      // iterator over the current row
+      auto rowit = std::find(b_cols_of_row_[row].begin(), b_cols_of_row_[row].end(), col);
+      ++rowit; // skip to next col
+      do {
+        for (; rowit != b_cols_of_row_[row].end(); ++rowit) {
+          Key<2> key = {row, *rowit};
+          if (world.rank() == this->get_keymap()(key)) {
+            if (0 == --to_skip) {
+              //std::cout << "BcastB sending to " << key << " from " << kj << " pb " << parallel_bcasts_ << std::endl;
+              ::sendk<2>(key, outs);
+              return;
+            } else {
+              //std::cout << "BcastB skipping " << key << " from " << kj << " pb " << parallel_bcasts_ << std::endl;
+            }
+          }
+        }
+        /* nothing for us in this row, move on to the next row */
+        if (++row != b_cols_of_row_.size()) {
+          rowit = b_cols_of_row_[row].begin();
+        } else {
+          break;
+        }
+      } while (1);
+
+
+#if 0
+      std::size_t num_rows = b_cols_of_row_.size();
+      auto it = std::find(b_cols_of_row_[k].begin(), b_cols_of_row_[k].end(), j);
+      ++it; // skip the current tile
+      long to_skip = parallel_bcasts_;
+      do {
+        for (; it != b_cols_of_row_[k].end(); ++it) {
+          Key<2> key = {k, *it};
+          if (world.rank() == this->get_keymap()(key)) {
+            if (0 == --to_skip) {
+              ::sendk<1>(key, outs);
+              return;
+            }
+          }
+        }
+        if ((k+1) < num_rows) {
+          it = b_cols_of_row_[++k].begin();
+        } else {
+          break;
+        }
+      } while (1);
+#endif // 0
     }
 
    private:
-    const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
+    const std::vector<std::vector<long>> &a_rows_of_col_;
+    const std::vector<std::vector<long>> &b_cols_of_row_;
+    const std::vector<std::vector<long>> &b_rows_of_col_;
     const Keymap3 &ijk_keymap_;
+    const int parallel_bcasts_;
   };  // class BcastB
 
-  /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this kayer of the
+  /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this layer of the
   /// 3-D process grid only
   class MultiplyAdd : public TT<Key<3>, std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>>, MultiplyAdd,
                                 ttg::typelist<const Blk, const Blk, Blk>> {
@@ -453,21 +724,21 @@ class SpMM25D {
     using baseT = typename MultiplyAdd::ttT;
 
     MultiplyAdd(Edge<Key<3>, Blk> &a_ijk, Edge<Key<3>, Blk> &b_ijk, Edge<Key<3>, Blk> &c_ijk, Edge<Key<2>, Blk> &c,
-                const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                const std::vector<std::vector<long>> &b_colidx_to_rowidx, const std::vector<int> &mTiles,
+                const std::vector<std::vector<long>> &a_cols_of_row,
+                const std::vector<std::vector<long>> &b_rows_of_col, const std::vector<int> &mTiles,
                 const std::vector<int> &nTiles, const Keymap3 &ijk_keymap)
         : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
                 {"c_ij", "c_ijk"}, ijk_keymap)
-        , a_rowidx_to_colidx_(a_rowidx_to_colidx)
-        , b_colidx_to_rowidx_(b_colidx_to_rowidx) {
-      this->set_priomap([=](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
+        , a_cols_of_row_(a_cols_of_row)
+        , b_rows_of_col_(b_rows_of_col) {
+      this->set_priomap([=,this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
 
       // for each {i,j} determine first k that contributes AND belongs to this node,
       // initialize input {i,j,first_k} flow to 0
-      for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) {
-        if (a_rowidx_to_colidx_[i].empty()) continue;
-        for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) {
-          if (b_colidx_to_rowidx_[j].empty()) continue;
+      for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) {
+        if (a_cols_of_row_[i].empty()) continue;
+        for (auto j = 0ul; j != b_rows_of_col_.size(); ++j) {
+          if (b_rows_of_col_[j].empty()) continue;
 
           const auto p = ttg::default_execution_context().rank();
           decltype(i) k;
@@ -482,7 +753,7 @@ class SpMM25D {
 #endif
             this->template in<2>()->send(Key<3>({i, j, k}), zero);
           } else {
-            if (tracing() && a_rowidx_to_colidx_.size() * b_colidx_to_rowidx_.size() < 400)
+            if (tracing() && a_cols_of_row_.size() * b_rows_of_col_.size() < 400)
               ttg::print("C[", i, "][", j, "] is empty");
           }
         }
@@ -520,8 +791,8 @@ class SpMM25D {
     }
 
    private:
-    const std::vector<std::vector<long>> &a_rowidx_to_colidx_;
-    const std::vector<std::vector<long>> &b_colidx_to_rowidx_;
+    const std::vector<std::vector<long>> &a_cols_of_row_;
+    const std::vector<std::vector<long>> &b_rows_of_col_;
 
     /* Compute the length of the remaining sequence on that tile */
     int32_t prio(const Key<3> &key) const {
@@ -541,11 +812,11 @@ class SpMM25D {
    public:  // to be able to reuse this logic in SpMM25D
     // given {i,j} return first k such that A[i][k] and B[k][j] exist
     std::tuple<long, bool> compute_first_k(long i, long j) const {
-      const auto &a_k_range = a_rowidx_to_colidx_.at(i);
+      const auto &a_k_range = a_cols_of_row_.at(i);
       auto a_iter = a_k_range.begin();
       auto a_iter_fence = a_k_range.end();
       if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
-      const auto &b_k_range = b_colidx_to_rowidx_.at(j);
+      const auto &b_k_range = b_rows_of_col_.at(j);
       auto b_iter = b_k_range.begin();
       auto b_iter_fence = b_k_range.end();
       if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
@@ -573,11 +844,11 @@ class SpMM25D {
     // given {i,j,k} such that A[i][k] and B[k][j] exist
     // return next k such that this condition holds
     std::tuple<long, bool> compute_next_k(long i, long j, long k) const {
-      const auto &a_k_range = a_rowidx_to_colidx_.at(i);
+      const auto &a_k_range = a_cols_of_row_.at(i);
       auto a_iter_fence = a_k_range.end();
       auto a_iter = std::find(a_k_range.begin(), a_iter_fence, k);
       assert(a_iter != a_iter_fence);
-      const auto &b_k_range = b_colidx_to_rowidx_.at(j);
+      const auto &b_k_range = b_rows_of_col_.at(j);
       auto b_iter_fence = b_k_range.end();
       auto b_iter = std::find(b_k_range.begin(), b_iter_fence, k);
       assert(b_iter != b_iter_fence);
@@ -656,10 +927,11 @@ class SpMM25D {
   Edge<Key<3>, Blk> local_b_ijk_;
   Edge<Key<3>, Blk> c_ijk_;
   Edge<Key<2>, Blk> c_ij_p_;
-  const std::vector<std::vector<long>> &a_rowidx_to_colidx_;
-  const std::vector<std::vector<long>> &b_colidx_to_rowidx_;
-  const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
-  const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
+  Edge<Key<2>, void> a_bcast_ctl_, b_bcast_ctl_;
+  const std::vector<std::vector<long>> &a_cols_of_row_;
+  const std::vector<std::vector<long>> &b_rows_of_col_;
+  const std::vector<std::vector<long>> &a_rows_of_col_;
+  const std::vector<std::vector<long>> &b_cols_of_row_;
   std::unique_ptr<BcastA> bcast_a_;
   std::unique_ptr<LocalBcastA> local_bcast_a_;
   std::unique_ptr<BcastB> bcast_b_;
@@ -668,58 +940,74 @@ class SpMM25D {
   std::unique_ptr<ReduceC> reduce_c_;
   Keymap2 ij_keymap_;
   Keymap3 ijk_keymap_;
+  long parallel_bcasts_;
 };
 
-class Control : public TT<void, std::tuple<Out<Key<2>>>, Control> {
+class Control : public TT<void, std::tuple<Out<Key<3>>>, Control> {
   using baseT = typename Control::ttT;
-  int P;
-  int Q;
+  int P = 0;
+  int Q = 0;
+  int R = 0;
 
  public:
-  explicit Control(Edge<Key<2>> &ctl) : baseT(edges(), edges(ctl), "Control", {}, {"ctl"}), P(0), Q(0) {}
+  explicit Control(Edge<Key<3>> &ctl) : baseT(edges(), edges(ctl), "Control", {}, {"ctl"}) {}
 
-  void op(std::tuple<Out<Key<2>>> &out) const {
+  void op(std::tuple<Out<Key<3>>> &out) const {
     for (int p = 0; p < P; p++) {
       for (int q = 0; q < Q; q++) {
-        ttg::trace("Control: start computing on process {", p, ", ", q, "}");
-        ::sendk<0>(Key<2>{p, q}, out);
+        for (int r = 0; r < R; r++) {
+          ttg::trace("Control: start computing on process {", p, ", ", q, ", ", r, "}");
+          ::sendk<0>(Key<3>{p, q, r}, out);
+        }
       }
     }
   }
 
-  void start(const int _p, const int _q) {
+  void start(const int _p, const int _q, const int _r) {
     P = _p;
     Q = _q;
+    R = _r;
     invoke();
   }
 };
 
+std::tuple<float, float> norms(float t) { return std::make_tuple(t * t, std::abs(t)); }
+std::tuple<double, double> norms(double t) { return std::make_tuple(t * t, std::abs(t)); }
+
+template <typename T>
+std::tuple<T, T> norms(std::complex<T> t) {
+  auto abs_t = std::abs(t);
+  return std::make_tuple(abs_t * abs_t, abs_t);
+}
+
 #ifdef BTAS_IS_USABLE
 template <typename T_, class Range_, class Store_>
-std::tuple<T_, T_> norms(const btas::Tensor<T_, Range_, Store_> &t) {
-  T_ norm_2_square = 0.0;
-  T_ norm_inf = 0.0;
-  for (auto k : t) {
-    norm_2_square += k * k;
-    norm_inf = std::max(norm_inf, std::abs(k));
+auto norms(const btas::Tensor<T_, Range_, Store_> &t) {
+  using T = decltype(std::abs(std::declval<T_>()));
+  T norm_2_square = 0.0;
+  T norm_inf = 0.0;
+  for (auto elem : t) {
+    T elem_norm_2_square, elem_norm_inf;
+    std::tie(elem_norm_2_square, elem_norm_inf) = norms(elem);
+    norm_2_square += elem_norm_2_square;
+    norm_inf = std::max(norm_inf, elem_norm_inf);
   }
   return std::make_tuple(norm_2_square, norm_inf);
 }
 #endif
 
-std::tuple<double, double> norms(double t) { return std::make_tuple(t * t, std::abs(t)); }
-
-template <typename Blk = blk_t>
-std::tuple<double, double> norms(const SpMatrix<Blk> &A) {
-  double norm_2_square = 0.0;
-  double norm_inf = 0.0;
+template <typename Blk>
+auto norms(const SpMatrix<Blk> &A) {
+  using T = scalar_t;
+  T norm_2_square = 0.0;
+  T norm_inf = 0.0;
   for (int i = 0; i < A.outerSize(); ++i) {
     for (typename SpMatrix<Blk>::InnerIterator it(A, i); it; ++it) {
       //  cout << 1+it.row() << "\t"; // row index
       //  cout << 1+it.col() << "\t"; // col index (here it is equal to k)
       //  cout << it.value() << endl;
       auto elem = it.value();
-      double elem_norm_2_square, elem_norm_inf;
+      T elem_norm_2_square, elem_norm_inf;
       std::tie(elem_norm_2_square, elem_norm_inf) = norms(elem);
       norm_2_square += elem_norm_2_square;
       norm_inf = std::max(norm_inf, elem_norm_inf);
@@ -909,10 +1197,10 @@ static void initSpHardCoded(const std::function<int(const Key<2> &)> &keymap, Sp
 static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap, SpMatrix<> &A, SpMatrix<> &B,
                               SpMatrix<> &C, SpMatrix<> &Aref, SpMatrix<> &Bref, bool buildRefs,
                               std::vector<int> &mTiles, std::vector<int> &nTiles, std::vector<int> &kTiles,
-                              std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                              std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                              std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                              std::vector<std::vector<long>> &b_colidx_to_rowidx, int &m, int &n, int &k) {
+                              std::vector<std::vector<long>> &a_cols_of_row,
+                              std::vector<std::vector<long>> &a_rows_of_col,
+                              std::vector<std::vector<long>> &b_cols_of_row,
+                              std::vector<std::vector<long>> &b_rows_of_col, int &m, int &n, int &k) {
   m = 2;
   n = 3;
   k = 4;
@@ -982,19 +1270,19 @@ static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap,
     Aref_elements.emplace_back(1, 2, .2);
   }
 #endif
-  a_rowidx_to_colidx.resize(2);
-  a_rowidx_to_colidx[0].emplace_back(1);  // A[0][1]
-  a_rowidx_to_colidx[0].emplace_back(2);  // A[0][2]
-  a_rowidx_to_colidx[0].emplace_back(3);  // A[0][3]
-  a_rowidx_to_colidx[1].emplace_back(0);  // A[1][0]
-  a_rowidx_to_colidx[1].emplace_back(2);  // A[1][2]
-
-  a_colidx_to_rowidx.resize(4);
-  a_colidx_to_rowidx[0].emplace_back(1);  // A[1][0]
-  a_colidx_to_rowidx[1].emplace_back(0);  // A[0][1]
-  a_colidx_to_rowidx[2].emplace_back(0);  // A[0][2]
-  a_colidx_to_rowidx[2].emplace_back(1);  // A[1][2]
-  a_colidx_to_rowidx[3].emplace_back(0);  // A[0][3]
+  a_cols_of_row.resize(2);
+  a_cols_of_row[0].emplace_back(1);  // A[0][1]
+  a_cols_of_row[0].emplace_back(2);  // A[0][2]
+  a_cols_of_row[0].emplace_back(3);  // A[0][3]
+  a_cols_of_row[1].emplace_back(0);  // A[1][0]
+  a_cols_of_row[1].emplace_back(2);  // A[1][2]
+
+  a_rows_of_col.resize(4);
+  a_rows_of_col[0].emplace_back(1);  // A[1][0]
+  a_rows_of_col[1].emplace_back(0);  // A[0][1]
+  a_rows_of_col[2].emplace_back(0);  // A[0][2]
+  a_rows_of_col[2].emplace_back(1);  // A[1][2]
+  a_rows_of_col[3].emplace_back(0);  // A[0][3]
 
   A.setFromTriplets(A_elements.begin(), A_elements.end());
   std::cout << "A_elements.begin()" << A_elements.begin() << "A_elements.end()" << A_elements.end() << "\n";
@@ -1059,23 +1347,23 @@ static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap,
     B_elements.emplace_back(3, 2, 0.2);
   }
 #endif
-  b_rowidx_to_colidx.resize(4);
-  b_rowidx_to_colidx[0].emplace_back(0);  // B[0][0]
-  b_rowidx_to_colidx[1].emplace_back(0);  // B[1][0]
-  b_rowidx_to_colidx[1].emplace_back(1);  // B[1][1]
-  b_rowidx_to_colidx[1].emplace_back(2);  // B[1][2]
-  b_rowidx_to_colidx[2].emplace_back(2);  // B[2][2]
-  b_rowidx_to_colidx[3].emplace_back(0);  // B[3][0]
-  b_rowidx_to_colidx[3].emplace_back(2);  // B[3][2]
-
-  b_colidx_to_rowidx.resize(3);
-  b_colidx_to_rowidx[0].emplace_back(0);  // B[0][0]
-  b_colidx_to_rowidx[0].emplace_back(1);  // B[1][0]
-  b_colidx_to_rowidx[0].emplace_back(3);  // B[3][0]
-  b_colidx_to_rowidx[1].emplace_back(1);  // B[1][1]
-  b_colidx_to_rowidx[2].emplace_back(1);  // B[1][2]
-  b_colidx_to_rowidx[2].emplace_back(2);  // B[2][2]
-  b_colidx_to_rowidx[2].emplace_back(3);  // A[3][2]
+  b_cols_of_row.resize(4);
+  b_cols_of_row[0].emplace_back(0);  // B[0][0]
+  b_cols_of_row[1].emplace_back(0);  // B[1][0]
+  b_cols_of_row[1].emplace_back(1);  // B[1][1]
+  b_cols_of_row[1].emplace_back(2);  // B[1][2]
+  b_cols_of_row[2].emplace_back(2);  // B[2][2]
+  b_cols_of_row[3].emplace_back(0);  // B[3][0]
+  b_cols_of_row[3].emplace_back(2);  // B[3][2]
+
+  b_rows_of_col.resize(3);
+  b_rows_of_col[0].emplace_back(0);  // B[0][0]
+  b_rows_of_col[0].emplace_back(1);  // B[1][0]
+  b_rows_of_col[0].emplace_back(3);  // B[3][0]
+  b_rows_of_col[1].emplace_back(1);  // B[1][1]
+  b_rows_of_col[2].emplace_back(1);  // B[1][2]
+  b_rows_of_col[2].emplace_back(2);  // B[2][2]
+  b_rows_of_col[2].emplace_back(3);  // A[3][2]
 
   B.setFromTriplets(B_elements.begin(), B_elements.end());
   if (buildRefs && 0 == rank) {
@@ -1087,10 +1375,10 @@ static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap,
 static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, size_t M, size_t N, size_t K, int minTs,
                            int maxTs, double avgDensity, SpMatrix<> &A, SpMatrix<> &B, SpMatrix<> &Aref,
                            SpMatrix<> &Bref, bool buildRefs, std::vector<int> &mTiles, std::vector<int> &nTiles,
-                           std::vector<int> &kTiles, std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                           std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                           std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                           std::vector<std::vector<long>> &b_colidx_to_rowidx, double &average_tile_size,
+                           std::vector<int> &kTiles, std::vector<std::vector<long>> &a_cols_of_row,
+                           std::vector<std::vector<long>> &a_rows_of_col,
+                           std::vector<std::vector<long>> &b_cols_of_row,
+                           std::vector<std::vector<long>> &b_rows_of_col, double &average_tile_size,
                            double &Adensity, double &Bdensity, unsigned int seed) {
   int rank = ttg::default_execution_context().rank();
 
@@ -1137,7 +1425,7 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
   size_t avg_nb = 0;
   int avg_nb_nb = 0;
 
-  struct tuple_hash : public std::unary_function<std::tuple<int, int>, std::size_t> {
+  struct tuple_hash {
     std::size_t operator()(const std::tuple<int, int> &k) const {
       return static_cast<size_t>(std::get<0>(k)) | (static_cast<size_t>(std::get<1>(k)) << 32);
     }
@@ -1153,10 +1441,10 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
     if (fills.find({mt, kt}) != fills.end()) continue;
     fills.insert({mt, kt});
 
-    if (mt >= a_rowidx_to_colidx.size()) a_rowidx_to_colidx.resize(mt + 1);
-    a_rowidx_to_colidx[mt].emplace_back(kt);
-    if (kt >= a_colidx_to_rowidx.size()) a_colidx_to_rowidx.resize(kt + 1);
-    a_colidx_to_rowidx[kt].emplace_back(mt);
+    if (mt >= a_cols_of_row.size()) a_cols_of_row.resize(mt + 1);
+    a_cols_of_row[mt].emplace_back(kt);
+    if (kt >= a_rows_of_col.size()) a_rows_of_col.resize(kt + 1);
+    a_rows_of_col[kt].emplace_back(mt);
 
     filling += mTiles[mt] * kTiles[kt];
     avg_nb += mTiles[mt] * kTiles[kt];
@@ -1166,10 +1454,10 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
     if (rank != keymap({mt, kt})) continue;
     A_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value));
   }
-  for (auto &row : a_rowidx_to_colidx) {
+  for (auto &row : a_cols_of_row) {
     std::sort(row.begin(), row.end());
   }
-  for (auto &col : a_colidx_to_rowidx) {
+  for (auto &col : a_rows_of_col) {
     std::sort(col.begin(), col.end());
   }
   A.setFromTriplets(A_elements.begin(), A_elements.end());
@@ -1185,10 +1473,10 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
     if (fills.find({kt, nt}) != fills.end()) continue;
     fills.insert({kt, nt});
 
-    if (kt >= b_rowidx_to_colidx.size()) b_rowidx_to_colidx.resize(kt + 1);
-    b_rowidx_to_colidx[kt].emplace_back(nt);
-    if (nt >= b_colidx_to_rowidx.size()) b_colidx_to_rowidx.resize(nt + 1);
-    b_colidx_to_rowidx[nt].emplace_back(kt);
+    if (kt >= b_cols_of_row.size()) b_cols_of_row.resize(kt + 1);
+    b_cols_of_row[kt].emplace_back(nt);
+    if (nt >= b_rows_of_col.size()) b_rows_of_col.resize(nt + 1);
+    b_rows_of_col[nt].emplace_back(kt);
 
     filling += kTiles[kt] * nTiles[nt];
     avg_nb += kTiles[kt] * nTiles[nt];
@@ -1198,10 +1486,10 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
     if (rank != keymap({kt, nt})) continue;
     B_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value));
   }
-  for (auto &row : b_rowidx_to_colidx) {
+  for (auto &row : b_cols_of_row) {
     std::sort(row.begin(), row.end());
   }
-  for (auto &col : b_colidx_to_rowidx) {
+  for (auto &col : b_rows_of_col) {
     std::sort(col.begin(), col.end());
   }
   B.setFromTriplets(B_elements.begin(), B_elements.end());
@@ -1218,12 +1506,12 @@ static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, siz
 static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<int(const Key<2> &)> &ij_keymap,
                               const std::function<int(const Key<3> &)> &ijk_keymap, const std::string &tiling_type,
                               double gflops, double avg_nb, double Adensity, double Bdensity,
-                              const std::vector<std::vector<long>> &a_rowidx_to_colidx,
-                              const std::vector<std::vector<long>> &a_colidx_to_rowidx,
-                              const std::vector<std::vector<long>> &b_rowidx_to_colidx,
-                              const std::vector<std::vector<long>> &b_colidx_to_rowidx, std::vector<int> &mTiles,
+                              const std::vector<std::vector<long>> &a_cols_of_row,
+                              const std::vector<std::vector<long>> &a_rows_of_col,
+                              const std::vector<std::vector<long>> &b_cols_of_row,
+                              const std::vector<std::vector<long>> &b_rows_of_col, std::vector<int> &mTiles,
                               std::vector<int> &nTiles, std::vector<int> &kTiles, int M, int N, int K, int minTs,
-                              int maxTs, int P, int Q, int R) {
+                              int maxTs, int P, int Q, int R, int parallel_bcasts) {
   int MT = (int)A.rows();
   int NT = (int)B.cols();
   int KT = (int)A.cols();
@@ -1232,20 +1520,25 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
   SpMatrix<> C;
   C.resize(MT, NT);
 
+  /* the Read_SpMatrix tasks get process coordinates, not tile coordinates  */
+  auto read_keymap = [&](const Key<3>& key){
+    return ijk2rank(key[0], key[1], key[2], P, Q, R);
+  };
+
   // flow graph needs to exist on every node
-  Edge<Key<2>> ctl("control");
+  Edge<Key<3>> ctl("control");
   Control control(ctl);
   Edge<Key<2>, blk_t> eA, eB;
   Edge<Key<2>, blk_t> eC;
 
-  Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
-  Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
-  Write_SpMatrix<> c(C, eC, ij_keymap);
+  Read_SpMatrix a("A", A, ctl, eA, read_keymap, ij_keymap);
+  Read_SpMatrix b("B", B, ctl, eB, read_keymap, ij_keymap);
+  Write_SpMatrix<> c(C, eC, ij_keymap, false);
   auto &c_status = c.status();
   assert(!has_value(c_status));
   //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
-  SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx,
-                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
+  SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col,
+                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R, parallel_bcasts);
   TTGUNUSED(a);
   TTGUNUSED(b);
   TTGUNUSED(a_times_b);
@@ -1259,7 +1552,7 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
   }, end{0}, diff{0};
   gettimeofday(&start, nullptr);
   // ready, go! need only 1 kick, so must be done by 1 thread only
-  if (ttg::default_execution_context().rank() == 0) control.start(P, Q);
+  if (ttg::default_execution_context().rank() == 0) control.start(P, Q, R);
   fence();
   gettimeofday(&end, nullptr);
   timersub(&end, &start, &diff);
@@ -1277,10 +1570,11 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<
               << " A_density= " << Adensity << " B_density= " << Bdensity << " gflops= " << gflops << " seconds= " << tc
               << " gflops/s= " << gflops / tc << std::endl;
   }
+  //std::cout << "num reductions " << reduce_count.load() << " tiles " << MT*KT << std::endl;
 }
 
 #if !defined(BLOCK_SPARSE_GEMM)
-static void make_rowidx_to_colidx_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &r2c) {
+static void make_cols_of_row_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &r2c) {
   for (int k = 0; k < mat.outerSize(); ++k) {  // cols, if col-major, rows otherwise
     for (typename SpMatrix<blk_t>::InnerIterator it(mat, k); it; ++it) {
       const long row = it.row();
@@ -1295,7 +1589,7 @@ static void make_rowidx_to_colidx_from_eigen(const SpMatrix<> &mat, std::vector<
   }
 }
 
-static void make_colidx_to_rowidx_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &c2r) {
+static void make_rows_of_col_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &c2r) {
   for (int k = 0; k < mat.outerSize(); ++k) {  // cols, if col-major, rows otherwise
     for (typename SpMatrix<blk_t>::InnerIterator it(mat, k); it; ++it) {
       const long row = it.row();
@@ -1312,6 +1606,13 @@ static void make_colidx_to_rowidx_from_eigen(const SpMatrix<> &mat, std::vector<
 }
 #endif
 
+/* where to distribute the work to */
+enum class WORKDIST {
+  A = 0, // distribute work based on A's distribution
+  B = 1, // distribute work based on B's distribution
+  C = 2, // distribute work based on C's distribution
+};
+
 static double compute_gflops(const std::vector<std::vector<long>> &a_r2c, const std::vector<std::vector<long>> &b_r2c,
                              const std::vector<int> &mTiles, const std::vector<int> &nTiles,
                              const std::vector<int> &kTiles) {
@@ -1398,10 +1699,15 @@ int main(int argc, char **argv) {
       std::string tiling_type;
       int M = 0, N = 0, K = 0;
       int minTs = 0, maxTs = 0;
+      int parallel_bcasts = std::numeric_limits<int>::max();
 
       double avg_nb = nan("undefined");
       double Adensity = nan("undefined");
       double Bdensity = nan("undefined");
+      if (cmdOptionExists(argv, argv + argc, "-b")) {
+        std::string pStr = getCmdOption(argv, argv + argc, "-b");
+        parallel_bcasts = std::stol(pStr);
+      }
 
       std::string PStr(getCmdOption(argv, argv + argc, "-P"));
       P = parseOption(PStr, P);
@@ -1426,23 +1732,52 @@ int main(int argc, char **argv) {
         }
       }
 
-      auto ij_keymap = [P, Q](const Key<2> &ij) {
+      WORKDIST dist = WORKDIST::C;
+      if (cmdOptionExists(argv, argv + argc, "-D")) {
+        std::string DStr(getCmdOption(argv, argv+argc, "-D"));
+        if (DStr == "a") {
+          dist = WORKDIST::A;
+        } else if (DStr == "b") {
+          dist = WORKDIST::B;
+        } else if (DStr == "c") {
+          dist = WORKDIST::C;
+        }
+      }
+
+      auto ij_keymap = [P, Q, R](const Key<2> &ij) {
         int i = (int)ij[0];
         int j = (int)ij[1];
-        int r = ij2rank(i, j, P, Q);
+        int r = ij2rank(i, j, P, Q, R);
         return r;
       };
 
-      auto ijk_keymap = [P, Q, R](const Key<3> &ijk) {
-        int i = (int)ijk[0];
-        int j = (int)ijk[1];
-        int k = (int)ijk[2];
-        int r = ijk2rank(i, j, k, P, Q, R);
-        return r;
-      };
+      std::function<int(const Key<3> &ijk)> ijk_keymap;
+
+      if (dist == WORKDIST::A) {
+        ijk_keymap = [&](const Key<3> &ijk) {
+            int i = ijk[0], j = ijk[1], k = ijk[2];
+            return ij2rank(i, k, P, Q, R);
+          };
+      } else if (dist == WORKDIST::B) {
+        ijk_keymap = [&](const Key<3> &ijk) {
+            int i = ijk[0], j = ijk[1], k = ijk[2];
+            return ij2rank(k, j, P, Q, R);
+          };
+      } else if (dist == WORKDIST::C) {
+        ijk_keymap = [&](const Key<3> &ijk) {
+            int i = ijk[0], j = ijk[1], k = ijk[2];
+            return ij2rank(i, j, P, Q, R);
+          };
+      } else {
+        ijk_keymap = [&](const Key<3> &ijk) {
+            int i = ijk[0], j = ijk[1], k = ijk[2];
+            int r = ijk2rank(i, j, k, P, Q, R);
+            return r;
+          };
+      }
 
       std::string seedStr(getCmdOption(argv, argv + argc, "-s"));
-      unsigned int seed = parseOption(seedStr, 0);
+      unsigned long seed = parseOption(seedStr, 0L);
       if (seed == 0) {
         std::random_device rd;
         seed = rd();
@@ -1453,10 +1788,10 @@ int main(int argc, char **argv) {
       std::vector<int> mTiles;
       std::vector<int> nTiles;
       std::vector<int> kTiles;
-      std::vector<std::vector<long>> a_rowidx_to_colidx;
-      std::vector<std::vector<long>> a_colidx_to_rowidx;
-      std::vector<std::vector<long>> b_rowidx_to_colidx;
-      std::vector<std::vector<long>> b_colidx_to_rowidx;
+      std::vector<std::vector<long>> a_cols_of_row;
+      std::vector<std::vector<long>> a_rows_of_col;
+      std::vector<std::vector<long>> b_cols_of_row;
+      std::vector<std::vector<long>> b_rows_of_col;
 
       std::string checkStr(getCmdOption(argv, argv + argc, "-x"));
       int check = parseOption(checkStr, !(argc >= 2));
@@ -1484,10 +1819,10 @@ int main(int argc, char **argv) {
       }
 
       // We still need to build the metadata from the  matrices.
-      make_rowidx_to_colidx_from_eigen(A, a_rowidx_to_colidx);
-      make_colidx_to_rowidx_from_eigen(A, a_colidx_to_rowidx);
-      make_rowidx_to_colidx_from_eigen(B, b_rowidx_to_colidx);
-      make_colidx_to_rowidx_from_eigen(B, b_colidx_to_rowidx);
+      make_cols_of_row_from_eigen(A, a_cols_of_row);
+      make_rows_of_col_from_eigen(A, a_rows_of_col);
+      make_cols_of_row_from_eigen(B, b_cols_of_row);
+      make_rows_of_col_from_eigen(B, b_rows_of_col);
       // This is only needed to compute the flops
       for (int mt = 0; mt < M; mt++) mTiles.emplace_back(1);
       for (int nt = 0; nt < N; nt++) nTiles.emplace_back(1);
@@ -1509,18 +1844,18 @@ int main(int argc, char **argv) {
         timing = (check == 0);
         tiling_type = "RandomIrregularTiling";
         initBlSpRandom(ij_keymap, M, N, K, minTs, maxTs, avg, A, B, Aref, Bref, check, mTiles, nTiles, kTiles,
-                       a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, avg_nb, Adensity,
+                       a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col, avg_nb, Adensity,
                        Bdensity, seed);
 
         C.resize(mTiles.size(), nTiles.size());
       } else {
         tiling_type = "HardCodedBlockSparseMatrix";
-        initBlSpHardCoded(ij_keymap, A, B, C, Aref, Bref, true, mTiles, nTiles, kTiles, a_rowidx_to_colidx,
-                          a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, M, N, K);
+        initBlSpHardCoded(ij_keymap, A, B, C, Aref, Bref, true, mTiles, nTiles, kTiles, a_cols_of_row,
+                          a_rows_of_col, b_cols_of_row, b_rows_of_col, M, N, K);
       }
 #endif  // !defined(BLOCK_SPARSE_GEMM)
 
-      gflops = compute_gflops(a_rowidx_to_colidx, b_rowidx_to_colidx, mTiles, nTiles, kTiles);
+      gflops = compute_gflops(a_cols_of_row, b_cols_of_row, mTiles, nTiles, kTiles);
 
       std::string nbrunStr(getCmdOption(argv, argv + argc, "-n"));
       int nb_runs = parseOption(nbrunStr, 1);
@@ -1530,24 +1865,29 @@ int main(int argc, char **argv) {
         execute();
         for (int nrun = 0; nrun < nb_runs; nrun++) {
           timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity,
-                            a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, mTiles,
-                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R);
+                            a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col, mTiles,
+                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R, parallel_bcasts);
         }
       } else {
         // flow graph needs to exist on every node
         // N.B. to validate C we need it on node 0!
         auto keymap_write = [](const Key<2> &key) { return 0; };
-        Edge<Key<2>> ctl("control");
+
+        /* the Read_SpMatrix tasks get process coordinates, not tile coordinates  */
+        auto read_keymap = [&](const Key<3>& key){
+            return ijk2rank(key[0], key[1], key[2], P, Q, R);
+          };
+        Edge<Key<3>> ctl("control");
         Control control(ctl);
         Edge<Key<2>, blk_t> eA, eB, eC;
-        Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
-        Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
+        Read_SpMatrix a("A", A, ctl, eA, read_keymap, ij_keymap);
+        Read_SpMatrix b("B", B, ctl, eB, read_keymap, ij_keymap);
         Write_SpMatrix<> c(C, eC, keymap_write);
         auto &c_status = c.status();
         assert(!has_value(c_status));
         //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
-        SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx,
-                            b_colidx_to_rowidx, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
+        SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row,
+                            b_rows_of_col, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
         TTGUNUSED(a_times_b);
         // calling the Dot constructor with 'true' argument disables the type
         if (default_execution_context().rank() == 0) std::cout << Dot{/*disable_type=*/true}(&control) << std::endl;
@@ -1558,7 +1898,7 @@ int main(int argc, char **argv) {
         TTGUNUSED(connected);
 
         // ready, go! need only 1 kick, so must be done by 1 thread only
-        if (ttg::default_execution_context().rank() == 0) control.start(P, Q);
+        if (ttg::default_execution_context().rank() == 0) control.start(P, Q, R);
 
         execute();
         fence();
diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc
new file mode 100644
index 000000000..90cca96f8
--- /dev/null
+++ b/examples/spmm/spmm_cuda.cc
@@ -0,0 +1,1966 @@
+
+#include <Eigen/SparseCore>
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <madness/world/world.h>
+
+#if __has_include(<btas/features.h>)
+#pragma message("C Preprocessor got here!")
+#include <btas/features.h>
+#ifdef BTAS_IS_USABLE
+#include <btas/btas.h>
+#include <btas/optimize/contract.h>
+#include <btas/util/mohndle.h>
+#include <TiledArray/device/allocators.h>
+#include "../devblas_helper.h"
+#include <madness/world/parsec.h>  // need to initialize MADNESS purely for the purposes of TA allocators
+#else
+#warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost"
+#endif
+#endif
+
+#include <sys/time.h>
+#include <boost/graph/rmat_graph_generator.hpp>
+#if !defined(BLOCK_SPARSE_GEMM)
+#include <boost/graph/directed_graph.hpp>
+#include <boost/random/linear_congruential.hpp>
+#include <unsupported/Eigen/SparseExtra>
+#endif
+
+#include "ttg.h"
+
+#include "../devblas_helper.h"
+
+using namespace ttg;
+
+#include "ttg/util/future.h"
+
+#include "ttg/util/multiindex.h"
+
+#include "ttg/util/bug.h"
+
+#include "ttg/serialization/std/pair.h"
+
+#if defined(TTG_HAVE_LEVEL_ZERO)
+#include <oneapi/mkl.hpp>
+#include <sys/time.h>
+#endif
+
+#if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
+
+template <typename _T, class _Range, class _Storage>
+struct DeviceTensor : public ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>
+                    , public btas::Tensor<_T, _Range, _Storage> {
+  using tensor_type = typename btas::Tensor<_T, _Range, _Storage>;
+  using ttvalue_type = typename ttg::TTValue<DeviceTensor<_T, _Range, _Storage>>;
+  ttg::Buffer<_T> b; // does not own the host buffer
+
+  using value_type = typename tensor_type::value_type;
+  using size_type = typename tensor_type::size_type;
+  using storage_type = typename tensor_type::storage_type;
+  using range_type = typename tensor_type::range_type;
+
+
+   public:
+    DeviceTensor() = default;
+    ~DeviceTensor() = default;
+
+    /// constructor with index extent
+    template <typename... _args>
+    explicit DeviceTensor(const size_type& first, const _args&... rest)
+    : ttvalue_type()
+    , tensor_type(first, rest...)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range, allocate data, but not initialized
+    template <typename Range>
+    explicit DeviceTensor(const Range& range, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range object, set all elements to \c v
+    template <typename Range>
+    DeviceTensor(const Range& range, value_type v, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range object, copy elements from \c vec
+    template <typename Range, typename U>
+    DeviceTensor(const Range& range, U* vec, typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, vec)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// construct from \c range and \c storage
+    template <typename Range, typename Storage>
+    DeviceTensor(const Range& range, const Storage& storage,
+           typename std::enable_if<btas::is_boxrange<Range>::value & not std::is_same<Range, range_type>::value &
+                                   not std::is_same<Storage, storage_type>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, storage)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy-copy-construct from \c range and \c storage
+    DeviceTensor(const range_type& range, const storage_type& storage)
+    : ttvalue_type()
+    , tensor_type(range, storage)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy-move-construct from \c range and \c storage
+    DeviceTensor(const range_type& range, storage_type&& storage)
+    : ttvalue_type()
+    , tensor_type(range, std::forward<storage_type>(storage))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// move-construct from \c range and \c storage
+    DeviceTensor(range_type&& range, storage_type&& storage)
+    : ttvalue_type()
+    , tensor_type(std::forward<range_type>(range), std::forward<storage_type>(storage))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// Construct an evaluated tensor
+
+    /// This constructor will allocate memory for \c range.area() elements. Each element
+    /// will be initialized as:
+    /// \code
+    ///   for(auto&& idx: range)
+    ///     (*this)[idx] = op(*(it++));
+    /// \endcode
+    /// \tparam Range An input Range type.
+    /// \tparam InIter An input iterator type.
+    /// \tparam Op A unary operation type
+    /// \param range the input range type
+    /// \param first An input iterator for the argument
+    /// \param op The unary operation to be applied to the argument data
+    template <typename Range, typename InIter, typename Op>
+    DeviceTensor(const Range& range, InIter it, const Op& op,
+           typename std::enable_if<btas::is_boxrange<Range>::value>::type* = 0)
+    : ttvalue_type()
+    , tensor_type(range, it, op)
+    , b(this->size() ? this->data() : nullptr, this->size())
+    { }
+
+    /// copy constructor
+    /// It will accept Tensors and TensorViews
+    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type>
+    DeviceTensor(const _Tensor& x) noexcept
+    : ttvalue_type()
+    , tensor_type(x.clone())
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor tensor_type copy ctor" << std::endl;
+    }
+
+    /// copy constructor: devicebuf cannot be copied, so deleted
+    DeviceTensor(const DeviceTensor& x) noexcept
+    : ttvalue_type(x)
+    , tensor_type(x.clone())
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor copy ctor" << std::endl;
+    }
+
+    /// move constructor
+    DeviceTensor(tensor_type&& x) noexcept
+    : ttvalue_type()
+    , tensor_type(std::move(x))
+    , b(this->size() ? this->data() : nullptr, this->size())
+    {
+      //std::cout << "DeviceTensor tensor_type move ctor" << std::endl;
+    }
+
+    DeviceTensor(DeviceTensor&& x) noexcept
+    : ttvalue_type(std::move(x))
+    , tensor_type(std::move(x))
+    /* Grrrr, moving a Tensor does not guarantee to move the pointer */
+    , b((this->size() == 0 ||
+         this->data() == x.b.host_ptr()) ? std::move(x.b)
+                                         : ttg::Buffer<_T>(this->size() ? this->data()
+                                                                        : nullptr,
+                                                           this->size()))
+    {
+      assert(this->data() == b.host_ptr());
+      //std::cout << "DeviceTensor move ctor" << std::endl;
+    }
+
+    /// copy assignment operator
+    template <class _Tensor, class = typename std::enable_if<
+                                 btas::is_boxtensor<_Tensor>::value &&
+                                 not std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
+    DeviceTensor& operator=(const _Tensor& x) noexcept {
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
+      return *this;
+    }
+
+    /// copy assignment operator
+    template <class _Tensor, class = typename std::enable_if<btas::is_boxtensor<_Tensor>::value>::type,
+              class = typename std::enable_if<
+                  std::is_same<typename _Tensor::storage_type, storage_type>::value>::type>
+    DeviceTensor& operator=(const _Tensor& x) noexcept {
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor tensor_type copy operator" << std::endl;
+      return *this;
+    }
+
+    /// copy assignment: devicebuf cannot be copied, deleted
+    DeviceTensor& operator=(const DeviceTensor& x) noexcept {
+      ttvalue_type::operator=(x);
+      tensor_type::operator=(x.clone());
+      b.reset(this->size() ? this->data() : nullptr, this->size());
+      //std::cout << "DeviceTensor copy operator" << std::endl;
+      return *this;
+    }
+
+    /// move assignment operator
+    DeviceTensor& operator=(DeviceTensor&& x) noexcept {
+      ttvalue_type::operator=(std::move(x));
+      tensor_type::operator=(std::move(x));
+      if (this->size() == 0 || this->data() == x.b.host_ptr()){
+        b = std::move(x.b);
+      } else  {
+        b = ttg::Buffer<_T>(this->size() ? this->data() : nullptr, this->size());
+      }
+      //std::swap(x.b, b);
+      //std::cout << "DeviceTensor move ctor" << std::endl;
+      return *this;
+    }
+
+    using tensor_type::begin;
+    using tensor_type::cbegin;
+    using tensor_type::end;
+    using tensor_type::cend;
+
+};
+
+using scalar_t = double;
+#if defined(TTG_HAVE_CUDA) || defined(TTG_HAVE_HIPBLAS)
+using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
+                           btas::mohndle<btas::varray<scalar_t, TiledArray::device_pinned_allocator<scalar_t>>,
+                                         btas::Handle::shared_ptr>>;
+#else
+using blk_t = DeviceTensor<scalar_t, btas::DEFAULT::range,
+                           btas::mohndle<btas::varray<scalar_t>, btas::Handle::shared_ptr>>;
+#endif
+
+
+//inline blk_t operator*(const blk_t &A, const blk_t &B) {
+//  blk_t::tensor_type c;
+//  btas::contract(1.0, A, {1, 2}, B, {2, 3}, 0.0, c, {1, 3});
+//  return blk_t(std::move(c));
+//}
+
+/* TODO: call CUDA gemm here */
+template <typename Blk>
+static void device_gemm(Blk &C, const Blk &A, const Blk &B) {
+  using blk_t = Blk;
+  using T = typename blk_t::value_type;
+  static_assert(std::is_same_v<T,double> || std::is_same_v<T,float>);
+  static const T alpha = 1.0;
+  static const T beta  = 1.0;
+  // make sure all memory is on the device
+  // TODO: A and B are read-only so the owner device will be 0. How to fix?
+  //assert(A.b.get_current_device() != 0);
+  //assert(B.b.get_current_device() != 0);
+  auto device = ttg::device::current_device();
+  assert(device.is_device());
+#if defined(TTG_HAVE_CUDA)
+  if constexpr (std::is_same_v<T,double>) {
+      cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
+                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
+                  C.b.current_device_ptr(), C.extent(0));
+  }
+  else if constexpr (std::is_same_v<T,float>) {
+      cublasSgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1),
+                  &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta,
+                  C.b.current_device_ptr(), C.extent(0));
+  }
+#elif defined(TTG_HAVE_HIPBLAS)
+  if constexpr (std::is_same_v<T,double>) {
+    hipblasDgemm(hipblas_handle(),
+                 HIPBLAS_OP_N, HIPBLAS_OP_N,
+                 C.extent(0), C.extent(1), A.extent(1), &alpha,
+                 A.b.current_device_ptr(), A.extent(0),
+                 B.b.current_device_ptr(), B.extent(0), &beta,
+                 C.b.current_device_ptr(), C.extent(0));
+  } else if constexpr (std::is_same_v<T,float>) {
+    hipblasSgemm(hipblas_handle(),
+                 HIPBLAS_OP_N, HIPBLAS_OP_N,
+                 C.extent(0), C.extent(1), A.extent(1), &alpha,
+                 A.b.current_device_ptr(), A.extent(0),
+                 B.b.current_device_ptr(), B.extent(0), &beta,
+                 C.b.current_device_ptr(), C.extent(0));
+  }
+#elif defined(TTG_HAVE_LEVEL_ZERO)
+
+#if defined(DEBUG_SYNCHRONOUS)
+  try {
+#endif /* DEBUG_SYNCHRONOUS */
+    cl::sycl::event gemm_event;
+    gemm_event = oneapi::mkl::blas::gemm(ttg::device::current_stream(),
+         oneapi::mkl::transpose::N, oneapi::mkl::transpose::N,
+         C.extent(0), C.extent(1), A.extent(1),
+         alpha, A.b.current_device_ptr(), A.extent(0),
+                B.b.current_device_ptr(), B.extent(0),
+         beta,  C.b.current_device_ptr(), C.extent(0));
+#if defined(DEBUG_SYNCHRONOUS)
+    gemm_event.wait();
+  } catch (const oneapi::mkl::invalid_argument &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws invalid argument exception" << std::endl;
+  } catch (const oneapi::mkl::unsupported_device &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unsuported device exception" << std::endl;
+  } catch (const oneapi::mkl::host_bad_alloc &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws host bad allocation exception" << std::endl;
+  } catch (const oneapi::mkl::device_bad_alloc &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws device bad allocation exception" << std::endl;
+  } catch (const oneapi::mkl::unimplemented &e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unimplemented exception" << std::endl;
+  } catch (const std::exception& e) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception" << std::endl;
+  } catch (...) {
+    std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception that is also badly formatted..." << std::endl;
+  }
+#endif /* DEBUG_SYNCHRONOUS */
+#endif
+}
+
+#if defined(TTG_USE_PARSEC)
+namespace ttg {
+  template <>
+  struct SplitMetadataDescriptor<blk_t> {
+    // TODO: this is a quick and dirty approach.
+    //   - blk_t could have any number of dimensions, this code only works for 2 dim blocks
+    //   - we use Blk{} to send a control flow in some tasks below, these blocks have only
+    //     1 dimension (of size 0), to code this, we set the second dimension to 0 in our
+    //     quick and dirty linearization, then have a case when we create the object
+    //   - when we create the object with the metadata, we use a constructor that initializes
+    //     the data to 0, which is useless: the data could be left uninitialized
+    static auto get_metadata(const blk_t &b) {
+      std::pair<int, int> dim{0, 0};
+      if (!b.empty()) {
+        assert(b.range().extent().size() == 2);
+        std::get<0>(dim) = (int)b.range().extent(0);
+        std::get<1>(dim) = (int)b.range().extent(1);
+      }
+      return dim;
+    }
+    static auto get_data(blk_t &b) {
+      using T = typename blk_t::value_type;
+      if (!b.empty())
+        return boost::container::small_vector<iovec, 1>(1, iovec{b.size() * sizeof(T), b.data()});
+      else
+        return boost::container::small_vector<iovec, 1>{};
+    }
+    static auto create_from_metadata(const std::pair<int, int> &meta) {
+      if (meta != std::pair{0, 0}) // N.B. allocate only, do not fill with zeroes
+        return blk_t(btas::Range(std::get<0>(meta), std::get<1>(meta)));
+      else
+        return blk_t{};
+    }
+  };
+}  // namespace ttg
+#endif /* TTG_USE_PARSEC */
+
+// declare btas::Tensor serializable by Boost
+#include "ttg/serialization/backends/boost.h"
+namespace ttg::detail {
+  // BTAS defines all of its Boost serializers in boost::serialization namespace ... as explained in
+  // ttg/serialization/boost.h such functions are not detectable via SFINAE, so must explicitly define serialization
+  // traits here
+  template <typename Archive>
+  inline static constexpr bool is_boost_serializable_v<Archive, blk_t> = is_boost_archive_v<Archive>;
+  template <typename Archive>
+  inline static constexpr bool is_boost_serializable_v<Archive, const blk_t> = is_boost_archive_v<Archive>;
+}  // namespace ttg::detail
+
+#else
+using blk_t = double;
+#endif
+template <typename T = blk_t>
+using SpMatrix = Eigen::SparseMatrix<T>;
+template <typename T = blk_t>
+using SpMatrixTriplet = Eigen::Triplet<T>;  // {row,col,value}
+
+#if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
+
+#if __has_include(<madness/world/archive.h>)
+
+#include <madness/world/archive.h>
+
+#endif  // __has_include(<madness/world/archive.h>)
+
+namespace btas {
+  template <typename T_, class Range_, class Store_>
+  inline btas::Tensor<T_, Range_, Store_> operator*(const btas::Tensor<T_, Range_, Store_> &A,
+                                                    const btas::Tensor<T_, Range_, Store_> &B) {
+    btas::Tensor<T_, Range_, Store_> C;
+    btas::contract(1.0, A, {1, 2}, B, {2, 3}, 0.0, C, {1, 3});
+    return C;
+  }
+
+  template <typename T_, class Range_, class Store_>
+  btas::Tensor<T_, Range_, Store_> gemm(btas::Tensor<T_, Range_, Store_> &&C, const btas::Tensor<T_, Range_, Store_> &A,
+                                        const btas::Tensor<T_, Range_, Store_> &B) {
+    using array = btas::DEFAULT::index<int>;
+    if (C.empty()) {  // first contribution to C = allocate it and gemm with beta=0
+      C = btas::Tensor<T_, Range_, Store_>(btas::Range(A.range().extent(0), B.range().extent(1)));
+      btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 0.0, C, array{1, 3}, false, false);
+    }
+    else {   // subsequent contributions to C = gemm with beta=1
+      btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false);
+    }
+    return std::move(C);
+  }
+}  // namespace btas
+#endif  // BTAS_IS_USABLE
+double gemm(double C, double A, double B) { return C + A * B; }
+
+// template <typename _Scalar, int _Options, typename _StorageIndex>
+// struct colmajor_layout;
+// template <typename _Scalar, typename _StorageIndex>
+// struct colmajor_layout<_Scalar, Eigen::ColMajor, _StorageIndex> : public std::true_type {};
+// template <typename _Scalar, typename _StorageIndex>
+// struct colmajor_layout<_Scalar, Eigen::RowMajor, _StorageIndex> : public std::false_type {};
+
+template <std::size_t Rank>
+using Key = MultiIndex<Rank>;
+
+/// maps {i,j} to rank within first (R=0) layer of the 3-d process grid
+inline int ij2rank(int i, int j, int P, int Q) {
+  std::vector<int> vec;
+  int p = (i % P);
+  int q = (j % Q);
+  int rank = (q * P) + p;
+  return rank;
+}
+
+/// maps {i,j,k} to rank within a 3-d process grid
+inline int ijk2rank(int i, int j, int k, int P, int Q, int R) {
+  std::vector<int> vec;
+  int p = (i % P);
+  int q = (j % Q);
+  int l = (k % R);
+  int rank = (l * P * Q) + (q * P) + p;
+  return rank;
+}
+
+// flow data from an existing SpMatrix on rank 0
+template <typename Blk = blk_t, typename Keymap = std::function<int(const Key<2> &)>>
+class Read_SpMatrix : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, Read_SpMatrix<Blk, Keymap>, ttg::typelist<void>> {
+ public:
+  using baseT = typename Read_SpMatrix::ttT;
+  Read_SpMatrix(const char *label, const SpMatrix<Blk> &matrix, Edge<Key<2>> &ctl, Edge<Key<2>, Blk> &out,
+                Keymap &ij_keymap)
+      : baseT(edges(ctl), edges(out), std::string("read_spmatrix(") + label + ")", {"ctl"}, {std::string(label) + "ij"},
+              ij_keymap)
+      , matrix_(matrix) {}
+
+  void op(const Key<2> &, std::tuple<Out<Key<2>, Blk>> &out) {
+    auto rank = ttg::default_execution_context().rank();
+    for (int k = 0; k < matrix_.outerSize(); ++k) {
+      for (typename SpMatrix<Blk>::InnerIterator it(matrix_, k); it; ++it) {
+        if (rank == this->get_keymap()(Key<2>(std::initializer_list<long>({it.row(), it.col()}))))
+          ::send<0>(Key<2>(std::initializer_list<long>({it.row(), it.col()})), ttg::persistent(it.value()), out);
+      }
+    }
+  }
+
+ private:
+  const SpMatrix<Blk> &matrix_;
+};
+
+// flow (move?) data into an existing SpMatrix on rank 0
+template <typename Blk = blk_t>
+class Write_SpMatrix : public TT<Key<2>, std::tuple<>, Write_SpMatrix<Blk>, ttg::typelist<Blk>> {
+ public:
+  using baseT = typename Write_SpMatrix::ttT;
+
+  template <typename Keymap2>
+  Write_SpMatrix(SpMatrix<Blk> &matrix, Edge<Key<2>, Blk> &in, Keymap2 &&ij_keymap, bool write_back = false)
+      : baseT(edges(in), edges(), "write_spmatrix", {"Cij"}, {}, ij_keymap)
+      , matrix_(matrix)
+      , write_back(write_back)
+  { }
+
+  void op(const Key<2> &key, typename baseT::input_refs_tuple_type &&elem, std::tuple<> &) {
+
+    if (write_back) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      ttg::trace("rank =", default_execution_context().rank(),
+                 "/ thread_id =", reinterpret_cast<std::uintptr_t>(pthread_self()), "spmm.cc Write_SpMatrix wrote {",
+                 key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast<void *>(&matrix_),
+                 " with mutex @", static_cast<void *>(&mtx_), " for object @", static_cast<void *>(this));
+      values_.emplace_back(key[0], key[1], std::move(baseT::template get<0>(elem)));
+    }
+  }
+
+  /// grab completion status as a future<void>
+  /// \note cannot be called once this is executable
+  const std::shared_future<void> &status() const {
+    assert(!this->is_executable());
+    if (!completion_status_) {  // if not done yet, register completion work with the world
+      auto promise = std::make_shared<std::promise<void>>();
+      completion_status_ = std::make_shared<std::shared_future<void>>(promise->get_future());
+      ttg_register_status(this->get_world(), std::move(promise));
+      ttg_register_callback(this->get_world(),
+                            [this]() { this->matrix_.setFromTriplets(this->values_.begin(), this->values_.end()); });
+    } else {  // if done already, commit the result
+      this->matrix_.setFromTriplets(this->values_.begin(), this->values_.end());
+    }
+    return *completion_status_;
+  }
+
+ private:
+  std::mutex mtx_;
+  SpMatrix<Blk> &matrix_;
+  std::vector<SpMatrixTriplet<Blk>> values_;
+  mutable std::shared_ptr<std::shared_future<void>> completion_status_;
+  bool write_back = false;
+};
+
+/// sparse mm via 2.5D SUMMA
+
+/// @tparam KeyMap2 maps {i,j} to processor
+/// @tparam KeyMap3 maps {i,j,k} to processor
+template <typename Keymap2 = std::function<int(const Key<2> &)>, typename Keymap3 = std::function<int(const Key<3> &)>,
+          typename Blk = blk_t>
+class SpMM25D {
+ public:
+  /// @param ij_keymap maps {i,j} to process, specifies distribution of tiles of A, B, and C
+  /// @param ijk_keymap maps {i,j,k} to process, controls distribution of tasks performing C[i][j] += A[i][k]*B[k][j]
+  /// @param R the number of "layers" in the 3-D process grid
+  SpMM25D(Edge<Key<2>, Blk> &a, Edge<Key<2>, Blk> &b, Edge<Key<2>, Blk> &c, const SpMatrix<Blk> &a_mat,
+          const SpMatrix<Blk> &b_mat, const std::vector<std::vector<long>> &a_rowidx_to_colidx,
+          const std::vector<std::vector<long>> &a_colidx_to_rowidx,
+          const std::vector<std::vector<long>> &b_rowidx_to_colidx,
+          const std::vector<std::vector<long>> &b_colidx_to_rowidx, const std::vector<int> &mTiles,
+          const std::vector<int> &nTiles, const std::vector<int> &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap, long R)
+      : a_rowidx_to_colidx_(a_rowidx_to_colidx)
+      , b_colidx_to_rowidx_(b_colidx_to_rowidx)
+      , a_colidx_to_rowidx_(a_colidx_to_rowidx)
+      , b_rowidx_to_colidx_(b_rowidx_to_colidx)
+      , ij_keymap_(std::move(ij_keymap))
+      , ijk_keymap_(std::move(ijk_keymap)) {
+    bcast_a_ = std::make_unique<BcastA>(a, local_a_ijk_, b_rowidx_to_colidx_, ij_keymap_, ijk_keymap_);
+    local_bcast_a_ = std::make_unique<LocalBcastA>(local_a_ijk_, a_ijk_, b_rowidx_to_colidx_, ijk_keymap_);
+    bcast_b_ = std::make_unique<BcastB>(b, local_b_ijk_, a_colidx_to_rowidx_, ij_keymap_, ijk_keymap_);
+    local_bcast_b_ = std::make_unique<LocalBcastB>(local_b_ijk_, b_ijk_, a_colidx_to_rowidx_, ijk_keymap_);
+    multiplyadd_ = std::make_unique<MultiplyAdd>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_rowidx_to_colidx_,
+                                                 b_colidx_to_rowidx_, mTiles, nTiles, ijk_keymap_);
+    reduce_c_ = std::make_unique<ReduceC>(c_ij_p_, c, ij_keymap_);
+    reduce_c_->template set_input_reducer<0>([](Blk &c_ij, const Blk &c_ij_p) { c_ij = c_ij + c_ij_p; });
+    // compute how many contributions each C[i][j] should expect ... MultiplyAdd already does this, but need a way to
+    // send message from each process p to the process owning C[i][j] to expect a contribution from it for now replicate
+    // this logic ...
+    // TODO: do this in MultiplyAdd (need to allreduce this info so that everyone has it)
+    // N.B. only need to set stream size on the rank that will accumulate the C[i][j] contribution
+    const auto my_rank = ttg::default_execution_context().rank();
+    for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) {
+      if (a_rowidx_to_colidx_[i].empty()) continue;
+      for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) {
+        if (b_colidx_to_rowidx_[j].empty()) continue;
+
+        if (ij_keymap_(Key<2>{i, j}) == my_rank) {
+          decltype(i) k;
+          bool have_k;
+          std::tie(k, have_k) = multiplyadd_->compute_first_k(i, j);
+          std::vector<bool> c_ij_procmask(R, false);
+          if (have_k) {
+            const auto pR = k % R;  // k values are distributed round-robin among the layers of the 3-D grid
+            assert(pR < c_ij_procmask.size());
+            c_ij_procmask[pR] = true;
+            while (have_k) {
+              std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k);
+              if (have_k) {
+                const auto pR = k % R;
+                assert(pR < c_ij_procmask.size());
+                c_ij_procmask[pR] = true;
+              }
+            }
+          }
+          const auto c_ij_nprocs = std::count_if(c_ij_procmask.begin(), c_ij_procmask.end(), [](bool b) { return b; });
+          if (c_ij_nprocs > 0) reduce_c_->template set_argstream_size<0>(Key<2>{i, j}, c_ij_nprocs);
+        }
+      }
+    }
+
+    TTGUNUSED(bcast_a_);
+    TTGUNUSED(bcast_b_);
+    TTGUNUSED(multiplyadd_);
+    TTGUNUSED(reduce_c_);
+  }
+
+  /// Locally broadcast `A[i][k]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `j` such that
+  /// `B[k][j]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
+  class LocalBcastA : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastA, ttg::typelist<const Blk>> {
+   public:
+    using baseT = typename LocalBcastA::ttT;
+
+    LocalBcastA(Edge<Key<3>, Blk> &a, Edge<Key<3>, Blk> &a_ijk,
+                const std::vector<std::vector<long>> &b_rowidx_to_colidx, const Keymap3 &ijk_keymap)
+        : baseT(edges(a), edges(a_ijk), "SpMM25D::local_bcast_a", {"a_ikp"}, {"a_ijk"},
+                [](const Key<3> &ikp) { return ikp[2]; })
+        , b_rowidx_to_colidx_(b_rowidx_to_colidx)
+        , ijk_keymap_(ijk_keymap) {}
+
+    void op(const Key<3> &ikp, typename baseT::input_refs_tuple_type &&a_ik, std::tuple<Out<Key<3>, Blk>> &a_ijk) {
+      const auto i = ikp[0];
+      const auto k = ikp[1];
+      const auto p = ikp[2];
+
+      auto world = default_execution_context();
+      assert(p == world.rank());
+      ttg::trace("LocalBcastA(", i, ", ", k, ", ", p, ")");
+      if (k >= b_rowidx_to_colidx_.size()) return;
+      // local broadcast a_ik to all {i,j,k} such that b_kj exists
+      std::vector<Key<3>> ijk_keys;
+      for (auto &j : b_rowidx_to_colidx_[k]) {
+        if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) {
+          ttg::trace("Broadcasting A[", i, "][", k, "] on proc ", p, " to j=", j);
+          ijk_keys.emplace_back(Key<3>({i, j, k}));
+        }
+      }
+      ::broadcast<0>(ijk_keys, std::move(baseT::template get<0>(a_ik)), a_ijk);
+    }
+
+   private:
+    const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
+    const Keymap3 &ijk_keymap_;
+  };  // class LocalBcastA
+
+  /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists
+  class BcastA : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastA, ttg::typelist<const Blk>> {
+   public:
+    using baseT = typename BcastA::ttT;
+
+    BcastA(Edge<Key<2>, Blk> &a_ik, Edge<Key<3>, Blk> &a_ikp, const std::vector<std::vector<long>> &b_rowidx_to_colidx,
+           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
+        : baseT(edges(a_ik), edges(a_ikp), "SpMM25D::bcast_a", {"a_ik"}, {"a_ikp"}, ij_keymap)
+        , b_rowidx_to_colidx_(b_rowidx_to_colidx)
+        , ijk_keymap_(ijk_keymap) {}
+
+    void op(const Key<2> &ik, typename baseT::input_refs_tuple_type &&a_ik, std::tuple<Out<Key<3>, Blk>> &a_ikp) {
+      const auto i = ik[0];
+      const auto k = ik[1];
+      ttg::trace("BcastA(", i, ", ", k, ")");
+      std::vector<Key<3>> ikp_keys;
+
+      if (k >= b_rowidx_to_colidx_.size()) return;
+      auto world = default_execution_context();
+      std::vector<bool> procmap(world.size());
+      for (auto &j : b_rowidx_to_colidx_[k]) {
+        const long p = ijk_keymap_(Key<3>(
+            {i, j, k}));  // N.B. in 2.5D SUMMA different k contributions to C[i][j] are computed on different nodes
+        if (!procmap[p]) {
+          ttg::trace("Broadcasting A[", i, "][", k, "] to proc ", p);
+          ikp_keys.emplace_back(Key<3>({i, k, p}));
+          procmap[p] = true;
+        }
+      }
+      ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), a_ikp);
+    }
+
+   private:
+    const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
+    const Keymap3 &ijk_keymap_;
+  };  // class BcastA
+
+  /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that
+  /// `A[i][k]` exists AND `k` contribution to `C[i][j]` is assigned to this processor
+  class LocalBcastB : public TT<Key<3>, std::tuple<Out<Key<3>, Blk>>, LocalBcastB, ttg::typelist<const Blk>> {
+   public:
+    using baseT = typename LocalBcastB::ttT;
+
+    LocalBcastB(Edge<Key<3>, Blk> &b_kjp, Edge<Key<3>, Blk> &b_ijk,
+                const std::vector<std::vector<long>> &a_colidx_to_rowidx, const Keymap3 &ijk_keymap)
+        : baseT(edges(b_kjp), edges(b_ijk), "SpMM25D::local_bcast_b", {"b_kjp"}, {"b_ijk"},
+                [](const Key<3> &kjp) { return kjp[2]; })
+        , a_colidx_to_rowidx_(a_colidx_to_rowidx)
+        , ijk_keymap_(ijk_keymap) {}
+
+    void op(const Key<3> &kjp, typename baseT::input_refs_tuple_type &&b_kj, std::tuple<Out<Key<3>, Blk>> &b_ijk) {
+      const auto k = kjp[0];
+      const auto j = kjp[1];
+      const auto p = kjp[2];
+      auto world = default_execution_context();
+      assert(p == world.rank());
+      ttg::trace("BcastB(", k, ", ", j, ", ", p, ")");
+      if (k >= a_colidx_to_rowidx_.size()) return;
+      // broadcast b_kj to all ijk for which c_ij is on this processor and a_ik exists
+      std::vector<Key<3>> ijk_keys;
+      for (auto &i : a_colidx_to_rowidx_[k]) {
+        if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) {
+          ttg::trace("Broadcasting B[", k, "][", j, "] on proc ", p, " to i=", i);
+          ijk_keys.emplace_back(Key<3>({i, j, k}));
+        }
+      }
+      ::broadcast<0>(ijk_keys, std::move(baseT::template get<0>(b_kj)), b_ijk);
+    }
+
+   private:
+    const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
+    const Keymap3 &ijk_keymap_;
+  };  // class LocalBcastB
+
+  /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists
+  class BcastB : public TT<Key<2>, std::tuple<Out<Key<3>, Blk>>, BcastB, ttg::typelist<const Blk>> {
+   public:
+    using baseT = typename BcastB::ttT;
+
+    BcastB(Edge<Key<2>, Blk> &b_kj, Edge<Key<3>, Blk> &b_kjp, const std::vector<std::vector<long>> &a_colidx_to_rowidx,
+           const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap)
+        : baseT(edges(b_kj), edges(b_kjp), "SpMM25D::bcast_b", {"b_kj"}, {"b_kjp"}, ij_keymap)
+        , a_colidx_to_rowidx_(a_colidx_to_rowidx)
+        , ijk_keymap_(ijk_keymap) {}
+
+    void op(const Key<2> &kj, typename baseT::input_refs_tuple_type &&b_kj, std::tuple<Out<Key<3>, Blk>> &b_kjp) {
+      const auto k = kj[0];
+      const auto j = kj[1];
+      // broadcast b_kj to all processors which will contain at least one c_ij such that a_ik exists
+      std::vector<Key<3>> kjp_keys;
+      ttg::trace("BcastB(", k, ", ", j, ")");
+      if (k >= a_colidx_to_rowidx_.size()) return;
+      auto world = default_execution_context();
+      std::vector<bool> procmap(world.size());
+      for (auto &i : a_colidx_to_rowidx_[k]) {
+        long p = ijk_keymap_(Key<3>({i, j, k}));
+        if (!procmap[p]) {
+          ttg::trace("Broadcasting B[", k, "][", j, "] to proc ", p);
+          kjp_keys.emplace_back(Key<3>({k, j, p}));
+          procmap[p] = true;
+        }
+      }
+      ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), b_kjp);
+    }
+
+   private:
+    const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
+    const Keymap3 &ijk_keymap_;
+  };  // class BcastB
+
+  /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this kayer of the
+  /// 3-D process grid only
+  class MultiplyAdd : public TT<Key<3>, std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>>, MultiplyAdd,
+                                ttg::typelist<const Blk, const Blk, Blk>> {
+   public:
+    using baseT = typename MultiplyAdd::ttT;
+
+#if defined(TTG_HAVE_CUDA)
+    static constexpr bool have_cuda_op = true;
+#warning SPMM using CUDA implementation
+#elif defined(TTG_HAVE_HIPBLAS)
+    static constexpr bool have_hip_op  = true;
+#warning SPMM using HIP implementation
+#elif defined(TTG_HAVE_LEVEL_ZERO)
+    static constexpr bool have_level_zero_op = true;
+#warning SPMM using LEVEL_ZERO implementation
+#else
+#error No valid device implementation found!
+#endif
+
+    MultiplyAdd(Edge<Key<3>, Blk> &a_ijk, Edge<Key<3>, Blk> &b_ijk, Edge<Key<3>, Blk> &c_ijk, Edge<Key<2>, Blk> &c,
+                const std::vector<std::vector<long>> &a_rowidx_to_colidx,
+                const std::vector<std::vector<long>> &b_colidx_to_rowidx, const std::vector<int> &mTiles,
+                const std::vector<int> &nTiles, const Keymap3 &ijk_keymap)
+        : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"},
+                {"c_ij", "c_ijk"}, ijk_keymap)
+        , a_rowidx_to_colidx_(a_rowidx_to_colidx)
+        , b_colidx_to_rowidx_(b_colidx_to_rowidx) {
+      this->set_priomap([this](const Key<3> &ijk) { return this->prio(ijk); });  // map a key to an integral priority value
+
+      // for each {i,j} determine first k that contributes AND belongs to this node,
+      // initialize input {i,j,first_k} flow to 0
+      for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) {
+        if (a_rowidx_to_colidx_[i].empty()) continue;
+        for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) {
+          if (b_colidx_to_rowidx_[j].empty()) continue;
+
+          const auto p = ttg::default_execution_context().rank();
+          decltype(i) k;
+          bool have_k;
+          std::tie(k, have_k) = compute_first_k(i, j, p);
+          if (have_k) {
+            ttg::trace("Initializing C[", i, "][", j, "] on process ", p, " to zero");
+#if BLOCK_SPARSE_GEMM
+            Blk zero(btas::Range(mTiles[i], nTiles[j]), 0.0);
+#else
+            Blk zero{0.0};
+#endif
+            this->template in<2>()->send(Key<3>({i, j, k}), zero);
+          } else {
+            if (tracing() && a_rowidx_to_colidx_.size() * b_colidx_to_rowidx_.size() < 400)
+              ttg::print("C[", i, "][", j, "] is empty");
+          }
+        }
+      }
+    }
+
+    ttg::device::Task op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
+            std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>> &result) {
+      const auto i = ijk[0];
+      const auto j = ijk[1];
+      const auto k = ijk[2];  // k==l same because 000 will always be on layer 0, 001 will be accessed on layer 1
+      const auto p = ttg::default_execution_context().rank();
+      long next_k;
+      bool have_next_k;
+
+      const blk_t& A = baseT::template get<0>(_ijk);
+      const blk_t& B = baseT::template get<1>(_ijk);
+      blk_t& C = baseT::template get<2>(_ijk);
+
+      if (C.empty()) {
+        C = blk_t(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0);
+      }
+
+      /* pull all buffers onto the device */
+      co_await ttg::device::select(A.b, B.b, C.b);
+
+      /* everything is on the device, call the gemm */
+      device_gemm(C, A, B);
+
+      /* compute next k while the kernel is running */
+      std::tie(next_k, have_next_k) = compute_next_k(i, j, k, p);
+      ttg::trace("Rank ", ttg::default_execution_context().rank(),
+                 " :"
+                 " C[",
+                 i, "][", j, "]  += A[", i, "][", k, "] by B[", k, "][", j, "],  next_k? ",
+                 (have_next_k ? std::to_string(next_k) : "does not exist"));
+
+      /* wait for the kernel to complete */
+      co_await ttg::device::wait();
+
+
+      // compute the contrib, pass the running total to the next flow, if needed
+      // otherwise write to the result flow
+      if (have_next_k) {
+        co_await ttg::device::forward(ttg::device::send<1>(
+                                                Key<3>({i, j, next_k}),
+                                                std::move(C),
+                                                result));
+      } else {  // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j]
+                // belongs
+        co_await ttg::device::forward(ttg::device::send<0>(
+                                                Key<2>({i, j}),
+                                                std::move(C),
+                                                result));
+      }
+    }
+
+   private:
+    const std::vector<std::vector<long>> &a_rowidx_to_colidx_;
+    const std::vector<std::vector<long>> &b_colidx_to_rowidx_;
+
+    /* Compute the length of the remaining sequence on that tile */
+    int32_t prio(const Key<3> &key) const {
+      const auto i = key[0];
+      const auto j = key[1];
+      const auto k = key[2];
+      int32_t len = -1;  // will be incremented at least once
+      long next_k = k;
+      bool have_next_k;
+      do {
+        std::tie(next_k, have_next_k) = compute_next_k(i, j, next_k);  // here I know how many 'k' I have with same ij
+        ++len;
+      } while (have_next_k);
+      return len;
+    }
+
+   public:  // to be able to reuse this logic in SpMM25D
+    // given {i,j} return first k such that A[i][k] and B[k][j] exist
+    std::tuple<long, bool> compute_first_k(long i, long j) const {
+      const auto &a_k_range = a_rowidx_to_colidx_.at(i);
+      auto a_iter = a_k_range.begin();
+      auto a_iter_fence = a_k_range.end();
+      if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
+      const auto &b_k_range = b_colidx_to_rowidx_.at(j);
+      auto b_iter = b_k_range.begin();
+      auto b_iter_fence = b_k_range.end();
+      if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
+
+      {
+        auto a_colidx = *a_iter;  // pointing to next kth element
+        auto b_rowidx = *b_iter;
+        while (a_colidx != b_rowidx) {
+          if (a_colidx < b_rowidx) {
+            ++a_iter;
+            if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
+            a_colidx = *a_iter;
+          } else {
+            ++b_iter;
+            if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
+            b_rowidx = *b_iter;
+          }
+        }
+        return std::make_tuple(a_colidx, true);  // returned true for kth element exist and also returns next k since
+                                                 // a_colidx points to ++a_iter,  if not reaches to fence
+      }
+      assert(false);
+    }
+
+    // given {i,j,k} such that A[i][k] and B[k][j] exist
+    // return next k such that this condition holds
+    std::tuple<long, bool> compute_next_k(long i, long j, long k) const {
+      const auto &a_k_range = a_rowidx_to_colidx_.at(i);
+      auto a_iter_fence = a_k_range.end();
+      auto a_iter = std::find(a_k_range.begin(), a_iter_fence, k);
+      assert(a_iter != a_iter_fence);
+      const auto &b_k_range = b_colidx_to_rowidx_.at(j);
+      auto b_iter_fence = b_k_range.end();
+      auto b_iter = std::find(b_k_range.begin(), b_iter_fence, k);
+      assert(b_iter != b_iter_fence);
+      while (a_iter != a_iter_fence && b_iter != b_iter_fence) {
+        ++a_iter;
+        ++b_iter;
+        if (a_iter == a_iter_fence || b_iter == b_iter_fence) return std::make_tuple(-1, false);
+        auto a_colidx = *a_iter;
+        auto b_rowidx = *b_iter;
+        while (a_colidx != b_rowidx) {
+          if (a_colidx < b_rowidx) {
+            ++a_iter;
+            if (a_iter == a_iter_fence) return std::make_tuple(-1, false);
+            a_colidx = *a_iter;
+          } else {
+            ++b_iter;
+            if (b_iter == b_iter_fence) return std::make_tuple(-1, false);
+            b_rowidx = *b_iter;
+          }
+        }
+        return std::make_tuple(a_colidx, true);
+      }
+      ttg::abort();  // unreachable
+      return std::make_tuple(0, false);
+    }
+
+    // given {i,j} return first k such that A[i][k] and B[k][j] exist AND ijk_keymap_(i,j,k) == p
+    std::tuple<long, bool> compute_first_k(long i, long j, long p) const {
+      long first_k = 0;
+      bool have_k = false;
+      std::tie(first_k, have_k) = compute_first_k(i, j);
+      while (have_k) {
+        if (this->get_keymap()(Key<3>{i, j, first_k}) == p)
+          return {first_k, true};
+        else
+          std::tie(first_k, have_k) = compute_next_k(i, j, first_k);
+      }
+      return {0, false};
+    }
+
+    // given {i,j,k} such that A[i][k] and B[k][j] exist
+    // return next k such that this condition holds AND ijk_keymap_(i,j,k) == p
+    std::tuple<long, bool> compute_next_k(long i, long j, long k, long p) const {
+      long next_k = 0;
+      bool have_k = false;
+      std::tie(next_k, have_k) = compute_next_k(i, j, k);
+      while (have_k) {
+        if (this->get_keymap()(Key<3>{i, j, next_k}) == p)
+          return {next_k, true};
+        else
+          std::tie(next_k, have_k) = compute_next_k(i, j, next_k);
+      }
+      return {0, false};
+    }
+
+  };  // MultiplyAdd
+
+  /// reduces contributions to `C[i][j]` produced on different layers of the 3-d process grid
+  class ReduceC : public TT<Key<2>, std::tuple<Out<Key<2>, Blk>>, ReduceC, ttg::typelist<Blk>> {
+   public:
+    using baseT = typename ReduceC::ttT;
+
+    ReduceC(Edge<Key<2>, Blk> &c_ij_p, Edge<Key<2>, Blk> &c_ij, const Keymap2 &ij_keymap)
+        : baseT(edges(c_ij_p), edges(c_ij), "SpMM25D::reduce_c", {"c_ij(p)"}, {"c_ij"}, ij_keymap) {}
+
+    void op(const Key<2> &ij, typename baseT::input_refs_tuple_type &&c_ij_p, std::tuple<Out<Key<2>, Blk>> &c_ij) {
+      ttg::trace("ReduceC(", ij[0], ", ", ij[1], ")");
+      ::send<0>(ij, std::move(baseT::template get<0>(c_ij_p)), c_ij);
+    }
+  };  // class ReduceC
+
+ private:
+  Edge<Key<3>, Blk> a_ijk_;
+  Edge<Key<3>, Blk> local_a_ijk_;
+  Edge<Key<3>, Blk> b_ijk_;
+  Edge<Key<3>, Blk> local_b_ijk_;
+  Edge<Key<3>, Blk> c_ijk_;
+  Edge<Key<2>, Blk> c_ij_p_;
+  const std::vector<std::vector<long>> &a_rowidx_to_colidx_;
+  const std::vector<std::vector<long>> &b_colidx_to_rowidx_;
+  const std::vector<std::vector<long>> &a_colidx_to_rowidx_;
+  const std::vector<std::vector<long>> &b_rowidx_to_colidx_;
+  std::unique_ptr<BcastA> bcast_a_;
+  std::unique_ptr<LocalBcastA> local_bcast_a_;
+  std::unique_ptr<BcastB> bcast_b_;
+  std::unique_ptr<LocalBcastB> local_bcast_b_;
+  std::unique_ptr<MultiplyAdd> multiplyadd_;
+  std::unique_ptr<ReduceC> reduce_c_;
+  Keymap2 ij_keymap_;
+  Keymap3 ijk_keymap_;
+};
+
+class Control : public TT<void, std::tuple<Out<Key<2>>>, Control> {
+  using baseT = typename Control::ttT;
+  int P;
+  int Q;
+
+ public:
+  explicit Control(Edge<Key<2>> &ctl) : baseT(edges(), edges(ctl), "Control", {}, {"ctl"}), P(0), Q(0) {}
+
+  void op(std::tuple<Out<Key<2>>> &out) const {
+    for (int p = 0; p < P; p++) {
+      for (int q = 0; q < Q; q++) {
+        ttg::trace("Control: start computing on process {", p, ", ", q, "}");
+        ::sendk<0>(Key<2>{p, q}, out);
+      }
+    }
+  }
+
+  void start(const int _p, const int _q) {
+    P = _p;
+    Q = _q;
+    invoke();
+  }
+};
+
+#ifdef BTAS_IS_USABLE
+template <typename T_, class Range_, class Store_>
+std::tuple<T_, T_> norms(const btas::Tensor<T_, Range_, Store_> &t) {
+  T_ norm_2_square = 0.0;
+  T_ norm_inf = 0.0;
+  for (auto k : t) {
+    norm_2_square += k * k;
+    norm_inf = std::max(norm_inf, std::abs(k));
+  }
+  return std::make_tuple(norm_2_square, norm_inf);
+}
+#endif
+
+std::tuple<double, double> norms(double t) { return std::make_tuple(t * t, std::abs(t)); }
+
+template <typename Blk = blk_t>
+std::tuple<double, double> norms(const SpMatrix<Blk> &A) {
+  double norm_2_square = 0.0;
+  double norm_inf = 0.0;
+  for (int i = 0; i < A.outerSize(); ++i) {
+    for (typename SpMatrix<Blk>::InnerIterator it(A, i); it; ++it) {
+      //  cout << 1+it.row() << "\t"; // row index
+      //  cout << 1+it.col() << "\t"; // col index (here it is equal to k)
+      //  cout << it.value() << endl;
+      auto& elem = it.value();
+      double elem_norm_2_square, elem_norm_inf;
+      std::tie(elem_norm_2_square, elem_norm_inf) = norms(elem);
+      norm_2_square += elem_norm_2_square;
+      norm_inf = std::max(norm_inf, elem_norm_inf);
+    }
+  }
+  return std::make_tuple(norm_2_square, norm_inf);
+}
+
+char *getCmdOption(char **begin, char **end, const std::string &option) {
+  static char *empty = (char *)"";
+  char **itr = std::find(begin, end, option);
+  if (itr != end && ++itr != end) return *itr;
+  return empty;
+}
+
+bool cmdOptionExists(char **begin, char **end, const std::string &option) {
+  return std::find(begin, end, option) != end;
+}
+
+int cmdOptionIndex(char **begin, char **end, const std::string &option) {
+  char **itr = std::find(begin, end, option);
+  if (itr != end) return (int)(itr - begin);
+  return -1;
+}
+
+static int parseOption(std::string &option, int default_value) {
+  size_t pos;
+  std::string token;
+  int N = default_value;
+  if (option.length() == 0) return N;
+  pos = option.find(':');
+  if (pos == std::string::npos) {
+    pos = option.length();
+  }
+  token = option.substr(0, pos);
+  N = std::stoi(token);
+  option.erase(0, pos + 1);
+  return N;
+}
+
+static long parseOption(std::string &option, long default_value) {
+  size_t pos;
+  std::string token;
+  long N = default_value;
+  if (option.length() == 0) return N;
+  pos = option.find(':');
+  if (pos == std::string::npos) {
+    pos = option.length();
+  }
+  token = option.substr(0, pos);
+  N = std::stol(token);
+  option.erase(0, pos + 1);
+  return N;
+}
+
+static double parseOption(std::string &option, double default_value = 0.25) {
+  size_t pos;
+  std::string token;
+  double N = default_value;
+  if (option.length() == 0) return N;
+  pos = option.find(':');
+  if (pos == std::string::npos) {
+    pos = option.length();
+  }
+  token = option.substr(0, pos);
+  N = std::stod(token);
+  option.erase(0, pos + 1);
+  return N;
+}
+
+#if !defined(BLOCK_SPARSE_GEMM)
+static void initSpMatrixMarket(const std::function<int(const Key<2> &)> &keymap, const char *filename, SpMatrix<> &A,
+                               SpMatrix<> &B, SpMatrix<> &C, int &M, int &N, int &K) {
+  std::vector<int> sizes;
+  // We load the entire matrix on each rank, but we only use the local part for the GEMM
+  // loadMarket() is the eigan fuction to load matrix from a file
+  if (!loadMarket(A, filename)) {
+    std::cerr << "Failed to load " << filename << ", bailing out..." << std::endl;
+    ttg::ttg_abort();
+  }
+  if (0 == ttg::default_execution_context().rank()) {
+    std::cout << "##MatrixMarket file " << filename << " -- " << A.rows() << " x " << A.cols() << " -- " << A.nonZeros()
+              << " nnz (density: " << (float)A.nonZeros() / (float)A.rows() / (float)A.cols() << ")" << std::endl;
+  }
+  if (A.rows() != A.cols()) {
+    B = A.transpose();
+  } else {
+    B = A;
+  }
+
+  C.resize(A.rows(), B.cols());
+  M = (int)A.rows();
+  N = (int)C.cols();
+  K = (int)A.cols();
+}
+
+static void initSpRmat(const std::function<int(const Key<2> &)> &keymap, const char *opt, SpMatrix<> &A, SpMatrix<> &B,
+                       SpMatrix<> &C, int &M, int &N, int &K, unsigned long seed) {
+  int E;
+  double a = 0.25, b = 0.25, c = 0.25, d = 0.25;
+  size_t nnz = 0;
+
+  if (nullptr == opt) {
+    std::cerr << "Usage: -rmat <#nodes>[:<#edges>[:<a>[:<b>:[<c>[:<d>]]]]]" << std::endl;
+    exit(1);
+  }
+  std::string token;
+  std::string option = std::string(opt);
+  N = parseOption(option, -1);
+  K = N;
+  M = N;
+
+  // We build the entire sparse matrix on each rank, but use only the local part
+  // on a given rank, according to keymap
+  A.resize(N, N);
+
+  E = parseOption(option, (int)(0.01 * N * N));
+  a = parseOption(option, a);
+  b = parseOption(option, b);
+  c = parseOption(option, c);
+  d = parseOption(option, d);
+
+  if (ttg::default_execution_context().rank() == 0) {
+    std::cout << "#R-MAT: " << N << " nodes, " << E << " edges, a/b/c/d = " << a << "/" << b << "/" << c << "/" << d
+              << std::endl;
+  }
+
+  boost::minstd_rand gen(seed);
+  boost::rmat_iterator<boost::minstd_rand, boost::directed_graph<>> rmat_it(gen, N, E, a, b, c, d);
+
+  using triplet_t = Eigen::Triplet<blk_t>;
+  std::vector<triplet_t> A_elements;
+  for (int i = 0; i < N; i++) {
+    nnz++;
+    A_elements.emplace_back(i, i, 1.0);
+  }
+  for (int i = 0; i < E; i++) {
+    auto x = *rmat_it++;
+    if (x.first != x.second) {
+      A_elements.emplace_back(x.first, x.second, 1.0);
+      nnz++;
+    }
+  }
+  A.setFromTriplets(A_elements.begin(), A_elements.end());
+
+  B = A;
+  C.resize(N, N);
+
+  if (ttg::default_execution_context().rank() == 0) {
+    std::cout << "#R-MAT: " << E << " nonzero elements, density: " << (double)nnz / (double)N / (double)N << std::endl;
+  }
+}
+
+static void initSpHardCoded(const std::function<int(const Key<2> &)> &keymap, SpMatrix<> &A, SpMatrix<> &B,
+                            SpMatrix<> &C, int &m, int &n, int &k) {
+  m = 2;
+  n = 3;
+  k = 4;
+
+  std::cout << "#HardCoded A, B, C" << std::endl;
+  A.resize(m, k);
+  B.resize(k, n);
+  C.resize(m, n);
+  // We initialize the same matrices on all the ranks, but we will use only the local part
+  // following the keymap
+  using triplet_t = Eigen::Triplet<blk_t>;
+  std::vector<triplet_t> A_elements;
+  A_elements.emplace_back(0, 1, 12.3);
+  A_elements.emplace_back(0, 2, 10.7);
+  A_elements.emplace_back(0, 3, -2.3);
+  A_elements.emplace_back(1, 0, -0.3);
+  A_elements.emplace_back(1, 2, 1.2);
+  A.setFromTriplets(A_elements.begin(), A_elements.end());
+
+  std::vector<triplet_t> B_elements;
+  B_elements.emplace_back(0, 0, 12.3);
+  B_elements.emplace_back(1, 0, 10.7);
+  B_elements.emplace_back(3, 0, -2.3);
+  B_elements.emplace_back(1, 1, -0.3);
+  B_elements.emplace_back(1, 2, 1.2);
+  B_elements.emplace_back(2, 2, 7.2);
+  B_elements.emplace_back(3, 2, 0.2);
+  B.setFromTriplets(B_elements.begin(), B_elements.end());
+}
+
+#else
+static void initBlSpHardCoded(const std::function<int(const Key<2> &)> &keymap, SpMatrix<> &A, SpMatrix<> &B,
+                              SpMatrix<> &C, SpMatrix<> &Aref, SpMatrix<> &Bref, bool buildRefs,
+                              std::vector<int> &mTiles, std::vector<int> &nTiles, std::vector<int> &kTiles,
+                              std::vector<std::vector<long>> &a_rowidx_to_colidx,
+                              std::vector<std::vector<long>> &a_colidx_to_rowidx,
+                              std::vector<std::vector<long>> &b_rowidx_to_colidx,
+                              std::vector<std::vector<long>> &b_colidx_to_rowidx, int &m, int &n, int &k) {
+  m = 2;
+  n = 3;
+  k = 4;
+
+  std::cout << "#HardCoded A, B, C" << std::endl;
+  A.resize(m, k);
+  B.resize(k, n);
+  C.resize(m, n);
+  if (buildRefs) {
+    Aref.resize(m, k);
+    Bref.resize(k, n);
+  }
+
+  for (int mt = 0; mt < m; mt++) mTiles.push_back(128);
+  for (int nt = 0; nt < n; nt++) nTiles.push_back(196);
+  for (int kt = 0; kt < k; kt++) kTiles.push_back(256);
+
+  int rank = ttg::default_execution_context().rank();
+
+  using triplet_t = Eigen::Triplet<blk_t>;
+  std::vector<triplet_t> A_elements;
+  std::vector<triplet_t> Aref_elements;
+#if defined(BTAS_IS_USABLE)
+  if (keymap({0, 1}) == rank) {
+    A_elements.emplace_back(0, 1, blk_t(btas::Range(128, 256), 12.3));
+  }
+  if (keymap({0, 2}) == rank) {
+    A_elements.emplace_back(0, 2, blk_t(btas::Range(128, 256), 10.7));
+  }
+  if (keymap({0, 3}) == rank) {
+    A_elements.emplace_back(0, 3, blk_t(btas::Range(128, 256), -2.3));
+  }
+  if (keymap({1, 0}) == rank) {
+    A_elements.emplace_back(1, 0, blk_t(btas::Range(128, 256), -0.3));
+  }
+  if (keymap({1, 2}) == rank) {
+    A_elements.emplace_back(1, 2, blk_t(btas::Range(128, 256), 1.2));
+  }
+  if (buildRefs && rank == 0) {
+    Aref_elements.emplace_back(0, 1, blk_t(btas::Range(128, 256), 12.3));
+    Aref_elements.emplace_back(0, 2, blk_t(btas::Range(128, 256), 10.7));
+    Aref_elements.emplace_back(0, 3, blk_t(btas::Range(128, 256), -2.3));
+    Aref_elements.emplace_back(1, 0, blk_t(btas::Range(128, 256), -0.3));
+    Aref_elements.emplace_back(1, 2, blk_t(btas::Range(128, 256), 1.2));
+  }
+#else
+  if ((buildRefs && rank == 0) || keymap({0, 1}) == rank) {
+    A_elements.emplace_back(0, 1, 12.3);
+  }
+  if ((buildRefs && rank == 0) || keymap({0, 2}) == rank) {
+    A_elements.emplace_back(0, 2, 10.7);
+  }
+  if ((buildRefs && rank == 0) || keymap({0, 3}) == rank) {
+    A_elements.emplace_back(0, 3, -2.3);
+  }
+  if ((buildRefs && rank == 0) || keymap({1, 0}) == rank) {
+    A_elements.emplace_back(1, 0, -0.3);
+  }
+  if ((buildRefs && rank == 0) || keymap({1, 2}) == rank) {
+    A_elements.emplace_back(1, 2, .2);
+  }
+  if (buildRefs && rank == 0) {
+    Aref_elements.emplace_back(0, 1, 12.3);
+    Aref_elements.emplace_back(0, 2, 10.7);
+    Aref_elements.emplace_back(0, 3, -2.3);
+    Aref_elements.emplace_back(1, 0, -0.3);
+    Aref_elements.emplace_back(1, 2, .2);
+  }
+#endif
+  a_rowidx_to_colidx.resize(2);
+  a_rowidx_to_colidx[0].emplace_back(1);  // A[0][1]
+  a_rowidx_to_colidx[0].emplace_back(2);  // A[0][2]
+  a_rowidx_to_colidx[0].emplace_back(3);  // A[0][3]
+  a_rowidx_to_colidx[1].emplace_back(0);  // A[1][0]
+  a_rowidx_to_colidx[1].emplace_back(2);  // A[1][2]
+
+  a_colidx_to_rowidx.resize(4);
+  a_colidx_to_rowidx[0].emplace_back(1);  // A[1][0]
+  a_colidx_to_rowidx[1].emplace_back(0);  // A[0][1]
+  a_colidx_to_rowidx[2].emplace_back(0);  // A[0][2]
+  a_colidx_to_rowidx[2].emplace_back(1);  // A[1][2]
+  a_colidx_to_rowidx[3].emplace_back(0);  // A[0][3]
+
+  A.setFromTriplets(A_elements.begin(), A_elements.end());
+  std::cout << "A_elements.begin()" << A_elements.begin() << "A_elements.end()" << A_elements.end() << "\n";
+
+  if (buildRefs && 0 == rank) {
+    Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end());
+  }
+
+  std::vector<triplet_t> B_elements;
+  std::vector<triplet_t> Bref_elements;
+#if defined(BTAS_IS_USABLE)
+  if (keymap({0, 0}) == rank) {
+    B_elements.emplace_back(0, 0, blk_t(btas::Range(256, 196), 12.3));
+  }
+  if (keymap({1, 0}) == rank) {
+    B_elements.emplace_back(1, 0, blk_t(btas::Range(256, 196), 10.7));
+  }
+  if (keymap({3, 0}) == rank) {
+    B_elements.emplace_back(3, 0, blk_t(btas::Range(256, 196), -2.3));
+  }
+  if (keymap({1, 1}) == rank) {
+    B_elements.emplace_back(1, 1, blk_t(btas::Range(256, 196), -0.3));
+  }
+  if (keymap({1, 2}) == rank) {
+    B_elements.emplace_back(1, 2, blk_t(btas::Range(256, 196), 1.2));
+  }
+  if (keymap({2, 2}) == rank) {
+    B_elements.emplace_back(2, 2, blk_t(btas::Range(256, 196), 7.2));
+  }
+  if (keymap({3, 2}) == rank) {
+    B_elements.emplace_back(3, 2, blk_t(btas::Range(256, 196), 0.2));
+  }
+  if (buildRefs && rank == 0) {
+    Bref_elements.emplace_back(0, 0, blk_t(btas::Range(256, 196), 12.3));
+    Bref_elements.emplace_back(1, 0, blk_t(btas::Range(256, 196), 10.7));
+    Bref_elements.emplace_back(3, 0, blk_t(btas::Range(256, 196), -2.3));
+    Bref_elements.emplace_back(1, 1, blk_t(btas::Range(256, 196), -0.3));
+    Bref_elements.emplace_back(1, 2, blk_t(btas::Range(256, 196), 1.2));
+    Bref_elements.emplace_back(2, 2, blk_t(btas::Range(256, 196), 7.2));
+    Bref_elements.emplace_back(3, 2, blk_t(btas::Range(256, 196), 0.2));
+  }
+#else
+  if (keymap({0, 0}) == rank) {
+    B_elements.emplace_back(0, 0, 12.3);
+  }
+  if (keymap({1, 0}) == rank) {
+    B_elements.emplace_back(1, 0, 10.7);
+  }
+  if (keymap({3, 0}) == rank) {
+    B_elements.emplace_back(3, 0, -2.3);
+  }
+  if (keymap({1, 1}) == rank) {
+    B_elements.emplace_back(1, 1, -0.3);
+  }
+  if (keymap({1, 2}) == rank) {
+    B_elements.emplace_back(1, 2, 1.2);
+  }
+  if (keymap({2, 2}) == rank) {
+    B_elements.emplace_back(2, 2, 7.2);
+  }
+  if (keymap({3, 2}) == rank) {
+    B_elements.emplace_back(3, 2, 0.2);
+  }
+#endif
+  b_rowidx_to_colidx.resize(4);
+  b_rowidx_to_colidx[0].emplace_back(0);  // B[0][0]
+  b_rowidx_to_colidx[1].emplace_back(0);  // B[1][0]
+  b_rowidx_to_colidx[1].emplace_back(1);  // B[1][1]
+  b_rowidx_to_colidx[1].emplace_back(2);  // B[1][2]
+  b_rowidx_to_colidx[2].emplace_back(2);  // B[2][2]
+  b_rowidx_to_colidx[3].emplace_back(0);  // B[3][0]
+  b_rowidx_to_colidx[3].emplace_back(2);  // B[3][2]
+
+  b_colidx_to_rowidx.resize(3);
+  b_colidx_to_rowidx[0].emplace_back(0);  // B[0][0]
+  b_colidx_to_rowidx[0].emplace_back(1);  // B[1][0]
+  b_colidx_to_rowidx[0].emplace_back(3);  // B[3][0]
+  b_colidx_to_rowidx[1].emplace_back(1);  // B[1][1]
+  b_colidx_to_rowidx[2].emplace_back(1);  // B[1][2]
+  b_colidx_to_rowidx[2].emplace_back(2);  // B[2][2]
+  b_colidx_to_rowidx[2].emplace_back(3);  // A[3][2]
+
+  B.setFromTriplets(B_elements.begin(), B_elements.end());
+  if (buildRefs && 0 == rank) {
+    Bref.setFromTriplets(Bref_elements.begin(), Bref_elements.end());
+  }
+}
+
+#if defined(BTAS_IS_USABLE)
+static void initBlSpRandom(const std::function<int(const Key<2> &)> &keymap, size_t M, size_t N, size_t K, int minTs,
+                           int maxTs, double avgDensity, SpMatrix<> &A, SpMatrix<> &B, SpMatrix<> &Aref,
+                           SpMatrix<> &Bref, bool buildRefs, std::vector<int> &mTiles, std::vector<int> &nTiles,
+                           std::vector<int> &kTiles, std::vector<std::vector<long>> &a_rowidx_to_colidx,
+                           std::vector<std::vector<long>> &a_colidx_to_rowidx,
+                           std::vector<std::vector<long>> &b_rowidx_to_colidx,
+                           std::vector<std::vector<long>> &b_colidx_to_rowidx, double &average_tile_size,
+                           double &Adensity, double &Bdensity, unsigned int seed) {
+  int rank = ttg::default_execution_context().rank();
+
+  int ts;
+  std::mt19937 gen(seed);
+  std::mt19937 genv(seed + 1);
+
+  std::uniform_int_distribution<> dist(minTs, maxTs);  // randomly pick any value in the range minTs, maxTs
+  using triplet_t = Eigen::Triplet<blk_t>;
+  std::vector<triplet_t> A_elements;
+  std::vector<triplet_t> B_elements;
+  std::vector<triplet_t> Aref_elements;
+  std::vector<triplet_t> Bref_elements;
+
+  for (int m = 0; m < M; m += ts) {
+    ts = dist(gen);
+    if (ts > M - m) ts = M - m;
+    mTiles.push_back(ts);
+  }
+  for (int n = 0; n < N; n += ts) {
+    ts = dist(gen);
+    if (ts > N - n) ts = N - n;
+    nTiles.push_back(ts);
+  }
+  for (int k = 0; k < K; k += ts) {
+    ts = dist(gen);
+    if (ts > K - k) ts = K - k;
+    kTiles.push_back(ts);
+  }
+
+  A.resize(mTiles.size(), kTiles.size());
+  B.resize(kTiles.size(), nTiles.size());
+  if (buildRefs) {
+    Aref.resize(mTiles.size(), kTiles.size());
+    Bref.resize(kTiles.size(), nTiles.size());
+  }
+
+  std::uniform_int_distribution<> mDist(0, mTiles.size() - 1);
+  std::uniform_int_distribution<> nDist(0, nTiles.size() - 1);
+  std::uniform_int_distribution<> kDist(0, kTiles.size() - 1);
+  std::uniform_real_distribution<> vDist(-1.0, 1.0);
+
+  size_t filling = 0;
+  size_t avg_nb = 0;
+  int avg_nb_nb = 0;
+
+  struct tuple_hash : public std::unary_function<std::tuple<int, int>, std::size_t> {
+    std::size_t operator()(const std::tuple<int, int> &k) const {
+      return static_cast<size_t>(std::get<0>(k)) | (static_cast<size_t>(std::get<1>(k)) << 32);
+    }
+  };
+
+  std::unordered_set<std::tuple<int, int>, tuple_hash> fills;
+
+  fills.clear();
+  while ((double)filling / (double)(M * K) < avgDensity) {
+    int mt = mDist(gen);
+    int kt = kDist(gen);
+
+    if (fills.find({mt, kt}) != fills.end()) continue;
+    fills.insert({mt, kt});
+
+    if (mt >= a_rowidx_to_colidx.size()) a_rowidx_to_colidx.resize(mt + 1);
+    a_rowidx_to_colidx[mt].emplace_back(kt);
+    if (kt >= a_colidx_to_rowidx.size()) a_colidx_to_rowidx.resize(kt + 1);
+    a_colidx_to_rowidx[kt].emplace_back(mt);
+
+    filling += mTiles[mt] * kTiles[kt];
+    avg_nb += mTiles[mt] * kTiles[kt];
+    avg_nb_nb++;
+    double value = vDist(genv);
+    if (0 == rank && buildRefs) Aref_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value));
+    if (rank != keymap({mt, kt})) continue;
+    A_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value));
+  }
+  for (auto &row : a_rowidx_to_colidx) {
+    std::sort(row.begin(), row.end());
+  }
+  for (auto &col : a_colidx_to_rowidx) {
+    std::sort(col.begin(), col.end());
+  }
+  A.setFromTriplets(A_elements.begin(), A_elements.end());
+  Adensity = (double)filling / (double)(M * K);
+  if (0 == rank && buildRefs) Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end());
+
+  filling = 0;
+  fills.clear();
+  while ((double)filling / (double)(K * N) < avgDensity) {
+    int nt = nDist(gen);
+    int kt = kDist(gen);
+
+    if (fills.find({kt, nt}) != fills.end()) continue;
+    fills.insert({kt, nt});
+
+    if (kt >= b_rowidx_to_colidx.size()) b_rowidx_to_colidx.resize(kt + 1);
+    b_rowidx_to_colidx[kt].emplace_back(nt);
+    if (nt >= b_colidx_to_rowidx.size()) b_colidx_to_rowidx.resize(nt + 1);
+    b_colidx_to_rowidx[nt].emplace_back(kt);
+
+    filling += kTiles[kt] * nTiles[nt];
+    avg_nb += kTiles[kt] * nTiles[nt];
+    avg_nb_nb++;
+    double value = vDist(genv);
+    if (0 == rank && buildRefs) Bref_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value));
+    if (rank != keymap({kt, nt})) continue;
+    B_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value));
+  }
+  for (auto &row : b_rowidx_to_colidx) {
+    std::sort(row.begin(), row.end());
+  }
+  for (auto &col : b_colidx_to_rowidx) {
+    std::sort(col.begin(), col.end());
+  }
+  B.setFromTriplets(B_elements.begin(), B_elements.end());
+  Bdensity = (double)filling / (double)(K * N);
+  if (0 == rank && buildRefs) Bref.setFromTriplets(Bref_elements.begin(), Bref_elements.end());
+  fills.clear();
+
+  average_tile_size = (double)avg_nb / avg_nb_nb;
+}
+#endif
+
+#endif
+
+static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function<int(const Key<2> &)> &ij_keymap,
+                              const std::function<int(const Key<3> &)> &ijk_keymap, const std::string &tiling_type,
+                              double gflops, double avg_nb, double Adensity, double Bdensity,
+                              const std::vector<std::vector<long>> &a_rowidx_to_colidx,
+                              const std::vector<std::vector<long>> &a_colidx_to_rowidx,
+                              const std::vector<std::vector<long>> &b_rowidx_to_colidx,
+                              const std::vector<std::vector<long>> &b_colidx_to_rowidx, std::vector<int> &mTiles,
+                              std::vector<int> &nTiles, std::vector<int> &kTiles, int M, int N, int K, int minTs,
+                              int maxTs, int P, int Q, int R) {
+  int MT = (int)A.rows();
+  int NT = (int)B.cols();
+  int KT = (int)A.cols();
+  assert(KT == B.rows());
+
+  SpMatrix<> C;
+  C.resize(MT, NT);
+
+  // flow graph needs to exist on every node
+  Edge<Key<2>> ctl("control");
+  Control control(ctl);
+  Edge<Key<2>, blk_t> eA, eB;
+  Edge<Key<2>, blk_t> eC;
+
+  Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
+  Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
+  Write_SpMatrix<> c(C, eC, ij_keymap);
+  auto &c_status = c.status();
+  assert(!has_value(c_status));
+  //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
+  SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx,
+                      mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
+  TTGUNUSED(a);
+  TTGUNUSED(b);
+  TTGUNUSED(a_times_b);
+
+  auto connected = make_graph_executable(&control);
+  assert(connected);
+  TTGUNUSED(connected);
+
+  struct timeval start {
+    0
+  }, end{0}, diff{0};
+  gettimeofday(&start, nullptr);
+  // ready, go! need only 1 kick, so must be done by 1 thread only
+  if (ttg::default_execution_context().rank() == 0) control.start(P, Q);
+  fence();
+  gettimeofday(&end, nullptr);
+  timersub(&end, &start, &diff);
+  double tc = (double)diff.tv_sec + (double)diff.tv_usec / 1e6;
+#if defined(TTG_USE_MADNESS)
+  std::string rt("MAD");
+#elif defined(TTG_USE_PARSEC)
+  std::string rt("PARSEC");
+#else
+  std::string rt("Unkown???");
+#endif
+  if (ttg::default_execution_context().rank() == 0) {
+    std::cout << "TTG-" << rt << " PxQxR=   " << P << " " << Q << " " << R << " 1 average_NB= " << avg_nb << " M= " << M
+              << " N= " << N << " K= " << K << " t= " << minTs << " T=" << maxTs << " Tiling= " << tiling_type
+              << " A_density= " << Adensity << " B_density= " << Bdensity << " gflops= " << gflops << " seconds= " << tc
+              << " gflops/s= " << gflops / tc << std::endl;
+  }
+}
+
+#if !defined(BLOCK_SPARSE_GEMM)
+static void make_rowidx_to_colidx_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &r2c) {
+  for (int k = 0; k < mat.outerSize(); ++k) {  // cols, if col-major, rows otherwise
+    for (typename SpMatrix<blk_t>::InnerIterator it(mat, k); it; ++it) {
+      const long row = it.row();
+      const long col = it.col();
+      if (row >= r2c.size()) r2c.resize(row + 1);
+      r2c[row].push_back(col);
+    }
+  }
+  // Sort each vector of column indices, as we pushed them in an arbitrary order
+  for (auto &row : r2c) {
+    std::sort(row.begin(), row.end());
+  }
+}
+
+static void make_colidx_to_rowidx_from_eigen(const SpMatrix<> &mat, std::vector<std::vector<long>> &c2r) {
+  for (int k = 0; k < mat.outerSize(); ++k) {  // cols, if col-major, rows otherwise
+    for (typename SpMatrix<blk_t>::InnerIterator it(mat, k); it; ++it) {
+      const long row = it.row();
+      const long col = it.col();
+
+      if (col >= c2r.size()) c2r.resize(col + 1);
+      c2r[col].push_back(row);
+    }
+    // Sort each vector of row indices, as we pushed them in an arbitrary order
+    for (auto &col : c2r) {
+      std::sort(col.begin(), col.end());
+    }
+  }
+}
+#endif
+
+static double compute_gflops(const std::vector<std::vector<long>> &a_r2c, const std::vector<std::vector<long>> &b_r2c,
+                             const std::vector<int> &mTiles, const std::vector<int> &nTiles,
+                             const std::vector<int> &kTiles) {
+  unsigned long flops = 0;
+  for (auto i = 0; i < a_r2c.size(); i++) {
+    for (auto kk = 0; kk < a_r2c[i].size(); kk++) {
+      auto k = a_r2c[i][kk];
+      if (k > b_r2c.size()) continue;
+      for (auto jj = 0; jj < b_r2c[k].size(); jj++) {
+        auto j = b_r2c[k][jj];
+        flops += static_cast<long>(mTiles[i]) * nTiles[j] * kTiles[k];
+      }
+    }
+  }
+  return 2.0 * (double)flops / 1e9;
+}
+
+int main(int argc, char **argv) {
+  bool timing;
+  double gflops;
+
+  // warm up silicon by calling gemm a few times
+#ifdef BTAS_IS_USABLE
+  for (int i = 0; i < 20; i++) {
+    using baseT = typename btas::Tensor<double>;
+    btas::Tensor<double, btas::Range, std::vector<double>> At(30, 30);
+    btas::Tensor<double, btas::Range, std::vector<double>> Bt(30, 30);
+    btas::Tensor<double, btas::Range, std::vector<double>> Ct(30, 30);
+    At.fill(1.0);
+    Bt.fill(2.0);
+    Ct.fill(3.0);
+    btas::gemm(std::move(Ct), Bt, At);
+  }
+#endif  // BTAS_IS_USABLE
+
+//  static volatile int debug_signal = 0;
+//  std::cout << "Waiting on debug signal (int*)" << &debug_signal << std::endl;
+//  while (!debug_signal) {}
+
+
+  int cores = -1;
+  std::string nbCoreStr(getCmdOption(argv, argv + argc, "-c"));
+  cores = parseOption(nbCoreStr, cores);
+
+  if (int dashdash = cmdOptionIndex(argv, argv + argc, "--") > -1) {
+    initialize(argc - dashdash, argv + dashdash, cores);
+  } else {
+    initialize(1, argv, cores);
+  }
+
+#ifdef BTAS_IS_USABLE
+  // initialize MADNESS so that TA allocators can be created
+  madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
+  madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
+#endif  // BTAS_IS_USABLE
+
+  std::string debugStr(getCmdOption(argv, argv + argc, "-d"));
+  auto debug = (unsigned int)parseOption(debugStr, 0);
+
+  if (debug & (1 << 1)) {
+    using ttg::Debugger;
+    auto debugger = std::make_shared<Debugger>();
+    Debugger::set_default_debugger(debugger);
+    debugger->set_exec(argv[0]);
+    debugger->set_prefix(ttg::default_execution_context().rank());
+    // debugger->set_cmd("lldb_xterm");
+    debugger->set_cmd("gdb_xterm");
+  }
+
+  int mpi_size = ttg::default_execution_context().size();
+  int mpi_rank = ttg::default_execution_context().rank();
+  int best_pqc = mpi_size;
+  int P, Q, R;
+  for (int c = 1; c <= (int)cbrt(mpi_size); c++) {
+    for (int p = 1; p <= (int)sqrt(mpi_size / c); p++) {
+      if ((mpi_size % (p * c)) == 0) {
+        int q = mpi_size / (p * c);
+        if (abs(c - p - q) <= best_pqc) {
+          best_pqc = abs(c - p - q);
+          P = p;
+          Q = q;
+          R = c;
+        }
+      }
+    }
+    // ttg::launch_lldb(ttg::default_execution_context().rank(), argv[0]);
+
+    {
+      if (debug & (1 << 0)) {
+        ttg::trace_on();
+        TTBase::set_trace_all(true);
+      }
+
+      SpMatrix<> A, B, C, Aref, Bref;
+      std::string tiling_type;
+      int M = 0, N = 0, K = 0;
+      int minTs = 0, maxTs = 0;
+
+      double avg_nb = nan("undefined");
+      double Adensity = nan("undefined");
+      double Bdensity = nan("undefined");
+
+      std::string PStr(getCmdOption(argv, argv + argc, "-P"));
+      P = parseOption(PStr, P);
+      std::string QStr(getCmdOption(argv, argv + argc, "-Q"));
+      Q = parseOption(QStr, Q);
+      // to make code behave like 2D summa if R not given
+      std::string RStr(getCmdOption(argv, argv + argc, "-R"));
+      R = parseOption(RStr, 1);
+
+      if (P * Q * R != mpi_size) {
+        if (!cmdOptionExists(argv, argv + argc, "-Q") && (mpi_size % (P * R) == 0))
+          Q = mpi_size / (P * R);
+        else if (!cmdOptionExists(argv, argv + argc, "-P") && (mpi_size % (Q * R)) == 0)
+          P = mpi_size / (Q * R);
+        else if (!cmdOptionExists(argv, argv + argc, "-R") && (mpi_size % (Q * P)) == 0)
+          R = mpi_size / (Q * P);
+        else {
+          if (0 == mpi_rank) {
+            std::cerr << P << "x" << Q << "x" << R << " is not a valid process grid -- bailing out" << std::endl;
+            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+          }
+        }
+      }
+
+      auto ij_keymap = [P, Q](const Key<2> &ij) {
+        int i = (int)ij[0];
+        int j = (int)ij[1];
+        int r = ij2rank(i, j, P, Q);
+        return r;
+      };
+
+      auto ijk_keymap = [P, Q, R](const Key<3> &ijk) {
+        int i = (int)ijk[0];
+        int j = (int)ijk[1];
+        int k = (int)ijk[2];
+        int r = ijk2rank(i, j, k, P, Q, R);
+        return r;
+      };
+
+      std::string seedStr(getCmdOption(argv, argv + argc, "-s"));
+      unsigned int seed = parseOption(seedStr, 0);
+      if (seed == 0) {
+        std::random_device rd;
+        seed = rd();
+        if (0 == ttg::default_execution_context().rank()) std::cerr << "#Random seeded with " << seed << std::endl;
+      }
+      ttg_broadcast(ttg::default_execution_context(), seed, 0);
+
+      std::vector<int> mTiles;
+      std::vector<int> nTiles;
+      std::vector<int> kTiles;
+      std::vector<std::vector<long>> a_rowidx_to_colidx;
+      std::vector<std::vector<long>> a_colidx_to_rowidx;
+      std::vector<std::vector<long>> b_rowidx_to_colidx;
+      std::vector<std::vector<long>> b_colidx_to_rowidx;
+
+      std::string checkStr(getCmdOption(argv, argv + argc, "-x"));
+      int check = parseOption(checkStr, !(argc >= 2));
+      timing = (check == 0);
+
+#if !defined(BLOCK_SPARSE_GEMM)
+      if (cmdOptionExists(argv, argv + argc, "-mm")) {
+        char *filename = getCmdOption(argv, argv + argc, "-mm");
+        tiling_type = filename;
+        initSpMatrixMarket(ij_keymap, filename, A, B, C, M, N, K);
+      } else if (cmdOptionExists(argv, argv + argc, "-rmat")) {
+        char *opt = getCmdOption(argv, argv + argc, "-rmat");
+        tiling_type = "RandomSparseMatrix";
+        initSpRmat(ij_keymap, opt, A, B, C, M, N, K, seed);
+      } else {
+        tiling_type = "HardCodedSparseMatrix";
+        initSpHardCoded(ij_keymap, A, B, C, M, N, K);
+      }
+
+      if (check) {
+        // We don't generate the sparse matrices in distributed, so Aref and Bref can
+        // just point to the same matrix, or be a local copy.
+        Aref = A;
+        Bref = B;
+      }
+
+      // We still need to build the metadata from the  matrices.
+      make_rowidx_to_colidx_from_eigen(A, a_rowidx_to_colidx);
+      make_colidx_to_rowidx_from_eigen(A, a_colidx_to_rowidx);
+      make_rowidx_to_colidx_from_eigen(B, b_rowidx_to_colidx);
+      make_colidx_to_rowidx_from_eigen(B, b_colidx_to_rowidx);
+      // This is only needed to compute the flops
+      for (int mt = 0; mt < M; mt++) mTiles.emplace_back(1);
+      for (int nt = 0; nt < N; nt++) nTiles.emplace_back(1);
+      for (int kt = 0; kt < K; kt++) kTiles.emplace_back(1);
+#else
+      if (argc >= 2) {
+        std::string Mstr(getCmdOption(argv, argv + argc, "-M"));
+        M = parseOption(Mstr, 1200);
+        std::string Nstr(getCmdOption(argv, argv + argc, "-N"));
+        N = parseOption(Nstr, M);
+        std::string Kstr(getCmdOption(argv, argv + argc, "-K"));
+        K = parseOption(Kstr, N);
+        std::string minTsStr(getCmdOption(argv, argv + argc, "-t"));
+        minTs = parseOption(minTsStr, 64);
+        std::string maxTsStr(getCmdOption(argv, argv + argc, "-T"));
+        maxTs = parseOption(maxTsStr, minTs);
+        std::string avgStr(getCmdOption(argv, argv + argc, "-a"));
+        double avg = parseOption(avgStr, 0.3);
+        timing = (check == 0);
+        tiling_type = "RandomIrregularTiling";
+        initBlSpRandom(ij_keymap, M, N, K, minTs, maxTs, avg, A, B, Aref, Bref, check, mTiles, nTiles, kTiles,
+                       a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, avg_nb, Adensity,
+                       Bdensity, seed);
+
+        C.resize(mTiles.size(), nTiles.size());
+      } else {
+        tiling_type = "HardCodedBlockSparseMatrix";
+        initBlSpHardCoded(ij_keymap, A, B, C, Aref, Bref, true, mTiles, nTiles, kTiles, a_rowidx_to_colidx,
+                          a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, M, N, K);
+      }
+#endif  // !defined(BLOCK_SPARSE_GEMM)
+
+      gflops = compute_gflops(a_rowidx_to_colidx, b_rowidx_to_colidx, mTiles, nTiles, kTiles);
+
+      std::string nbrunStr(getCmdOption(argv, argv + argc, "-n"));
+      int nb_runs = parseOption(nbrunStr, 1);
+
+      if (timing) {
+        // Start up engine
+        execute();
+        for (int nrun = 0; nrun < nb_runs; nrun++) {
+          parsec_devices_release_memory();
+          timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity,
+                            a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, mTiles,
+                            nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R);
+          parsec_devices_reset_load(default_execution_context().impl().context());
+        }
+      } else {
+        // flow graph needs to exist on every node
+        // N.B. to validate C we need it on node 0!
+        auto keymap_write = [](const Key<2> &key) { return 0; };
+        Edge<Key<2>> ctl("control");
+        Control control(ctl);
+        Edge<Key<2>, blk_t> eA, eB, eC;
+        Read_SpMatrix a("A", A, ctl, eA, ij_keymap);
+        Read_SpMatrix b("B", B, ctl, eB, ij_keymap);
+        Write_SpMatrix<> c(C, eC, keymap_write, true);
+        auto &c_status = c.status();
+        assert(!has_value(c_status));
+        //  SpMM25D a_times_b(world, eA, eB, eC, A, B);
+        SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx,
+                            b_colidx_to_rowidx, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R);
+        TTGUNUSED(a_times_b);
+        // calling the Dot constructor with 'true' argument disables the type
+        if (default_execution_context().rank() == 0) std::cout << Dot{/*disable_type=*/true}(&control) << std::endl;
+
+        // ready to run!
+        auto connected = make_graph_executable(&control);
+        assert(connected);
+        TTGUNUSED(connected);
+
+        // ready, go! need only 1 kick, so must be done by 1 thread only
+        if (ttg::default_execution_context().rank() == 0) control.start(P, Q);
+
+        execute();
+        fence();
+
+        // validate C=A*B against the reference output
+        assert(has_value(c_status));
+        if (ttg::default_execution_context().rank() == 0) {
+          SpMatrix<> Cref = Aref * Bref;
+
+          double norm_2_square, norm_inf;
+          std::tie(norm_2_square, norm_inf) = norms<blk_t>(Cref - C);
+          std::cout << "||Cref - C||_2      = " << std::sqrt(norm_2_square) << std::endl;
+          std::cout << "||Cref - C||_\\infty = " << norm_inf << std::endl;
+          if (norm_inf > 1e-9) {
+            std::cout << "Cref:\n" << Cref << std::endl;
+            std::cout << "C:\n" << C << std::endl;
+            ttg_abort();
+          }
+        }
+
+        // validate Acopy=A against the reference output
+        //      assert(has_value(copy_status));
+        //      if (ttg::default_execution_context().rank() == 0) {
+        //        double norm_2_square, norm_inf;
+        //        std::tie(norm_2_square, norm_inf) = norms<blk_t>(Acopy - A);
+        //        std::cout << "||Acopy - A||_2      = " << std::sqrt(norm_2_square) << std::endl;
+        //        std::cout << "||Acopy - A||_\\infty = " << norm_inf << std::endl;
+        //        if (::ttg::tracing()) {
+        //          std::cout << "Acopy (" << static_cast<void *>(&Acopy) << "):\n" << Acopy << std::endl;
+        //          std::cout << "A (" << static_cast<void *>(&A) << "):\n" << A << std::endl;
+        //        }
+        //        if (norm_inf != 0) {
+        //          ttg_abort();
+        //        }
+        //      }
+      }
+    }
+
+#ifdef BTAS_IS_USABLE
+    madness::finalize();
+#endif  // BTAS_IS_USABLE
+    ttg_finalize();
+    return 0;
+  }
+}
diff --git a/examples/t9/t9_streaming.cc b/examples/t9/t9_streaming.cc
index f79b80321..8511e1bd2 100644
--- a/examples/t9/t9_streaming.cc
+++ b/examples/t9/t9_streaming.cc
@@ -281,7 +281,7 @@ auto make_reconstruct(const nodeEdge& in, nodeEdge& out, const std::string& name
 }
 
 // cannot easily replace this with wrapper due to persistent state
-class Norm2 : public TT<Key, std::tuple<>, Norm2, ttg::typelist<Node>> {
+class Norm2 : public TT<Key, std::tuple<>, Norm2, ttg::typelist<const Node>> {
   using baseT = typename Norm2::ttT;
   double sumsq;
   std::mutex charon;
diff --git a/examples/task-benchmarks/chain-ttg-dev.cc b/examples/task-benchmarks/chain-ttg-dev.cc
new file mode 100644
index 000000000..80f14bff4
--- /dev/null
+++ b/examples/task-benchmarks/chain-ttg-dev.cc
@@ -0,0 +1,251 @@
+//#define TTG_USE_USER_TERMDET 1
+#include "ttg.h"
+
+#include "chrono.h"
+
+#if defined(TTG_HAVE_CUDA)
+#define ES ttg::ExecutionSpace::CUDA
+#elif defined(TTG_HAVE_HIP)
+#define ES ttg::ExecutionSpace::HIP
+#else
+#error "Either CUDA OR HIP is required to build this test!"
+#endif // 0
+
+#define NUM_TASKS 100000
+
+using namespace ttg;
+
+std::atomic<int> task_counter = 0;
+
+struct A : public ttg::TTValue<A> {
+  // TODO: allocate pinned memory
+  int v = 0;
+  ttg::Buffer<int> b;
+  A() : b(&v, 1) { }
+
+  A(A&& a) = default;
+  A(const A& a) : v(a.v), b(&v, 1) { }
+
+  template <typename Archive>
+  void serialize(Archive& ar) {
+    ttg_abort();
+  }
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int) {
+    ttg_abort();
+  }
+
+};
+
+template <int num_flows>
+auto make_ttg(bool do_move);
+
+// flows task ids via values
+template <>
+auto make_ttg<1>(bool do_move) {
+  Edge<int, A> I2N, N2N;
+  Edge<void, A> N2S;
+
+  auto init = make_tt<void>(
+    []() {
+      ++task_counter;
+      std::cout << "init 1 " << std::endl;
+      send<0>(0, A{});
+    }, edges(), edges(I2N));
+
+  auto next = make_tt<ES, int>([=](const int &key, auto&& value) -> ttg::device::Task {
+    //++task_counter;
+    co_await ttg::device::select(value.b);
+    if (key < NUM_TASKS) {
+      if (do_move) {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(value)));
+      } else {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, value));
+      }
+    } else {
+    }
+  } , edges(fuse(I2N, N2N)), edges(N2N));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+template <>
+auto make_ttg<2>(bool do_move) {
+  Edge<int, A> I2N1, I2N2;
+  Edge<int, A> N2N1, N2N2;
+  Edge<void, A> N2S1, N2S2;
+
+  auto init = make_tt<void>([]() {
+    send<0>(0, A{});
+    send<1>(0, A{});
+  }, edges(), edges(I2N1, I2N2));
+
+  auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2) -> ttg::device::Task {
+    co_await ttg::device::select(v1.b, v2.b);
+    if (key < NUM_TASKS) {
+      if (do_move) {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
+                                      ttg::device::send<1>(key+1, std::move(v2)));
+      } else {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
+                                      ttg::device::send<1>(key+1, v2));
+      }
+    }
+  } , edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2)), edges(N2N1, N2N2));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+template <>
+auto make_ttg<4>(bool do_move) {
+  Edge<int, A> I2N1, I2N2, I2N3, I2N4;
+  Edge<int, A> N2N1, N2N2, N2N3, N2N4;
+  Edge<void, A> N2S1, N2S2, N2S3, N2S4;
+
+  auto init = make_tt<void>(
+    []() {
+      send<0>(0, A{});
+      send<1>(0, A{});
+      send<2>(0, A{});
+      send<3>(0, A{});
+    }, edges(), edges(I2N1, I2N2, I2N3, I2N4));
+
+  auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device::Task {
+    co_await ttg::device::select(v1.b, v2.b, v3.b, v4.b);
+    if (key < NUM_TASKS) {
+      if (do_move) {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
+                                      ttg::device::send<1>(key+1, std::move(v2)),
+                                      ttg::device::send<2>(key+1, std::move(v3)),
+                                      ttg::device::send<3>(key+1, std::move(v4)));
+      } else {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
+                                      ttg::device::send<1>(key+1, v2),
+                                      ttg::device::send<2>(key+1, v3),
+                                      ttg::device::send<3>(key+1, v4));
+      }
+    }
+  }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2),
+           fuse(I2N3, N2N3), fuse(I2N4, N2N4)),
+     edges(N2N1, N2N2, N2N3, N2N4));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+template <>
+auto make_ttg<8>(bool do_move) {
+  Edge<int, A> I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8;
+  Edge<int, A> N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8;
+  Edge<void, A> N2S1, N2S2, N2S3, N2S4, N2S5, N2S6, N2S7, N2S8;
+
+  auto init = make_tt<void>(
+    []() {
+      send<0>(0, A{});
+      send<1>(0, A{});
+      send<2>(0, A{});
+      send<3>(0, A{});
+      send<4>(0, A{});
+      send<5>(0, A{});
+      send<6>(0, A{});
+      send<7>(0, A{});
+    }, edges(), edges(I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8));
+
+  auto next = make_tt<ES, int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device::Task {
+    co_await ttg::device::select(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b);
+    if (key < NUM_TASKS) {
+      if (do_move) {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
+                                      ttg::device::send<1>(key+1, std::move(v2)),
+                                      ttg::device::send<2>(key+1, std::move(v3)),
+                                      ttg::device::send<3>(key+1, std::move(v4)),
+                                      ttg::device::send<4>(key+1, std::move(v5)),
+                                      ttg::device::send<5>(key+1, std::move(v6)),
+                                      ttg::device::send<6>(key+1, std::move(v7)),
+                                      ttg::device::send<7>(key+1, std::move(v8)));
+      } else {
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, v1),
+                                      ttg::device::send<1>(key+1, v2),
+                                      ttg::device::send<2>(key+1, v3),
+                                      ttg::device::send<3>(key+1, v4),
+                                      ttg::device::send<4>(key+1, v5),
+                                      ttg::device::send<5>(key+1, v6),
+                                      ttg::device::send<6>(key+1, v7),
+                                      ttg::device::send<7>(key+1, v8));
+      }
+    }
+  }, edges(fuse(I2N1, N2N1), fuse(I2N2, N2N2), fuse(I2N3, N2N3), fuse(I2N4, N2N4), fuse(I2N5, N2N5), fuse(I2N6, N2N6), fuse(I2N7, N2N7), fuse(I2N8, N2N8)),
+     edges(N2N1, N2N2, N2N3, N2N4, N2N5, N2N6, N2N7, N2N8));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+// flows task ids via keys
+template <>
+auto make_ttg<0>(bool do_move) {
+  Edge<int, void> I2N, N2N;
+  Edge<void, int> N2S;
+
+  auto init = make_tt<void>([](std::tuple<Out<int, void>> &outs) { sendk<0>(0, outs); }, edges(), edges(I2N));
+
+  auto next = make_tt<ES>([](const int& key) -> ttg::device::Task {
+    co_await ttg::device::select();
+    if (key < NUM_TASKS) {
+      co_await ttg::device::forward(ttg::device::sendk<0>(key+1));
+    }
+  }, edges(fuse(I2N, N2N)), edges(N2N));
+
+  return std::make_tuple(std::move(init), std::move(next));
+}
+
+template<int num_flows>
+void run_bench(bool do_move)
+{
+  auto [init, next] = make_ttg<num_flows>(do_move);
+
+  auto connected = make_graph_executable(init.get());
+  assert(connected);
+  std::cout << "Graph " << num_flows << " is connected.\n";
+
+  if (ttg::default_execution_context().rank() == 0) init->invoke();
+
+  ttg_execute(ttg_default_execution_context());
+  ttg_fence(ttg_default_execution_context());
+
+  auto t0 = now();
+  if (ttg::default_execution_context().rank() == 0) init->invoke();
+
+  ttg_execute(ttg_default_execution_context());
+  ttg_fence(ttg_default_execution_context());
+  auto t1 = now();
+
+  std::cout << "# of tasks = " << NUM_TASKS << std::endl;
+  std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1) << ", avg " << duration_in_mus(t0, t1) / (double)NUM_TASKS << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+
+  int num_flows = 0;
+  int do_move = 1;
+  ttg_initialize(argc, argv, -1);
+
+  if (argc > 1) {
+    num_flows = std::atoi(argv[1]);
+  }
+
+  if (argc > 2) {
+    do_move = std::atoi(argv[2]);
+  }
+
+  switch(num_flows) {
+  case 0: run_bench<0>(do_move); break;
+  case 1: run_bench<1>(do_move); break;
+  case 2: run_bench<2>(do_move); break;
+  case 4: run_bench<4>(do_move); break;
+  case 8: run_bench<8>(do_move); break;
+  default: std::cout << "Unsupported number of flows: " << NUM_TASKS << std::endl;
+  }
+
+  ttg_finalize();
+  return 0;
+}
+
diff --git a/examples/task-benchmarks/chrono.h b/examples/task-benchmarks/chrono.h
new file mode 100644
index 000000000..358d6dcc4
--- /dev/null
+++ b/examples/task-benchmarks/chrono.h
@@ -0,0 +1,22 @@
+//
+// Created by Eduard Valeyev on 10/24/21.
+//
+
+#ifndef TEST_BENCHMARKS_CHRONO_H
+#define TEST_BENCHMARKS_CHRONO_H
+
+#include <chrono>
+
+using time_point = std::chrono::high_resolution_clock::time_point;
+
+inline time_point now() { return std::chrono::high_resolution_clock::now(); }
+
+inline std::chrono::system_clock::time_point system_now() {
+  return std::chrono::system_clock::now();
+}
+
+inline int64_t duration_in_mus(time_point const &t0, time_point const &t1) {
+  return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+}
+
+#endif // TEST_BENCHMARKS_CHRONO_H
diff --git a/examples/test/test.cc b/examples/test/test.cc
index 61fbecbd7..f739881b4 100644
--- a/examples/test/test.cc
+++ b/examples/test/test.cc
@@ -22,7 +22,9 @@ class A : public TT<keyT, std::tuple<Out<void, int>, Out<keyT, int>>, A, ttg::ty
     const std::string &name)
       : baseT(inedges, outedges, name, {"inputA"}, {"resultA", "iterateA"}) {}
 
+#if defined(TTG_HAVE_CUDA)
   static constexpr const bool have_cuda_op = true;
+#endif // TTG_HAVE_CUDA
 
   void op(const keyT &key, const baseT::input_refs_tuple_type &t, baseT::output_terminals_type &out) {
     // int& value = baseT::get<0>(t);  // !! ERROR, trying to get int& from const int
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 2016872e2..275f3fdd8 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -1,15 +1,35 @@
 include(AddTTGExecutable)
 
 # TT unit test: core TTG ops
-add_ttg_executable(core-unittests-ttg "fibonacci.cc;ranges.cc;tt.cc;unit_main.cpp" LINK_LIBRARIES "Catch2::Catch2")
+set(ut_src
+        fibonacci.cc
+        ranges.cc
+        tt.cc
+        unit_main.cpp
+		streams.cc
+    )
+set(ut_libs Catch2::Catch2)
 
-# serialization test: probes serialization via all supported serialization methods (MADNESS, Boost::serialization, cereal) that are available
-add_executable(serialization "serialization.cc;unit_main.cpp")
-target_link_libraries(serialization "Catch2::Catch2;ttg-serialization")
-if (TARGET BTAS::BTAS)
-    target_link_libraries(serialization BTAS::BTAS)
-    target_compile_definitions(serialization PRIVATE TTG_HAS_BTAS=1)
-endif (TARGET BTAS::BTAS)
+# coroutine tests
+# we definitely have TARGET std::coroutine
+list(APPEND ut_src fibonacci-coro.cc)
+list(APPEND ut_src device_coro.cc)
+if (TTG_HAVE_CUDA)
+    list(APPEND ut_src cuda_kernel.cu)
+endif(TTG_HAVE_CUDA)
+list(APPEND ut_libs std::coroutine)
+
+add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" COMPILE_DEFINITIONS "CATCH_CONFIG_NO_POSIX_SIGNALS=1" )
+
+# serialization test: probes serialization via all supported serialization methods (MADNESS, Boost::serialization) that are available
+add_ttg_executable(serialization serialization.cc unit_main.cpp
+                   LINK_LIBRARIES Catch2::Catch2 ttg-serialization  $<TARGET_NAME_IF_EXISTS:BTAS::BTAS>
+                   COMPILE_DEFINITIONS $<$<TARGET_EXISTS:BTAS::BTAS>:TTG_HAS_BTAS=1>)
+#target_link_libraries(serialization "Catch2::Catch2;ttg-serialization")
+#if (TARGET BTAS::BTAS)
+#    target_link_libraries(serialization BTAS::BTAS)
+#    target_compile_definitions(serialization PRIVATE TTG_HAS_BTAS=1)
+#endif (TARGET BTAS::BTAS)
 
 # TODO: convert into unit test
 #if (TARGET MADworld)
@@ -17,4 +37,5 @@ endif (TARGET BTAS::BTAS)
 #endif(TARGET MADworld)
 
 
-catch_discover_tests(serialization TEST_PREFIX "ttg/test/unit/")
+catch_discover_tests(serialization-parsec TEST_PREFIX "ttg/test/unit/")
+catch_discover_tests(serialization-mad TEST_PREFIX "ttg/test/unit/")
diff --git a/tests/unit/cuda_kernel.cu b/tests/unit/cuda_kernel.cu
new file mode 100644
index 000000000..f6f00d172
--- /dev/null
+++ b/tests/unit/cuda_kernel.cu
@@ -0,0 +1,22 @@
+
+#include "cuda_kernel.h"
+
+#ifdef TTG_HAVE_CUDA
+
+static __global__ void cu_increment_buffer(double* buffer, double* scratch) {
+  // Thread index
+  int tx = threadIdx.x;
+
+  buffer[tx] += 1.0;
+  if (tx == 0 && scratch != nullptr) {
+    *scratch += 1.0;
+  }
+}
+
+void increment_buffer(double* buffer, std::size_t buffer_size, double* scratch, std::size_t scratch_size) {
+
+  cu_increment_buffer<<<1, buffer_size>>>(buffer, scratch);
+
+}
+
+#endif // TTG_HAVE_CUDA
\ No newline at end of file
diff --git a/tests/unit/cuda_kernel.h b/tests/unit/cuda_kernel.h
new file mode 100644
index 000000000..4fec87a99
--- /dev/null
+++ b/tests/unit/cuda_kernel.h
@@ -0,0 +1,4 @@
+#include "ttg/config.h"
+#include <cinttypes>
+
+void increment_buffer(double* buffer, std::size_t buffer_size, double* scratch, std::size_t scratch_size);
\ No newline at end of file
diff --git a/tests/unit/device_coro.cc b/tests/unit/device_coro.cc
new file mode 100644
index 000000000..60581d232
--- /dev/null
+++ b/tests/unit/device_coro.cc
@@ -0,0 +1,438 @@
+#include <catch2/catch_all.hpp>
+
+#include "ttg.h"
+
+#include "ttg/serialization.h"
+
+#include "cuda_kernel.h"
+
+struct value_t {
+  ttg::Buffer<double> db; // TODO: rename
+  int quark;
+
+  template<typename Archive>
+  void serialize(Archive& ar, const unsigned int version) {
+    ar& quark;
+    ar& db; // input:
+  }
+};
+
+#ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS
+/* devicebuf is non-POD so provide serialization
+ * information for members not a devicebuf */
+namespace madness::archive {
+  template <class Archive>
+  struct ArchiveSerializeImpl<Archive, value_t> {
+    static inline void serialize(const Archive& ar, value_t& obj) { ar& obj.quark & obj.db; };
+  };
+}  // namespace madness::archive
+#endif  // TTG_SERIALIZATION_SUPPORTS_MADNESS
+
+#if defined(TTG_HAVE_DEVICE) && defined(TTG_IMPL_DEVICE_SUPPORT)
+
+TEST_CASE("Device", "coro") {
+
+  SECTION("devicebuf") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+      //ttg::print("device_task key ", key);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(val.db.current_device_ptr() != nullptr);
+
+      /* NO KERNEL */
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::device::wait();
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        //ttg::send<0>(key+1, std::move(val));
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      }
+    };
+
+    //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    ttg::make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    std::cout << "Entering fence" << std::endl;
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("devicebuf-inc") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+      //ttg::print("device_task key ", key);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(val.db.current_device_ptr() != nullptr);
+
+      std::cout << "KEY " << key << " VAL IN DEV " << *val.db.current_device_ptr() << " VAL IN HOST " << *val.db.host_ptr() << std::endl;
+
+      /* call a kernel */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), nullptr, 0);
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::device::wait(val.db);
+
+      std::cout << "KEY " << key << " VAL OUT DEV " << *val.db.current_device_ptr() << " VAL OUT HOST " << *val.db.host_ptr() << std::endl;
+
+#ifdef TTG_HAVE_CUDA
+      /* buffer is increment once per task, so it should be the same as key */
+      CHECK(static_cast<int>(*val.db.host_ptr()) == key+1);
+#endif // TTG_HAVE_CUDA
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        //ttg::send<0>(key+1, std::move(val));
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      }
+    };
+
+    //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    ttg::make_graph_executable(tt);
+    value_t v;
+    *v.db.host_ptr() = 2.0; // start from non-zero value
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(2, std::move(v));
+    std::cout << "Entering fence" << std::endl;
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("scratch") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+      double scratch = 0.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::Allocate);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* call a kernel */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::device::wait(ds);
+
+#ifdef TTG_HAVE_CUDA
+      /* the scratch is allocated but no data is transferred in; it's incremented once */
+      CHECK((static_cast<int>(scratch)-1) == 0);
+#endif // 0
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        //ttg::send<0>(key+1, std::move(val));
+        /* NOTE: we use co_await here instead of co_return because co_return destroys all local variables first;
+         *       we will not return from this co_await!*/
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      }
+    };
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    ttg::make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("scratch-syncin") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+      double scratch = key;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncIn);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* call a kernel */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::device::wait(ds);
+
+#ifdef TTG_HAVE_CUDA
+      /* scratch is increment once per task, so it should be the same as key */
+      CHECK((static_cast<int>(scratch))-1 == key);
+#endif // 0
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        //ttg::send<0>(key+1, std::move(val));
+        /* NOTE: we use co_await here instead of co_return because co_return destroys all local variables first;
+         *       we will not return from this co_await!*/
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      }
+    };
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    ttg::make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("scratch-value-out") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+      double scratch = 0.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::Allocate);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* call a kernel */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel to complete */
+      co_await ttg::device::wait(ds, val.db);
+
+#ifdef TTG_HAVE_CUDA
+      /* buffer is increment once per task, so it should be 1 */
+      CHECK((static_cast<int>(scratch)-1) == 0);
+#endif // 0
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        //ttg::send<0>(key+1, std::move(val));
+        /* NOTE: we use co_await here instead of co_return because co_return destroys all local variables first;
+         *       we will not return from this co_await!*/
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      }
+    };
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    ttg::make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("ptr") {
+
+    ttg::Edge<int, value_t> edge;
+    ttg::Ptr<value_t> ptr;
+    int last_key = 0;
+    constexpr const int num_iter = 10;
+    auto fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+      double scratch = key;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncIn);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      /* KERNEL */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel and the out-transfer to complete */
+      co_await ttg::device::wait(val.db, ds);
+
+#ifdef TTG_HAVE_CUDA
+      /* buffer is increment once per task, so it should be the same as key */
+      CHECK(static_cast<int>(scratch) == key+1);
+      CHECK(static_cast<int>(*val.db.host_ptr()) == key+1);
+#endif // TTG_HAVE_CUDA
+
+      /* we're back, the kernel executed and we can send */
+      if (key < num_iter) {
+        //ttg::send<0>(key+1, std::move(val));
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      } else {
+        /* exfiltrate the value */
+        /* TODO: what consistency do we expect from get_ptr? */
+        ptr = ttg::get_ptr(val);
+        last_key = key;
+      }
+    };
+
+    //ptr.get_view<ttg::ExecutionSpace::CUDA>(device_id);
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    ttg::make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+    if (num_iter == last_key) {
+      CHECK(ptr.is_valid());
+      assert(ptr.is_valid());
+    }
+
+    /* feed the ptr back into a graph */
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(last_key+1, ptr);
+    ttg::ttg_fence(ttg::default_execution_context());
+
+    ptr.reset();
+  }
+
+  /* TODO: enabel this test once we control the PaRSEC state machine! */
+  SECTION("device-host-tasks") {
+
+    ttg::Edge<int, value_t> h2d, d2h;
+
+    auto host_fn = [&](const int& key, value_t&& val) {
+      /* check that the data has been synced back */
+#ifdef TTG_HAVE_CUDA
+      CHECK(static_cast<int>(*val.db.host_ptr()) == key);
+#endif // TTG_HAVE_CUDA
+
+      /* modify the data */
+      *val.db.host_ptr() += 1.0;
+#ifdef TTG_HAVE_CUDA
+      CHECK(static_cast<int>(*val.db.host_ptr()) == key+1);
+#endif // TTG_HAVE_CUDA
+
+      /* send back to the device */
+      ttg::send<0>(key+1, std::move(val));
+    };
+    auto htt = ttg::make_tt(host_fn, ttg::edges(d2h), ttg::edges(h2d),
+                            "host_task", {"d2h"}, {"h2d"});
+
+    auto device_fn = [&](const int& key, value_t&& val) -> ttg::device::Task {
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(val.db);
+
+      /* call a kernel */
+#ifdef TTG_HAVE_CUDA
+      increment_buffer(val.db.current_device_ptr(), val.db.size(), nullptr, 0);
+#endif // TTG_HAVE_CUDA
+
+      /* here we suspend to wait for a kernel to complete */
+      //co_await ttg::device::wait(val.db);
+      co_await ttg::device::wait();
+
+      /* we're back, the kernel executed and we can send */
+      if (key < 10) {
+        /* TODO: should we move the view in here if we want to get the device side data */
+        std::cout << "Sending to host key " << key+1 <<std::endl;
+        co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(val)));
+      }
+    };
+
+    auto dtt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(device_fn, ttg::edges(h2d), ttg::edges(d2h),
+                                                      "device_task", {"h2d"}, {"d2h"});
+    ttg::make_graph_executable(dtt);
+    if (ttg::default_execution_context().rank() == 0) htt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("loop") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](int key, value_t&& val) -> ttg::device::Task {
+      double scratch = 1.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::Allocate);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      for (int i = 0; i < 10; ++i) {
+
+        CHECK(ds.device_ptr() != nullptr);
+        CHECK(val.db.current_device_ptr() != nullptr);
+
+        /* KERNEL */
+#ifdef TTG_HAVE_CUDA
+        increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+        //increment_buffer(val.db.current_device_ptr(), val.db.size(), 0, 0);
+#endif // TTG_HAVE_CUDA
+
+        /* here we suspend to wait for a kernel and the out-transfer to complete */
+        co_await ttg::device::wait(val.db);
+
+#ifdef TTG_HAVE_CUDA
+        /* buffer is increment once per task, so it should be the same as key */
+        //CHECK(static_cast<int>(scratch) == i);
+        CHECK(static_cast<int>(*val.db.host_ptr()) == i+1);
+#endif // TTG_HAVE_CUDA
+      }
+    };
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    ttg::make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+
+  SECTION("loop-scratchout") {
+
+    ttg::Edge<int, value_t> edge;
+    auto fn = [&](int key, value_t&& val) -> ttg::device::Task {
+      double scratch = -10.0;
+      ttg::devicescratch<double> ds = ttg::make_scratch(&scratch, ttg::scope::SyncIn);
+
+      /* wait for the view to be available on the device */
+      co_await ttg::device::select(ds, val.db);
+      /* once we're back here the data has been transferred */
+      CHECK(ds.device_ptr()  != nullptr);
+
+      for (int i = 0; i < 10; ++i) {
+
+        CHECK(ds.device_ptr() != nullptr);
+        CHECK(val.db.current_device_ptr() != nullptr);
+
+        /* KERNEL */
+#ifdef TTG_HAVE_CUDA
+        increment_buffer(val.db.current_device_ptr(), val.db.size(), ds.device_ptr(), ds.size());
+        //increment_buffer(val.db.current_device_ptr(), val.db.size(), 0, 0);
+#endif // TTG_HAVE_CUDA
+
+        /* here we suspend to wait for a kernel and the out-transfer to complete */
+        co_await ttg::device::wait(val.db, ds);
+
+#ifdef TTG_HAVE_CUDA
+        /* buffer is increment once per task, so it should be the same as key */
+        CHECK(static_cast<int>(scratch) == (-10+i+1));
+        CHECK(static_cast<int>(*val.db.host_ptr()) == i+1);
+#endif // TTG_HAVE_CUDA
+      }
+    };
+
+    auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
+                                                      "device_task", {"edge_in"}, {"edge_out"});
+    ttg::make_graph_executable(tt);
+    if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+}
+
+#endif // TTG_IMPL_DEVICE_SUPPORT
diff --git a/tests/unit/fibonacci-coro.cc b/tests/unit/fibonacci-coro.cc
new file mode 100644
index 000000000..a464750f5
--- /dev/null
+++ b/tests/unit/fibonacci-coro.cc
@@ -0,0 +1,120 @@
+#include <catch2/catch_all.hpp>
+
+#include "ttg.h"
+
+#include "ttg/serialization/std/pair.h"
+#include "ttg/util/hash/std/pair.h"
+
+constexpr int64_t N = 1000;
+
+TEST_CASE("Fibonacci-coroutines", "[fib][core]") {
+  // compute the reference result
+  int reference_result = 0;
+  {
+    // recursive lambda pattern from http://pedromelendez.com/blog/2015/07/16/recursive-lambdas-in-c14/
+    auto compute_reference_result = [&reference_result](int f_np1, int f_n) {
+      auto impl = [&reference_result](int f_np1, int f_n, const auto &impl_ref) -> void {
+        assert(f_n < N);
+        reference_result += f_n;
+        if (f_np1 < N) {
+          const auto f_np2 = f_np1 + f_n;
+          impl_ref(f_np2, f_np1, impl_ref);
+        }
+      };
+      impl(f_np1, f_n, impl);
+    };
+    compute_reference_result(1, 0);
+  }
+
+  SECTION("shared-memory") {
+    if (ttg::default_execution_context().size() == 1) {
+      ttg::Edge<int, int> F2F;
+      ttg::Edge<void, int> F2P;
+
+      // N.B. wrap a trivial (nonsuspending) coroutine using make_tt!
+      auto fib_op = ttg::make_tt(
+          // computes next value: F_{n+2} = F_{n+1} + F_{n}, seeded by F_1 = 1, F_0 = 0
+          // N.B. can't autodeduce return type, must explicitly declare the return type
+          [](const int &F_n_plus_1, const int &F_n) -> ttg::resumable_task {
+            // on 1 process the right order of sends can avoid the race iff reductions are inline (on-current-thread)
+            // and not async (nthread>1):
+            // - send<1> will call wc->set_arg which will eagerly reduce the argument
+            // - send<0> then will call wa->set_arg which will create task for key F_np2 ... that can potentially call
+            // finalize<1> in the other clause
+            // - reversing the order of sends will create a race between wc->set_arg->send<1> executing on this thread
+            // and wa->set_arg->finalize<1> executing in thread pool
+            // - there is no way to detect the "undesired" outcome of the race without keeping expired TTArgs from the
+            // cache there is no way currently to avoid race if there is more than 1 process ... need to track the
+            // number of messages that the reducing terminal will receive, that's what distributed example demonstrates.
+            // The order of operations will still matter.
+            if (F_n_plus_1 < N) {
+              const auto F_n_plus_2 = F_n_plus_1 + F_n;
+              // cool, if there are no events to wait for co_await is no-op
+              co_await ttg::resumable_task_events{};
+              ttg::sendv<1>(F_n_plus_1);
+              ttg::send<0>(F_n_plus_2, F_n_plus_1);
+            } else
+              ttg::finalize<1>();
+
+            // to test coro-based task lifecycle introduce fake events
+            ttg::event null_event;
+            co_await ttg::resumable_task_events{null_event};
+
+            // N.B. return void just as normal TT op
+            co_return;
+          },
+          ttg::edges(F2F), ttg::edges(F2F, F2P));
+      auto print_op = ttg::make_tt(
+          [reference_result](const int &value, std::tuple<> &out) {
+            ttg::print("sum of Fibonacci numbers up to ", N, " = ", value);
+            CHECK(value == reference_result);
+          },
+          ttg::edges(F2P), ttg::edges());
+      print_op->set_input_reducer<0>([](int &a, const int &b) { a = a + b; });
+      make_graph_executable(fib_op);
+      if (ttg::default_execution_context().rank() == 0) fib_op->invoke(1, 0);
+      ttg::ttg_fence(ttg::default_execution_context());
+    }
+  }
+
+  // in distributed memory we must count how many messages the reducer will receive
+  SECTION("distributed-memory") {
+    ttg::Edge<int, std::pair<int, int>> F2F;
+    ttg::Edge<void, int> F2P;
+    const auto nranks = ttg::default_execution_context().size();
+
+    auto fib_op = ttg::make_tt(
+        // computes next value: F_{n+2} = F_{n+1} + F_{n}, seeded by F_1 = 1, F_0 = 0
+        [](const int &n, const std::pair<int, int> &F_np1_n) {
+          const auto &[F_n_plus_1, F_n] = F_np1_n;
+          if (F_n_plus_1 < N) {
+            const auto F_n_plus_2 = F_n_plus_1 + F_n;
+            ttg::print("sent ", F_n_plus_1, " to fib reducer");
+            ttg::sendv<1>(F_n_plus_1);
+            ttg::send<0>(n + 1, std::make_pair(F_n_plus_2, F_n_plus_1));
+          } else {
+            // how many messages the reducer should expect to receive
+            ttg::set_size<1>(n);
+            ttg::print("fib reducer will expect ", n, " messages");
+          }
+        },
+        ttg::edges(F2F), ttg::edges(F2F, F2P));
+    auto print_op = ttg::make_tt(
+        [reference_result](const int &value, std::tuple<> &out) {
+          ttg::print("sum of Fibonacci numbers up to ", N, " = ", value);
+          CHECK(value == reference_result);
+        },
+        ttg::edges(F2P), ttg::edges());
+    // move all fib tasks to last rank, all reductions will happen on 0 => for some reason no reductions occur!
+    fib_op->set_keymap([=](const auto &key) { return nranks - 1; });
+    fib_op->set_trace_instance(true);
+    print_op->set_input_reducer<0>([](int &a, const int &b) {
+      ttg::print("fib reducer: current value = ", a, ", incremented by ", b, " set to ", a + b);
+      a = a + b;
+    });
+    make_graph_executable(fib_op);
+    ttg::ttg_fence(ttg::default_execution_context());
+    if (ttg::default_execution_context().rank() == 0) fib_op->invoke(0, std::make_pair(1, 0));
+    ttg::ttg_fence(ttg::default_execution_context());
+  }
+}  // TEST_CAST("Fibonacci")
diff --git a/tests/unit/fibonacci.cc b/tests/unit/fibonacci.cc
index 6434aaf55..37d333e07 100644
--- a/tests/unit/fibonacci.cc
+++ b/tests/unit/fibonacci.cc
@@ -1,4 +1,4 @@
-#include <catch2/catch.hpp>
+#include <catch2/catch_all.hpp>
 
 #include "ttg.h"
 
@@ -49,8 +49,9 @@ TEST_CASE("Fibonacci", "[fib][core]") {
               const auto F_n_plus_2 = F_n_plus_1 + F_n;
               ttg::sendv<1>(F_n_plus_1, outs);
               ttg::send<0>(F_n_plus_2, F_n_plus_1, outs);
-            } else
+            } else {
               ttg::finalize<1>(outs);
+            }
           },
           ttg::edges(F2F), ttg::edges(F2F, F2P));
       auto print_op = ttg::make_tt(
diff --git a/tests/unit/ranges.cc b/tests/unit/ranges.cc
index cd423a2bd..f532805f5 100644
--- a/tests/unit/ranges.cc
+++ b/tests/unit/ranges.cc
@@ -1,4 +1,4 @@
-#include <catch2/catch.hpp>
+#include <catch2/catch_all.hpp>
 
 #include "ttg.h"
 
diff --git a/tests/unit/serialization.cc b/tests/unit/serialization.cc
index 434c828b7..e12607de3 100644
--- a/tests/unit/serialization.cc
+++ b/tests/unit/serialization.cc
@@ -40,14 +40,7 @@ class POD {
   bool operator==(const POD& other) const { return value == other.value; }
 };
 static_assert(std::is_trivially_copyable_v<POD>);
-
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-// WTF?! std::array of non-Serializable is Serializable
-static_assert(!cereal::traits::is_input_serializable<POD, cereal::BinaryInputArchive>::value);
-static_assert(!cereal::traits::is_output_serializable<POD, cereal::BinaryOutputArchive>::value);
-static_assert(!cereal::traits::is_input_serializable<std::array<POD, 3>, cereal::BinaryInputArchive>::value);
-static_assert(!cereal::traits::is_output_serializable<std::array<POD, 3>, cereal::BinaryOutputArchive>::value);
-#endif  // TTG_SERIALIZATION_SUPPORTS_CEREAL
+static_assert(ttg::detail::is_memcpyable_v<POD>);
 
 static_assert(!ttg::detail::is_madness_user_buffer_serializable_v<POD>);
 #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST
@@ -55,10 +48,8 @@ static_assert(!ttg::detail::is_boost_serializable_v<boost::archive::binary_oarch
 static_assert(ttg::detail::has_freestanding_boost_serialize_with_version_v<POD, boost::archive::binary_oarchive>);
 #endif  // TTG_SERIALIZATION_SUPPORTS_BOOST
 static_assert(!ttg::detail::is_boost_user_buffer_serializable_v<POD>);
-static_assert(!ttg::detail::is_cereal_user_buffer_serializable_v<POD>);
 static_assert(!ttg::detail::is_madness_user_buffer_serializable_v<std::array<POD, 3>>);
 static_assert(!ttg::detail::is_boost_user_buffer_serializable_v<std::array<POD, 3>>);
-static_assert(!ttg::detail::is_cereal_user_buffer_serializable_v<std::array<POD, 3>>);
 
 std::ostream& operator<<(std::ostream& s, const POD& f) {
   s << "POD(" << f.get() << ")";
@@ -91,7 +82,17 @@ class NonPOD {
 
   int get() const { return value; }
 };
+// non-default ctor breaks trivial copyability
 static_assert(!std::is_trivially_copyable_v<NonPOD>);
+static_assert(!ttg::detail::is_memcpyable_v<NonPOD>);
+
+// but can allow use of std::memcpy on type
+class MemcpyableNonPOD : public NonPOD {};
+namespace ttg::detail {
+  template<> inline constexpr bool is_memcpyable_override_v<MemcpyableNonPOD> = true;
+}  // namespace ttg::detail
+static_assert(!std::is_trivially_copyable_v<MemcpyableNonPOD>);
+static_assert(ttg::detail::is_memcpyable_v<MemcpyableNonPOD>);
 
 namespace intrusive::symmetric::mc {
 
@@ -150,7 +151,7 @@ namespace intrusive::symmetric::bc_v {
 
     int get() const { return value; }
 
-    // boost uses `unsigned int` for version, cereal uses `std::uint32_t`
+    // boost uses `unsigned int` for version
     template <typename Archive>
     void serialize(Archive& ar, const unsigned int version) {
       ar& value;
@@ -169,9 +170,6 @@ namespace intrusive_private::symmetric::bc_v {
 #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST
     friend class boost::serialization::access;
 #endif
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-    friend class cereal::access;  // befriend the cereal version of access
-#endif
 
     template <typename Archive>
     void serialize(Archive& ar, const unsigned int version) {
@@ -190,72 +188,6 @@ namespace intrusive_private::symmetric::bc_v {
   static_assert(!std::is_trivially_copyable_v<NonPOD>);
 }  // namespace intrusive_private::symmetric::bc_v
 
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-namespace intrusive::symmetric::c {
-
-  class NonPOD {
-    int value;
-
-   public:
-    NonPOD() = default;
-    NonPOD(int value) : value(value) {}
-    NonPOD(const NonPOD& other) : value(other.value) {}
-
-    int get() const { return value; }
-
-    // versioned
-    template <class Archive>
-    std::enable_if_t<std::is_base_of_v<cereal::detail::InputArchiveBase, Archive> ||
-                     std::is_base_of_v<cereal::detail::OutputArchiveBase, Archive>>
-    serialize(Archive& ar) {
-      ar(value);
-    }
-  };
-  static_assert(!std::is_trivially_copyable_v<NonPOD>);
-
-  static_assert(ttg::detail::is_cereal_buffer_serializable_v<NonPOD>);
-  static_assert(!ttg::detail::is_boost_buffer_serializable_v<NonPOD>);
-  static_assert(!ttg::detail::is_madness_buffer_serializable_v<NonPOD>);
-  static_assert(ttg::detail::is_cereal_user_buffer_serializable_v<NonPOD>);
-  static_assert(!ttg::detail::is_boost_user_buffer_serializable_v<NonPOD>);
-  static_assert(!ttg::detail::is_madness_user_buffer_serializable_v<NonPOD>);
-
-}  // namespace intrusive::symmetric::c
-
-namespace intrusive::symmetric::c_v {
-
-  class NonPOD {
-    int value;
-
-   public:
-    NonPOD() = default;
-    NonPOD(int value) : value(value) {}
-    NonPOD(const NonPOD& other) : value(other.value) {}
-
-    int get() const { return value; }
-
-    // versioned
-    template <class Archive>
-    std::enable_if_t<std::is_base_of_v<cereal::detail::InputArchiveBase, Archive> ||
-                     std::is_base_of_v<cereal::detail::OutputArchiveBase, Archive>>
-    serialize(Archive& ar, std::uint32_t const version) {
-      ar(value);
-    }
-  };
-  static_assert(!std::is_trivially_copyable_v<NonPOD>);
-
-  static_assert(ttg::detail::is_cereal_buffer_serializable_v<NonPOD>);
-  static_assert(!ttg::detail::is_boost_serializable_v<boost::archive::binary_iarchive, NonPOD>);
-  static_assert(!ttg::detail::is_boost_buffer_serializable_v<NonPOD>);
-  static_assert(!ttg::detail::is_madness_buffer_serializable_v<NonPOD>);
-  static_assert(ttg::detail::is_cereal_user_buffer_serializable_v<NonPOD>);
-  static_assert(!ttg::detail::is_boost_user_buffer_serializable_v<NonPOD>);
-  static_assert(!ttg::detail::is_madness_user_buffer_serializable_v<NonPOD>);
-
-}  // namespace intrusive::symmetric::c_v
-
-#endif  // TTG_SERIALIZATION_SUPPORTS_CEREAL
-
 #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST
 
 // boost serialization, with version (and object) tracking
@@ -361,10 +293,6 @@ static_assert(std::is_same_v<typex<boost::archive::binary_oarchive, intrusive::a
 
 #endif  // TTG_SERIALIZATION_SUPPORTS_BOOST
 
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-CEREAL_CLASS_VERSION(intrusive::symmetric::c_v::NonPOD, 17);
-#endif
-
 namespace nonintrusive::symmetric::m {
 
   class NonPOD {
@@ -452,25 +380,27 @@ namespace freestanding::symmetric::bc_v {
 
 #include "ttg/serialization/data_descriptor.h"
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_all.hpp>
 
 static_assert(ttg::detail::is_madness_buffer_serializable_v<int>);
 static_assert(!ttg::detail::is_madness_user_buffer_serializable_v<int>);
 static_assert(!ttg::detail::is_boost_user_buffer_serializable_v<int>);
-static_assert(!ttg::detail::is_cereal_user_buffer_serializable_v<int>);
 static_assert(!ttg::detail::is_user_buffer_serializable_v<int>);
 static_assert(ttg::detail::is_madness_buffer_serializable_v<const int>);
 static_assert(!ttg::detail::is_madness_user_buffer_serializable_v<const int>);
 static_assert(!ttg::detail::is_boost_user_buffer_serializable_v<const int>);
-static_assert(!ttg::detail::is_cereal_user_buffer_serializable_v<const int>);
 static_assert(!ttg::detail::is_user_buffer_serializable_v<const int>);
 static_assert(ttg::detail::is_madness_buffer_serializable_v<int[4]>);
 static_assert(!ttg::detail::is_madness_user_buffer_serializable_v<int[4]>);
 static_assert(!ttg::detail::is_boost_user_buffer_serializable_v<int[4]>);
-static_assert(!ttg::detail::is_cereal_user_buffer_serializable_v<int[4]>);
 static_assert(!ttg::detail::is_user_buffer_serializable_v<int[4]>);
 static_assert(!ttg::detail::is_user_buffer_serializable_v<std::array<int, 4>>);
 
+// default_data_descriptor<std::pair<int,int>> should be defined but std::is_trivially_copyable_v<std::pair<int,int>> is false
+// is_memcpyable_v<std::pair<int,int>> is true
+static_assert(!std::is_trivially_copyable_v<std::pair<int,int>>);
+static_assert(ttg::detail::is_memcpyable_v<std::pair<int,int>>);
+
 #ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS
 
 static_assert(ttg::detail::is_madness_serializable_v<madness::archive::BufferOutputArchive, int>);
@@ -803,20 +733,6 @@ TEST_CASE("Boost Serialization", "[serialization]") {
 }
 #endif  // TTG_SERIALIZATION_SUPPORTS_BOOST
 
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-TEST_CASE("Cereal Serialization", "[serialization]") {
-  auto test = [](const auto& t) {
-    using T = std::remove_reference_t<decltype(t)>;
-    CHECK(ttg::detail::is_cereal_serializable_v<cereal::BinaryOutputArchive, T>);
-    using Tnc = std::remove_const_t<T>;
-    CHECK(ttg::detail::is_cereal_serializable_v<cereal::BinaryInputArchive, Tnc>);
-  };
-
-  test(intrusive::symmetric::bc_v::NonPOD{17});
-  test(freestanding::symmetric::bc_v::NonPOD{18});
-}
-#endif  // TTG_SERIALIZATION_SUPPORTS_CEREAL
-
 #if defined(TTG_SERIALIZATION_SUPPORTS_MADNESS) && defined(TTG_SERIALIZATION_SUPPORTS_BOOST)
 TEST_CASE("TTG Serialization", "[serialization]") {
   // Test code written as if calling from C
@@ -872,10 +788,6 @@ TEST_CASE("TTG Serialization", "[serialization]") {
   test_struct(intrusive::symmetric::bc_v::NonPOD{20});     // Boost
   test_struct(freestanding::symmetric::bc_v::NonPOD{21});  // Boost
 #endif
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-  test_struct(intrusive::symmetric::c::NonPOD{22});    // Cereal
-  test_struct(intrusive::symmetric::c_v::NonPOD{23});  // Cereal
-#endif
 
 #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST
   // verify that turning off version and object tracking for Boost produces same archive since the TTG boost archives
diff --git a/tests/unit/splitmd_serialization.cc b/tests/unit/splitmd_serialization.cc
index 482b338ac..4323b5fda 100644
--- a/tests/unit/splitmd_serialization.cc
+++ b/tests/unit/splitmd_serialization.cc
@@ -3,7 +3,7 @@
 
 #include "ttg.h"
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_all.hpp>
 
 using value_t = int;
 constexpr const int N = 10, M = 10;
diff --git a/tests/unit/streams.cc b/tests/unit/streams.cc
new file mode 100644
index 000000000..b31506e5a
--- /dev/null
+++ b/tests/unit/streams.cc
@@ -0,0 +1,210 @@
+#include <catch2/catch_all.hpp>
+#include <ctime>
+
+#include "ttg.h"
+
+#include "ttg/serialization/std/pair.h"
+#include "ttg/util/hash/std/pair.h"
+
+
+
+TEST_CASE("streams", "[streams][core]") {
+
+  // in distributed memory we must count how many messages the reducer will receive
+  SECTION("concurrent-stream-size") {
+    ttg::Edge<int, int> I2O;
+    ttg::Edge<int, int> O2S;
+    const auto nranks = ttg::default_execution_context().size();
+
+    constexpr std::size_t N = 12000;
+    constexpr std::size_t SLICE = 600;
+    constexpr const timespec ts = { .tv_sec = 0, .tv_nsec = 10000 };
+    constexpr int VALUE = 1;
+    std::atomic<std::size_t> reduce_ops = 0;
+
+    auto op = ttg::make_tt(
+        [&](const int &n, int&& i,
+           std::tuple<ttg::Out<int, int>> &outs) {
+          int key = n/SLICE;
+          nanosleep(&ts, nullptr);
+          if (n < N-1) {
+            ttg::send<0>(key, std::forward<int>(i), outs);
+            //ttg::print("sent to sink ", key);
+          } else {
+            // set the size of the last reducer
+            if (N%SLICE > 0) {
+              ttg::set_size<0>(key, N%SLICE, outs);
+              std::cout << "set_size key " << key << " size " << N%SLICE << std::endl;
+            }
+            // forward the value
+            ttg::send<0>(key, std::forward<int>(i), outs);
+            //ttg::print("finalized last sink ", key);
+          }
+        },
+        ttg::edges(I2O), ttg::edges(O2S));
+
+    auto sink_op = ttg::make_tt(
+        [&](const int key, const int &value) {
+          std::cout << "sink " << key << std::endl;
+          if (!(value == SLICE || key == (N/SLICE))) {
+            std::cout << "SINK ERROR: key " << key << " value " << value << " SLICE " << SLICE << " N " << N << std::endl;
+          }
+          CHECK((value == SLICE || key == (N/SLICE)));
+          reduce_ops++;
+        },
+        ttg::edges(O2S), ttg::edges());
+
+    op->set_keymap([=](const auto &key) { return nranks - 1; });
+    op->set_trace_instance(true);
+    sink_op->set_input_reducer<0>([&](int &a, const int &b) {
+      a += 1; // we count invocations
+      CHECK(b == VALUE);
+      reduce_ops++;
+    }, SLICE);
+
+    make_graph_executable(op);
+    ttg::execute(ttg::default_execution_context());
+    if (ttg::default_execution_context().rank() == 0) {
+      for (std::size_t i = 0; i < N; ++i) {
+        op->invoke(i, VALUE);
+      }
+    }
+
+    ttg::ttg_fence(ttg::default_execution_context());
+    CHECK(reduce_ops == N/nranks);
+  }
+
+  SECTION("streams-readonly-input") {
+    ttg::Edge<int, int> I2O;
+    ttg::Edge<int, int> O2S;
+    ttg::Edge<int, int> O2D;
+    const auto nranks = ttg::default_execution_context().size();
+
+    constexpr std::size_t N = 12000;
+    constexpr std::size_t SLICE = 600;
+    constexpr const timespec ts = { .tv_sec = 0, .tv_nsec = 10000 };
+    constexpr int VALUE = 1;
+    std::atomic<std::size_t> reduce_ops = 0;
+
+    auto dummy = ttg::make_tt([&](const int &n, const int &i){
+      CHECK(i == VALUE);
+    }, ttg::edges(O2D));
+
+    auto op = ttg::make_tt(
+        [&](const int &n, const int& i,
+           std::tuple<ttg::Out<int, int>, ttg::Out<int, int>> &outs) {
+          int key = n/SLICE;
+          nanosleep(&ts, nullptr);
+          ttg::send<1>(n, i, outs); // send to a dummy to check ref-counting
+          if (n < N-1) {
+            ttg::send<0>(key, i, outs);
+            //ttg::print("sent to sink ", key);
+          } else {
+            // set the size of the last reducer
+            if (N%SLICE > 0) {
+              ttg::set_size<0>(key, N%SLICE, outs);
+              std::cout << "set_size key " << key << " size " << N%SLICE << std::endl;
+            }
+            // forward the value
+            ttg::send<0>(key, i, outs);
+            //ttg::print("finalized last sink ", key);
+          }
+        },
+        ttg::edges(I2O), ttg::edges(O2S, O2D));
+
+    auto sink_op = ttg::make_tt(
+        [&](const int key, const int &value) {
+          std::cout << "sink " << key << std::endl;
+          if (!(value == SLICE || key == (N/SLICE))) {
+            std::cout << "SINK ERROR: key " << key << " value " << value << " SLICE " << SLICE << " N " << N << std::endl;
+          }
+          CHECK((value == SLICE || key == (N/SLICE)));
+          reduce_ops++;
+        },
+        ttg::edges(O2S), ttg::edges());
+
+    op->set_keymap([=](const auto &key) { return nranks - 1; });
+    op->set_trace_instance(true);
+    sink_op->set_input_reducer<0>([&](int &a, const int &b) {
+      a += 1; // we count invocations
+      CHECK(b == VALUE);
+      reduce_ops++;
+    }, SLICE);
+
+    make_graph_executable(op);
+    ttg::execute(ttg::default_execution_context());
+    if (ttg::default_execution_context().rank() == 0) {
+      for (std::size_t i = 0; i < N; ++i) {
+        op->invoke(i, VALUE);
+      }
+    }
+
+    ttg::ttg_fence(ttg::default_execution_context());
+    CHECK(reduce_ops == N/nranks);
+  }
+
+  SECTION("streams-temporary-input") {
+    ttg::Edge<int, int> I2O;
+    ttg::Edge<int, int> O2S;
+    const auto nranks = ttg::default_execution_context().size();
+
+    constexpr std::size_t N = 12000;
+    constexpr std::size_t SLICE = 600;
+    constexpr const timespec ts = { .tv_sec = 0, .tv_nsec = 10000 };
+    constexpr int VALUE = 1;
+    std::atomic<std::size_t> reduce_ops = 0;
+
+    auto op = ttg::make_tt(
+        [&](const int &n, const int& i,
+           std::tuple<ttg::Out<int, int>> &outs) {
+          int key = n/SLICE;
+          nanosleep(&ts, nullptr);
+          int tmp = i; // temporary data, not tracked
+          if (n < N-1) {
+            std::get<0>(outs).send(key, int{i});
+            //ttg::send<0>(key, int{i}, outs);
+            //ttg::print("sent to sink ", key);
+          } else {
+            // set the size of the last reducer
+            if (N%SLICE > 0) {
+              ttg::set_size<0>(key, N%SLICE, outs);
+              std::cout << "set_size key " << key << " size " << N%SLICE << std::endl;
+            }
+            // forward the value
+            ttg::send<0>(key, int{i}, outs);
+            //ttg::print("finalized last sink ", key);
+          }
+        },
+        ttg::edges(I2O), ttg::edges(O2S));
+
+    auto sink_op = ttg::make_tt(
+        [&](const int key, const int &value) {
+          std::cout << "sink " << key << std::endl;
+          if (!(value == SLICE || key == (N/SLICE))) {
+            std::cout << "SINK ERROR: key " << key << " value " << value << " SLICE " << SLICE << " N " << N << std::endl;
+          }
+          CHECK((value == SLICE || key == (N/SLICE)));
+          reduce_ops++;
+        },
+        ttg::edges(O2S), ttg::edges());
+
+    op->set_keymap([=](const auto &key) { return nranks - 1; });
+    op->set_trace_instance(true);
+    sink_op->set_input_reducer<0>([&](int &a, const int &b) {
+      a += 1; // we count invocations
+      CHECK(b == VALUE);
+      reduce_ops++;
+    }, SLICE);
+
+    make_graph_executable(op);
+    ttg::execute(ttg::default_execution_context());
+    if (ttg::default_execution_context().rank() == 0) {
+      for (std::size_t i = 0; i < N; ++i) {
+        op->invoke(i, VALUE);
+      }
+    }
+
+    ttg::ttg_fence(ttg::default_execution_context());
+    CHECK(reduce_ops == N/nranks);
+  }
+}  // TEST_CASE("streams")
diff --git a/tests/unit/tt.cc b/tests/unit/tt.cc
index f77a483ef..e4f7524d3 100644
--- a/tests/unit/tt.cc
+++ b/tests/unit/tt.cc
@@ -1,4 +1,4 @@
-#include <catch2/catch.hpp>
+#include <catch2/catch_all.hpp>
 
 #include "ttg.h"
 
@@ -146,7 +146,7 @@ namespace tt_i_iv {
 
   template <typename K, typename D1, typename D2>
   void func0(K &key, D1 &datum1, D2 &&datum2) {
-    abort();
+    ttg::abort();
   }
 }  // namespace tt_i_iv
 
@@ -298,6 +298,9 @@ TEST_CASE("TemplateTask", "[core]") {
         // OK: all of {auto&&, auto&, const auto&} bind to const T&
         static_assert(std::is_invocable<decltype(func0), int, const float &, const float &, const float &,
                                         const float &, std::tuple<> &>::value);
+        static_assert(std::is_same_v<std::invoke_result_t<decltype(func0), int, const float &, const float &,
+                                                          const float &, const float &, std::tuple<> &>,
+                                     void>);
         // OK: ditto
         static_assert(std::is_void_v<decltype(tt_i_iv::func0(std::declval<const int &>(), std::declval<const float &>(),
                                                              std::declval<const float &>()))>);
@@ -323,14 +326,15 @@ TEST_CASE("TemplateTask", "[core]") {
                                ttg::typelist<ttg::typelist<int>, ttg::typelist<float &&, const float &>,
                                              ttg::typelist<float &&>, ttg::typelist<float &&, const float &>,
                                              ttg::typelist<float &&, const float &>, ttg::typelist<std::tuple<> &>>{})),
-                           ttg::typelist<>>);
+                           ttg::typelist<ttg::typelist<>, ttg::typelist<>>>);
         static_assert(
             std::is_same_v<
                 decltype(compute_arg_binding_types(
                     func0, ttg::typelist<ttg::typelist<int>, ttg::typelist<float &&, const float &>,
                                          ttg::typelist<float &&, const float &>, ttg::typelist<float &&, const float &>,
                                          ttg::typelist<float &&, const float &>, ttg::typelist<std::tuple<> &>>{})),
-                ttg::typelist<int, float &&, const float &, float &&, float &&, std::tuple<> &>>);
+                ttg::typelist<ttg::typelist<void>,
+                              ttg::typelist<int, float &&, const float &, float &&, float &&, std::tuple<> &>>>);
         // voids are skipped
         static_assert(
             std::is_same_v<
@@ -339,7 +343,8 @@ TEST_CASE("TemplateTask", "[core]") {
                                          ttg::typelist<float &&, const float &>, ttg::typelist<float &&, const float &>,
                                          ttg::typelist<float &&, const float &>, ttg::typelist<float &&, const float &>,
                                          ttg::typelist<void, std::tuple<> &>, ttg::typelist<void>>{})),
-                ttg::typelist<int, void, float &&, const float &, float &&, float &&, std::tuple<> &, void>>);
+                ttg::typelist<ttg::typelist<void>, ttg::typelist<int, void, float &&, const float &, float &&, float &&,
+                                                                 std::tuple<> &, void>>>);
 
         // test introspection of generic arguments by the runtime (i.e. contents of TT::input_args_type) and
         // the deduced types inside the function body
@@ -425,10 +430,10 @@ TEST_CASE("TemplateTask", "[core]") {
     static_assert(ttg::meta::is_generic_callable_v<decltype(g)>);
     auto [f_is_generic, f_args_t_v] = ttg::meta::callable_args<decltype(f)>;
     CHECK(!f_is_generic);
-    static_assert(std::is_same_v<decltype(f_args_t_v), ttg::typelist<int &>>);
+    static_assert(std::is_same_v<decltype(f_args_t_v), std::pair<ttg::typelist<void>, ttg::typelist<int &>>>);
     auto [g_is_generic, g_args_t_v] = ttg::meta::callable_args<decltype(g)>;
     CHECK(g_is_generic);
-    static_assert(std::is_same_v<decltype(g_args_t_v), ttg::typelist<>>);
+    static_assert(std::is_same_v<decltype(g_args_t_v), std::pair<ttg::typelist<>, ttg::typelist<>>>);
 
     {
       static_assert(!ttg::meta::is_generic_callable_v<decltype(&args_pmf::X::f)>);
diff --git a/tests/unit/unit_main.cpp b/tests/unit/unit_main.cpp
index 7ca861716..9384ebac9 100644
--- a/tests/unit/unit_main.cpp
+++ b/tests/unit/unit_main.cpp
@@ -1,6 +1,6 @@
 #define CATCH_CONFIG_RUNNER
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_all.hpp>
 
 #include <clocale>
 #include <iostream>
@@ -30,7 +30,8 @@ int main(int argc, char** argv) {
   ttg::diagnose_off();  // turn off diagnostics
 
   const auto nranks = ttg::default_execution_context().size();
-  std::cout << "ready to run TTG unit tests with " << nranks << " rank" << (nranks > 1 ? "s" : "") << std::endl;
+  if (session.config().verbosity() != Catch::Verbosity::Quiet)
+    std::cout << "ready to run TTG unit tests with " << nranks << " rank" << (nranks > 1 ? "s" : "") << std::endl;
 
   ttg::execute();
 #endif
diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt
index 96429ceca..900df6519 100644
--- a/ttg/CMakeLists.txt
+++ b/ttg/CMakeLists.txt
@@ -17,6 +17,7 @@ set(ttg-util-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/future.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/hash.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/hash/std/pair.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/iovec.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/macro.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/meta.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/meta/callable.h
@@ -37,22 +38,36 @@ set(ttg-base-headers
 file(GLOB_RECURSE ttg-external-headers $<$<VERSION_GREATER_EQUAL:${CMAKE_VERSION},3.12>:CONFIGURE_DEPENDS>
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/external/boost/*
     )
+configure_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/config.in.h
+        ${CMAKE_CURRENT_BINARY_DIR}/ttg/config.h
+)
 set(ttg-impl-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/broadcast.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/buffer.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescope.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescratch.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/edge.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/execution.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/func.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/fwd.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/impl_selector.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/tt.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/ptr.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/reduce.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/run.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/runtimes.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/serialization.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/terminal.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/traverse.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/ttvalue.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/world.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/make_tt.h
+        ${CMAKE_CURRENT_BINARY_DIR}/ttg/config.h
+    )
+set(ttg_device_headers
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/device.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/task.h
     )
 set(ttg-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg.h
@@ -62,12 +77,14 @@ set(ttg-sources
         ${ttg-impl-headers}
         ${ttg-base-headers}
         ${ttg-util-headers}
+        ${ttg_device_headers}
         ${ttg-external-headers}
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/backtrace.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/bug.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/env.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/version.cc
     )
+
 # extract git metadata
 include(GetGitMetadata)
 vgkit_cmake_git_metadata()
@@ -75,17 +92,19 @@ vgkit_cmake_git_metadata()
 set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/version.cc
         PROPERTIES
         COMPILE_DEFINITIONS "TTG_GIT_REVISION=\"${TTG_GIT_REVISION}\";TTG_GIT_DESCRIPTION=\"${TTG_GIT_DESCRIPTION}\"")
-set(ttg-public-headers ${ttg-headers};${ttg-impl-headers};${ttg-base-headers};${ttg-util-headers})
-if (NOT TTG_IGNORE_BUNDLED_EXTERNALS)
-  list(APPEND ttg-sources ${ttg-external-headers})
-  list(APPEND ttg-public-headers ${ttg-external-headers})
-endif()
+
 # optional dependencies
-if (TARGET Boost::boost)
-  list(APPEND ttg-deps Boost::boost)
-else () # if Boost::boost is missing must use bundled Boost.CallableTraits
-  list(APPEND ttg-defs "$<BUILD_INTERFACE:TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS=1>")
-  list(APPEND ttg-incs "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/ttg/external>")
+if (TARGET Boost::headers)
+  if (TARGET Boost::callable_traits)  # using modularized Boost?
+      list(APPEND ttg-deps Boost::callable_traits)
+  else()
+      list(APPEND ttg-deps Boost::headers)
+  endif()
+else () # if Boost::headers is missing must use bundled Boost.CallableTraits
+  list(APPEND ttg-defs "TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS=1")
+  list(APPEND ttg-incs
+          "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/ttg/external>"
+          "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ttg/external>")
 endif ()
 if (TARGET TTG_Libunwind)
   list(APPEND ttg-deps TTG_Libunwind)
@@ -93,6 +112,30 @@ endif(TARGET TTG_Libunwind)
 if (TTG_ENABLE_TRACE)
   list(APPEND ttg-defs "TTG_ENABLE_TRACE=1")
 endif (TTG_ENABLE_TRACE)
+if (TARGET std::coroutine)
+  list(APPEND ttg-deps std::coroutine)
+  list(APPEND ttg-defs "TTG_HAS_COROUTINE=1")
+  list(APPEND ttg-util-headers
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/coroutine.h
+          )
+endif ()
+if (TTG_HAVE_CUDA)
+  list(APPEND ttg-deps CUDA::cudart)
+endif (TTG_HAVE_CUDA)
+
+if (TTG_HAVE_HIPBLAS)
+  list(APPEND ttg-deps hip::host)
+endif (TTG_HAVE_HIPBLAS)
+
+if (TTG_HAVE_LEVEL_ZERO)
+  list(APPEND ttg-deps level_zero::ze_loader)
+endif (TTG_HAVE_LEVEL_ZERO)
+
+set(ttg-public-headers ${ttg-headers};${ttg-impl-headers};${ttg-base-headers};${ttg-util-headers};${ttg_device_headers})
+if (NOT TTG_IGNORE_BUNDLED_EXTERNALS)
+  list(APPEND ttg-sources ${ttg-external-headers})
+  list(APPEND ttg-public-headers ${ttg-external-headers})
+endif()
 
 add_ttg_library(ttg "${ttg-sources}" PUBLIC_HEADER "${ttg-public-headers}" LINK_LIBRARIES "${ttg-deps}" INCLUDE_DIRECTORIES "${ttg-incs}" COMPILE_DEFINITIONS "${ttg-defs}")
 
@@ -110,7 +153,6 @@ set(ttg-serialization-headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/serialization/traits.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/serialization/backends/boost.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/serialization/backends/boost/archive.h
-        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/serialization/backends/cereal.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/serialization/backends/madness.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/serialization/std/allocator.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/serialization/std/array.h
@@ -130,12 +172,13 @@ if (TARGET MADworld)
 endif(TARGET MADworld)
 if (TARGET Boost::serialization)
   list(APPEND ttg-serialization-deps Boost::serialization)
+  list(APPEND ttg-serialization-boost-deps Boost::serialization)
+  if (TARGET Boost::iostreams)  # using modularized Boost?
+      list(APPEND ttg-serialization-deps Boost::iostreams)
+      list(APPEND ttg-serialization-boost-deps Boost::iostreams)
+  endif()
   list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_BOOST=1)
 endif (TARGET Boost::serialization)
-if (TARGET cereal::cereal)
-  list(APPEND ttg-serialization-deps cereal::cereal)
-  list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_CEREAL=1)
-endif (TARGET cereal::cereal)
 
 add_ttg_library(ttg-serialization
                 "${ttg-serialization-sources}"
@@ -155,26 +198,20 @@ if (TARGET Boost::serialization)
   add_ttg_library(ttg-serialization-boost
           "${ttg-serialization-sources}"
           PUBLIC_HEADER "${ttg-serialization-headers}"
-          LINK_LIBRARIES "Boost::serialization"
+          LINK_LIBRARIES "${ttg-serialization-boost-deps}"
           COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_BOOST=1")
 endif(TARGET Boost::serialization)
-# make cereal-only serialization target
-if (TARGET cereal::cereal)
-  add_ttg_library(ttg-serialization-cereal
-          "${ttg-serialization-sources}"
-          PUBLIC_HEADER "${ttg-serialization-headers}"
-          LINK_LIBRARIES "cereal::cereal"
-          COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_CEREAL=1")
-endif(TARGET cereal::cereal)
 
 #########################
 ####### MADNESS-specific
 #########################
 if (TARGET MADworld)
   set(ttg-mad-headers
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/buffer.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/fwd.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/import.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/ttg.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/ttvalue.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/watch.h)
   # N.B. ttg-mad can use MADNESS serialization only
   add_ttg_library(ttg-mad "${ttg-mad-headers}" PUBLIC_HEADER "${ttg-mad-headers}" LINK_LIBRARIES "ttg;MADworld;ttg-serialization-madness" COMPILE_DEFINITIONS "WORLD_INSTANTIATE_STATIC_TEMPLATES=1")
@@ -185,10 +222,19 @@ endif(TARGET MADworld)
 ########################
 if (TARGET PaRSEC::parsec)
   set(ttg-parsec-headers
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/buffer.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/device.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/devicefunc.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/devicescratch.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/fwd.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/import.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ptr.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/parsec-ext.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/task.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/thread_local.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttg_data_copy.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/parsec/ttvalue.h
           )
   find_package(MPI)
   set(ttg-parsec-deps "ttg;MPI::MPI_CXX;PaRSEC::parsec")
diff --git a/ttg/ttg.h b/ttg/ttg.h
index e0fa9a702..50a891c1d 100644
--- a/ttg/ttg.h
+++ b/ttg/ttg.h
@@ -1,6 +1,7 @@
 #ifndef TTG_H_INCLUDED
 #define TTG_H_INCLUDED
 
+#include "ttg/config.h"
 #include "ttg/fwd.h"
 
 #include "ttg/runtimes.h"
@@ -27,11 +28,19 @@
 
 #include "ttg/edge.h"
 
+#include "ttg/ptr.h"
+#include "ttg/buffer.h"
+#include "ttg/devicescratch.h"
+#include "ttg/ttvalue.h"
+#include "ttg/devicescope.h"
+#include "ttg/device/device.h"
+#include "ttg/device/task.h"
+
 #if defined(TTG_USE_PARSEC)
 #include "ttg/parsec/ttg.h"
 #elif defined(TTG_USE_MADNESS)
 #include "ttg/madness/ttg.h"
-#endif  // TTG_USE_PARSEC|MADNESS
+#endif  // TTG_USE_{PARSEC|MADNESS}
 
 // these headers use the default backend
 #include "ttg/run.h"
diff --git a/ttg/ttg/base/tt.h b/ttg/ttg/base/tt.h
index 660eaf70a..d2fbd467a 100644
--- a/ttg/ttg/base/tt.h
+++ b/ttg/ttg/base/tt.h
@@ -144,11 +144,11 @@ namespace ttg {
     virtual ~TTBase() = default;
 
     /// Use this to create a task that takes no data "manually"
-    /// @warning calls std::abort() if the derived class TT did not override this;
+    /// @warning calls ttg::abort() if the derived class TT did not override this;
     ///          only makes sense to override this if the derived TT uses void for key or data
     virtual void invoke() {
       std::cerr << "TTBase::invoke() invoked on a TT that did not override it" << std::endl;
-      abort();
+      ttg::abort();
     }
 
     /// Sets trace for all operations to value and returns previous setting.
@@ -264,7 +264,6 @@ namespace ttg {
     virtual void release() {}
 
     /// Marks this executable
-    /// @return nothing
     virtual void make_executable() = 0;
 
     /// Queries if this ready to execute
diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h
new file mode 100644
index 000000000..4d998c29a
--- /dev/null
+++ b/ttg/ttg/buffer.h
@@ -0,0 +1,13 @@
+#ifndef TTG_BUFFER_H
+#define TTG_BUFFER_H
+
+#include "ttg/fwd.h"
+
+namespace ttg {
+
+template<typename T, typename Allocator = std::allocator<T>>
+using Buffer = TTG_IMPL_NS::Buffer<T, Allocator>;
+
+} // namespace ttg
+
+#endif // TTG_buffer_H
\ No newline at end of file
diff --git a/ttg/ttg/config.in.h b/ttg/ttg/config.in.h
new file mode 100644
index 000000000..51e58b4a2
--- /dev/null
+++ b/ttg/ttg/config.in.h
@@ -0,0 +1,38 @@
+//
+// Created by Eduard Valeyev on 10/31/22.
+//
+
+#ifndef TTG_CONFIG_IN_H
+#define TTG_CONFIG_IN_H
+
+/** the C++ header containing the coroutine API */
+#define TTG_CXX_COROUTINE_HEADER <@CXX_COROUTINE_HEADER@>
+
+/** the C++ namespace containing the coroutine API */
+#define TTG_CXX_COROUTINE_NAMESPACE @CXX_COROUTINE_NAMESPACE@
+
+/** whether TTG has CUDA language support */
+#cmakedefine TTG_HAVE_CUDA
+
+/** whether TTG has CUDA runtime support */
+#cmakedefine TTG_HAVE_CUDART
+
+/** whether TTG has HIP support */
+#cmakedefine TTG_HAVE_HIP
+
+/** whether TTG has HIP BLAS library */
+#cmakedefine TTG_HAVE_HIPBLAS
+
+/** whether TTG has Intel Level Zero support */
+#cmakedefine TTG_HAVE_LEVEL_ZERO
+
+/** whether TTG has any device programming model (CUDA/HIP/LEVEL_ZERO) support */
+#cmakedefine TTG_HAVE_DEVICE
+
+/** whether TTG has MPI library */
+#cmakedefine TTG_HAVE_MPI
+
+/** whether TTG has the mpi-ext.h header */
+#cmakedefine TTG_HAVE_MPIEXT
+
+#endif  // TTG_CONFIG_IN_H
diff --git a/ttg/ttg/coroutine.h b/ttg/ttg/coroutine.h
new file mode 100644
index 000000000..81d5b1657
--- /dev/null
+++ b/ttg/ttg/coroutine.h
@@ -0,0 +1,230 @@
+//
+// Created by Eduard Valeyev on 10/31/22.
+//
+
+#ifndef TTG_COROUTINE_H
+#define TTG_COROUTINE_H
+
+#include "ttg/config.h"
+#include TTG_CXX_COROUTINE_HEADER
+
+#include <algorithm>
+#include <array>
+
+namespace ttg {
+
+  // import std coroutine API into ttg namespace
+
+  using suspend_always = TTG_CXX_COROUTINE_NAMESPACE::suspend_always;
+  using suspend_never = TTG_CXX_COROUTINE_NAMESPACE::suspend_never;
+  template <typename Promise>
+  using coroutine_handle = TTG_CXX_COROUTINE_NAMESPACE::coroutine_handle<Promise>;
+
+  /// @defgroup resumable_task resumable_task coroutine
+
+  /// resumable_task is the original prototype TTG coroutine that awaits on generic events.
+  /// There is no proper support for it by TTG runtimes, but it can be useful for understanding how
+  /// coroutines work with TTG and potentially in the future as a model for universal resumable tasks
+
+  /// @{
+
+  // fwd-declares
+
+  struct resumable_task_state;
+
+  template <std::size_t N>
+  struct resumable_task_events;
+
+  /// represents a generic one-time event
+  struct event {
+    void finish() { finished_ = true; }
+
+    /// @return true if the event has occurred
+    bool finished() const { return finished_; }
+
+   private:
+    std::atomic<bool> finished_ = false;
+  };
+
+  /// task that can be resumed after some events occur
+  struct resumable_task : public ttg::coroutine_handle<resumable_task_state> {
+    using base_type = ttg::coroutine_handle<resumable_task_state>;
+
+    /// @name members mandated by the promise_type concept
+    /// @{
+
+    using promise_type = struct resumable_task_state;
+
+    /// @}
+
+    resumable_task(base_type base) : base_type(std::move(base)) {}
+
+    base_type handle() { return *this; }
+
+    /// @return true if ready to resume
+    inline bool ready() const;
+
+    /// @return true if task completed and can be destroyed
+    inline bool completed() const;
+
+    /// @return ttg::span of events that this task depends on
+    inline ttg::span<event*> events();
+  };
+
+  /// encapsulates the state of the coroutine object visible to the outside world
+  /// @note this is the `promise_type` for resumable_task coroutine
+  struct resumable_task_state {
+    resumable_task_state() noexcept = default;
+    // these only live on coroutine frames so make noncopyable and nonmovable
+    resumable_task_state(const resumable_task_state&) = delete;
+    resumable_task_state& operator=(const resumable_task_state&) = delete;
+    resumable_task_state(resumable_task_state&&) = delete;
+    resumable_task_state& operator=(resumable_task_state&&) = delete;
+
+    constexpr static inline std::size_t MaxNumEvents = 20;
+    using handle_type = coroutine_handle<resumable_task_state>;
+
+    /// @name members mandated by the promise_type concept
+    /// @{
+
+    resumable_task get_return_object() { return resumable_task{handle_type::from_promise(*this)}; }
+
+    /// @note start task eagerly
+    suspend_never initial_suspend() noexcept { return {}; }
+
+    /// @note suspend task before destroying it so the runtime can know that the task is completed
+    suspend_always final_suspend() noexcept {
+      completed_ = true;
+      return {};
+    }
+    void return_void() {}
+    void unhandled_exception() {}
+
+    /// @}
+
+    /// @name optional members of the promise_type concept
+    /// @{
+
+    // these can be used to use optional storage provided by the runtime (e.g. part of the runtime's task data struct)
+    // N.B. the existing buffer must be passed to operator new via TLS
+    //    void* operator new(std::size_t size)
+    //    {
+    //      return ::operator new(size);
+    //    }
+
+    // N.B. whether the external buffer was used by operator new must be passed via TLS
+    //    void operator delete(void* ptr, std::size_t size)
+    //    {
+    //      ::operator delete(ptr, size);
+    //    }
+
+    /// @}
+
+    /// @return true if ready to resume
+    constexpr bool ready() const {
+      for (std::size_t e = 0; e != nevents_; ++e)
+        if (!events_storage_[e]->finished()) return false;
+      return true;
+    }
+
+    /// @return true if the task is completed
+    constexpr bool completed() const { return completed_; }
+
+    ttg::span<event*> events() { return ttg::span(events_storage_.data(), nevents_); }
+
+   private:
+    std::array<event*, MaxNumEvents> events_storage_;
+    std::size_t nevents_;
+    bool completed_ = false;
+
+    template <std::size_t N>
+    friend struct resumable_task_events;
+
+    void reset_events() {
+      std::fill(events_storage_.begin(), events_storage_.begin() + nevents_, nullptr);
+      nevents_ = 0;
+    }
+
+    template <std::size_t N>
+    void set_events(const std::array<event*, N> events) {
+      static_assert(N <= MaxNumEvents);
+      std::copy(events.begin(), events.end(), events_storage_.begin());
+      nevents_ = N;
+    }
+  };
+
+  bool resumable_task::ready() const { return base_type::promise().ready(); }
+  bool resumable_task::completed() const { return base_type::promise().completed(); }
+  ttg::span<event*> resumable_task::events() { return base_type::promise().events(); }
+
+  /// statically-sized sequence of events on whose completion progress of a given task depends on
+  /// @note this is the `Awaiter` for resumable_task coroutine
+  ///       (the concept is not defined in the standard, see
+  ///       https://lewissbaker.github.io/2017/11/17/understanding-operator-co-await instead )
+  template <std::size_t N>
+  struct resumable_task_events {
+   private:
+    template <std::size_t... I>
+    constexpr bool await_ready(std::index_sequence<I...>) const {
+      return (std::get<I>(events_)->finished() && ...);
+    }
+
+   public:
+    template <typename... Events>
+    constexpr resumable_task_events(Events&&... events) : events_{(&events)...} {}
+
+    /// @name members mandated by the Awaiter concept
+    /// @{
+
+    constexpr bool await_ready() const { return await_ready(std::make_index_sequence<N>{}); }
+
+    void await_suspend(coroutine_handle<resumable_task_state> pending_task) {
+      pending_task_ = pending_task;
+      pending_task_.promise().set_events(events_);
+    }
+
+    void await_resume() {
+      if (pending_task_) {
+        pending_task_.promise().reset_events();
+        pending_task_ = {};
+      }
+    }
+
+    /// @}
+
+   private:
+    std::array<event*, N> events_;
+    coroutine_handle<resumable_task_state> pending_task_;
+  };  // resumable_task_events
+
+  // deduce the number of events properly
+  template <typename... Events>
+  resumable_task_events(Events&&...) -> resumable_task_events<sizeof...(Events)>;
+
+  static_assert(resumable_task_events<0>{}.await_ready() == true);
+
+  /// @}
+
+  /////////////////////////////////////////////////////////////////////////////
+  // describe all types of coroutine tasks known to TTG
+  /////////////////////////////////////////////////////////////////////////////
+
+  // fwd declare all coro promise types that have not been declared yet
+  namespace device::detail {
+    struct device_task_promise_type;
+  }  // namespace device::detail
+
+  /// describes all types of coroutine tasks known to TTG
+  /// @internal only exists to simplify metaprogramming in the backend code
+  enum class TaskCoroutineID {
+    /// not a coroutine, i.e. a standard task function, -> void
+    Invalid,
+    /// -> ttg::resumable_task
+    ResumableTask,
+    /// -> ttg::device::Task
+    DeviceTask
+  };
+
+}  // namespace ttg
+
+#endif  // TTG_COROUTINE_H
diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h
new file mode 100644
index 000000000..6690982f6
--- /dev/null
+++ b/ttg/ttg/device/device.h
@@ -0,0 +1,182 @@
+#pragma once
+
+#include "ttg/config.h"
+#include "ttg/execution.h"
+
+
+
+namespace ttg::device {
+
+#if defined(TTG_HAVE_CUDA)
+  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::CUDA;
+#elif defined(TTG_HAVE_HIP)
+  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::HIP;
+#elif defined(TTG_HAVE_LEVEL_ZERO)
+  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::L0;
+#else
+  constexpr ttg::ExecutionSpace available_execution_space = ttg::ExecutionSpace::Invalid;
+#endif
+
+  /// Represents a device in a specific execution space
+  class Device {
+    int m_id = 0;
+    ttg::ExecutionSpace m_space = ttg::ExecutionSpace::Host;
+
+  public:
+    Device() = default;
+    Device(int id, ttg::ExecutionSpace space)
+    : m_id(id)
+    , m_space(space)
+    { }
+
+    int id() const {
+      if (is_host()) {
+        throw std::runtime_error("No valid ID for Host execution space!");
+      }
+      if (is_invalid()) {
+        throw std::runtime_error("Invalid execution space!");
+      }
+      return m_id;
+    }
+
+    operator int() const {
+      return id();
+    }
+
+    ttg::ExecutionSpace space() const {
+      return m_space;
+    }
+
+    bool is_device() const {
+      return !is_host();
+    }
+
+    bool is_host() const {
+      return !is_invalid() && (m_space == ttg::ExecutionSpace::Host);
+    }
+
+    bool is_invalid() const {
+      return (m_space == ttg::ExecutionSpace::Invalid);
+    }
+  };
+} // namespace ttg::device
+
+namespace std {
+  inline
+  std::ostream& operator<<(std::ostream& os, ttg::device::Device device) {
+    os << ttg::detail::execution_space_name(device.space());
+    if (device.is_device()) {
+      os << "(" << device.id() << ")";
+    }
+    return os;
+  }
+} // namespace std
+
+#if defined(TTG_HAVE_CUDA)
+#include <cuda_runtime.h>
+
+namespace ttg::device {
+  namespace detail {
+    inline thread_local ttg::device::Device current_device_ts = {};
+    inline thread_local cudaStream_t current_stream_ts = 0; // default stream
+
+    inline void reset_current() {
+      current_device_ts = {};
+      current_stream_ts = 0;
+    }
+
+    inline void set_current(int device, cudaStream_t stream) {
+      current_device_ts = ttg::device::Device(device, ttg::ExecutionSpace::CUDA);
+      current_stream_ts = stream;
+    }
+  } // namespace detail
+
+  inline
+  Device current_device() {
+    return detail::current_device_ts;
+  }
+
+  inline
+  cudaStream_t current_stream() {
+    return detail::current_stream_ts;
+  }
+} // namespace ttg
+
+#elif defined(TTG_HAVE_HIP)
+
+#include <hip/hip_runtime.h>
+
+namespace ttg::device {
+  namespace detail {
+    inline thread_local ttg::device::Device current_device_ts = {};
+    inline thread_local hipStream_t current_stream_ts = 0; // default stream
+
+    inline void reset_current() {
+      current_device_ts = {};
+      current_stream_ts = 0;
+    }
+
+    inline void set_current(int device, hipStream_t stream) {
+      current_device_ts = ttg::device::Device(device, ttg::ExecutionSpace::HIP);
+      current_stream_ts = stream;
+    }
+  } // namespace detail
+
+  inline
+  Device current_device() {
+    return detail::current_device_ts;
+  }
+
+  inline
+  hipStream_t current_stream() {
+    return detail::current_stream_ts;
+  }
+} // namespace ttg
+
+#elif defined(TTG_HAVE_LEVEL_ZERO)
+
+#include <CL/sycl.hpp>
+
+namespace ttg::device {
+  namespace detail {
+    inline thread_local ttg::device::Device current_device_ts = {};
+    inline thread_local sycl::queue* current_stream_ts = nullptr; // default stream
+
+
+    inline void reset_current() {
+      current_device_ts = {};
+      current_stream_ts = nullptr;
+    }
+
+    inline void set_current(int device, sycl::queue& stream) {
+      current_device_ts = ttg::device::Device(device, ttg::ExecutionSpace::HIP);
+      current_stream_ts = &stream;
+    }
+  } // namespace detail
+
+  inline
+  Device current_device() {
+    return detail::current_device_ts;
+  }
+
+  inline
+  sycl::queue& current_stream() {
+    return *detail::current_stream_ts;
+  }
+} // namespace ttg
+
+#else
+
+namespace ttg::device {
+  inline Device current_device() {
+    return {};
+  }
+
+  template<ttg::ExecutionSpace Space = ttg::ExecutionSpace::Invalid>
+  inline const void* current_stream() {
+    static_assert(Space != ttg::ExecutionSpace::Invalid,
+                  "TTG was built without any known device support so we cannot provide a current stream!");
+    return nullptr;
+  }
+} // namespace ttg
+#endif // defined(TTG_HAVE_HIP)
diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h
new file mode 100644
index 000000000..d95e0d1eb
--- /dev/null
+++ b/ttg/ttg/device/task.h
@@ -0,0 +1,637 @@
+#ifndef TTG_DEVICE_TASK_H
+#define TTG_DEVICE_TASK_H
+
+#include <array>
+#include <type_traits>
+#include <span>
+
+#include "ttg/fwd.h"
+#include "ttg/impl_selector.h"
+#include "ttg/ptr.h"
+
+namespace ttg::device {
+
+  namespace detail {
+    template <typename... Ts>
+    struct to_device_t {
+      std::tuple<std::add_lvalue_reference_t<Ts>...> ties;
+    };
+  }  // namespace detail
+
+  /**
+   * Select a device to execute on based on the provided buffer and scratchspace objects.
+   * Returns an object that should be awaited on using \c co_await.
+   * Upon resume, the device is selected (i.e., \sa ttg::device::current_device and
+   * \sa ttg::device::current_stream are available) and the buffers are available on the
+   * selected device.
+   */
+  template <typename... Args>
+  [[nodiscard]]
+  inline auto select(Args &&...args) {
+    return detail::to_device_t<std::remove_reference_t<Args>...>{std::tie(std::forward<Args>(args)...)};
+  }
+
+  namespace detail {
+
+    enum ttg_device_coro_state {
+      TTG_DEVICE_CORO_STATE_NONE,
+      TTG_DEVICE_CORO_INIT,
+      TTG_DEVICE_CORO_WAIT_TRANSFER,
+      TTG_DEVICE_CORO_WAIT_KERNEL,
+      TTG_DEVICE_CORO_SENDOUT,
+      TTG_DEVICE_CORO_COMPLETE
+    };
+
+    template <typename... Ts>
+    struct wait_kernel_t {
+      std::tuple<Ts &...> ties;
+
+      /* always suspend */
+      constexpr bool await_ready() const noexcept { return false; }
+
+      /* always suspend */
+      template <typename Promise>
+      constexpr void await_suspend(ttg::coroutine_handle<Promise>) const noexcept {}
+
+      void await_resume() noexcept {
+        if constexpr (sizeof...(Ts) > 0) {
+          /* hook to allow the backend to handle the data after pushout */
+          TTG_IMPL_NS::post_device_out(ties);
+        }
+      }
+    };
+  }  // namespace detail
+
+  /**
+   * Wait for previously submitted kernels to complete and provided
+   * ttg::Buffer and ttg::devicescratch to be transferred back to host.
+   * Must only be called after awaiting \sa ttg::device::select has resumed.
+   */
+  template <typename... Buffers>
+  [[nodiscard]]
+  inline auto wait(Buffers &&...args) {
+    static_assert(
+        ((ttg::meta::is_buffer_v<std::decay_t<Buffers>> || ttg::meta::is_devicescratch_v<std::decay_t<Buffers>>) &&
+         ...),
+        "Only ttg::Buffer and ttg::devicescratch can be waited on!");
+    return detail::wait_kernel_t<std::remove_reference_t<Buffers>...>{std::tie(std::forward<Buffers>(args)...)};
+  }
+
+  /******************************
+   * Send/Broadcast handling
+   * We pass the value returned by the backend's copy handler into a coroutine
+   * and execute the first part (prepare), before suspending it.
+   * The second part (send/broadcast) is executed after the task completed.
+   ******************************/
+
+  namespace detail {
+    struct send_coro_promise_type;
+
+    using send_coro_handle_type = ttg::coroutine_handle<send_coro_promise_type>;
+
+    /// a coroutine for sending data from the device
+    struct send_coro_state : public send_coro_handle_type {
+      using base_type = send_coro_handle_type;
+
+      /// these are members mandated by the promise_type concept
+      ///@{
+
+      using promise_type = send_coro_promise_type;
+
+      ///@}
+
+      send_coro_state(base_type base) : base_type(std::move(base)) {}
+
+      base_type &handle() { return *this; }
+
+      /// @return true if ready to resume
+      inline bool ready() { return true; }
+
+      /// @return true if task completed and can be destroyed
+      inline bool completed();
+    };
+
+    /// the promise type for the send coroutine
+    struct send_coro_promise_type {
+      /* do not suspend the coroutine on first invocation, we want to run
+      * the coroutine immediately and suspend only once.
+       */
+      ttg::suspend_never initial_suspend() { return {}; }
+
+      /* we don't suspend the coroutine at the end.
+      * it can be destroyed once the send/broadcast is done
+       */
+      ttg::suspend_never final_suspend() noexcept { return {}; }
+
+      send_coro_state get_return_object() { return send_coro_state{send_coro_handle_type::from_promise(*this)}; }
+
+      /* the send coros only have an empty co_await */
+      ttg::suspend_always await_transform(ttg::Void) { return {}; }
+
+      void unhandled_exception() {
+        std::cerr << "Send coroutine caught an unhandled exception!" << std::endl;
+        throw;  // fwd
+      }
+
+      void return_void() {}
+    };
+
+    template <typename Key, typename Value, ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline send_coro_state send_coro(const Key &key, Value &&value, ttg::Out<Key, std::decay_t<Value>> &t,
+                                     ttg::detail::value_copy_handler<Runtime> &ch) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler = std::move(ch);  // destroyed at the end of the coro
+      Key k = key;
+      t.prepare_send(k, std::forward<Value>(value));
+      co_await ttg::Void{};  // we'll come back once the task is done
+      t.send(k, std::forward<Value>(value));
+    };
+
+    template <typename Value, ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline send_coro_state sendv_coro(Value &&value, ttg::Out<void, std::decay_t<Value>> &t,
+                                      ttg::detail::value_copy_handler<Runtime> &ch) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler = std::move(ch);  // destroyed at the end of the coro
+      t.prepare_send(std::forward<Value>(value));
+      co_await ttg::Void{};  // we'll come back once the task is done
+      t.sendv(std::forward<Value>(value));
+    };
+
+    template <typename Key, ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline send_coro_state sendk_coro(const Key &key, ttg::Out<Key, void> &t) {
+      // no need to prepare the send but we have to suspend once
+      Key k = key;
+      co_await ttg::Void{};  // we'll come back once the task is done
+      t.sendk(k);
+    };
+
+    template <ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline send_coro_state send_coro(ttg::Out<void, void> &t) {
+      // no need to prepare the send but we have to suspend once
+      co_await ttg::Void{};  // we'll come back once the task is done
+      t.send();
+    };
+
+    struct send_t {
+      send_coro_state coro;
+    };
+  }  // namespace detail
+
+  template <size_t i, typename keyT, typename valueT, typename... out_keysT, typename... out_valuesT,
+            ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t send(const keyT &key, valueT &&value, std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+    ttg::detail::value_copy_handler<Runtime> copy_handler;
+    return detail::send_t{
+        detail::send_coro(key, copy_handler(std::forward<valueT>(value)), std::get<i>(t), copy_handler)};
+  }
+
+  template <size_t i, typename valueT, typename... out_keysT, typename... out_valuesT,
+            ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t sendv(valueT &&value, std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+    ttg::detail::value_copy_handler<Runtime> copy_handler;
+    return detail::send_t{detail::sendv_coro(copy_handler(std::forward<valueT>(value)), std::get<i>(t), copy_handler)};
+  }
+
+  template <size_t i, typename Key, typename... out_keysT, typename... out_valuesT,
+            ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t sendk(const Key &key, std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+    return detail::send_t{detail::sendk_coro(key, std::get<i>(t))};
+  }
+
+  // clang-format off
+  /// \brief Sends a task id and a value to the template tasks attached to the output terminal of this template task
+  /// \param[in] i Identifies which output terminal of this template task to select for sending
+  /// \param[in] key: the id of the task(s) receiving the value
+  /// \param[in] value: the value to send to the receiving task(s)
+  // clang-format on
+  template <typename keyT, typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t send(size_t i, const keyT &key, valueT &&value) {
+    ttg::detail::value_copy_handler<Runtime> copy_handler;
+    auto *terminal_ptr = ttg::detail::get_out_terminal<keyT, valueT>(i, "ttg::device::send(i, key, value)");
+    return detail::send_t{detail::send_coro(key, copy_handler(std::forward<valueT>(value)), *terminal_ptr, copy_handler)};
+  }
+
+  // clang-format off
+  /// \brief Sends a task id and a value to the template tasks attached to the output terminal of this template task
+  /// \note this is provided to support `send<i>` with and without explicitly-passed terminal tuple
+  /// \tparam <i> Identifies which output terminal of this template task to select for sending
+  /// \param[in] key: the id of the task(s) receiving the value
+  /// \param[in] value: the value to send to the receiving task(s)
+  // clang-format on
+  template <size_t i, typename keyT, typename valueT>
+  inline auto send(const keyT &key, valueT &&value) {
+    return ttg::device::send(i, key, std::forward<valueT>(value));
+  }
+
+
+  template <typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t sendv(std::size_t i, valueT &&value) {
+    auto *terminal_ptr = ttg::detail::get_out_terminal<void, valueT>(i, "ttg::device::send(i, key, value)");
+    ttg::detail::value_copy_handler<Runtime> copy_handler;
+    return detail::send_t{detail::sendv_coro(copy_handler(std::forward<valueT>(value)), *terminal_ptr, copy_handler)};
+  }
+
+  template <typename Key, ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t sendk(std::size_t i, const Key& key) {
+    auto *terminal_ptr = ttg::detail::get_out_terminal<Key, void>(i, "ttg::device::send(i, key, value)");
+    return detail::send_t{detail::sendk_coro(key, *terminal_ptr)};
+  }
+
+  template <ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t send(std::size_t i) {
+    auto *terminal_ptr = ttg::detail::get_out_terminal<void, void>(i, "ttg::device::send(i, key, value)");
+    return detail::send_t{detail::send_coro(*terminal_ptr)};
+  }
+
+
+  template <std::size_t i, typename valueT, typename... out_keysT, typename... out_valuesT,
+            ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t sendv(valueT &&value) {
+    return sendv(i, std::forward<valueT>(value));
+  }
+
+  template <size_t i, typename Key, ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t sendk(const Key& key) {
+    return sendk(i, key);
+  }
+
+  template <size_t i, ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t sendk() {
+    return send(i);
+  }
+
+  namespace detail {
+
+    template<typename T, typename Enabler = void>
+    struct broadcast_keylist_trait {
+      using type = T;
+    };
+
+    /* overload for iterable types that extracts the type of the first element */
+    template<typename T>
+    struct broadcast_keylist_trait<T, std::enable_if_t<ttg::meta::is_iterable_v<T>>> {
+      using key_type = decltype(*std::begin(std::get<0>(std::declval<T>())));
+    };
+
+    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
+              typename... out_keysT, typename... out_valuesT>
+    inline void prepare_broadcast(const std::tuple<RangesT...> &keylists, valueT &&value,
+                                  std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+      std::get<I>(t).prepare_send(std::get<KeyId>(keylists), std::forward<valueT>(value));
+      if constexpr (sizeof...(Is) > 0) {
+        prepare_broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value), t);
+      }
+    }
+
+    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
+              typename... out_keysT, typename... out_valuesT>
+    inline void prepare_broadcast(const std::tuple<RangesT...> &keylists, valueT &&value) {
+      using key_t = typename broadcast_keylist_trait<
+                      std::tuple_element_t<KeyId, std::tuple<std::remove_reference_t<RangesT>...>>
+                    >::key_type;
+      auto *terminal_ptr = ttg::detail::get_out_terminal<key_t, valueT>(I, "ttg::device::broadcast(keylists, value)");
+      terminal_ptr->prepare_send(std::get<KeyId>(keylists), value);
+      if constexpr (sizeof...(Is) > 0) {
+        prepare_broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value));
+      }
+    }
+
+    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
+              typename... out_keysT, typename... out_valuesT>
+    inline void broadcast(const std::tuple<RangesT...> &keylists, valueT &&value,
+                                  std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+      std::get<I>(t).broadcast(std::get<KeyId>(keylists), std::forward<valueT>(value));
+      if constexpr (sizeof...(Is) > 0) {
+        detail::broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value), t);
+      }
+    }
+
+    template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
+              typename... out_keysT, typename... out_valuesT>
+    inline void broadcast(const std::tuple<RangesT...> &keylists, valueT &&value) {
+      using key_t = typename broadcast_keylist_trait<
+                      std::tuple_element_t<KeyId, std::tuple<std::remove_reference_t<RangesT>...>>
+                    >::key_type;
+      auto *terminal_ptr = ttg::detail::get_out_terminal<key_t, valueT>(I, "ttg::device::broadcast(keylists, value)");
+      terminal_ptr->broadcast(std::get<KeyId>(keylists), value);
+      if constexpr (sizeof...(Is) > 0) {
+        ttg::device::detail::broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value));
+      }
+    }
+
+    /* overload with explicit terminals */
+    template <size_t I, size_t... Is, typename RangesT, typename valueT,
+              typename... out_keysT, typename... out_valuesT,
+              ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline send_coro_state
+    broadcast_coro(RangesT &&keylists, valueT &&value,
+                    std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t,
+                    ttg::detail::value_copy_handler<Runtime>&& ch) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler = std::move(ch); // destroyed at the end of the coro
+      RangesT kl = std::forward<RangesT>(keylists); // capture the keylist(s)
+      if constexpr (ttg::meta::is_tuple_v<RangesT>) {
+        // treat as tuple
+        prepare_broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value), t);
+        co_await ttg::Void{}; // we'll come back once the task is done
+        ttg::device::detail::broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value), t);
+      } else if constexpr (!ttg::meta::is_tuple_v<RangesT>) {
+        // create a tie to the captured keylist
+        prepare_broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value), t);
+        co_await ttg::Void{}; // we'll come back once the task is done
+        ttg::device::detail::broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value), t);
+      }
+    }
+
+    /* overload with implicit terminals */
+    template <size_t I, size_t... Is, typename RangesT, typename valueT,
+              ttg::Runtime Runtime = ttg::ttg_runtime>
+    inline send_coro_state
+    broadcast_coro(RangesT &&keylists, valueT &&value,
+                    ttg::detail::value_copy_handler<Runtime>&& ch) {
+      ttg::detail::value_copy_handler<Runtime> copy_handler = std::move(ch); // destroyed at the end of the coro
+      RangesT kl = std::forward<RangesT>(keylists); // capture the keylist(s)
+      if constexpr (ttg::meta::is_tuple_v<RangesT>) {
+        // treat as tuple
+        static_assert(sizeof...(Is)+1 == std::tuple_size_v<RangesT>,
+                      "Size of keylist tuple must match the number of output terminals");
+        prepare_broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value));
+        co_await ttg::Void{}; // we'll come back once the task is done
+        ttg::device::detail::broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value));
+      } else if constexpr (!ttg::meta::is_tuple_v<RangesT>) {
+        // create a tie to the captured keylist
+        prepare_broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value));
+        co_await ttg::Void{}; // we'll come back once the task is done
+        ttg::device::detail::broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value));
+      }
+    }
+  }  // namespace detail
+
+  /* overload with explicit terminals and keylist passed by const reference */
+  template <size_t I, size_t... Is, typename rangeT, typename valueT, typename... out_keysT, typename... out_valuesT,
+            ttg::Runtime Runtime = ttg::ttg_runtime>
+  [[nodiscard]]
+  inline detail::send_t broadcast(rangeT &&keylist,
+                                  valueT &&value,
+                                  std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+    ttg::detail::value_copy_handler<Runtime> copy_handler;
+    return detail::send_t{
+            detail::broadcast_coro<I, Is...>(std::forward<rangeT>(keylist),
+                                            copy_handler(std::forward<valueT>(value)),
+                                            t, std::move(copy_handler))};
+  }
+
+  /* overload with implicit terminals and keylist passed by const reference */
+  template <size_t i, typename rangeT, typename valueT,
+            ttg::Runtime Runtime = ttg::ttg_runtime>
+  inline detail::send_t broadcast(rangeT &&keylist, valueT &&value) {
+    ttg::detail::value_copy_handler<Runtime> copy_handler;
+    return detail::send_t{broadcast_coro<i>(std::tie(keylist), copy_handler(std::forward<valueT>(value)),
+                                            std::move(copy_handler))};
+  }
+
+  template<typename... Args, ttg::Runtime Runtime = ttg::ttg_runtime>
+  [[nodiscard]]
+  std::vector<device::detail::send_t> forward(Args&&... args) {
+    // TODO: check the cost of this!
+    return std::vector{std::forward<Args>(args)...};
+  }
+
+  /*******************************************
+   * Device task promise and coroutine handle
+   *******************************************/
+
+  namespace detail {
+    // fwd-decl
+    struct device_task_promise_type;
+    // base type for ttg::device::Task
+    using device_task_handle_type = ttg::coroutine_handle<device_task_promise_type>;
+  } // namespace detail
+
+  /// A device::Task is a coroutine (a callable that can be suspended and resumed).
+
+  /// Since task execution in TTG is not preempable, tasks should not block.
+  /// The purpose of suspending a task is to yield control back to the runtime until some events occur;
+  /// in the meantime its executor (e.g., a user-space thread) can perform other work.
+  /// Once the task function reaches a point where further progress is pending completion of one or more asynchronous
+  /// actions the function needs to be suspended via a coroutine await (`co_await`).
+  /// Resumption will be handled by the runtime.
+  struct Task : public detail::device_task_handle_type {
+    using base_type = detail::device_task_handle_type;
+
+    /// these are members mandated by the promise_type concept
+    ///@{
+
+    using promise_type = detail::device_task_promise_type;
+
+    ///@}
+
+    Task(base_type base) : base_type(std::move(base)) {}
+
+    base_type& handle() { return *this; }
+
+    /// @return true if ready to resume
+    inline bool ready() {
+      return true;
+    }
+
+    /// @return true if task completed and can be destroyed
+    inline bool completed();
+  };
+
+  namespace detail {
+
+    /* The promise type that stores the views provided by the
+    * application task coroutine on the first co_yield. It subsequently
+    * tracks the state of the task when it moves from waiting for transfers
+    * to waiting for the submitted kernel to complete. */
+    struct device_task_promise_type {
+
+      /* do not suspend the coroutine on first invocation, we want to run
+      * the coroutine immediately and suspend when we get the device transfers.
+      */
+      ttg::suspend_never initial_suspend() {
+        m_state = ttg::device::detail::TTG_DEVICE_CORO_INIT;
+        return {};
+      }
+
+      /* suspend the coroutine at the end of the execution
+      * so we can access the promise.
+      * TODO: necessary? maybe we can save one suspend here
+      */
+      ttg::suspend_always final_suspend() noexcept {
+        m_state = ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
+        return {};
+      }
+
+      /* Allow co_await on a tuple */
+      template<typename... Views>
+      ttg::suspend_always await_transform(std::tuple<Views&...> &views) {
+        return yield_value(views);
+      }
+
+      template<typename... Ts>
+      ttg::suspend_always await_transform(detail::to_device_t<Ts...>&& a) {
+        bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
+        /* TODO: are we allowed to not suspend here and launch the kernel directly? */
+        m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER;
+        return {};
+      }
+
+      template<typename... Ts>
+      auto await_transform(detail::wait_kernel_t<Ts...>&& a) {
+        //std::cout << "yield_value: wait_kernel_t" << std::endl;
+        if constexpr (sizeof...(Ts) > 0) {
+          TTG_IMPL_NS::mark_device_out(a.ties);
+        }
+        m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_KERNEL;
+        return a;
+      }
+
+      ttg::suspend_always await_transform(std::vector<device::detail::send_t>&& v) {
+        m_sends = std::forward<std::vector<device::detail::send_t>>(v);
+        m_state = ttg::device::detail::TTG_DEVICE_CORO_SENDOUT;
+        return {};
+      }
+
+      ttg::suspend_always await_transform(device::detail::send_t&& v) {
+        m_sends.clear();
+        m_sends.push_back(std::forward<device::detail::send_t>(v));
+        m_state = ttg::device::detail::TTG_DEVICE_CORO_SENDOUT;
+        return {};
+      }
+
+      void return_void() {
+        m_state = ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
+      }
+
+      bool complete() const {
+        return m_state == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
+      }
+
+      ttg::device::Task get_return_object() { return {detail::device_task_handle_type::from_promise(*this)}; }
+
+      void unhandled_exception() {
+        std::cerr << "Task coroutine caught an unhandled exception!" << std::endl;
+        throw; // fwd
+      }
+
+      //using iterator = std::vector<device_obj_view>::iterator;
+
+      /* execute all pending send and broadcast operations */
+      void do_sends() {
+        for (auto& send : m_sends) {
+          send.coro();
+        }
+        m_sends.clear();
+      }
+
+      auto state() {
+        return m_state;
+      }
+
+    private:
+      std::vector<device::detail::send_t> m_sends;
+      ttg_device_coro_state m_state = ttg::device::detail::TTG_DEVICE_CORO_STATE_NONE;
+
+    };
+
+  } // namespace detail
+
+  bool Task::completed() { return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE; }
+
+  struct device_wait_kernel
+  { };
+
+
+  /* NOTE: below is preliminary for reductions on the device, which is not available yet */
+#if 0
+  /**************************
+   * Device reduction coros *
+   **************************/
+
+  struct device_reducer_promise_type;
+
+  using device_reducer_handle_type = ttg::coroutine_handle<device_reducer_promise_type>;
+
+  /// task that can be resumed after some events occur
+  struct device_reducer : public device_reducer_handle_type {
+    using base_type = device_reducer_handle_type;
+
+    /// these are members mandated by the promise_type concept
+    ///@{
+
+    using promise_type = device_reducer_promise_type;
+
+    ///@}
+
+    device_reducer(base_type base) : base_type(std::move(base)) {}
+
+    base_type& handle() { return *this; }
+
+    /// @return true if ready to resume
+    inline bool ready() {
+      return true;
+    }
+
+    /// @return true if task completed and can be destroyed
+    inline bool completed();
+  };
+
+
+  /* The promise type that stores the views provided by the
+  * application task coroutine on the first co_yield. It subsequently
+  * tracks the state of the task when it moves from waiting for transfers
+  * to waiting for the submitted kernel to complete. */
+  struct device_reducer_promise_type {
+
+    /* do not suspend the coroutine on first invocation, we want to run
+    * the coroutine immediately and suspend when we get the device transfers.
+    */
+    ttg::suspend_never initial_suspend() {
+      m_state = ttg::device::detail::TTG_DEVICE_CORO_INIT;
+      return {};
+    }
+
+    /* suspend the coroutine at the end of the execution
+    * so we can access the promise.
+    * TODO: necessary? maybe we can save one suspend here
+    */
+    ttg::suspend_always final_suspend() noexcept {
+      m_state = ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
+      return {};
+    }
+
+    template<typename... Ts>
+    ttg::suspend_always await_transform(detail::to_device_t<Ts...>&& a) {
+      bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
+      /* TODO: are we allowed to not suspend here and launch the kernel directly? */
+      m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER;
+      return {};
+    }
+
+    void return_void() {
+      m_state = ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
+    }
+
+    bool complete() const {
+      return m_state == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE;
+    }
+
+    device_reducer get_return_object() { return device_reducer{device_reducer_handle_type::from_promise(*this)}; }
+
+    void unhandled_exception() { }
+
+    auto state() {
+      return m_state;
+    }
+
+
+  private:
+    ttg::device::detail::ttg_device_coro_state m_state = ttg::device::detail::TTG_DEVICE_CORO_STATE_NONE;
+
+  };
+
+  bool device_reducer::completed() { return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE; }
+#endif // 0
+
+}  // namespace ttg::devie
+
+#endif // TTG_DEVICE_TASK_H
diff --git a/ttg/ttg/devicescope.h b/ttg/ttg/devicescope.h
new file mode 100644
index 000000000..594e6db0b
--- /dev/null
+++ b/ttg/ttg/devicescope.h
@@ -0,0 +1,11 @@
+#ifndef TTG_DEVICESCOPE_H
+#define TTG_DEVICESCOPE_H
+
+namespace ttg {
+  enum class scope {
+    Allocate     = 0x0,  //< memory allocated as scratch, but not moved in or out
+    SyncIn       = 0x2,  //< memory allocated as scratch and data transferred to device
+  };
+} // namespace ttg
+
+#endif // TTG_DEVICESCOPE_H
\ No newline at end of file
diff --git a/ttg/ttg/devicescratch.h b/ttg/ttg/devicescratch.h
new file mode 100644
index 000000000..9ccb60bce
--- /dev/null
+++ b/ttg/ttg/devicescratch.h
@@ -0,0 +1,19 @@
+#ifndef TTG_DEVICESCRATCH_H
+#define TTG_DEVICESCRATCH_H
+
+#include "ttg/devicescope.h"
+#include "ttg/fwd.h"
+
+namespace ttg {
+
+template<typename T>
+using devicescratch = TTG_IMPL_NS::devicescratch<T>;
+
+template<typename T>
+auto make_scratch(T* val, ttg::scope scope, std::size_t count = 1) {
+  return devicescratch<T>(val, scope, count);
+}
+
+} // namespace ttg
+
+#endif // TTG_DEVICESCRATCH_H
\ No newline at end of file
diff --git a/ttg/ttg/execution.h b/ttg/ttg/execution.h
index 29775a24b..495d0e248 100644
--- a/ttg/ttg/execution.h
+++ b/ttg/ttg/execution.h
@@ -17,9 +17,23 @@ enum class Execution {
 enum class ExecutionSpace {
   Host,   // a CPU
   CUDA,   // an NVIDIA CUDA device
+  HIP,    // an AMD HIP device
+  L0,     // an Intel L0 device
   Invalid
 };
 
+namespace detail {
+  inline const char *execution_space_name(ExecutionSpace space) noexcept {
+    switch (space) {
+      case ExecutionSpace::Host: return "Host";
+      case ExecutionSpace::CUDA: return "CUDA";
+      case ExecutionSpace::HIP: return "HIP";
+      case ExecutionSpace::Invalid: return "INVALID";
+      default: return "UNKNOWN";
+    }
+  }
+} // namespace detail
+
 };
 
 #endif //TTG_EXECUTION_H
diff --git a/ttg/ttg/func.h b/ttg/ttg/func.h
index e383ebbc8..28b75ace9 100644
--- a/ttg/ttg/func.h
+++ b/ttg/ttg/func.h
@@ -98,11 +98,14 @@ namespace ttg {
   inline void connect(ttg::TerminalBase *out, ttg::TerminalBase *in) { out->connect(in); }
 
   /// \brief Connect producer output terminal outindex to consumer input terminal inindex (via unique or otherwise
-  /// wrapped pointers to TTs) \tparam outindex The index of the output terminal on the producer. \tparam inindex  The
-  /// index of the input terminal on the consumer. \param p The producer TT \param c The consumer TT
+  /// wrapped pointers to TTs)
+  /// \tparam outindex The index of the output terminal on the producer.
+  /// \tparam inindex  The index of the input terminal on the consumer.
+  /// \param p The producer TT
+  /// \param c The consumer TT
   template <std::size_t outindex, std::size_t inindex, typename producer_tt_ptr, typename successor_tt_ptr>
-  inline void connect(producer_tt_ptr &p, successor_tt_ptr &s) {
-    connect(p->template out<outindex>(), s->template in<inindex>());
+  inline void connect(producer_tt_ptr &p, successor_tt_ptr &c) {
+    connect(p->template out<outindex>(), c->template in<inindex>());
   }
 
   /// \brief Connect producer output terminal outindex to consumer input terminal inindex (via bare pointers to TTs)
@@ -111,13 +114,13 @@ namespace ttg {
   /// \param p The producer TT
   /// \param c The consumer TT
   template <std::size_t outindex, std::size_t inindex, typename producer_tt_ptr, typename successor_tt_ptr>
-  inline void connect(producer_tt_ptr *p, successor_tt_ptr *s) {
-    connect(p->template out<outindex>(), s->template in<inindex>());
+  inline void connect(producer_tt_ptr *p, successor_tt_ptr *c) {
+    connect(p->template out<outindex>(), c->template in<inindex>());
   }
 
   /// \brief Connect producer output terminal outindex to consumer input terminal inindex (via TTBase pointers)
-  /// \tparam outindex The index of the output terminal on the producer.
-  /// \tparam inindex  The index of the input terminal on the consumer.
+  /// \param outindex The index of the output terminal on the producer.
+  /// \param inindex  The index of the input terminal on the consumer.
   /// \param producer The producer TT
   /// \param consumer The consumer TT
   inline void connect(size_t outindex, size_t inindex, TTBase *producer, TTBase *consumer) {
@@ -149,7 +152,7 @@ namespace ttg {
   /// \brief Sends a task id and a value to the given output terminal
   /// \param[in] key: the id of the task(s) receiving the value
   /// \param[in] value: the value to send to the receiving task(s)
-  /// \param[in] out: the output terminal
+  /// \param[in] t: the output terminal
   // clang-format on
   template <typename keyT, typename valueT, typename output_terminalT, ttg::Runtime Runtime = ttg::ttg_runtime>
   inline void send(const keyT &key, valueT &&value, ttg::Out<keyT, valueT> &t) {
@@ -160,7 +163,7 @@ namespace ttg {
   // clang-format off
   /// \brief Sends a task id (without an accompanying value) to the given output terminal
   /// \param[in] key: the id of the task(s) receiving the value
-  /// \param[in] out: the output terminal
+  /// \param[in] t: the output terminal
   // clang-format on
   template <typename keyT>
   inline void sendk(const keyT &key, ttg::Out<keyT, void> &t) {
@@ -170,7 +173,7 @@ namespace ttg {
   // clang-format off
   /// \brief Sends a value (without an accompanying task id) to the given output terminal
   /// \param[in] value: the value to send to the receiving task(s)
-  /// \param[in] out: the output terminal
+  /// \param[in] t: the output terminal
   // clang-format on
   template <typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
   inline void sendv(valueT &&value, ttg::Out<void, valueT> &t) {
@@ -180,7 +183,7 @@ namespace ttg {
 
   // clang-format off
   /// \brief Sends a control message (message without an accompanying task id or a value) to the given output terminal
-  /// \param[in] out: the output terminal
+  /// \param[in] t: the output terminal
   // clang-format on
   inline void send(ttg::Out<void, void> &t) { t.send(); }
 
@@ -189,7 +192,7 @@ namespace ttg {
   /// \tparam <i> Identifies which output terminal in \p t to select for sending
   /// \param[in] key: the id of the task(s) receiving the value
   /// \param[in] value: the value to send to the receiving task(s)
-  /// \param[in] out: a tuple of output terminals (typically, this is the output terminal of the template task where this is invoked)
+  /// \param[in] t: a tuple of output terminals (typically, this is the output terminal of the template task where this is invoked)
   // clang-format on
   template <size_t i, typename keyT, typename valueT, typename... out_keysT, typename... out_valuesT,
             ttg::Runtime Runtime = ttg::ttg_runtime>
@@ -230,7 +233,7 @@ namespace ttg {
   /// \brief Sends a task id (without an accompanying value) to the template tasks attached to the output terminal selected in the explicitly given terminal tuple \p t
   /// \tparam <i> Identifies which output terminal in \p t to select for sending
   /// \param[in] key: the id of the task(s) receiving the value
-  /// \param[in] out: a tuple of output terminals (typically, this is the output terminal of the template task where this is invoked)
+  /// \param[in] t: a tuple of output terminals (typically, this is the output terminal of the template task where this is invoked)
   // clang-format on
   template <size_t i, typename keyT, typename... out_keysT, typename... out_valuesT>
   inline std::enable_if_t<!meta::is_void_v<keyT>, void> sendk(const keyT &key,
@@ -264,7 +267,7 @@ namespace ttg {
   /// \brief Sends a value (without an accompanying task id) to the template tasks attached to the output terminal selected in the explicitly given terminal tuple \p t
   /// \tparam <i> Identifies which output terminal in \p t to select for sending
   /// \param[in] value: the value to send to the receiving task(s)
-  /// \param[in] out: a tuple of output terminals (typically, this is the output terminal of the template task where this is invoked)
+  /// \param[in] t: a tuple of output terminals (typically, this is the output terminal of the template task where this is invoked)
   // clang-format on
   template <size_t i, typename valueT, typename... out_keysT, typename... out_valuesT,
             ttg::Runtime Runtime = ttg::ttg_runtime>
@@ -276,7 +279,7 @@ namespace ttg {
 
   // clang-format off
   /// \brief Sends a value (without an accompanying task id) to the template tasks attached to the output terminal of this template task
-  /// \param[in] i Identifies which output terminal of this template task to select for sending
+  /// \param[in] <i> Identifies which output terminal of this template task to select for sending
   /// \param[in] value: the value to send to the receiving task(s)
   // clang-format on
   template <typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
@@ -300,7 +303,7 @@ namespace ttg {
   // clang-format off
   /// \brief Sends a control message (message without an accompanying task id or a value) to the template tasks attached to the output terminal selected in the explicitly given terminal tuple \p t
   /// \tparam <i> Identifies which output terminal in \p t to select for sending
-  /// \param[in] out: a tuple of output terminals (typically, this is the output terminal of the template task where this is invoked)
+  /// \param[in] t: a tuple of output terminals (typically, this is the output terminal of the template task where this is invoked)
   // clang-format on
   template <size_t i, typename... out_keysT, typename... out_valuesT>
   inline void send(std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
@@ -519,7 +522,7 @@ namespace ttg {
 
   template <std::size_t i>
   inline void set_size(const std::size_t size) {
-    set_size<i>(size);
+    set_size(i, size);
   }
 
   /// \brief Finalize streaming input terminals connecting to the given output terminal for tasks
@@ -574,7 +577,7 @@ namespace ttg {
 
   template <std::size_t i>
   inline void finalize() {
-    finalize<i>();
+    finalize(i);
   }
 
 }  // namespace ttg
diff --git a/ttg/ttg/fwd.h b/ttg/ttg/fwd.h
index df32505d0..f9b8d1c0f 100644
--- a/ttg/ttg/fwd.h
+++ b/ttg/ttg/fwd.h
@@ -47,6 +47,7 @@ namespace ttg {
   template <typename... RestOfArgs>
   void initialize(int argc, char **argv, int num_threads = -1, RestOfArgs &&...);
   void finalize();
+  [[noreturn]]
   void abort();
   World default_execution_context();
   void execute(ttg::World world);
diff --git a/ttg/ttg/madness/buffer.h b/ttg/ttg/madness/buffer.h
new file mode 100644
index 000000000..8f210c1e7
--- /dev/null
+++ b/ttg/ttg/madness/buffer.h
@@ -0,0 +1,299 @@
+#ifndef TTG_MADNESS_BUFFER_H
+#define TTG_MADNESS_BUFFER_H
+
+#include "ttg/serialization/traits.h"
+
+namespace ttg_madness {
+
+/// A runtime-managed buffer mirrored between host and device memory
+template<typename T, typename Allocator>
+struct Buffer : private Allocator {
+
+  using element_type = std::decay_t<T>;
+
+  using allocator_traits = std::allocator_traits<Allocator>;
+  using allocator_type = typename  allocator_traits::allocator_type;
+
+  static_assert(std::is_trivially_copyable_v<element_type>,
+                "Only trivially copyable types are supported for devices.");
+  static_assert(std::is_default_constructible_v<element_type>,
+                "Only default constructible types are supported for devices.");
+
+private:
+  using delete_fn_t = std::function<void(element_type*)>;
+
+  using host_data_ptr   = std::add_pointer_t<element_type>;
+  host_data_ptr m_host_data = nullptr;
+  std::size_t m_count = 0;
+  bool m_owned= false;
+
+  static void delete_non_owned(element_type *ptr) {
+    // nothing to be done, we don't own the memory
+  }
+
+  allocator_type& get_allocator_reference() { return static_cast<allocator_type&>(*this); }
+
+  element_type* allocate(std::size_t n) {
+    return allocator_traits::allocate(get_allocator_reference(), n);
+  }
+
+  void deallocate() {
+    allocator_traits::deallocate(get_allocator_reference(), m_host_data, m_count);
+  }
+
+public:
+
+  Buffer() : Buffer(nullptr, 0)
+  { }
+
+  Buffer(std::size_t n)
+  : allocator_type()
+  , m_host_data(allocate(n))
+  , m_count(n)
+  , m_owned(true)
+  { }
+
+  /* Constructing a buffer using application-managed memory.
+   * The memory pointed to by ptr must be accessible during
+   * the life-time of the buffer. */
+  Buffer(element_type* ptr, std::size_t n = 1)
+  : allocator_type()
+  , m_host_data(ptr)
+  , m_count(n)
+  , m_owned(false)
+  { }
+
+  virtual ~Buffer() {
+    if (m_owned) {
+      deallocate();
+      m_owned = false;
+    }
+    unpin(); // make sure the copies are not pinned
+  }
+
+  /* allow moving device buffers */
+  Buffer(Buffer&& db)
+  : allocator_type(std::move(db))
+  , m_host_data(db.m_host_data)
+  , m_count(db.m_count)
+  , m_owned(db.m_owned)
+  {
+    db.m_host_data = nullptr;
+    db.m_count = 0;
+    db.m_owned = false;
+  }
+
+  /* explicitly disable copying of buffers
+   * TODO: should we allow this? What data to use?
+   */
+  Buffer(const Buffer& db) = delete;
+
+  /* allow moving device buffers */
+  Buffer& operator=(Buffer&& db) {
+    allocator_type::operator=(std::move(db));
+    std::swap(m_host_data, db.m_host_data);
+    std::swap(m_count, db.m_count);
+    std::swap(m_owned, db.m_owned);
+    return *this;
+  }
+
+  /* explicitly disable copying of buffers
+   * TODO: should we allow this? What data to use?
+   */
+  Buffer& operator=(const Buffer& db) = delete;
+
+  /* set the current device, useful when a device
+   * buffer was modified outside of a TTG */
+  void set_current_device(const ttg::device::Device& device) {
+    assert(is_valid());
+    if (!device.is_host()) throw std::runtime_error("MADNESS backend does not support non-host memory!");
+    /* no-op */
+  }
+
+  /* Get the owner device ID, i.e., the last updated
+   * device buffer. */
+  ttg::device::Device get_owner_device() const {
+    assert(is_valid());
+    return {}; // host only
+  }
+
+  /* Get the pointer on the currently active device. */
+  element_type* current_device_ptr() {
+    assert(is_valid());
+    return m_host_data;
+  }
+
+  /* Get the pointer on the currently active device. */
+  const element_type* current_device_ptr() const {
+    assert(is_valid());
+    return m_host_data;
+  }
+
+  /* Get the pointer on the owning device.
+   * @note: This may not be the device assigned to the currently executing task.
+   *        See \ref ttg::device::current_device for that. */
+  element_type* owner_device_ptr() {
+    assert(is_valid());
+    return m_host_data;
+  }
+
+  /* get the current device pointer */
+  const element_type* owner_device_ptr() const {
+    assert(is_valid());
+    return m_host_data;
+  }
+
+  /* get the device pointer at the given device
+   */
+  element_type* device_ptr_on(const ttg::device::Device& device) {
+    assert(is_valid());
+    if (device.is_device()) throw std::runtime_error("MADNESS missing support for non-host memory!");
+    return m_host_data;
+  }
+
+  /* get the device pointer at the given device
+   */
+  const element_type* device_ptr_on(const ttg::device::Device& device) const {
+    assert(is_valid());
+    if (device.is_device()) throw std::runtime_error("MADNESS missing support for non-host memory!");
+    return m_host_data;
+  }
+
+  element_type* host_ptr() {
+    return m_host_data;
+  }
+
+  const element_type* host_ptr() const {
+    return m_host_data;
+  }
+
+  bool is_valid_on(const ttg::device::Device& device) const {
+    assert(is_valid());
+    if (device.is_device()) throw std::runtime_error("MADNESS missing support for non-host memory!");
+    return true;
+  }
+
+  void allocate_on(const ttg::device::Device& device_id) {
+    /* TODO: need exposed PaRSEC memory allocator */
+    throw std::runtime_error("not implemented yet");
+  }
+
+  /* TODO: can we do this automatically?
+   * Pin the memory on all devices we currently track.
+   * Pinned memory won't be released by PaRSEC and can be used
+   * at any time.
+   */
+  void pin() {
+    // nothing to do
+  }
+
+  /* Unpin the memory on all devices we currently track. */
+  void unpin() {
+    // nothing to do
+  }
+
+  /* Pin the memory on a given device */
+  void pin_on(int device_id) {
+    /* TODO: how can we pin memory on a device? */
+  }
+
+  /* Pin the memory on a given device */
+  void unpin_on(int device_id) {
+    /* TODO: how can we unpin memory on a device? */
+  }
+
+  bool is_valid() const {
+    return true;
+  }
+
+  operator bool() const {
+    return true;
+  }
+
+  std::size_t size() const {
+    return m_count;
+  }
+
+  /* Reallocate the buffer with count elements */
+  void reset(std::size_t n) {
+
+    if (m_owned) {
+      deallocate();
+      m_owned = false;
+    }
+
+    if (n == 0) {
+      m_host_data = nullptr;
+      m_owned = false;
+    } else {
+      m_host_data = allocate(n);
+      m_owned = true;
+    }
+    m_count = n;
+  }
+
+  /* Reset the buffer to use the ptr to count elements */
+  void reset(T* ptr, std::size_t n = 1) {
+    /* TODO: can we resize if count is smaller than m_count? */
+    if (n == m_count) {
+      return;
+    }
+
+    if (m_owned) {
+      deallocate();
+    }
+
+    if (nullptr == ptr) {
+      m_host_data = nullptr;
+      m_count = 0;
+      m_owned = false;
+    } else {
+      m_host_data = ptr;
+      m_count = n;
+      m_owned = false;
+    }
+  }
+
+  /* serialization support */
+
+#if defined(TTG_SERIALIZATION_SUPPORTS_BOOST) && 0
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int version) {
+    if constexpr (ttg::detail::is_output_archive_v<Archive>) {
+      std::size_t s = size();
+      ar& s;
+      /* TODO: how to serialize the array? */
+    } else {
+      std::size_t s;
+      ar & s;
+      /* initialize internal pointers and then reset */
+      reset(s);
+      /* TODO: how to deserialize the array? */
+    }
+  }
+#endif // TTG_SERIALIZATION_SUPPORTS_BOOST
+
+#if defined(TTG_SERIALIZATION_SUPPORTS_MADNESS)
+  template <typename Archive>
+  std::enable_if_t<std::is_base_of_v<madness::archive::BufferInputArchive, Archive> ||
+                   std::is_base_of_v<madness::archive::BufferOutputArchive, Archive>>
+  serialize(Archive& ar) {
+    if constexpr (ttg::detail::is_output_archive_v<Archive>) {
+      std::size_t s = size();
+      ar& s;
+      ar << wrap(host_ptr(), s);
+    } else {
+      std::size_t s;
+      ar & s;
+      reset(s);
+      ar >> wrap(host_ptr(), s);  // MatrixTile<T>(bm.rows(), bm.cols());
+    }
+  }
+#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS
+
+
+};
+
+} // namespace ttg_madness
+
+#endif // TTG_MADNESS_BUFFER_H
diff --git a/ttg/ttg/madness/fwd.h b/ttg/ttg/madness/fwd.h
index af050f9a1..0d340db0b 100644
--- a/ttg/ttg/madness/fwd.h
+++ b/ttg/ttg/madness/fwd.h
@@ -26,6 +26,7 @@ namespace ttg_madness {
 
   inline void ttg_finalize();
 
+  [[noreturn]]
   inline void ttg_abort();
 
   inline ttg::World ttg_default_execution_context();
@@ -47,6 +48,35 @@ namespace ttg_madness {
   template <typename T>
   inline void ttg_broadcast(ttg::World world, T &data, int source_rank);
 
+
+  /* device definitions, not currently provided by this impl */
+  template<typename T, typename Allocator = std::allocator<T>>
+  struct Buffer;
+
+  template<typename T>
+  struct Ptr;
+
+  template<typename T>
+  struct devicescratch;
+
+  template<typename T>
+  struct TTValue;
+
+  template<typename T, typename... Args>
+  Ptr<T> make_ptr(Args&&... args);
+
+  template<typename T>
+  auto get_ptr(T&& obj);
+
+  template<typename... Views>
+  inline bool register_device_memory(std::tuple<Views&...> &views);
+
+  template<typename... Buffer>
+  inline void post_device_out(std::tuple<Buffer&...> &b);
+
+  template<typename... Buffer>
+  inline void mark_device_out(std::tuple<Buffer&...> &b);
+
 }  // namespace ttg_madness
 
 #endif  // TTG_MADNESS_FWD_H
diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h
index f483a3bc9..5fab6f6dd 100644
--- a/ttg/ttg/madness/ttg.h
+++ b/ttg/ttg/madness/ttg.h
@@ -23,6 +23,9 @@
 #include "ttg/util/meta/callable.h"
 #include "ttg/util/void.h"
 #include "ttg/world.h"
+#ifdef TTG_HAS_COROUTINE
+#include "ttg/coroutine.h"
+#endif
 
 #include <array>
 #include <cassert>
@@ -132,7 +135,10 @@ namespace ttg_madness {
     ::madness::finalize();
   }
   inline ttg::World ttg_default_execution_context() { return ttg::get_default_world(); }
-  inline void ttg_abort() { MPI_Abort(ttg_default_execution_context().impl().impl().mpi.Get_mpi_comm(), 1); }
+  inline void ttg_abort() {
+    MPI_Abort(ttg_default_execution_context().impl().impl().mpi.Get_mpi_comm(), 1);
+    assert(0); // make sure we abort
+  }
   inline void ttg_execute(ttg::World world) {
     // World executes tasks eagerly
   }
@@ -207,6 +213,26 @@ namespace ttg_madness {
    public:
     ttg::World get_world() const override final { return world; }
 
+    /// @return true if derivedT::have_cuda_op exists and is defined to true
+    static constexpr bool derived_has_cuda_op() {
+      return false;
+    }
+
+    /// @return true if derivedT::have_hip_op exists and is defined to true
+    static constexpr bool derived_has_hip_op() {
+      return false;
+    }
+
+    /// @return true if derivedT::have_hip_op exists and is defined to true
+    static constexpr bool derived_has_level_zero_op() {
+      return false;
+    }
+
+    /// @return true if the TT supports device execution
+    static constexpr bool derived_has_device_op() {
+      return false;
+    }
+
    protected:
     using worldobjT = ::madness::WorldObject<ttT>;
 
@@ -276,6 +302,10 @@ namespace ttg_madness {
       derivedT *derived;                            // Pointer to derived class instance
       bool pull_terminals_invoked = false;
       std::conditional_t<ttg::meta::is_void_v<keyT>, ttg::Void, keyT> key;  // Task key
+#ifdef TTG_HAS_COROUTINE
+      void *suspended_task_address = nullptr;  // if not null the function is suspended
+      ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid;
+#endif
 
       /// makes a tuple of references out of tuple of
       template <typename Tuple, std::size_t... Is>
@@ -300,28 +330,88 @@ namespace ttg_madness {
       }
 
       virtual void run(::madness::World &world) override {
-        // ttg::print("starting task");
-
         using ttg::hash;
         ttT::threaddata.key_hash = hash<decltype(key)>{}(key);
         ttT::threaddata.call_depth++;
 
-        if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          derived->op(key, this->make_input_refs(),
-                      derived->output_terminals);  // !!! NOTE converting input values to refs
-        } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          derived->op(key, derived->output_terminals);
-        } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          derived->op(this->make_input_refs(),
-                      derived->output_terminals);  // !!! NOTE converting input values to refs
-        } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-          derived->op(derived->output_terminals);
-        } else
-          abort();
+        void *suspended_task_address =
+#ifdef TTG_HAS_COROUTINE
+            this->suspended_task_address;  // non-null = need to resume the task
+#else
+            nullptr;
+#endif
+        if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
+          // ttg::print("starting task");
+          if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+            TTG_PROCESS_TT_OP_RETURN(
+                suspended_task_address,
+                coroutine_id,
+                derived->op(key, this->make_input_refs(),
+                            derived->output_terminals));  // !!! NOTE converting input values to refs
+          } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+            TTG_PROCESS_TT_OP_RETURN(suspended_task_address, coroutine_id, derived->op(key, derived->output_terminals));
+          } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+            TTG_PROCESS_TT_OP_RETURN(
+                suspended_task_address,
+                coroutine_id,
+                derived->op(this->make_input_refs(),
+                            derived->output_terminals));  // !!! NOTE converting input values to refs
+          } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+            TTG_PROCESS_TT_OP_RETURN(suspended_task_address, coroutine_id, derived->op(derived->output_terminals));
+          } else  // unreachable
+            ttg::abort();
+        } else {  // resume suspended coroutine
+#ifdef TTG_HAS_COROUTINE
+          auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address));
+          assert(ret.ready());
+          ret.resume();
+          if (ret.completed()) {
+            ret.destroy();
+            suspended_task_address = nullptr;
+          } else {  // not yet completed
+            // leave suspended_task_address as is
+          }
+          this->suspended_task_address = suspended_task_address;
+#else
+          ttg::abort();  // should not happen
+#endif
+        }
 
         ttT::threaddata.call_depth--;
 
-        // ttg::print("finishing task",ttT::threaddata.call_depth);
+        // if (suspended_task_address == nullptr) {
+        //   ttg::print("finishing task",ttT::threaddata.call_depth);
+        // }
+
+#ifdef TTG_HAS_COROUTINE
+        if (suspended_task_address) {
+          // TODO implement handling of suspended coroutines properly
+
+          // only resumable_task is recognized at the moment
+          assert(coroutine_id == ttg::TaskCoroutineID::ResumableTask);
+
+          // right now can events are not properly implemented, we are only testing the workflow with dummy events
+          // so mark the events finished manually
+          // proper thing to do is to process event queue and resubmit this task again
+          auto events =
+              static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address)).events();
+          for (auto &event_ptr : events) {
+            event_ptr->finish();
+          }
+          assert(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address).promise().ready());
+
+          // resume the coroutine
+          auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address));
+          assert(ret.ready());
+          ret.resume();
+          if (ret.completed()) {
+            ret.destroy();
+            suspended_task_address = nullptr;
+          } else {  // not yet completed
+            ttg::abort();
+          }
+        }
+#endif  // TTG_HAS_COROUTINE
       }
 
       virtual ~TTArgs() {}  // Will be deleted via TaskInterface*
@@ -567,7 +657,7 @@ namespace ttg_madness {
             } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
               static_cast<derivedT *>(this)->op(output_terminals);  // Runs immediately
             } else
-              abort();
+              ttg::abort();
             ttT::threaddata.call_depth--;
 
           } else {
@@ -947,7 +1037,7 @@ namespace ttg_madness {
         auto finalize_callback = [this]() { finalize_argstream<i>(); };
         input.set_callback(send_callback, send_callback, {}, setsize_callback, finalize_callback);
       } else
-        abort();
+        ttg::abort();
     }
 
     template <std::size_t... IS>
@@ -1073,14 +1163,14 @@ namespace ttg_madness {
           for (std::size_t i = 0; i < numins; i++) std::cerr << (item.second->nargs[i] == 0 ? "T" : "F") << " ";
           std::cerr << ")" << std::endl;
         }
-        abort();
+        ttg::abort();
       }
     }
 
     /// define the reducer function to be called when additional inputs are
     /// received on a streaming terminal
     ///   @tparam <i> the index of the input terminal that is used as a streaming terminal
-    ///   @param[in] reducer: a function of prototype (input_type<i> &a, const input_type<i> &b)
+    ///   @param[in] reducer: a function of prototype `void(input_type<i> &a, const input_type<i> &b)`
     ///                       that function should aggregate b into a
     template <std::size_t i, typename Reducer>
     void set_input_reducer(Reducer &&reducer) {
@@ -1091,7 +1181,7 @@ namespace ttg_madness {
     /// define the reducer function to be called when additional inputs are
     /// received on a streaming terminal
     ///   @tparam <i> the index of the input terminal that is used as a streaming terminal
-    ///   @param[in] reducer: a function of prototype (input_type<i> &a, const input_type<i> &b)
+    ///   @param[in] reducer: a function of prototype `void(input_type<i> &a, const input_type<i> &b)`
     ///                       that function should aggregate b into a
     ///   @param[in] size: the default number of inputs that are received in this streaming terminal,
     ///                    for each task
@@ -1231,5 +1321,7 @@ namespace ttg_madness {
 }  // namespace ttg_madness
 
 #include "ttg/madness/watch.h"
+#include "ttg/madness/buffer.h"
+#include "ttg/madness/ttvalue.h"
 
 #endif  // MADNESS_TTG_H_INCLUDED
diff --git a/ttg/ttg/madness/ttvalue.h b/ttg/ttg/madness/ttvalue.h
new file mode 100644
index 000000000..ad53ee5f8
--- /dev/null
+++ b/ttg/ttg/madness/ttvalue.h
@@ -0,0 +1,14 @@
+#ifndef TTG_MADNESS_TTVALUE_H
+#define TTG_MADNESS_TTVALUE_H
+
+namespace ttg_madness {
+
+  template<typename DerivedT>
+  struct TTValue
+  {
+    /* empty */
+  };
+
+} // namespace ttg_madness
+
+  #endif // TTG_MADNESS_TTVALUE_H
diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h
index eabd3c34e..1711d8556 100644
--- a/ttg/ttg/make_tt.h
+++ b/ttg/ttg/make_tt.h
@@ -8,7 +8,8 @@
 // case 1 (keyT != void): void op(auto&& key, std::tuple<input_valuesT...>&&, std::tuple<output_terminalsT...>&)
 // case 2 (keyT == void): void op(std::tuple<input_valuesT...>&&, std::tuple<output_terminalsT...>&)
 //
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
+template <typename funcT, bool funcT_receives_outterm_tuple,
+          typename keyT, typename output_terminalsT,
           typename... input_valuesT>
 class CallableWrapTT
     : public TT<keyT, output_terminalsT,
@@ -100,15 +101,19 @@ class CallableWrapTT
   }
 };
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
+template <typename funcT, bool funcT_receives_outterm_tuple,
+          typename keyT, typename output_terminalsT,
           typename input_values_tupleT>
 struct CallableWrapTTUnwrapTypelist;
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
+template <typename funcT, bool funcT_receives_outterm_tuple,
+          typename keyT, typename output_terminalsT,
           typename... input_valuesT>
-struct CallableWrapTTUnwrapTypelist<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+struct CallableWrapTTUnwrapTypelist<funcT, funcT_receives_outterm_tuple,
+                                    keyT, output_terminalsT,
                                     std::tuple<input_valuesT...>> {
-  using type = CallableWrapTT<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+  using type = CallableWrapTT<funcT, funcT_receives_outterm_tuple,
+                              keyT, output_terminalsT,
                               std::remove_reference_t<input_valuesT>...>;
 };
 
@@ -122,15 +127,17 @@ struct CallableWrapTTUnwrapTypelist<funcT, funcT_receives_outterm_tuple, keyT, o
 
 // Class to wrap a callable with signature
 //
-// case 1 (keyT != void): void op(auto&& key, input_valuesT&&..., std::tuple<output_terminalsT...>&)
-// case 2 (keyT == void): void op(input_valuesT&&..., std::tuple<output_terminalsT...>&)
+// case 1 (keyT != void): returnT op(auto&& key, input_valuesT&&..., std::tuple<output_terminalsT...>&)
+// case 2 (keyT == void): returnT op(input_valuesT&&..., std::tuple<output_terminalsT...>&)
 //
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
-          typename... input_valuesT>
+// returnT is void for funcT = synchronous (ordinary) function and the appropriate return type for funcT=coroutine
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+          typename keyT, typename output_terminalsT, typename... input_valuesT>
 class CallableWrapTTArgs
-    : public TT<keyT, output_terminalsT,
-                CallableWrapTTArgs<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT, input_valuesT...>,
-                ttg::typelist<input_valuesT...>> {
+    : public TT<
+          keyT, output_terminalsT,
+          CallableWrapTTArgs<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT, input_valuesT...>,
+          ttg::typelist<input_valuesT...>> {
   using baseT = typename CallableWrapTTArgs::ttT;
 
   using input_values_tuple_type = typename baseT::input_values_tuple_type;
@@ -141,55 +148,180 @@ class CallableWrapTTArgs
   using noref_funcT = std::remove_reference_t<funcT>;
   std::conditional_t<std::is_function_v<noref_funcT>, std::add_pointer_t<noref_funcT>, noref_funcT> func;
 
+  using op_return_type =
+#ifdef TTG_HAS_COROUTINE
+      std::conditional_t<std::is_same_v<returnT, ttg::resumable_task>,
+                         ttg::coroutine_handle<ttg::resumable_task_state>,
+#ifdef TTG_HAVE_DEVICE
+                         std::conditional_t<std::is_same_v<returnT, ttg::device::Task>,
+                                            ttg::device::Task::base_type,
+                                            void>
+#else  // TTG_HAVE_DEVICE
+                           void
+#endif  // TTG_HAVE_DEVICE
+                         >;
+#else   // TTG_HAS_COROUTINE
+      void;
+#endif  // TTG_HAS_COROUTINE
+
+public:
+  static constexpr bool have_cuda_op = (space == ttg::ExecutionSpace::CUDA);
+  static constexpr bool have_hip_op  = (space == ttg::ExecutionSpace::HIP);
+  static constexpr bool have_level_zero_op = (space == ttg::ExecutionSpace::L0);
+
+protected:
+
+  template<typename ReturnT>
+  auto process_return(ReturnT&& ret, output_terminalsT &out) {
+    static_assert(std::is_same_v<std::remove_reference_t<decltype(ret)>, returnT>,
+                  "CallableWrapTTArgs<funcT,returnT,...>: returnT does not match the actual return type of funcT");
+    if constexpr (!std::is_void_v<returnT>) {  // protect from compiling for void returnT
+#ifdef TTG_HAS_COROUTINE
+      if constexpr (std::is_same_v<returnT, ttg::resumable_task>) {
+        ttg::coroutine_handle<ttg::resumable_task_state> coro_handle;
+        // if task completed destroy it
+        if (ret.completed()) {
+          ret.destroy();
+        } else {  // if task is suspended return the coroutine promise ptr
+          coro_handle = ret;
+        }
+        return coro_handle;
+      } else
+#ifdef TTG_HAVE_DEVICE
+          if constexpr (std::is_same_v<returnT, ttg::device::Task>) {
+        ttg::device::Task::base_type coro_handle = ret;
+        return coro_handle;
+      }
+#else  // TTG_HAVE_DEVICE
+        ttg::abort();  // should not happen
+#endif  // TTG_HAVE_DEVICE
+      if constexpr (!(std::is_same_v<returnT, ttg::resumable_task>
+#ifdef TTG_HAVE_DEVICE
+          || std::is_same_v<returnT, ttg::device::Task>
+#endif  // TTG_HAVE_DEVICE
+              ))
+#endif
+      {
+        static_assert(std::tuple_size_v<std::remove_reference_t<decltype(out)>> == 1,
+                      "CallableWrapTTArgs<funcT,returnT,funcT_receives_outterm_tuple=true,...): funcT can return a "
+                      "value only if there is only 1 out terminal");
+        static_assert(std::tuple_size_v<returnT> <= 2,
+                      "CallableWrapTTArgs<funcT,returnT,funcT_receives_outterm_tuple=true,...): funcT can return a "
+                      "value only if it is a plain value (then sent with null key), a tuple-like containing a single "
+                      "key (hence value is void), or a tuple-like containing a key and a value");
+        if constexpr (std::tuple_size_v<returnT> == 0)
+          std::get<0>(out).sendv(std::move(ret));
+        else if constexpr (std::tuple_size_v<returnT> == 1)
+          std::get<0>(out).sendk(std::move(std::get<0>(ret)));
+        else if constexpr (std::tuple_size_v<returnT> == 2)
+          std::get<0>(out).send(std::move(std::get<0>(ret)), std::move(std::get<1>(ret)));
+        return;
+      }
+    }
+  }
+
+  /// @return coroutine handle<> (if funcT is a coroutine), else void
   template <typename Key, typename Tuple, std::size_t... S>
-  void call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
+  auto call_func(Key &&key, Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
     using func_args_t = ttg::meta::tuple_concat_t<std::tuple<const Key &>, input_refs_tuple_type, output_edges_type>;
-    if constexpr (funcT_receives_outterm_tuple)
-      func(std::forward<Key>(key),
-           baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
-    else {
+
+    if constexpr (funcT_receives_outterm_tuple) {
+      if constexpr (std::is_void_v<returnT>) {
+        func(std::forward<Key>(key),
+             baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
+        return;
+      } else {
+        auto ret = func(
+            std::forward<Key>(key),
+            baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
+
+        return process_return(std::move(ret), out);
+      }
+    } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
       this->set_outputs_tls_ptr();
-      func(std::forward<Key>(key),
-           baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))...);
-      this->set_outputs_tls_ptr(old_output_tls_ptr);
+      if constexpr (std::is_void_v<returnT>) {
+        func(std::forward<Key>(key),
+             baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))...);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return;
+      } else {
+        auto ret =
+            func(std::forward<Key>(key),
+                 baseT::template get<S, std::tuple_element_t<S + 1, func_args_t>>(std::forward<Tuple>(args_tuple))...);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return process_return(std::move(ret), out);
+      }
     }
   }
 
   template <typename Tuple, std::size_t... S>
-  void call_func(Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
+  auto call_func(Tuple &&args_tuple, output_terminalsT &out, std::index_sequence<S...>) {
     using func_args_t = ttg::meta::tuple_concat_t<input_refs_tuple_type, output_edges_type>;
-    if constexpr (funcT_receives_outterm_tuple)
-      func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
-    else {
+    if constexpr (funcT_receives_outterm_tuple) {
+      if constexpr (std::is_void_v<returnT>) {
+        func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
+      } else {
+        auto ret = func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))..., out);
+        return process_return(std::move(ret), out);
+      }
+    } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
       this->set_outputs_tls_ptr();
-      func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))...);
-      this->set_outputs_tls_ptr(old_output_tls_ptr);
+      if constexpr (std::is_void_v<returnT>) {
+        func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))...);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+      } else {
+        auto ret = func(baseT::template get<S, std::tuple_element_t<S, func_args_t>>(std::forward<Tuple>(args_tuple))...);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return process_return(std::move(ret), out);
+      }
     }
   }
 
   template <typename Key>
-  void call_func(Key &&key, output_terminalsT &out) {
-    if constexpr (funcT_receives_outterm_tuple)
-      func(std::forward<Key>(key), out);
-    else {
+  auto call_func(Key &&key, output_terminalsT &out) {
+    if constexpr (funcT_receives_outterm_tuple) {
+      if constexpr (std::is_void_v<returnT>) {
+        func(std::forward<Key>(key), out);
+      } else {
+        auto ret = func(std::forward<Key>(key), out);
+        return process_return(std::move(ret), out);
+      }
+    } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
       this->set_outputs_tls_ptr();
-      func(std::forward<Key>(key));
-      this->set_outputs_tls_ptr(old_output_tls_ptr);
+      if constexpr (std::is_void_v<returnT>) {
+        func(std::forward<Key>(key));
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+      } else {
+        auto ret = func(std::forward<Key>(key));
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return process_return(std::move(ret), out);
+      }
     }
   }
 
   template <typename OutputTerminals>
-  void call_func(OutputTerminals &out) {
-    if constexpr (funcT_receives_outterm_tuple)
-      func(out);
-    else {
+  auto call_func(OutputTerminals &out) {
+    if constexpr (funcT_receives_outterm_tuple) {
+      if constexpr (std::is_void_v<returnT>) {
+        func(out);
+      } else {
+        auto ret = func(out);
+        return process_return(std::move(ret), out);
+      }
+    } else {
       auto old_output_tls_ptr = this->outputs_tls_ptr_accessor();
       this->set_outputs_tls_ptr();
-      func();
-      this->set_outputs_tls_ptr(old_output_tls_ptr);
+      if constexpr (std::is_void_v<returnT>) {
+        func();
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+      } else {
+        auto ret = func(out);
+        this->set_outputs_tls_ptr(old_output_tls_ptr);
+        return process_return(std::move(ret), out);
+      }
     }
   }
 
@@ -214,54 +346,55 @@ class CallableWrapTTArgs
   template <typename Key, typename ArgsTuple>
   std::enable_if_t<std::is_same_v<ArgsTuple, input_refs_tuple_type> &&
                        !ttg::meta::is_empty_tuple_v<input_refs_tuple_type> && !ttg::meta::is_void_v<Key>,
-                   void>
+                   op_return_type>
   op(Key &&key, ArgsTuple &&args_tuple, output_terminalsT &out) {
     assert(&out == &baseT::get_output_terminals());
-    call_func(std::forward<Key>(key), std::forward<ArgsTuple>(args_tuple), out,
-              std::make_index_sequence<std::tuple_size_v<ArgsTuple>>{});
+    return call_func(std::forward<Key>(key), std::forward<ArgsTuple>(args_tuple), out,
+                     std::make_index_sequence<std::tuple_size_v<ArgsTuple>>{});
   };
 
   template <typename ArgsTuple, typename Key = keyT>
   std::enable_if_t<std::is_same_v<ArgsTuple, input_refs_tuple_type> &&
                        !ttg::meta::is_empty_tuple_v<input_refs_tuple_type> && ttg::meta::is_void_v<Key>,
-                   void>
+                   op_return_type>
   op(ArgsTuple &&args_tuple, output_terminalsT &out) {
     assert(&out == &baseT::get_output_terminals());
-    call_func(std::forward<ArgsTuple>(args_tuple), out, std::make_index_sequence<std::tuple_size_v<ArgsTuple>>{});
+    return call_func(std::forward<ArgsTuple>(args_tuple), out,
+                     std::make_index_sequence<std::tuple_size_v<ArgsTuple>>{});
   };
 
   template <typename Key, typename ArgsTuple = input_refs_tuple_type>
-  std::enable_if_t<ttg::meta::is_empty_tuple_v<ArgsTuple> && !ttg::meta::is_void_v<Key>, void> op(
+  std::enable_if_t<ttg::meta::is_empty_tuple_v<ArgsTuple> && !ttg::meta::is_void_v<Key>, op_return_type> op(
       Key &&key, output_terminalsT &out) {
     assert(&out == &baseT::get_output_terminals());
-    call_func(std::forward<Key>(key), out);
+    return call_func(std::forward<Key>(key), out);
   };
 
   template <typename Key = keyT, typename ArgsTuple = input_refs_tuple_type>
-  std::enable_if_t<ttg::meta::is_empty_tuple_v<ArgsTuple> && ttg::meta::is_void_v<Key>, void> op(
+  std::enable_if_t<ttg::meta::is_empty_tuple_v<ArgsTuple> && ttg::meta::is_void_v<Key>, op_return_type> op(
       output_terminalsT &out) {
     assert(&out == &baseT::get_output_terminals());
-    call_func(out);
+    return call_func(out);
   };
 };
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
-          typename input_values_typelistT>
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+          typename keyT, typename output_terminalsT, typename input_values_typelistT>
 struct CallableWrapTTArgsAsTypelist;
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
-          typename... input_valuesT>
-struct CallableWrapTTArgsAsTypelist<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+          typename keyT, typename output_terminalsT, typename... input_valuesT>
+struct CallableWrapTTArgsAsTypelist<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT,
                                     std::tuple<input_valuesT...>> {
-  using type = CallableWrapTTArgs<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+  using type = CallableWrapTTArgs<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT,
                                   std::remove_reference_t<input_valuesT>...>;
 };
 
-template <typename funcT, bool funcT_receives_outterm_tuple, typename keyT, typename output_terminalsT,
-          typename... input_valuesT>
-struct CallableWrapTTArgsAsTypelist<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+template <typename funcT, typename returnT, bool funcT_receives_outterm_tuple, ttg::ExecutionSpace space,
+          typename keyT, typename output_terminalsT, typename... input_valuesT>
+struct CallableWrapTTArgsAsTypelist<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT,
                                     ttg::meta::typelist<input_valuesT...>> {
-  using type = CallableWrapTTArgs<funcT, funcT_receives_outterm_tuple, keyT, output_terminalsT,
+  using type = CallableWrapTTArgs<funcT, returnT, funcT_receives_outterm_tuple, space, keyT, output_terminalsT,
                                   std::remove_reference_t<input_valuesT>...>;
 };
 
@@ -421,7 +554,9 @@ auto make_tt_tpl(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_value
 /// @warning Although generic arguments annotated by `const auto&` are also permitted, their use is discouraged to avoid confusion;
 ///          namely, `const auto&` denotes a _consumable_ argument, NOT read-only, despite the `const`.
 // clang-format on
-template <typename keyT = void, typename funcT, typename... input_edge_valuesT, typename... output_edgesT>
+template <ttg::ExecutionSpace space,
+          typename keyT = void, typename funcT,
+          typename... input_edge_valuesT, typename... output_edgesT>
 auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges = std::tuple<>{},
              const std::tuple<output_edgesT...> &outedges = std::tuple<>{}, const std::string &name = "wrapper",
              const std::vector<std::string> &innames = std::vector<std::string>(sizeof...(input_edge_valuesT), "input"),
@@ -447,7 +582,10 @@ auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>.
 
   // gross argument typelist for invoking func, can include void for optional args
   constexpr static auto func_is_generic = ttg::meta::is_generic_callable_v<funcT>;
-  using gross_func_args_t = decltype(ttg::meta::compute_arg_binding_types_r<void>(func, candidate_func_args_t{}));
+  using return_type_typelist_and_gross_func_args_t =
+      decltype(ttg::meta::compute_arg_binding_types(func, candidate_func_args_t{}));
+  using func_return_t = std::tuple_element_t<0, std::tuple_element_t<0, return_type_typelist_and_gross_func_args_t>>;
+  using gross_func_args_t = std::tuple_element_t<1, return_type_typelist_and_gross_func_args_t>;
   constexpr auto DETECTED_HOW_TO_INVOKE_GENERIC_FUNC =
       func_is_generic ? !std::is_same_v<gross_func_args_t, ttg::typelist<>> : true;
   static_assert(DETECTED_HOW_TO_INVOKE_GENERIC_FUNC,
@@ -493,12 +631,21 @@ auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>.
   using decayed_input_args_t = ttg::meta::decayed_typelist_t<input_args_t>;
   // 3. full_input_args_t = edge-types with non-void types replaced by input_args_t
   using full_input_args_t = ttg::meta::replace_nonvoid_t<input_edge_value_types, input_args_t>;
-  using wrapT = typename CallableWrapTTArgsAsTypelist<funcT, have_outterm_tuple, keyT, output_terminals_type,
-                                                      full_input_args_t>::type;
+  using wrapT = typename CallableWrapTTArgsAsTypelist<funcT, func_return_t, have_outterm_tuple, space, keyT,
+                                                      output_terminals_type, full_input_args_t>::type;
 
   return std::make_unique<wrapT>(std::forward<funcT>(func), inedges, outedges, name, innames, outnames);
 }
 
+template <typename keyT = void, typename funcT,
+          typename... input_edge_valuesT, typename... output_edgesT>
+auto make_tt(funcT &&func, const std::tuple<ttg::Edge<keyT, input_edge_valuesT>...> &inedges = std::tuple<>{},
+             const std::tuple<output_edgesT...> &outedges = std::tuple<>{}, const std::string &name = "wrapper",
+             const std::vector<std::string> &innames = std::vector<std::string>(sizeof...(input_edge_valuesT), "input"),
+             const std::vector<std::string> &outnames = std::vector<std::string>(sizeof...(output_edgesT), "output")) {
+  return make_tt<ttg::ExecutionSpace::Host, keyT>(std::forward<funcT>(func), inedges, outedges, name, innames, outnames);
+}
+
 template <typename keyT, typename funcT, typename... input_valuesT, typename... output_edgesT>
 [[deprecated("use make_tt_tpl instead")]] inline auto wrapt(
     funcT &&func, const std::tuple<ttg::Edge<keyT, input_valuesT>...> &inedges,
diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
new file mode 100644
index 000000000..98b14eb12
--- /dev/null
+++ b/ttg/ttg/parsec/buffer.h
@@ -0,0 +1,392 @@
+#ifndef TTG_PARSEC_BUFFER_H
+#define TTG_PARSEC_BUFFER_H
+
+#include <array>
+#include <vector>
+#include <parsec.h>
+#include <parsec/data_internal.h>
+#include <parsec/mca/device/device.h>
+#include "ttg/parsec/ttg_data_copy.h"
+#include "ttg/parsec/parsec-ext.h"
+#include "ttg/util/iovec.h"
+#include "ttg/device/device.h"
+#include "ttg/parsec/device.h"
+
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+#include <cuda_runtime.h>
+#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
+
+namespace ttg_parsec {
+
+
+namespace detail {
+  // fwd decl
+  template<typename T, typename A>
+  parsec_data_t* get_parsec_data(const ttg_parsec::Buffer<T, A>& db);
+} // namespace detail
+
+/**
+ * A buffer that is mirrored between host memory
+ * and different devices. The runtime is free to
+ * move data between device and host memory based
+ * on where the tasks are executing.
+ *
+ * Note that a buffer is movable and should not
+ * be shared between two objects (e.g., through a pointer)
+ * in order for TTG to properly facilitate ownership
+ * tracking of the containing object.
+ */
+template<typename T, typename Allocator>
+struct Buffer : public detail::ttg_parsec_data_wrapper_t
+              , private Allocator {
+
+  using element_type = std::decay_t<T>;
+
+  using allocator_traits = std::allocator_traits<Allocator>;
+  using allocator_type = typename  allocator_traits::allocator_type;
+
+  static_assert(std::is_trivially_copyable_v<element_type>,
+                "Only trivially copyable types are supported for devices.");
+  static_assert(std::is_default_constructible_v<element_type>,
+                "Only default constructible types are supported for devices.");
+
+private:
+  using delete_fn_t = std::function<void(element_type*)>;
+
+  using host_data_ptr   = std::add_pointer_t<element_type>;
+  host_data_ptr m_host_data = nullptr;
+  std::size_t m_count = 0;
+  bool m_owned= false;
+
+  static void delete_non_owned(element_type *ptr) {
+    // nothing to be done, we don't own the memory
+  }
+
+  friend parsec_data_t* detail::get_parsec_data<T, Allocator>(const ttg_parsec::Buffer<T, Allocator>&);
+
+  allocator_type& get_allocator_reference() { return static_cast<allocator_type&>(*this); }
+
+  element_type* allocate(std::size_t n) {
+    return allocator_traits::allocate(get_allocator_reference(), n);
+  }
+
+  void deallocate() {
+    allocator_traits::deallocate(get_allocator_reference(), m_host_data, m_count);
+  }
+
+public:
+
+  Buffer() : Buffer(nullptr, 0)
+  { }
+
+  Buffer(std::size_t n)
+  : ttg_parsec_data_wrapper_t()
+  , allocator_type()
+  , m_host_data(allocate(n))
+  , m_count(n)
+  , m_owned(true)
+  {
+    //std::cout << "buffer " << this << " ctor count "
+    //          << m_count << "(" << m_host_data << ") ttg_copy "
+    //          << m_ttg_copy
+    //          << " parsec_data " << m_data.get() << std::endl;
+    this->reset_parsec_data(m_host_data, n*sizeof(element_type));
+  }
+
+  /* Constructing a buffer using application-managed memory.
+   * The memory pointed to by ptr must be accessible during
+   * the life-time of the buffer. */
+  Buffer(element_type* ptr, std::size_t n = 1)
+  : ttg_parsec_data_wrapper_t()
+  , allocator_type()
+  , m_host_data(ptr)
+  , m_count(n)
+  , m_owned(false)
+  {
+    //std::cout << "buffer " << this << " ctor ptr " << ptr << "count "
+    //          << m_count << "(" << m_host_data << ") ttg_copy "
+    //          << m_ttg_copy
+    //          << " parsec_data " << m_data.get() << std::endl;
+    this->reset_parsec_data(m_host_data, n*sizeof(element_type));
+  }
+
+  virtual ~Buffer() {
+    if (m_owned) {
+      deallocate();
+      m_owned = false;
+    }
+    unpin(); // make sure the copies are not pinned
+  }
+
+  /* allow moving device buffers */
+  Buffer(Buffer&& db)
+  : ttg_parsec_data_wrapper_t(std::move(db))
+  , allocator_type(std::move(db))
+  , m_host_data(db.m_host_data)
+  , m_count(db.m_count)
+  , m_owned(db.m_owned)
+  {
+    db.m_host_data = nullptr;
+    db.m_count = 0;
+    db.m_owned = false;
+  }
+
+  /* explicitly disable copying of buffers
+   * TODO: should we allow this? What data to use?
+   */
+  Buffer(const Buffer& db) = delete;
+
+  /* allow moving device buffers */
+  Buffer& operator=(Buffer&& db) {
+    ttg_parsec_data_wrapper_t::operator=(std::move(db));
+    allocator_type::operator=(std::move(db));
+    std::swap(m_host_data, db.m_host_data);
+    std::swap(m_count, db.m_count);
+    std::swap(m_owned, db.m_owned);
+    //std::cout << "buffer " << this << " other " << &db << " mv op ttg_copy " << m_ttg_copy << std::endl;
+    //std::cout << "buffer::move-assign from " << &db << " ttg-copy " << db.m_ttg_copy
+    //          << " to " << this << " ttg-copy " << m_ttg_copy
+    //          << " parsec-data " << m_data.get()
+    //          << std::endl;
+    /* don't update the ttg_copy, we keep the connection */
+    return *this;
+  }
+
+  /* explicitly disable copying of buffers
+   * TODO: should we allow this? What data to use?
+   */
+  Buffer& operator=(const Buffer& db) = delete;
+
+  /* set the current device, useful when a device
+   * buffer was modified outside of a TTG */
+  void set_current_device(const ttg::device::Device& device) {
+    assert(is_valid());
+    int parsec_id = detail::ttg_device_to_parsec_device(device);
+    /* make sure it's a valid device */
+    assert(parsec_nb_devices > parsec_id);
+    /* make sure it's a valid copy */
+    assert(m_data->device_copies[parsec_id] != nullptr);
+    m_data->owner_device = parsec_id;
+  }
+
+  /* Get the owner device ID, i.e., the last updated
+   * device buffer. */
+  ttg::device::Device get_owner_device() const {
+    assert(is_valid());
+    return detail::parsec_device_to_ttg_device(m_data->owner_device);
+  }
+
+  /* Get the pointer on the currently active device. */
+  element_type* current_device_ptr() {
+    assert(is_valid());
+    int device_id = detail::ttg_device_to_parsec_device(ttg::device::current_device());
+    return static_cast<element_type*>(m_data->device_copies[device_id]->device_private);
+  }
+
+  /* Get the pointer on the currently active device. */
+  const element_type* current_device_ptr() const {
+    assert(is_valid());
+    int device_id = detail::ttg_device_to_parsec_device(ttg::device::current_device());
+    return static_cast<element_type*>(m_data->device_copies[device_id]->device_private);
+  }
+
+  /* Get the pointer on the owning device.
+   * @note: This may not be the device assigned to the currently executing task.
+   *        See \ref ttg::device::current_device for that. */
+  element_type* owner_device_ptr() {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the current device pointer */
+  const element_type* owner_device_ptr() const {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the device pointer at the given device
+   */
+  element_type* device_ptr_on(const ttg::device::Device& device) {
+    assert(is_valid());
+    int device_id = detail::ttg_device_to_parsec_device(device);
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+  }
+
+  /* get the device pointer at the given device
+   */
+  const element_type* device_ptr_on(const ttg::device::Device& device) const {
+    assert(is_valid());
+    int device_id = detail::ttg_device_to_parsec_device(device);
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+  }
+
+  element_type* host_ptr() {
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), 0));
+  }
+
+  const element_type* host_ptr() const {
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), 0));
+  }
+
+  bool is_valid_on(const ttg::device::Device& device) const {
+    assert(is_valid());
+    int device_id = detail::ttg_device_to_parsec_device(device);
+    return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr);
+  }
+
+  void allocate_on(const ttg::device::Device& device_id) {
+    /* TODO: need exposed PaRSEC memory allocator */
+    throw std::runtime_error("not implemented yet");
+  }
+
+  /* TODO: can we do this automatically?
+   * Pin the memory on all devices we currently track.
+   * Pinned memory won't be released by PaRSEC and can be used
+   * at any time.
+   */
+  void pin() {
+    for (int i = 1; i < parsec_nb_devices; ++i) {
+      pin_on(i);
+    }
+  }
+
+  /* Unpin the memory on all devices we currently track. */
+  void unpin() {
+    if (!is_valid()) return;
+    for (int i = 0; i < parsec_nb_devices-detail::first_device_id; ++i) {
+      unpin_on(i);
+    }
+  }
+
+  /* Pin the memory on a given device */
+  void pin_on(int device_id) {
+    /* TODO: how can we pin memory on a device? */
+  }
+
+  /* Pin the memory on a given device */
+  void unpin_on(int device_id) {
+    /* TODO: how can we unpin memory on a device? */
+  }
+
+  bool is_valid() const {
+    return !!m_data;
+  }
+
+  operator bool() const {
+    return is_valid();
+  }
+
+  std::size_t size() const {
+    return m_count;
+  }
+
+  /* Reallocate the buffer with count elements */
+  void reset(std::size_t n) {
+    /* TODO: can we resize if count is smaller than m_count? */
+
+    if (m_owned) {
+      deallocate();
+      m_owned = false;
+    }
+
+    if (n == 0) {
+      m_host_data = nullptr;
+      m_owned = false;
+    } else {
+      m_host_data = allocate(n);
+      m_owned = true;
+    }
+    reset_parsec_data(m_host_data, n*sizeof(element_type));
+    //std::cout << "buffer::reset(" << count << ") ptr " << m_host_data.get()
+    //          << " ttg_copy " << m_ttg_copy
+    //          << " parsec_data " << m_data.get() << std::endl;
+    m_count = n;
+  }
+
+  /* Reset the buffer to use the ptr to count elements */
+  void reset(T* ptr, std::size_t n = 1) {
+    /* TODO: can we resize if count is smaller than m_count? */
+    if (n == m_count) {
+      return;
+    }
+
+    if (m_owned) {
+      deallocate();
+    }
+
+    if (nullptr == ptr) {
+      m_host_data = nullptr;
+      m_count = 0;
+      m_owned = false;
+    } else {
+      m_host_data = ptr;
+      m_count = n;
+      m_owned = false;
+    }
+    reset_parsec_data(m_host_data, n*sizeof(element_type));
+    //std::cout << "buffer::reset(" << ptr << ", " << count << ") ptr " << m_host_data.get()
+    //          << " ttg_copy " << m_ttg_copy
+    //          << " parsec_data " << m_data.get() << std::endl;
+  }
+
+  /* serialization support */
+
+#ifdef TTG_SERIALIZATION_SUPPORTS_BOOST
+  template <typename Archive>
+  void serialize(Archive& ar, const unsigned int version) {
+    if constexpr (ttg::detail::is_output_archive_v<Archive>) {
+      std::size_t s = size();
+      ar& s;
+      assert(m_ttg_copy != nullptr); // only tracked objects allowed
+      m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()});
+    } else {
+      std::size_t s;
+      ar & s;
+      /* initialize internal pointers and then reset */
+      reset(s);
+      assert(m_ttg_copy != nullptr); // only tracked objects allowed
+      m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()});
+    }
+  }
+#endif // TTG_SERIALIZATION_SUPPORTS_BOOST
+
+#ifdef TTG_SERIALIZATION_SUPPORTS_MADNESS
+  template <typename Archive>
+  std::enable_if_t<std::is_base_of_v<madness::archive::BufferInputArchive, Archive> ||
+                   std::is_base_of_v<madness::archive::BufferOutputArchive, Archive>>
+  serialize(Archive& ar) {
+    if constexpr (ttg::detail::is_output_archive_v<Archive>) {
+      std::size_t s = size();
+      ar& s;
+      assert(m_ttg_copy != nullptr); // only tracked objects allowed
+      /* transfer from the current device
+       * note: if the transport layer (MPI) does not support device transfers
+       *       the data will have been pushed out */
+      m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()});
+    } else {
+      std::size_t s;
+      ar & s;
+      //std::cout << "serialize(IN) buffer " << this << " size " << s << std::endl;
+      /* initialize internal pointers and then reset */
+      reset(s);
+      assert(m_ttg_copy != nullptr); // only tracked objects allowed
+      /* transfer to the current device
+       * TODO: how can we make sure the device copy is not evicted? */
+      m_ttg_copy->iovec_add(ttg::iovec{s*sizeof(T), current_device_ptr()});
+    }
+  }
+#endif // TTG_SERIALIZATION_SUPPORTS_MADNESS
+
+
+};
+
+namespace detail {
+  template<typename T, typename A>
+  parsec_data_t* get_parsec_data(const ttg_parsec::Buffer<T, A>& db) {
+    return const_cast<parsec_data_t*>(db.m_data.get());
+  }
+} // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_BUFFER_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/device.h b/ttg/ttg/parsec/device.h
new file mode 100644
index 000000000..77722b1c1
--- /dev/null
+++ b/ttg/ttg/parsec/device.h
@@ -0,0 +1,40 @@
+#ifndef TTG_PARSEC_DEVICE_H
+#define TTG_PARSEC_DEVICE_H
+
+#include "ttg/device/device.h"
+
+namespace ttg_parsec {
+
+  namespace detail {
+
+    // the first ID of an accelerator in the parsec ID-space
+    inline int first_device_id = -1;
+
+    /**
+     * map from TTG ID-space to parsec ID-space
+     */
+    inline
+    int ttg_device_to_parsec_device(const ttg::device::Device& device) {
+      if (device.is_host()) {
+        return 0;
+      } else {
+        return device.id() + first_device_id;
+      }
+    }
+
+    /**
+     * map from parsec ID-space to TTG ID-space
+     */
+    inline
+    ttg::device::Device parsec_device_to_ttg_device(int parsec_id) {
+      if (parsec_id < first_device_id) {
+        return ttg::device::Device(parsec_id, ttg::ExecutionSpace::Host);
+      }
+      return ttg::device::Device(parsec_id - first_device_id,
+                                ttg::device::available_execution_space);
+    }
+  } // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_DEVICE_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
new file mode 100644
index 000000000..d4a488673
--- /dev/null
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -0,0 +1,177 @@
+#ifndef TTG_PARSEC_DEVICEFUNC_H
+#define TTG_PARSEC_DEVICEFUNC_H
+
+#if defined(TTG_HAVE_CUDART)
+#include <cuda.h>
+#endif
+
+#include "ttg/parsec/task.h"
+#include <parsec.h>
+#include <parsec/mca/device/device_gpu.h>
+
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+#include <parsec/mca/device/cuda/device_cuda.h>
+#elif defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+#include <parsec/mca/device/hip/device_hip.h>
+#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
+
+namespace ttg_parsec {
+  namespace detail {
+    template<typename... Views, std::size_t I, std::size_t... Is>
+    inline bool register_device_memory(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
+      static_assert(I < MAX_PARAM_COUNT,
+                    "PaRSEC only supports MAX_PARAM_COUNT device input/outputs. "
+                    "Increase MAX_PARAM_COUNT and recompile PaRSEC/TTG.");
+      using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
+      parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
+      assert(nullptr != caller->dev_ptr);
+      parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task;
+      parsec_flow_t *flows = caller->dev_ptr->flows;
+
+      auto& view = std::get<I>(views);
+      bool is_current = false;
+      static_assert(ttg::meta::is_buffer_v<view_type> || ttg::meta::is_devicescratch_v<view_type>);
+      /* get_parsec_data is overloaded for buffer and devicescratch */
+      parsec_data_t* data = detail::get_parsec_data(view);
+      /* TODO: check whether the device is current */
+
+      auto access = PARSEC_FLOW_ACCESS_RW;
+      if constexpr (std::is_const_v<view_type>) {
+        // keep the flow at RW if it was RW to make sure we pull the data back out eventually
+        //if (flows[I].flow_flags != PARSEC_FLOW_ACCESS_RW) {
+          access = PARSEC_FLOW_ACCESS_READ;
+        //}
+      } else if constexpr (ttg::meta::is_devicescratch_v<view_type>) {
+        if (view.scope() == ttg::scope::Allocate) {
+          access = PARSEC_FLOW_ACCESS_WRITE;
+        }
+      }
+
+      //std::cout << "register_device_memory task " << detail::parsec_ttg_caller << " data " << I << " "
+      //          << data << " size " << data->nb_elts << std::endl;
+
+      /* build the flow */
+      /* TODO: reuse the flows of the task class? How can we control the sync direction then? */
+      flows[I] = parsec_flow_t{.name = nullptr,
+                               .sym_type = PARSEC_SYM_INOUT,
+                               .flow_flags = static_cast<uint8_t>(access),
+                               .flow_index = I,
+                               .flow_datatype_mask = ~0 };
+
+      gpu_task->flow_nb_elts[I] = data->nb_elts; // size in bytes
+      gpu_task->flow[I] = &flows[I];
+
+      /* set the input data copy, parsec will take care of the transfer
+       * and the buffer will look at the parsec_data_t for the current pointer */
+      //detail::parsec_ttg_caller->parsec_task.data[I].data_in = data->device_copies[data->owner_device];
+      assert(nullptr != data->device_copies[0]->original);
+      caller->parsec_task.data[I].data_in = data->device_copies[0];
+      caller->parsec_task.data[I].source_repo_entry = NULL;
+
+      if constexpr (sizeof...(Is) > 0) {
+        is_current |= register_device_memory(views, std::index_sequence<Is...>{});
+      }
+      return is_current;
+    }
+  } // namespace detail
+
+  /* Takes a tuple of ttg::Views or ttg::buffers and register them
+   * with the currently executing task. Returns true if all memory
+   * is current on the target device, false if transfers are required. */
+  template<typename... Views>
+  inline bool register_device_memory(std::tuple<Views&...> &views) {
+    bool is_current = true;
+    if (nullptr == detail::parsec_ttg_caller) {
+      throw std::runtime_error("register_device_memory may only be invoked from inside a task!");
+    }
+
+    if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
+      throw std::runtime_error("register_device_memory called inside a non-gpu task!");
+    }
+
+    if constexpr (sizeof...(Views) > 0) {
+      is_current = detail::register_device_memory(views, std::index_sequence_for<Views...>{});
+    }
+
+    /* reset all entries in the current task */
+    for (int i = sizeof...(Views); i < MAX_PARAM_COUNT; ++i) {
+      detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr;
+      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE;
+      detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i;
+      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i];
+      detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0;
+    }
+
+    return is_current;
+  }
+
+  namespace detail {
+    template<typename... Views, std::size_t I, std::size_t... Is, bool DeviceAvail = false>
+    inline void mark_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
+
+      using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
+      auto& view = std::get<I>(views);
+
+      /* get_parsec_data is overloaded for buffer and devicescratch */
+      parsec_data_t* data = detail::get_parsec_data(view);
+      parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task;
+      parsec_gpu_exec_stream_t *stream = detail::parsec_ttg_caller->dev_ptr->stream;
+
+      /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
+      parsec_device_gpu_module_t *device_module = detail::parsec_ttg_caller->dev_ptr->device;
+      device_module->memcpy_async(device_module, stream,
+                                  data->device_copies[0]->device_private,
+                                  data->device_copies[data->owner_device]->device_private,
+                                  data->nb_elts, parsec_device_gpu_transfer_direction_d2h);
+
+      if constexpr (sizeof...(Is) > 0) {
+        // recursion
+        mark_device_out(views, std::index_sequence<Is...>{});
+      }
+    }
+  } // namespace detail
+
+  template<typename... Buffer>
+  inline void mark_device_out(std::tuple<Buffer&...> &b) {
+
+    if (nullptr == detail::parsec_ttg_caller) {
+      throw std::runtime_error("mark_device_out may only be invoked from inside a task!");
+    }
+
+    if (nullptr == detail::parsec_ttg_caller->dev_ptr) {
+      throw std::runtime_error("mark_device_out called inside a non-gpu task!");
+    }
+
+    detail::mark_device_out(b, std::index_sequence_for<Buffer...>{});
+  }
+
+  namespace detail {
+
+    template<typename... Views, std::size_t I, std::size_t... Is>
+    inline void post_device_out(std::tuple<Views&...> &views, std::index_sequence<I, Is...>) {
+
+      using view_type = std::remove_reference_t<std::tuple_element_t<I, std::tuple<Views&...>>>;
+
+      if constexpr (!std::is_const_v<view_type>) {
+        auto& view = std::get<I>(views);
+
+        /* get_parsec_data is overloaded for buffer and devicescratch */
+        parsec_data_t* data = detail::get_parsec_data(view);
+        data->device_copies[0]->version = data->device_copies[data->owner_device]->version;
+        parsec_data_transfer_ownership_to_copy(data, 0, PARSEC_FLOW_ACCESS_READ);
+      }
+
+      if constexpr (sizeof...(Is) > 0) {
+        // recursion
+        post_device_out(views, std::index_sequence<Is...>{});
+      }
+    }
+  } // namespace detail
+  template<typename... Buffer>
+  inline void post_device_out(std::tuple<Buffer&...> &b) {
+    detail::post_device_out(b, std::index_sequence_for<Buffer...>{});
+  }
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_DEVICEFUNC_H
diff --git a/ttg/ttg/parsec/devicescratch.h b/ttg/ttg/parsec/devicescratch.h
new file mode 100644
index 000000000..e2c3743aa
--- /dev/null
+++ b/ttg/ttg/parsec/devicescratch.h
@@ -0,0 +1,145 @@
+#ifndef TTG_PARSEC_DEVICESCRATCH_H
+#define TTG_PARSEC_DEVICESCRATCH_H
+
+// TODO: replace with short vector
+#define TTG_PARSEC_MAX_NUM_DEVICES 4
+
+#include <array>
+#include <parsec.h>
+#include <parsec/data_internal.h>
+#include <parsec/mca/device/device.h>
+#include <parsec/mca/device/device_gpu.h>
+#include <ttg/devicescope.h>
+#include "ttg/parsec/task.h"
+
+namespace ttg_parsec {
+
+namespace detail {
+  // fwd decl
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::devicescratch<T>&);
+} // namespace detail
+
+/**
+ * Scratch-space for task-local variables.
+ * TTG will allocate memory on the device
+ * and transfer data in and out based on the scope.
+ */
+template<typename T>
+struct devicescratch {
+
+  using element_type = std::decay_t<T>;
+
+  static_assert(std::is_trivially_copyable_v<element_type>,
+                "Only trivially copyable types are supported for devices.");
+  static_assert(std::is_default_constructible_v<element_type>,
+                "Only default constructible types are supported for devices.");
+
+private:
+
+  parsec_data_t* m_data = nullptr;
+  ttg::scope m_scope;
+
+  static parsec_data_t* create_parsec_data(void *ptr, size_t count) {
+
+    parsec_data_t *data = parsec_data_create_with_type(nullptr, 0, ptr,
+                                                       sizeof(element_type)*count,
+                                                       parsec_datatype_int8_t);
+    data->device_copies[0]->flags |= PARSEC_DATA_FLAG_PARSEC_MANAGED;
+    data->device_copies[0]->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+    return data;
+  }
+
+  void remove_from_flow() {
+    /* remove the scratch from the gpu-task flow */
+    assert(nullptr != detail::parsec_ttg_caller);
+    parsec_task_t *parsec_task = &detail::parsec_ttg_caller->parsec_task;
+    parsec_flow_t *flows = detail::parsec_ttg_caller->dev_ptr->flows;
+    for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+      if (nullptr != parsec_task->data[i].data_in && parsec_task->data[i].data_in->original == m_data) {
+        flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; // disable this flow
+        break;
+      }
+    }
+  }
+
+  friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::devicescratch<T>&);
+
+public:
+
+  /* Constructing a devicescratch using application-managed memory.
+   * The memory pointed to by ptr must be accessible during
+   * the life-time of the devicescratch. */
+  devicescratch(element_type* ptr, ttg::scope scope = ttg::scope::SyncIn, std::size_t count = 1)
+  : m_data(create_parsec_data(ptr, count))
+  , m_scope(scope)
+  {
+    if (ttg::scope::SyncIn == scope) {
+      /* increment the version to force the first initial transfer */
+      m_data->device_copies[0]->version = 1;
+    }
+  }
+
+  /* don't allow moving */
+  devicescratch(devicescratch&&) = delete;
+
+  /* don't allow copying */
+  devicescratch(const devicescratch& db) = delete;
+
+  /* don't allow moving */
+  devicescratch& operator=(devicescratch&&) = delete;
+
+  /* don't allow copying */
+  devicescratch& operator=(const devicescratch& db) = delete;
+
+  ~devicescratch() {
+    /* remove data from flow */
+    //remove_from_flow();
+    if (nullptr != m_data) {
+      //parsec_data_destroy(m_data);
+      //parsec_data_copy_detach(m_data, parsec_data_get_copy(m_data, 0), 0);
+      //auto *copy = parsec_data_get_copy(m_data, 0);
+      //PARSEC_OBJ_RELEASE(copy);
+    }
+    //parsec_data_destroy(m_data);
+    m_data = nullptr;
+  }
+
+  /* get the current device pointer */
+  element_type* device_ptr() {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  /* get the current device pointer */
+  const element_type* device_ptr() const {
+    assert(is_valid());
+    return static_cast<element_type*>(m_data->device_copies[m_data->owner_device]->device_private);
+  }
+
+  bool is_valid() const {
+    // TODO: how to get the current device
+    // return (m_data->owner_device == parsec_current_device);
+    return true;
+  }
+
+  ttg::scope scope() const {
+    return m_scope;
+  }
+
+  std::size_t size() const {
+    return (m_data->nb_elts / sizeof(element_type));
+  }
+
+};
+
+namespace detail {
+  template<typename T>
+  parsec_data_t* get_parsec_data(const ttg_parsec::devicescratch<T>& scratch) {
+    return const_cast<parsec_data_t*>(scratch.m_data);
+  }
+} // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_DEVICESCRATCH_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h
index 4511a6231..d5bc8931e 100644
--- a/ttg/ttg/parsec/fwd.h
+++ b/ttg/ttg/parsec/fwd.h
@@ -28,6 +28,7 @@ namespace ttg_parsec {
 
   inline void ttg_finalize();
 
+  [[noreturn]]
   static inline void ttg_abort();
 
   inline ttg::World ttg_default_execution_context();
@@ -53,6 +54,39 @@ namespace ttg_parsec {
   template <typename T>
   static void ttg_broadcast(ttg::World world, T &data, int source_rank);
 
+  /* device definitions */
+  template<typename T, typename Allocator = std::allocator<T>>
+  struct Buffer;
+
+  template<typename T>
+  struct Ptr;
+
+  template<typename T>
+  struct devicescratch;
+
+  template<typename T>
+  struct TTValue;
+
+  template<typename T, typename... Args>
+  inline Ptr<T> make_ptr(Args&&... args);
+
+  template<typename T>
+  inline Ptr<std::decay_t<T>> get_ptr(T&& obj);
+
+  template<typename... Views>
+  inline bool register_device_memory(std::tuple<Views&...> &views);
+
+  template<typename... Buffer>
+  inline void post_device_out(std::tuple<Buffer&...> &b);
+
+  template<typename... Buffer>
+  inline void mark_device_out(std::tuple<Buffer&...> &b);
+
+#if 0
+  template<typename... Args>
+  inline std::pair<bool, std::tuple<ptr<std::decay_t<Args>>...>> get_ptr(Args&&... args);
+#endif
+
 }  // namespace ttg_parsec
 
 #endif  // TTG_PARSEC_FWD_H
diff --git a/ttg/ttg/parsec/import.h b/ttg/ttg/parsec/import.h
index 7a5a05108..18ad7e89d 100644
--- a/ttg/ttg/parsec/import.h
+++ b/ttg/ttg/parsec/import.h
@@ -10,6 +10,7 @@
 #define TTG_SELECTED_DEFAULT_IMPL parsec
 #define TTG_PARSEC_IMPORTED 1
 #define TTG_IMPL_NS ttg_parsec
+#define TTG_IMPL_DEVICE_SUPPORT 1
 
 namespace ttg_parsec {}
 
@@ -20,7 +21,6 @@ namespace ttg {
 
   constexpr const ttg::Runtime ttg_runtime = ttg::Runtime::PaRSEC;
 
-
 }  // namespace ttg
 
 #endif  // TTG_PARSEC_IMPORT_H
diff --git a/ttg/ttg/parsec/parsec-ext.h b/ttg/ttg/parsec/parsec-ext.h
new file mode 100644
index 000000000..a7e5e5222
--- /dev/null
+++ b/ttg/ttg/parsec/parsec-ext.h
@@ -0,0 +1,7 @@
+#ifndef TTG_PARSEC_EXT_H
+#define TTG_PARSEC_EXT_H
+
+/* HACK: we need this flag on a data copy to indicate whether it has been registered */
+#define TTG_PARSEC_DATA_FLAG_REGISTERED        ((parsec_data_flag_t)1<<2)
+
+#endif // TTG_PARSEC_EXT_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/ptr.h b/ttg/ttg/parsec/ptr.h
new file mode 100644
index 000000000..8184d2657
--- /dev/null
+++ b/ttg/ttg/parsec/ptr.h
@@ -0,0 +1,284 @@
+#ifndef TTG_PARSEC_PTR_H
+#define TTG_PARSEC_PTR_H
+
+#include <unordered_map>
+#include <mutex>
+
+#include "ttg/parsec/ttg_data_copy.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/task.h"
+
+namespace ttg_parsec {
+
+  // fwd decl
+  template<typename T>
+  struct ptr;
+
+  namespace detail {
+    /* fwd decl */
+    template <typename Value>
+    inline ttg_data_copy_t *create_new_datacopy(Value &&value);
+
+    struct ptr_impl {
+      using copy_type = detail::ttg_data_copy_t;
+
+    private:
+      static inline std::unordered_map<ptr_impl*, bool> m_ptr_map;
+      static inline std::mutex m_ptr_map_mtx;
+
+      copy_type *m_copy = nullptr;
+
+      void drop_copy() {
+        std::cout << "ptr drop_copy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        if (nullptr != m_copy && 1 == m_copy->drop_ref()) {
+          delete m_copy;
+        }
+        m_copy = nullptr;
+      }
+
+      void register_self() {
+        /* insert ourselves from the list of ptr */
+        std::lock_guard _{m_ptr_map_mtx};
+        m_ptr_map.insert(std::pair{this, true});
+      }
+
+      void deregister_self() {
+        /* remove ourselves from the list of ptr */
+        std::lock_guard _{m_ptr_map_mtx};
+        if (m_ptr_map.contains(this)) {
+          m_ptr_map.erase(this);
+        }
+      }
+
+    public:
+      ptr_impl(copy_type *copy)
+      : m_copy(copy)
+      {
+        register_self();
+        m_copy->add_ref();
+        std::cout << "ptr copy_obj ref " << m_copy->num_ref() << std::endl;
+      }
+
+      copy_type* get_copy() const {
+        return m_copy;
+      }
+
+      ptr_impl(const ptr_impl& p)
+      : m_copy(p.m_copy)
+      {
+        register_self();
+        m_copy->add_ref();
+        std::cout << "ptr cpy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+      }
+
+      ptr_impl(ptr_impl&& p)
+      : m_copy(p.m_copy)
+      {
+        register_self();
+        p.m_copy = nullptr;
+        std::cout << "ptr mov " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+      }
+
+      ~ptr_impl() {
+        deregister_self();
+        drop_copy();
+      }
+
+      ptr_impl& operator=(const ptr_impl& p)
+      {
+        drop_copy();
+        m_copy = p.m_copy;
+        m_copy->add_ref();
+        std::cout << "ptr cpy " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        return *this;
+      }
+
+      ptr_impl& operator=(ptr_impl&& p) {
+        drop_copy();
+        m_copy = p.m_copy;
+        p.m_copy = nullptr;
+        std::cout << "ptr mov " << m_copy << " ref " << m_copy->num_ref() << std::endl;
+        return *this;
+      }
+
+      bool is_valid() const {
+        return (nullptr != m_copy);
+      }
+
+      void reset() {
+        drop_copy();
+      }
+
+      /* drop all currently registered ptr
+       * \note this function is not thread-safe
+       *       and should only be called at the
+       *       end of the execution, e.g., during finalize.
+       */
+      static void drop_all_ptr() {
+        for(auto it : m_ptr_map) {
+          it.first->drop_copy();
+        }
+      }
+    };
+
+
+    template<typename T>
+    ttg_parsec::detail::ttg_data_copy_t* get_copy(ttg_parsec::Ptr<T>& p);
+  } // namespace detail
+
+  // fwd decl
+  template<typename T, typename... Args>
+  Ptr<T> make_ptr(Args&&... args);
+
+  // fwd decl
+  template<typename T>
+  inline Ptr<std::decay_t<T>> get_ptr(T&& obj);
+
+  template<typename T>
+  struct Ptr {
+
+    using value_type = std::decay_t<T>;
+
+  private:
+    using copy_type = detail::ttg_data_value_copy_t<value_type>;
+
+    std::unique_ptr<detail::ptr_impl> m_ptr;
+
+    /* only PaRSEC backend functions are allowed to touch our private parts */
+    template<typename... Args>
+    friend Ptr<T> make_ptr(Args&&... args);
+    template<typename S>
+    friend Ptr<std::decay_t<S>> get_ptr(S&& obj);
+    template<typename S>
+    friend detail::ttg_data_copy_t* detail::get_copy(Ptr<S>& p);
+    friend ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC>;
+
+    /* only accessible by get_ptr and make_ptr */
+    Ptr(detail::ptr_impl::copy_type *copy)
+    : m_ptr(new detail::ptr_impl(copy))
+    { }
+
+    copy_type* get_copy() const {
+      return static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+  public:
+
+    Ptr() = default;
+
+    Ptr(const Ptr& p)
+    : Ptr(p.get_copy())
+    { }
+
+    Ptr(Ptr&& p) = default;
+
+    ~Ptr() = default;
+
+    Ptr& operator=(const Ptr& p) {
+      m_ptr.reset(new detail::ptr_impl(p.get_copy()));
+      return *this;
+    }
+
+    Ptr& operator=(Ptr&& p) = default;
+
+    value_type& operator*() const {
+      return **static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+    value_type& operator->() const {
+      return **static_cast<copy_type*>(m_ptr->get_copy());
+    }
+
+    bool is_valid() const {
+      return m_ptr && m_ptr->is_valid();
+    }
+
+    void reset() {
+      m_ptr.reset();
+    }
+  };
+
+#if 0
+  namespace detail {
+    template<typename Arg>
+    inline auto get_ptr(Arg&& obj) {
+
+      for (int i = 0; i < detail::parsec_ttg_caller->data_count; ++i) {
+        detail::ttg_data_copy_t *copy = detail::parsec_ttg_caller->copies[i];
+        if (nullptr != copy) {
+          if (copy->get_ptr() == &obj) {
+            bool is_ready = true;
+            /* TODO: how can we force-sync host and device? Current data could be on either. */
+#if 0
+            /* check all tracked device data for validity */
+            for (auto it : copy) {
+              parsec_data_t *data = *it;
+              for (int i = 0; i < parsec_nb_devices; ++i) {
+                if (nullptr != data->device_copies[i]) {
+
+                } else {
+                  is_ready = false;
+                }
+              }
+            }
+#endif // 0
+            return std::make_pair(is_ready, std::tuple{ttg_parsec::ptr<std::decay_t<Arg>>(copy)});
+          }
+        }
+      }
+
+      throw std::runtime_error("ttg::get_ptr called on an unknown object!");
+    }
+  }
+
+  template<typename... Args>
+  inline std::pair<bool, std::tuple<ptr<std::decay_t<Args>>...>> get_ptr(Args&&... args) {
+    if (nullptr == detail::parsec_ttg_caller) {
+      throw std::runtime_error("ttg::get_ptr called outside of a task!");
+    }
+
+    bool ready = true;
+    auto fn = [&](auto&& arg){
+      auto pair = get_ptr(std::forward<decltype(arg)>(arg));
+      ready &= pair.first;
+      return std::move(pair.second);
+    };
+    std::tuple<ptr<std::decay_t<Args>>...> tpl = {(fn(std::forward<Args>(args)))...};
+    return {ready, std::move(tpl)};
+  }
+#endif // 0
+
+  template<typename T>
+  inline Ptr<std::decay_t<T>> get_ptr(T&& obj) {
+    using ptr_type = Ptr<std::decay_t<T>>;
+    if (nullptr != detail::parsec_ttg_caller) {
+      for (int i = 0; i < detail::parsec_ttg_caller->data_count; ++i) {
+        detail::ttg_data_copy_t *copy = detail::parsec_ttg_caller->copies[i];
+        if (nullptr != copy) {
+          if (copy->get_ptr() == &obj) {
+            return ptr_type(copy);
+          }
+        }
+      }
+    }
+    /* object not tracked, make a new ptr that is now tracked */
+    detail::ttg_data_copy_t *copy = detail::create_new_datacopy(obj);
+    return ptr_type(copy);
+  }
+
+  template<typename T, typename... Args>
+  inline Ptr<T> make_ptr(Args&&... args) {
+    detail::ttg_data_copy_t *copy = detail::create_new_datacopy(T(std::forward<Args>(args)...));
+    return Ptr<T>(copy);
+  }
+
+  namespace detail {
+    template<typename T>
+    inline detail::ttg_data_copy_t* get_copy(ttg_parsec::Ptr<T>& p) {
+      return p.get_copy();
+    }
+  } // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_PTR_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
new file mode 100644
index 000000000..5b23d53af
--- /dev/null
+++ b/ttg/ttg/parsec/task.h
@@ -0,0 +1,328 @@
+#ifndef TTG_PARSEC_TASK_H
+#define TTG_PARSEC_TASK_H
+
+#include "ttg/parsec/ttg_data_copy.h"
+
+#include <parsec/parsec_internal.h>
+#include <parsec/mca/device/device_gpu.h>
+
+namespace ttg_parsec {
+
+  namespace detail {
+
+    struct device_ptr_t {
+      parsec_gpu_task_t* gpu_task = nullptr;
+      parsec_flow_t* flows = nullptr;
+      parsec_gpu_exec_stream_t* stream = nullptr;
+      parsec_device_gpu_module_t* device = nullptr;
+    };
+
+    template<bool SupportDevice>
+    struct device_state_t
+    {
+      static constexpr bool support_device = false;
+      static constexpr size_t num_flows = 0;
+      device_state_t()
+      { }
+      static constexpr device_ptr_t* dev_ptr() {
+        return nullptr;
+      }
+    };
+
+    template<>
+    struct device_state_t<true> {
+      static constexpr bool support_device = false;
+      static constexpr size_t num_flows = MAX_PARAM_COUNT;
+      parsec_flow_t m_flows[num_flows];
+      device_ptr_t m_dev_ptr = {nullptr, &m_flows[0], nullptr, nullptr}; // gpu_task will be allocated in each task
+      device_ptr_t* dev_ptr() {
+        return &m_dev_ptr;
+      }
+    };
+
+    enum class ttg_parsec_data_flags : uint8_t {
+      NONE           = 0,
+      SINGLE_READER  = 1 << 0,
+      MULTIPLE_READER   = 1 << 1,
+      SINGLE_WRITER  = 1 << 2,
+      MULTIPLE_WRITER   = 1 << 3,
+      IS_MODIFIED    = 1 << 4,
+      MARKED_PUSHOUT = 1 << 5
+    };
+
+    inline
+    ttg_parsec_data_flags operator|(ttg_parsec_data_flags lhs, ttg_parsec_data_flags rhs) {
+        using flags_type = std::underlying_type<ttg_parsec_data_flags>::type;
+        return ttg_parsec_data_flags(static_cast<flags_type>(lhs) | static_cast<flags_type>(rhs));
+    }
+
+    inline
+    ttg_parsec_data_flags operator|=(ttg_parsec_data_flags lhs, ttg_parsec_data_flags rhs) {
+        using flags_type = std::underlying_type<ttg_parsec_data_flags>::type;
+        return ttg_parsec_data_flags(static_cast<flags_type>(lhs) | static_cast<flags_type>(rhs));
+    }
+
+    inline
+    uint8_t operator&(ttg_parsec_data_flags lhs, ttg_parsec_data_flags rhs) {
+        using flags_type = std::underlying_type<ttg_parsec_data_flags>::type;
+        return static_cast<flags_type>(lhs) & static_cast<flags_type>(rhs);
+    }
+
+    inline
+    bool operator!(ttg_parsec_data_flags lhs) {
+        using flags_type = std::underlying_type<ttg_parsec_data_flags>::type;
+        return lhs == ttg_parsec_data_flags::NONE;
+    }
+
+
+    typedef parsec_hook_return_t (*parsec_static_op_t)(void *);  // static_op will be cast to this type
+
+    struct parsec_ttg_task_base_t {
+      parsec_task_t parsec_task;
+      int32_t in_data_count = 0;   //< number of satisfied inputs
+      int32_t data_count = 0;      //< number of data elements in the copies array
+      ttg_data_copy_t **copies;    //< pointer to the fixed copies array of the derived task
+      parsec_hash_table_item_t tt_ht_item = {};
+
+      struct stream_info_t {
+        std::size_t goal;
+        std::size_t size;
+        parsec_lifo_t reduce_copies;
+        std::atomic<std::size_t> reduce_count;
+      };
+
+    protected:
+      template<std::size_t i = 0, typename TT>
+      void init_stream_info_impl(TT *tt, std::array<stream_info_t, TT::numins>& streams) {
+        if constexpr (TT::numins > i) {
+          if (std::get<i>(tt->input_reducers)) {
+            streams[i].goal = tt->static_stream_goal[i];
+            streams[i].size = 0;
+            PARSEC_OBJ_CONSTRUCT(&streams[i].reduce_copies, parsec_lifo_t);
+            streams[i].reduce_count.store(0, std::memory_order_relaxed);
+          }
+          /* recursion */
+          if constexpr((i + 1) < TT::numins) {
+            init_stream_info_impl<i+1>(tt, streams);
+          }
+        }
+      }
+
+      template<typename TT>
+      void init_stream_info(TT *tt, std::array<stream_info_t, TT::numins>& streams) {
+        init_stream_info_impl<0>(tt, streams);
+      }
+
+    public:
+      typedef void (release_task_fn)(parsec_ttg_task_base_t*);
+      /* Poor-mans virtual function
+       * We cannot use virtual inheritance or private visibility because we
+       * need offsetof for the mempool and scheduling.
+       */
+      release_task_fn* release_task_cb = nullptr;
+      device_ptr_t* dev_ptr = nullptr;
+      bool remove_from_hash = true;
+      bool dummy = false;
+      bool defer_writer = TTG_PARSEC_DEFER_WRITER; // whether to defer writer instead of creating a new copy
+      ttg_parsec_data_flags data_flags; // HACKY: flags set by prepare_send and reset by the copy_handler
+
+      /*
+      virtual void release_task() = 0;
+      */
+    //public:
+      void release_task() {
+        release_task_cb(this);
+      }
+
+     protected:
+      /**
+       * Protected constructors: this class should not be instantiated directly
+       * but always be use through parsec_ttg_task_t.
+       */
+
+      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                             int data_count, ttg_data_copy_t **copies,
+                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
+          : data_count(data_count)
+          , copies(copies)
+          , defer_writer(defer_writer) {
+        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
+        parsec_task.mempool_owner = mempool;
+        parsec_task.task_class = task_class;
+        parsec_task.priority = 0;
+      }
+
+      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                             parsec_taskpool_t *taskpool, int32_t priority,
+                             int data_count, ttg_data_copy_t **copies,
+                             release_task_fn *release_fn,
+                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
+          : data_count(data_count)
+          , copies(copies)
+          , release_task_cb(release_fn)
+          , defer_writer(defer_writer) {
+        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
+        parsec_task.mempool_owner = mempool;
+        parsec_task.task_class = task_class;
+        parsec_task.status = PARSEC_TASK_STATUS_HOOK;
+        parsec_task.taskpool = taskpool;
+        parsec_task.priority = priority;
+        parsec_task.chore_mask = 1<<0;
+      }
+
+    public:
+      void set_dummy(bool d) { dummy = d; }
+      bool is_dummy() { return dummy; }
+    };
+
+    template <typename TT, bool KeyIsVoid = ttg::meta::is_void_v<typename TT::key_type>>
+    struct parsec_ttg_task_t : public parsec_ttg_task_base_t {
+      using key_type = typename TT::key_type;
+      static constexpr size_t num_streams = TT::numins;
+      /* device tasks may have to store more copies than it's inputs as their sends are aggregated */
+      static constexpr size_t num_copies  = TT::derived_has_device_op() ? static_cast<size_t>(MAX_PARAM_COUNT)
+                                                                      : (num_streams+1);
+      TT* tt = nullptr;
+      key_type key;
+      std::array<stream_info_t, num_streams> streams;
+#ifdef TTG_HAS_COROUTINE
+      void* suspended_task_address = nullptr;  // if not null the function is suspended
+      ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid;
+#endif
+      device_state_t<TT::derived_has_device_op()> dev_state;
+      ttg_data_copy_t *copies[num_copies] = { nullptr };  // the data copies tracked by this task
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) {
+        tt_ht_item.key = pkey();
+        this->dev_ptr = this->dev_state.dev_ptr();
+        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
+        *(uintptr_t*)&(parsec_task.locals[0]) = 0; //there is no key
+        *(uintptr_t*)&(parsec_task.locals[2]) = 0; //there is no key
+      }
+
+      parsec_ttg_task_t(const key_type& key, parsec_thread_mempool_t *mempool,
+                        parsec_task_class_t *task_class, parsec_taskpool_t *taskpool,
+                        TT *tt_ptr, int32_t priority)
+          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
+                                   num_streams, copies,
+                                   &release_task, tt_ptr->m_defer_writer)
+          , tt(tt_ptr), key(key) {
+        tt_ht_item.key = pkey();
+        this->dev_ptr = this->dev_state.dev_ptr();
+
+        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
+        uint64_t hv = ttg::hash<std::decay_t<decltype(key)>>{}(key);
+        *(uintptr_t*)&(parsec_task.locals[0]) = hv;
+        *(uintptr_t*)&(parsec_task.locals[2]) = reinterpret_cast<uintptr_t>(&this->key);
+
+        init_stream_info(tt, streams);
+      }
+
+      static void release_task(parsec_ttg_task_base_t* task_base) {
+        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
+        TT *tt = task->tt;
+        tt->release_task(task);
+      }
+
+      template<ttg::ExecutionSpace Space>
+      parsec_hook_return_t invoke_op() {
+        if constexpr (Space == ttg::ExecutionSpace::Host) {
+          return TT::template static_op<Space>(&this->parsec_task);
+        } else {
+          return TT::template device_static_op<Space>(&this->parsec_task);
+        }
+      }
+
+      parsec_key_t pkey() { return reinterpret_cast<parsec_key_t>(&key); }
+    };
+
+    template <typename TT>
+    struct parsec_ttg_task_t<TT, true> : public parsec_ttg_task_base_t {
+      static constexpr size_t num_streams = TT::numins;
+      TT* tt = nullptr;
+      std::array<stream_info_t, num_streams> streams;
+#ifdef TTG_HAS_COROUTINE
+      void* suspended_task_address = nullptr;  // if not null the function is suspended
+      ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid;
+#endif
+      device_state_t<TT::derived_has_device_op()> dev_state;
+      ttg_data_copy_t *copies[num_streams+1] = { nullptr };  // the data copies tracked by this task
+                                                             // +1 for the copy needed during send/bcast
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
+          : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) {
+        tt_ht_item.key = pkey();
+        this->dev_ptr = this->dev_state.dev_ptr();
+      }
+
+      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
+                        parsec_taskpool_t *taskpool, TT *tt_ptr, int32_t priority)
+          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
+                                   num_streams, copies,
+                                   &release_task, tt_ptr->m_defer_writer)
+          , tt(tt_ptr) {
+        tt_ht_item.key = pkey();
+        this->dev_ptr = this->dev_state.dev_ptr();
+        init_stream_info(tt, streams);
+      }
+
+      static void release_task(parsec_ttg_task_base_t* task_base) {
+        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
+        TT *tt = task->tt;
+        tt->release_task(task);
+      }
+
+      template<ttg::ExecutionSpace Space>
+      parsec_hook_return_t invoke_op() {
+        if constexpr (Space == ttg::ExecutionSpace::Host) {
+          return TT::template static_op<Space>(&this->parsec_task);
+        } else {
+          return TT::template device_static_op<Space>(&this->parsec_task);
+        }
+      }
+
+      parsec_key_t pkey() { return 0; }
+    };
+
+
+    /**
+     * Reducer task representing one or more stream reductions.
+     * A reducer task may be deferred on its first input (the object into which
+     * all other inputs are folded). Once that input becomes available the task
+     * is submitted and reduces all available inputs. Additional reducer tasks may
+     * be submitted until all required inputs have been processed.
+     */
+    struct reducer_task_t : public parsec_ttg_task_base_t {
+      parsec_ttg_task_base_t *parent_task;
+      bool is_first;
+
+      reducer_task_t(parsec_ttg_task_base_t* task, parsec_thread_mempool_t *mempool,
+                     parsec_task_class_t *task_class, parsec_taskpool_t *taskpool,
+                     int32_t priority, bool is_first)
+      : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
+                               0, nullptr,
+                               &release_task,
+                               true /* deferred until other readers have completed */)
+      , parent_task(task)
+      , is_first(is_first)
+      {
+        /* store the first 4 integers from the parent task (needed for profiling) */
+        for (int i = 0; i < 4; ++i) {
+          parsec_task.locals[i] = task->parsec_task.locals[i];
+        }
+      }
+
+      static void release_task(parsec_ttg_task_base_t* task_base) {
+        /* reducer tasks have one mutable input so the task can be submitted on the first release */
+        parsec_task_t *vp_task_rings[1] = { &task_base->parsec_task };
+        parsec_execution_stream_t *es = parsec_my_execution_stream();
+        __parsec_schedule_vp(es, vp_task_rings, 0);
+      }
+    };
+
+  } // namespace detail
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_TASK_H
diff --git a/ttg/ttg/parsec/thread_local.h b/ttg/ttg/parsec/thread_local.h
new file mode 100644
index 000000000..54b98885e
--- /dev/null
+++ b/ttg/ttg/parsec/thread_local.h
@@ -0,0 +1,22 @@
+#ifndef TTG_PARSEC_THREAD_LOCAL_H
+#define TTG_PARSEC_THREAD_LOCAL_H
+
+namespace ttg_parsec {
+
+namespace detail {
+
+  // fwd decls
+  struct parsec_ttg_task_base_t;
+  struct ttg_data_copy_t;
+
+  inline thread_local parsec_ttg_task_base_t *parsec_ttg_caller = nullptr;
+
+  inline ttg_data_copy_t*& ttg_data_copy_container() {
+    static thread_local ttg_data_copy_t *ptr = nullptr;
+    return ptr;
+  }
+
+} // namespace detail
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_THREAD_LOCAL_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index aa624e106..65e39991a 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -7,6 +7,11 @@
 #define TTG_USE_PARSEC 1
 #endif  // !defined(TTG_IMPL_NAME)
 
+/* Whether to defer a potential writer if there are readers.
+ * This may avoid extra copies in exchange for concurrency.
+ * This may cause deadlocks, so use with caution. */
+#define TTG_PARSEC_DEFER_WRITER false
+
 #include "ttg/impl_selector.h"
 
 /* include ttg header to make symbols available in case this header is included directly */
@@ -28,11 +33,20 @@
 #include "ttg/util/print.h"
 #include "ttg/util/trace.h"
 #include "ttg/util/typelist.h"
+#ifdef TTG_HAVE_DEVICE
+#include "ttg/device/task.h"
+#endif  // TTG_HAVE_DEVICE
 
 #include "ttg/serialization/data_descriptor.h"
 
 #include "ttg/parsec/fwd.h"
 
+#include "ttg/parsec/buffer.h"
+#include "ttg/parsec/devicescratch.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/devicefunc.h"
+#include "ttg/parsec/ttvalue.h"
+
 #include <algorithm>
 #include <array>
 #include <cassert>
@@ -45,11 +59,21 @@
 #include <map>
 #include <memory>
 #include <mutex>
+#include <numeric>
 #include <sstream>
 #include <string>
 #include <tuple>
 #include <vector>
 
+// needed for MPIX_CUDA_AWARE_SUPPORT
+#if defined(TTG_HAVE_MPI)
+#include <mpi.h>
+#if defined(TTG_HAVE_MPIEXT)
+#include <mpi-ext.h>
+#endif // TTG_HAVE_MPIEXT
+#endif // TTG_HAVE_MPI
+
+
 #include <parsec.h>
 #include <parsec/class/parsec_hash_table.h>
 #include <parsec/data_internal.h>
@@ -59,6 +83,19 @@
 #include <parsec/parsec_comm_engine.h>
 #include <parsec/parsec_internal.h>
 #include <parsec/scheduling.h>
+#include <parsec/remote_dep.h>
+
+#ifdef PARSEC_HAVE_DEV_CUDA_SUPPORT
+#include <parsec/mca/device/cuda/device_cuda.h>
+#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
+#ifdef PARSEC_HAVE_DEV_HIP_SUPPORT
+#include <parsec/mca/device/hip/device_hip.h>
+#endif // PARSEC_HAVE_DEV_HIP_SUPPORT
+#ifdef PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
+#include <parsec/mca/device/level_zero/device_level_zero.h>
+#endif //PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
+
+#include <parsec/mca/device/device_gpu.h>
 #if defined(PARSEC_PROF_TRACE)
 #include <parsec/profiling.h>
 #undef PARSEC_TTG_PROFILE_BACKEND
@@ -69,22 +106,19 @@
 #include <cstdlib>
 #include <cstring>
 
-#include "ttg/parsec/ttg_data_copy.h"
-
-/* This is missing in the parsec_comm_engine.h interface... But we are discussing
-   if this execution stream should be exposed. Workaround this for now. */
-extern "C" parsec_execution_stream_t parsec_comm_es;
-
-#undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES
-
 #if defined(TTG_PARSEC_DEBUG_TRACK_DATA_COPIES)
 #include <unordered_set>
 #endif
 
-/* Whether to defer a potential writer if there are readers.
- * This may avoid extra copies in exchange for concurrency.
- * This may cause deadlocks, so use with caution. */
-#define TTG_PARSEC_DEFER_WRITER false
+#include "ttg/parsec/ttg_data_copy.h"
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/ptr.h"
+#include "ttg/parsec/task.h"
+#include "ttg/parsec/parsec-ext.h"
+
+#include "ttg/device/device.h"
+
+#undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES
 
 /* PaRSEC function declarations */
 extern "C" {
@@ -101,22 +135,58 @@ namespace ttg_parsec {
   inline std::multimap<uint64_t, static_set_arg_fct_arg_t> delayed_unpack_actions;
 
   struct msg_header_t {
-    typedef enum {
+    typedef enum fn_id : std::int8_t {
+      MSG_INVALID = -1,
       MSG_SET_ARG = 0,
       MSG_SET_ARGSTREAM_SIZE = 1,
       MSG_FINALIZE_ARGSTREAM_SIZE = 2,
-      MSG_GET_FROM_PULL =3 } fn_id_t;
-    uint32_t taskpool_id;
-    uint64_t op_id;
-    fn_id_t fn_id;
-    int32_t param_id;
-    int num_keys;
+      MSG_GET_FROM_PULL = 3 } fn_id_t;
+    uint32_t taskpool_id = -1;
+    uint64_t op_id = -1;
+    std::size_t key_offset = 0;
+    fn_id_t fn_id = MSG_INVALID;
+    std::int8_t num_iovecs = 0;
+    bool inline_data = false;
+    int32_t param_id = -1;
+    int num_keys = 0;
+    int sender = -1;
+
+    msg_header_t() = default;
+
+    msg_header_t(fn_id_t fid, uint32_t tid, uint64_t oid, int32_t pid, int sender, int nk)
+    : fn_id(fid)
+    , taskpool_id(tid)
+    , op_id(oid)
+    , param_id(pid)
+    , num_keys(nk)
+    , sender(sender)
+    { }
   };
 
   static void unregister_parsec_tags(void *_);
 
   namespace detail {
 
+    constexpr const int PARSEC_TTG_MAX_AM_SIZE = 1 * 1024*1024;
+
+    struct msg_t {
+      msg_header_t tt_id;
+      static constexpr std::size_t max_payload_size = PARSEC_TTG_MAX_AM_SIZE - sizeof(msg_header_t);
+      unsigned char bytes[max_payload_size];
+
+      msg_t() = default;
+      msg_t(uint64_t tt_id,
+            uint32_t taskpool_id,
+            msg_header_t::fn_id_t fn_id,
+            int32_t param_id,
+            int sender,
+            int num_keys = 1)
+      : tt_id(fn_id, taskpool_id, tt_id, param_id, sender, num_keys)
+      {}
+    };
+
+    inline std::size_t max_inline_size = msg_t::max_payload_size;
+
     static int static_unpack_msg(parsec_comm_engine_t *ce, uint64_t tag, void *data, long unsigned int size,
                                  int src_rank, void *obj) {
       static_set_arg_fct_type static_set_arg_fct;
@@ -160,6 +230,8 @@ namespace ttg_parsec {
     ttg::Edge<> m_ctl_edge;
     bool _dag_profiling;
     bool _task_profiling;
+    std::array<bool, static_cast<std::size_t>(ttg::ExecutionSpace::Invalid)>
+               mpi_space_support = {true, false, false};
 
     int query_comm_size() {
       int comm_size;
@@ -173,6 +245,18 @@ namespace ttg_parsec {
       return comm_rank;
     }
 
+    static void ttg_parsec_ce_up(parsec_comm_engine_t *comm_engine, void *user_data)
+    {
+      parsec_ce.tag_register(WorldImpl::parsec_ttg_tag(), &detail::static_unpack_msg, user_data, detail::PARSEC_TTG_MAX_AM_SIZE);
+      parsec_ce.tag_register(WorldImpl::parsec_ttg_rma_tag(), &detail::get_remote_complete_cb, user_data, 128);
+    }
+
+    static void ttg_parsec_ce_down(parsec_comm_engine_t *comm_engine, void *user_data)
+    {
+      parsec_ce.tag_unregister(WorldImpl::parsec_ttg_tag());
+      parsec_ce.tag_unregister(WorldImpl::parsec_ttg_rma_tag());
+    }
+
    public:
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
     int parsec_ttg_profile_backend_set_arg_start, parsec_ttg_profile_backend_set_arg_end;
@@ -180,7 +264,6 @@ namespace ttg_parsec {
     int parsec_ttg_profile_backend_allocate_datacopy, parsec_ttg_profile_backend_free_datacopy;
 #endif
 
-    static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 1024 * 1024;
     WorldImpl(int *argc, char **argv[], int ncores, parsec_context_t *c = nullptr)
         : WorldImplBase(query_comm_size(), query_comm_rank())
         , ctx(c)
@@ -195,6 +278,23 @@ namespace ttg_parsec {
       ttg::detail::register_world(*this);
       if (own_ctx) ctx = parsec_init(ncores, argc, argv);
 
+      /* query MPI device support */
+      if (ttg::detail::force_device_comm()
+#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
+          || MPIX_Query_cuda_support()
+#endif // MPIX_CUDA_AWARE_SUPPORT
+         ) {
+        mpi_space_support[static_cast<std::size_t>(ttg::ExecutionSpace::CUDA)] = true;
+      }
+
+      if (ttg::detail::force_device_comm()
+#if defined(MPIX_HIP_AWARE_SUPPORT) && MPIX_HIP_AWARE_SUPPORT
+          || MPIX_Query_hip_support()
+#endif // MPIX_HIP_AWARE_SUPPORT
+         ) {
+        mpi_space_support[static_cast<std::size_t>(ttg::ExecutionSpace::HIP)] = true;
+      }
+
 #if defined(PARSEC_PROF_TRACE)
       if(parsec_profile_enabled) {
         profile_on();
@@ -214,7 +314,7 @@ namespace ttg_parsec {
 #endif
 
       if( NULL != parsec_ce.tag_register) {
-        parsec_ce.tag_register(WorldImpl::parsec_ttg_tag(), &detail::static_unpack_msg, this, PARSEC_TTG_MAX_AM_SIZE);
+        parsec_ce.tag_register(WorldImpl::parsec_ttg_tag(), &detail::static_unpack_msg, this, detail::PARSEC_TTG_MAX_AM_SIZE);
         parsec_ce.tag_register(WorldImpl::parsec_ttg_rma_tag(), &detail::get_remote_complete_cb, this, 128);
       }
 
@@ -235,6 +335,13 @@ namespace ttg_parsec {
       tpool->taskpool_name = strdup("TTG Taskpool");
       parsec_taskpool_reserve_id(tpool);
 
+      tpool->devices_index_mask = 0;
+      for(int i = 0; i < (int)parsec_nb_devices; i++) {
+          parsec_device_module_t *device = parsec_mca_device_get(i);
+          if( NULL == device ) continue;
+          tpool->devices_index_mask |= (1 << device->device_index);
+      }
+
 #ifdef TTG_USE_USER_TERMDET
       parsec_termdet_open_module(tpool, "user_trigger");
 #else   // TTG_USE_USER_TERMDET
@@ -386,6 +493,10 @@ namespace ttg_parsec {
 
     virtual bool profiling() override { return _task_profiling; }
 
+    bool mpi_support(ttg::ExecutionSpace space) {
+      return mpi_space_support[static_cast<std::size_t>(space)];
+    }
+
     virtual void final_task() override {
 #ifdef TTG_USE_USER_TERMDET
       if(parsec_taskpool_started) {
@@ -469,7 +580,7 @@ namespace ttg_parsec {
 #endif
   };
 
-  inline void unregister_parsec_tags(void *_)
+  static void unregister_parsec_tags(void *_pidx)
   {
     if(NULL != parsec_ce.tag_unregister) {
       parsec_ce.tag_unregister(WorldImpl::parsec_ttg_tag());
@@ -479,8 +590,6 @@ namespace ttg_parsec {
 
   namespace detail {
 
-    typedef void (*parsec_static_op_t)(void *);  // static_op will be cast to this type
-
     const parsec_symbol_t parsec_taskclass_param0 = {
       .flags = PARSEC_SYMBOL_IS_STANDALONE|PARSEC_SYMBOL_IS_GLOBAL,
       .name = "HASH0",
@@ -514,167 +623,14 @@ namespace ttg_parsec {
       .expr_inc = nullptr,
       .cst_inc = 0 };
 
-    struct parsec_ttg_task_base_t {
-      parsec_task_t parsec_task;
-      int32_t in_data_count = 0;  //< number of satisfied inputs
-      int32_t data_count = 0;     //< number of data elements in parsec_task.data
-      parsec_hash_table_item_t tt_ht_item = {};
-      parsec_static_op_t function_template_class_ptr[ttg::runtime_traits<ttg::Runtime::PaRSEC>::num_execution_spaces] =
-          {nullptr};
-      bool is_dummy = false;
-      bool defer_writer = TTG_PARSEC_DEFER_WRITER; // whether to defer writer instead of creating a new copy
-
-      typedef void (release_task_fn)(parsec_ttg_task_base_t*);
-
-      typedef struct {
-        std::size_t goal;
-        std::size_t size;
-      } size_goal_t;
-
-      /* Poor-mans virtual function
-       * We cannot use virtual inheritance or private visibility because we
-       * need offsetof for the mempool and scheduling.
-       */
-      release_task_fn* release_task_cb = nullptr;
-      bool remove_from_hash = true;
-
-      /*
-      virtual void release_task() = 0;
-      */
-    //public:
-      void release_task() {
-        release_task_cb(this);
-      }
-
-     protected:
-      /**
-       * Protected constructors: this class should not be instantiated directly
-       * but always be use through parsec_ttg_task_t.
-       */
-
-      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, int data_count,
-                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
-          : data_count(data_count), defer_writer(defer_writer) {
-        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
-        parsec_task.mempool_owner = mempool;
-        parsec_task.task_class = task_class;
-        parsec_task.priority = 0;
-      }
-
-      parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
-                             parsec_taskpool_t *taskpool, int32_t priority, int data_count,
-                             release_task_fn *release_fn,
-                             bool defer_writer = TTG_PARSEC_DEFER_WRITER)
-          : data_count(data_count)
-          , defer_writer(defer_writer)
-          , release_task_cb(release_fn) {
-            int32_t p = priority;
-        PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
-        parsec_task.mempool_owner = mempool;
-        parsec_task.task_class = task_class;
-        parsec_task.status = PARSEC_TASK_STATUS_HOOK;
-        parsec_task.taskpool = taskpool;
-        parsec_task.priority = priority;
-        parsec_task.chore_mask = 1<<0;
-      }
-
-    public:
-      void set_dummy(bool d) { is_dummy = d; }
-      bool dummy() { return is_dummy; }
-    };
-
-    template <typename TT, bool KeyIsVoid = ttg::meta::is_void_v<typename TT::key_type>>
-    struct parsec_ttg_task_t : public parsec_ttg_task_base_t {
-      using key_type = typename TT::key_type;
-      static constexpr size_t num_streams = TT::numins;
-      TT* tt;
-      key_type key;
-      size_goal_t stream[num_streams] = {};
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams) {
-        tt_ht_item.key = pkey();
-
-        for (int i = 0; i < num_streams; ++i) {
-          parsec_task.data[i].data_in = nullptr;
-        }
-
-        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
-        *(uintptr_t*)&(parsec_task.locals[0]) = 0; //there is no key
-        *(uintptr_t*)&(parsec_task.locals[2]) = 0; //there is no key
-      }
-
-      parsec_ttg_task_t(const key_type& key, parsec_thread_mempool_t *mempool,
-                        parsec_task_class_t *task_class, parsec_taskpool_t *taskpool,
-                        TT *tt_ptr, int32_t priority)
-          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
-                                   num_streams, &release_task, tt_ptr->m_defer_writer)
-          , tt(tt_ptr), key(key) {
-        tt_ht_item.key = pkey();
-
-        for (int i = 0; i < num_streams; ++i) {
-          parsec_task.data[i].data_in = nullptr;
-        }
-
-        // We store the hash of the key and the address where it can be found in locals considered as a scratchpad
-        uint64_t hv = ttg::hash<std::decay_t<decltype(key)>>{}(key);
-        *(uintptr_t*)&(parsec_task.locals[0]) = hv;
-        *(uintptr_t*)&(parsec_task.locals[2]) = reinterpret_cast<uintptr_t>(&this->key);
-      }
-
-      static void release_task(parsec_ttg_task_base_t* task_base) {
-        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
-        TT *tt = task->tt;
-        tt->release_task(task);
-      }
-
-      parsec_key_t pkey() { return reinterpret_cast<parsec_key_t>(&key); }
-    };
-
-    template <typename TT>
-    struct parsec_ttg_task_t<TT, true> : public parsec_ttg_task_base_t {
-      static constexpr size_t num_streams = TT::numins;
-      TT* tt;
-      size_goal_t stream[num_streams] = {};
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class)
-          : parsec_ttg_task_base_t(mempool, task_class, num_streams) {
-        tt_ht_item.key = pkey();
-
-        for (int i = 0; i < num_streams; ++i) {
-          parsec_task.data[i].data_in = nullptr;
-        }
-      }
-
-      parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
-                        parsec_taskpool_t *taskpool, TT *tt_ptr, int32_t priority)
-          : parsec_ttg_task_base_t(mempool, task_class, taskpool, priority,
-                                   num_streams, &release_task, tt_ptr->m_defer_writer)
-          , tt(tt_ptr) {
-        tt_ht_item.key = pkey();
-
-        for (int i = 0; i < num_streams; ++i) {
-          parsec_task.data[i].data_in = nullptr;
-        }
-      }
-
-      static void release_task(parsec_ttg_task_base_t* task_base) {
-        parsec_ttg_task_t *task = static_cast<parsec_ttg_task_t*>(task_base);
-        TT *tt = task->tt;
-        tt->release_task(task);
-      }
-
-      parsec_key_t pkey() { return 0; }
-    };
-
     inline ttg_data_copy_t *find_copy_in_task(parsec_ttg_task_base_t *task, const void *ptr) {
       ttg_data_copy_t *res = nullptr;
       if (task == nullptr || ptr == nullptr) {
         return res;
       }
       for (int i = 0; i < task->data_count; ++i) {
-        auto copy = static_cast<ttg_data_copy_t *>(task->parsec_task.data[i].data_in);
-        if (NULL != copy && copy->device_private == ptr) {
+        auto copy = static_cast<ttg_data_copy_t *>(task->copies[i]);
+        if (NULL != copy && copy->get_ptr() == ptr) {
           res = copy;
           break;
         }
@@ -688,8 +644,8 @@ namespace ttg_parsec {
         return i;
       }
       for (i = 0; i < task->data_count; ++i) {
-        auto copy = static_cast<ttg_data_copy_t *>(task->parsec_task.data[i].data_in);
-        if (NULL != copy && copy->device_private == ptr) {
+        auto copy = static_cast<ttg_data_copy_t *>(task->copies[i]);
+        if (NULL != copy && copy->get_ptr() == ptr) {
           return i;
         }
       }
@@ -705,7 +661,7 @@ namespace ttg_parsec {
         throw std::logic_error("Too many data copies, check MAX_PARAM_COUNT!");
       }
 
-      task->parsec_task.data[task->data_count].data_in = copy;
+      task->copies[task->data_count] = copy;
       task->data_count++;
       return true;
     }
@@ -714,17 +670,17 @@ namespace ttg_parsec {
       int i;
       /* find and remove entry; copies are usually appended and removed, so start from back */
       for (i = task->data_count-1; i >= 0; --i) {
-        if (copy == task->parsec_task.data[i].data_in) {
+        if (copy == task->copies[i]) {
           break;
         }
       }
       if (i < 0) return;
       /* move all following elements one up */
       for (; i < task->data_count - 1; ++i) {
-        task->parsec_task.data[i].data_in = task->parsec_task.data[i + 1].data_in;
+        task->copies[i] = task->copies[i + 1];
       }
       /* null last element */
-      task->parsec_task.data[i].data_in = nullptr;
+      task->copies[i] = nullptr;
       task->data_count--;
     }
 
@@ -740,7 +696,15 @@ namespace ttg_parsec {
     template <typename Value>
     inline ttg_data_copy_t *create_new_datacopy(Value &&value) {
       using value_type = std::decay_t<Value>;
-      ttg_data_copy_t *copy = new ttg_data_value_copy_t<value_type>(std::forward<Value>(value));
+      ttg_data_copy_t *copy;
+      if constexpr (std::is_base_of_v<ttg::TTValue<value_type>, value_type>) {
+        copy = new value_type(std::forward<Value>(value));
+      } else if constexpr (std::is_rvalue_reference_v<decltype(value)> ||
+                           std::is_copy_constructible_v<std::decay_t<Value>>) {
+        copy = new ttg_data_value_copy_t<value_type>(std::forward<Value>(value));
+      } else {
+        throw std::logic_error("Trying to copy-construct data that is not copy-constructible!");
+      }
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
       // Keep track of additional memory usage
       if(ttg::default_execution_context().impl().profiling()) {
@@ -762,21 +726,68 @@ namespace ttg_parsec {
       return copy;
     }
 
+#if 0
+    template <std::size_t... IS, typename Key = keyT>
+    void invoke_pull_terminals(std::index_sequence<IS...>, const Key &key, detail::parsec_ttg_task_base_t *task) {
+      int junk[] = {0, (invoke_pull_terminal<IS>(
+                            std::get<IS>(input_terminals), key, task),
+                        0)...};
+      junk[0]++;
+    }
+#endif // 0
+
+    template<typename TT, std::size_t I>
+    inline void transfer_ownership_impl(ttg_data_copy_t *copy, int device) {
+      if constexpr(!std::is_const_v<std::tuple_element_t<I, typename TT::input_values_tuple_type>>) {
+        copy->transfer_ownership(PARSEC_FLOW_ACCESS_RW, device);
+      }
+    }
+
+    template<typename TT, std::size_t... Is>
+    inline void transfer_ownership(parsec_ttg_task_t<TT> *me, int device, std::index_sequence<Is...>) {
+      /* transfer ownership of each data */
+      int junk[] = {0, (transfer_ownership_impl<TT, Is>(me->copies[Is], device), 0)...};
+      junk[0]++;
+    }
+
+    template<typename TT>
     inline parsec_hook_return_t hook(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) {
-      parsec_ttg_task_base_t *me = (parsec_ttg_task_base_t *)parsec_task;
-      me->function_template_class_ptr[static_cast<std::size_t>(ttg::ExecutionSpace::Host)](parsec_task);
-      return PARSEC_HOOK_RETURN_DONE;
+      parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
+      if constexpr(std::tuple_size_v<typename TT::input_values_tuple_type> > 0) {
+        transfer_ownership<TT>(me, 0, std::make_index_sequence<std::tuple_size_v<typename TT::input_values_tuple_type>>{});
+      }
+      return me->template invoke_op<ttg::ExecutionSpace::Host>();
     }
 
+    template<typename TT>
     inline parsec_hook_return_t hook_cuda(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) {
-      parsec_ttg_task_base_t *me = (parsec_ttg_task_base_t *)parsec_task;
-      me->function_template_class_ptr[static_cast<std::size_t>(ttg::ExecutionSpace::CUDA)](parsec_task);
-      return PARSEC_HOOK_RETURN_DONE;
+      if constexpr(TT::derived_has_cuda_op()) {
+        parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
+        return me->template invoke_op<ttg::ExecutionSpace::CUDA>();
+      } else {
+        throw std::runtime_error("PaRSEC CUDA hook invoked on a TT that does not support CUDA operations!");
+      }
+    }
+
+    template<typename TT>
+    inline parsec_hook_return_t hook_hip(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) {
+      if constexpr(TT::derived_has_hip_op()) {
+        parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
+        return me->template invoke_op<ttg::ExecutionSpace::HIP>();
+      } else {
+        throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!");
+      }
     }
 
-    static parsec_key_fn_t parsec_tasks_hash_fcts = {.key_equal = parsec_hash_table_generic_64bits_key_equal,
-                                                     .key_print = parsec_hash_table_generic_64bits_key_print,
-                                                     .key_hash = parsec_hash_table_generic_64bits_key_hash};
+    template<typename TT>
+    inline parsec_hook_return_t hook_level_zero(struct parsec_execution_stream_s *es, parsec_task_t *parsec_task) {
+      if constexpr(TT::derived_has_level_zero_op()) {
+        parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
+        return me->template invoke_op<ttg::ExecutionSpace::L0>();
+      } else {
+        throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!");
+      }
+    }
 
     template <typename KeyT, typename ActivationCallbackT>
     class rma_delayed_activate {
@@ -831,7 +842,7 @@ namespace ttg_parsec {
     }
 
     inline void release_data_copy(ttg_data_copy_t *copy) {
-      if (copy->is_mutable()) {
+      if (copy->is_mutable() && nullptr == copy->get_next_task()) {
         /* current task mutated the data but there are no consumers so prepare
         * the copy to be freed below */
         copy->reset_readers();
@@ -841,19 +852,26 @@ namespace ttg_parsec {
       if (readers > 1) {
         /* potentially more than one reader, decrement atomically */
         readers = copy->decrement_readers();
+      } else if (readers == 1) {
+        /* make sure readers drop to zero */
+        readers = copy->decrement_readers<false>();
       }
-      /* if there was only one reader (the current task) we release the copy */
-      if (1 == readers) {
-        if (nullptr != copy->push_task) {
+      /* if there was only one reader (the current task) or
+       * a mutable copy and a successor, we release the copy */
+      if (1 == readers || readers == copy->mutable_tag) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+        if (nullptr != copy->get_next_task()) {
           /* Release the deferred task.
-          * The copy was mutable and will be mutated by the released task,
-          * so simply transfer ownership.
-          */
-          parsec_task_t *push_task = copy->push_task;
-          copy->push_task = nullptr;
-          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)push_task;
+           * The copy was mutable and will be mutated by the released task,
+           * so simply transfer ownership.
+           */
+          parsec_task_t *next_task = copy->get_next_task();
+          copy->set_next_task(nullptr);
+          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)next_task;
+          copy->mark_mutable();
           deferred_op->release_task();
-        } else {
+        } else if ((1 == copy->num_ref()) || (1 == copy->drop_ref())) {
+          /* we are the last reference, delete the copy */
 #if defined(TTG_PARSEC_DEBUG_TRACK_DATA_COPIES)
           {
             const std::lock_guard<std::mutex> lock(pending_copies_mutex);
@@ -889,10 +907,10 @@ namespace ttg_parsec {
       }
 
       if (readers == copy_in->mutable_tag) {
-        if (copy_res->push_task != nullptr) {
+        if (copy_res->get_next_task() != nullptr) {
           if (readonly) {
-            parsec_ttg_task_base_t *push_task = reinterpret_cast<parsec_ttg_task_base_t *>(copy_res->push_task);
-            if (push_task->defer_writer) {
+            parsec_ttg_task_base_t *next_task = reinterpret_cast<parsec_ttg_task_base_t *>(copy_res->get_next_task());
+            if (next_task->defer_writer) {
               /* there is a writer but it signalled that it wants to wait for readers to complete */
               return copy_res;
             }
@@ -926,14 +944,14 @@ namespace ttg_parsec {
            * no other readers, mark copy as mutable and defer the release
            * of the task
            */
+          assert(nullptr == copy_in->get_next_task());
+          copy_in->set_next_task(&task->parsec_task);
+          std::atomic_thread_fence(std::memory_order_release);
           copy_in->mark_mutable();
-          assert(nullptr == copy_in->push_task);
-          assert(nullptr != task);
-          copy_in->push_task = &task->parsec_task;
         } else {
-          if (task->defer_writer && nullptr == copy_in->push_task) {
+          if (task->defer_writer && nullptr == copy_in->get_next_task()) {
             /* we're the first writer and want to wait for all readers to complete */
-            copy_res->push_task = &task->parsec_task;
+            copy_res->set_next_task(&task->parsec_task);
           } else {
             /* there are writers and/or waiting already of this copy already, make a copy that we can mutate */
             copy_res = NULL;
@@ -942,19 +960,19 @@ namespace ttg_parsec {
       }
 
       if (NULL == copy_res) {
-        ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast<Value *>(copy_in->device_private));
-        if (replace && nullptr != copy_in->push_task) {
+        ttg_data_copy_t *new_copy = detail::create_new_datacopy(*static_cast<Value *>(copy_in->get_ptr()));
+        if (replace && nullptr != copy_in->get_next_task()) {
           /* replace the task that was deferred */
-          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->push_task;
+          parsec_ttg_task_base_t *deferred_op = (parsec_ttg_task_base_t *)copy_in->get_next_task();
           new_copy->mark_mutable();
           /* replace the copy in the deferred task */
           for (int i = 0; i < deferred_op->data_count; ++i) {
-            if (deferred_op->parsec_task.data[i].data_in == copy_in) {
-              deferred_op->parsec_task.data[i].data_in = new_copy;
+            if (deferred_op->copies[i] == copy_in) {
+              deferred_op->copies[i] = new_copy;
               break;
             }
           }
-          copy_in->push_task = nullptr;
+          copy_in->set_next_task(nullptr);
           deferred_op->release_task();
           copy_in->reset_readers();            // set the copy back to being read-only
           copy_in->increment_readers<false>(); // register as reader
@@ -971,8 +989,6 @@ namespace ttg_parsec {
 
   }  // namespace detail
 
-  inline thread_local detail::parsec_ttg_task_base_t *parsec_ttg_caller;
-
   inline void ttg_initialize(int argc, char **argv, int num_threads, parsec_context_t *ctx) {
     if (detail::initialized_mpi()) throw std::runtime_error("ttg_parsec::ttg_initialize: can only be called once");
 
@@ -993,6 +1009,26 @@ namespace ttg_parsec {
     std::shared_ptr<ttg::base::WorldImplBase> world_sptr{static_cast<ttg::base::WorldImplBase *>(world_ptr)};
     ttg::World world{std::move(world_sptr)};
     ttg::detail::set_default_world(std::move(world));
+
+    // query the first device ID
+    detail::first_device_id = -1;
+    for (int i = 0; i < parsec_nb_devices; ++i) {
+      bool is_gpu = parsec_mca_device_is_gpu(i);
+      if (detail::first_device_id == -1 && is_gpu) {
+        detail::first_device_id = i;
+      } else if (detail::first_device_id > -1 && !is_gpu) {
+        throw std::runtime_error("PaRSEC: Found non-GPU device in GPU ID range!");
+      }
+    }
+
+    /* parse the maximum inline size */
+    const char* ttg_max_inline_cstr = std::getenv("TTG_MAX_INLINE");
+    if (nullptr != ttg_max_inline_cstr) {
+      std::size_t inline_size = std::atol(ttg_max_inline_cstr);
+      if (inline_size < detail::max_inline_size) {
+        detail::max_inline_size = inline_size;
+      }
+    }
   }
   inline void ttg_finalize() {
     // We need to notify the current taskpool of termination if we are in user termination detection mode
@@ -1000,11 +1036,13 @@ namespace ttg_parsec {
     if(0 == ttg::default_execution_context().rank())
       ttg::default_execution_context().impl().final_task();
     ttg::detail::set_default_world(ttg::World{});  // reset the default world
+    detail::ptr_impl::drop_all_ptr();
     ttg::detail::destroy_worlds<ttg_parsec::WorldImpl>();
     if (detail::initialized_mpi()) MPI_Finalize();
   }
   inline ttg::World ttg_default_execution_context() { return ttg::get_default_world(); }
-  inline void ttg_abort() { MPI_Abort(ttg_default_execution_context().impl().comm(), 1); }
+  [[noreturn]]
+  inline void ttg_abort() { MPI_Abort(ttg_default_execution_context().impl().comm(), 1); std::abort(); }
   inline void ttg_execute(ttg::World world) { world.impl().execute(); }
   inline void ttg_fence(ttg::World world) { world.impl().fence(); }
 
@@ -1069,14 +1107,6 @@ namespace ttg_parsec {
       parsec_task_class_t self;
     };
 
-    struct msg_t {
-      msg_header_t tt_id;
-      unsigned char bytes[WorldImpl::PARSEC_TTG_MAX_AM_SIZE - sizeof(msg_header_t)];
-
-      msg_t() = default;
-      msg_t(uint64_t tt_id, uint32_t taskpool_id, msg_header_t::fn_id_t fn_id, int32_t param_id, int num_keys = 1)
-          : tt_id{taskpool_id, tt_id, fn_id, param_id, num_keys} {}
-    };
   }  // namespace detail
 
   template <typename keyT, typename output_terminalsT, typename derivedT, typename input_valueTs>
@@ -1098,7 +1128,13 @@ namespace ttg_parsec {
 
     // check for a non-type member named have_cuda_op
     template <typename T>
-    using have_cuda_op_non_type_t = decltype(&T::have_cuda_op);
+    using have_cuda_op_non_type_t = decltype(T::have_cuda_op);
+
+    template <typename T>
+    using have_hip_op_non_type_t = decltype(T::have_hip_op);
+
+    template <typename T>
+    using have_level_zero_op_non_type_t = decltype(T::have_level_zero_op);
 
     bool alive = true;
 
@@ -1107,6 +1143,7 @@ namespace ttg_parsec {
     static constexpr int numouts = std::tuple_size_v<output_terminalsT>;       // number of outputs
     static constexpr int numflows = std::max(numins, numouts);                 // max number of flows
 
+   public:
     /// @return true if derivedT::have_cuda_op exists and is defined to true
     static constexpr bool derived_has_cuda_op() {
       if constexpr (ttg::meta::is_detected_v<have_cuda_op_non_type_t, derivedT>) {
@@ -1116,7 +1153,29 @@ namespace ttg_parsec {
       }
     }
 
-   public:
+    /// @return true if derivedT::have_hip_op exists and is defined to true
+    static constexpr bool derived_has_hip_op() {
+      if constexpr (ttg::meta::is_detected_v<have_hip_op_non_type_t, derivedT>) {
+        return derivedT::have_hip_op;
+      } else {
+        return false;
+      }
+    }
+
+    /// @return true if derivedT::have_hip_op exists and is defined to true
+    static constexpr bool derived_has_level_zero_op() {
+      if constexpr (ttg::meta::is_detected_v<have_level_zero_op_non_type_t, derivedT>) {
+        return derivedT::have_level_zero_op;
+      } else {
+        return false;
+      }
+    }
+
+    /// @return true if the TT supports device execution
+    static constexpr bool derived_has_device_op() {
+      return (derived_has_cuda_op() || derived_has_hip_op() || derived_has_level_zero_op());
+    }
+
     using ttT = TT;
     using key_type = keyT;
     using input_terminals_type = ttg::detail::input_terminals_tuple_t<keyT, input_tuple_type>;
@@ -1149,6 +1208,7 @@ namespace ttg_parsec {
    private:
     using task_t = detail::parsec_ttg_task_t<ttT>;
 
+    friend detail::parsec_ttg_task_base_t;
     friend task_t;
 
     /* the offset of the key placed after the task structure in the memory from mempool */
@@ -1193,13 +1253,21 @@ namespace ttg_parsec {
     constexpr static std::array<void (TT::*)(void *, std::size_t), numinedges> get_from_pull_msg_fcts =
         make_get_from_pull_fcts(std::make_index_sequence<numinedges>{});
 
+    template<std::size_t... IS>
+    constexpr static auto make_input_is_const(std::index_sequence<IS...>) {
+      using resultT = decltype(input_is_const);
+      return resultT{{std::is_const_v<std::tuple_element_t<IS, input_args_type>>...}};
+    }
+    constexpr static std::array<bool, numins> input_is_const = make_input_is_const(std::make_index_sequence<numins>{});
+
     ttg::World world;
     ttg::meta::detail::keymap_t<keyT> keymap;
     ttg::meta::detail::keymap_t<keyT> priomap;
     // For now use same type for unary/streaming input terminals, and stream reducers assigned at runtime
     ttg::meta::detail::input_reducers_t<actual_input_tuple_type>
         input_reducers;  //!< Reducers for the input terminals (empty = expect single value)
-    std::array<std::size_t, numins> static_stream_goal;
+    std::array<parsec_task_class_t*, numins> inpute_reducers_taskclass = { nullptr };
+    std::array<std::size_t, numins> static_stream_goal = { std::numeric_limits<std::size_t>::max() };
     int num_pullins = 0;
 
     bool m_defer_writer = TTG_PARSEC_DEFER_WRITER;
@@ -1208,16 +1276,21 @@ namespace ttg_parsec {
     ttg::World get_world() const override final { return world; }
 
    private:
-    /// dispatches a call to derivedT::op if Space == Host, otherwise to derivedT::op_cuda if Space == CUDA
+    /// dispatches a call to derivedT::op
+    /// @return void if called a synchronous function, or ttg::coroutine_handle<> if called a coroutine (if non-null,
+    ///    points to the suspended coroutine)
     template <ttg::ExecutionSpace Space, typename... Args>
-    void op(Args &&...args) {
+    auto op(Args &&...args) {
       derivedT *derived = static_cast<derivedT *>(this);
-      if constexpr (Space == ttg::ExecutionSpace::Host)
-        derived->op(std::forward<Args>(args)...);
-      else if constexpr (Space == ttg::ExecutionSpace::CUDA)
-        derived->op_cuda(std::forward<Args>(args)...);
-      else
-        abort();
+      //if constexpr (Space == ttg::ExecutionSpace::Host) {
+        using return_type = decltype(derived->op(std::forward<Args>(args)...));
+        if constexpr (std::is_same_v<return_type,void>) {
+          derived->op(std::forward<Args>(args)...);
+          return;
+        }
+        else {
+          return derived->op(std::forward<Args>(args)...);
+        }
     }
 
     template <std::size_t i, typename terminalT, typename Key>
@@ -1240,7 +1313,8 @@ namespace ttg_parsec {
       auto &world_impl = world.impl();
       parsec_taskpool_t *tp = world_impl.taskpool();
       std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), tp->taskpool_id,
-                                                            msg_header_t::MSG_GET_FROM_PULL, i, 1);
+                                                            msg_header_t::MSG_GET_FROM_PULL, i,
+                                                            world.rank(), 1);
       /* pack the key */
       size_t pos = 0;
       pos = pack(key, msg->bytes, pos);
@@ -1262,167 +1336,608 @@ namespace ttg_parsec {
     static input_refs_tuple_type make_tuple_of_ref_from_array(task_t *task, std::index_sequence<IS...>) {
       return input_refs_tuple_type{static_cast<std::tuple_element_t<IS, input_refs_tuple_type>>(
           *reinterpret_cast<std::remove_reference_t<std::tuple_element_t<IS, input_refs_tuple_type>> *>(
-              task->parsec_task.data[IS].data_in->device_private))...};
+              task->copies[IS]->get_ptr()))...};
     }
 
+#ifdef TTG_HAVE_DEVICE
+    /**
+     * Submit callback called by PaRSEC once all input transfers have completed.
+     */
     template <ttg::ExecutionSpace Space>
-    static void static_op(parsec_task_t *parsec_task) {
-      task_t *task = (task_t*)parsec_task;
-      ttT *baseobj = task->tt;
-      derivedT *obj = static_cast<derivedT *>(baseobj);
-      assert(parsec_ttg_caller == NULL);
-      parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
-      if (obj->tracing()) {
-        if constexpr (!ttg::meta::is_void_v<keyT>)
-          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": executing");
-        else
-          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : executing");
-      }
-
-      if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-        auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
-        baseobj->template op<Space>(task->key, std::move(input), obj->output_terminals);
-      } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-        baseobj->template op<Space>(task->key, obj->output_terminals);
-      } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-        auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
-        baseobj->template op<Space>(std::move(input), obj->output_terminals);
-      } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
-        baseobj->template op<Space>(obj->output_terminals);
-      } else {
-        abort();
-      }
-      parsec_ttg_caller = NULL;
+    static int device_static_submit(parsec_device_gpu_module_t  *gpu_device,
+                                    parsec_gpu_task_t           *gpu_task,
+                                    parsec_gpu_exec_stream_t    *gpu_stream) {
 
-      if (obj->tracing()) {
-        if constexpr (!ttg::meta::is_void_v<keyT>)
-          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": done executing");
-        else
-          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : done executing");
+      task_t *task = (task_t*)gpu_task->ec;
+      // get the device task from the coroutine handle
+      ttg::device::Task dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address);
+
+      task->dev_ptr->stream = gpu_stream;
+
+      //std::cout << "device_static_submit task " << task << std::endl;
+
+      // get the promise which contains the views
+      auto dev_data = dev_task.promise();
+
+      /* we should still be waiting for the transfer to complete */
+      assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER ||
+             dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_KERNEL);
+
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
+      {
+        parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
+        int device = detail::parsec_device_to_ttg_device(gpu_device->super.device_index);
+        ttg::device::detail::set_current(device, cuda_stream->cuda_stream);
       }
-    }
+#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
 
-    template <ttg::ExecutionSpace Space>
-    static void static_op_noarg(parsec_task_t *parsec_task) {
-      task_t *task = static_cast<task_t*>(parsec_task);
-      ttT *baseobj = (ttT *)task->object_ptr;
-      derivedT *obj = (derivedT *)task->object_ptr;
-      assert(parsec_ttg_caller == NULL);
-      parsec_ttg_caller = task;
-      if constexpr (!ttg::meta::is_void_v<keyT>) {
-        baseobj->template op<Space>(task->key, obj->output_terminals);
-      } else if constexpr (ttg::meta::is_void_v<keyT>) {
-        baseobj->template op<Space>(obj->output_terminals);
-      } else
-        abort();
-      parsec_ttg_caller = NULL;
-    }
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP)
+      {
+        parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
+        int device = detail::parsec_device_to_ttg_device(gpu_device->super.device_index);
+        ttg::device::detail::set_current(device, hip_stream->hip_stream);
+      }
+#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
 
-   protected:
-    template <typename T>
-    uint64_t unpack(T &obj, void *_bytes, uint64_t pos) {
-      const ttg_data_descriptor *dObj = ttg::get_data_descriptor<ttg::meta::remove_cvr_t<T>>();
-      uint64_t payload_size;
-      if constexpr (!ttg::default_data_descriptor<ttg::meta::remove_cvr_t<T>>::serialize_size_is_const) {
-        const ttg_data_descriptor *dSiz = ttg::get_data_descriptor<uint64_t>();
-        dSiz->unpack_payload(&payload_size, sizeof(uint64_t), pos, _bytes);
-        pos += sizeof(uint64_t);
-      } else {
-        payload_size = dObj->payload_size(&obj);
+#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) && defined(TTG_HAVE_LEVEL_ZERO)
+      {
+        parsec_level_zero_exec_stream_t *stream;
+        stream = (parsec_level_zero_exec_stream_t *)gpu_stream;
+        int device = detail::parsec_device_to_ttg_device(gpu_device->super.device_index);
+        ttg::device::detail::set_current(device, stream->swq->queue);
       }
-      dObj->unpack_payload(&obj, payload_size, pos, _bytes);
-      return pos + payload_size;
-    }
+#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
 
-    template <typename T>
-    uint64_t pack(T &obj, void *bytes, uint64_t pos) {
-      const ttg_data_descriptor *dObj = ttg::get_data_descriptor<ttg::meta::remove_cvr_t<T>>();
-      uint64_t payload_size = dObj->payload_size(&obj);
-      if constexpr (!ttg::default_data_descriptor<ttg::meta::remove_cvr_t<T>>::serialize_size_is_const) {
-        const ttg_data_descriptor *dSiz = ttg::get_data_descriptor<uint64_t>();
-        dSiz->pack_payload(&payload_size, sizeof(uint64_t), pos, bytes);
-        pos += sizeof(uint64_t);
+      /* Here we call back into the coroutine again after the transfers have completed */
+      static_op<Space>(&task->parsec_task);
+
+      ttg::device::detail::reset_current();
+
+      /* we will come back into this function once the kernel and transfers are done */
+      int rc = PARSEC_HOOK_RETURN_DONE;
+      if (nullptr != task->suspended_task_address) {
+        /* Get a new handle for the promise*/
+        dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address);
+        dev_data = dev_task.promise();
+
+        assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_KERNEL ||
+               dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_SENDOUT     ||
+               dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE);
+
+        if (ttg::device::detail::TTG_DEVICE_CORO_SENDOUT == dev_data.state() ||
+            ttg::device::detail::TTG_DEVICE_CORO_COMPLETE == dev_data.state()) {
+          /* the task started sending so we won't come back here */
+          //std::cout << "device_static_submit task " << task << " complete" << std::endl;
+        } else {
+          //std::cout << "device_static_submit task " << task << " return-again" << std::endl;
+          rc = PARSEC_HOOK_RETURN_AGAIN;
+        }
+      } else {
+        /* the task is done so we won't come back here */
+        //std::cout << "device_static_submit task " << task << " complete" << std::endl;
       }
-      dObj->pack_payload(&obj, payload_size, pos, bytes);
-      return pos + payload_size;
+      return rc;
     }
 
-    static void static_set_arg(void *data, std::size_t size, ttg::TTBase *bop) {
-      assert(size >= sizeof(msg_header_t) &&
-             "Trying to unpack as message that does not hold enough bytes to represent a single header");
-      msg_header_t *hd = static_cast<msg_header_t *>(data);
-      derivedT *obj = reinterpret_cast<derivedT *>(bop);
-      switch (hd->fn_id) {
-        case msg_header_t::MSG_SET_ARG: {
-          if (0 <= hd->param_id) {
-            assert(hd->param_id >= 0);
-            assert(hd->param_id < obj->set_arg_from_msg_fcts.size());
-            auto member = obj->set_arg_from_msg_fcts[hd->param_id];
-            (obj->*member)(data, size);
-          } else {
-            // there is no good reason to have negative param ids
-            abort();
+    static void
+    static_device_stage_in(parsec_gpu_task_t        *gtask,
+                         uint32_t                  flow_mask,
+                         parsec_gpu_exec_stream_t *gpu_stream) {
+      /* register any memory that hasn't been registered yet */
+      for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+        if (flow_mask & (1<<i)) {
+          task_t *task = (task_t*)gtask->ec;
+          parsec_data_copy_t *copy = task->parsec_task.data[i].data_in;
+          if (0 == (copy->flags & TTG_PARSEC_DATA_FLAG_REGISTERED)) {
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+            // register host memory for faster device access
+            cudaError_t status;
+            //status = cudaHostRegister(copy->device_private, gtask->flow_nb_elts[i], cudaHostRegisterPortable);
+            //assert(cudaSuccess == status);
+#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
+            //copy->flags |= TTG_PARSEC_DATA_FLAG_REGISTERED;
           }
-          break;
-        }
-        case msg_header_t::MSG_SET_ARGSTREAM_SIZE: {
-          assert(hd->param_id >= 0);
-          assert(hd->param_id < obj->set_argstream_size_from_msg_fcts.size());
-          auto member = obj->set_argstream_size_from_msg_fcts[hd->param_id];
-          (obj->*member)(data, size);
-          break;
-        }
-        case msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE: {
-          assert(hd->param_id >= 0);
-          assert(hd->param_id < obj->finalize_argstream_from_msg_fcts.size());
-          auto member = obj->finalize_argstream_from_msg_fcts[hd->param_id];
-          (obj->*member)(data, size);
-          break;
-        }
-        case msg_header_t::MSG_GET_FROM_PULL: {
-          assert(hd->param_id >= 0);
-          assert(hd->param_id < obj->get_from_pull_msg_fcts.size());
-          auto member = obj->get_from_pull_msg_fcts[hd->param_id];
-          (obj->*member)(data, size);
-          break;
         }
-        default:
-          abort();
       }
     }
 
-    /** Returns the task memory pool owned by the calling thread */
-    inline parsec_thread_mempool_t *get_task_mempool(void) {
-      auto &world_impl = world.impl();
-      parsec_execution_stream_s *es = world_impl.execution_stream();
-      int index = (es->virtual_process->vp_id * es->virtual_process->nb_cores + es->th_id);
-      return &mempools.thread_mempools[index];
+    static int
+    static_device_stage_in_hook(parsec_gpu_task_t        *gtask,
+                                uint32_t                  flow_mask,
+                                parsec_gpu_exec_stream_t *gpu_stream) {
+      static_device_stage_in(gtask, flow_mask, gpu_stream);
+      return parsec_default_gpu_stage_in(gtask, flow_mask, gpu_stream);
     }
 
-    template <size_t i, typename valueT>
-    void set_arg_from_msg_keylist(ttg::span<keyT> &&keylist, detail::ttg_data_copy_t *copy) {
-      /* create a dummy task that holds the copy, which can be reused by others */
-      task_t *dummy;
-      parsec_execution_stream_s *es = world.impl().execution_stream();
-      parsec_thread_mempool_t *mempool = get_task_mempool();
-      dummy = new (parsec_thread_mempool_allocate(mempool)) task_t(mempool, &this->self);
+    template <ttg::ExecutionSpace Space>
+    static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) {
+      static_assert(derived_has_device_op());
+
+      int dev_index;
+      double ratio = 1.0;
+
+      task_t *task = (task_t*)parsec_task;
+      parsec_execution_stream_s *es = task->tt->world.impl().execution_stream();
+
+      //std::cout << "device_static_op: task " << parsec_task << std::endl;
+
+      /* set up a device task */
+      parsec_gpu_task_t *gpu_task;
+      /* PaRSEC wants to free the gpu_task, because F***K ownerships */
+      gpu_task = static_cast<parsec_gpu_task_t*>(std::calloc(1, sizeof(*gpu_task)));
+      PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
+      gpu_task->ec = parsec_task;
+      gpu_task->task_type = 0; // user task
+      gpu_task->load = 1;    // TODO: can we do better?
+      gpu_task->last_data_check_epoch = -1; // used internally
+      gpu_task->pushout = 0;
+      gpu_task->submit = &TT::device_static_submit<Space>;
+
+      /* set the gpu_task so it's available in register_device_memory */
+      task->dev_ptr->gpu_task = gpu_task;
+
+      // first invocation of the coroutine to get the coroutine handle
+      static_op<Space>(parsec_task);
+
+      /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
+
+      if (nullptr == task->suspended_task_address) {
+        /* short-cut in case the task returned immediately */
+        return PARSEC_HOOK_RETURN_DONE;
+      }
+
+      // get the device task from the coroutine handle
+      auto dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address);
+
+      // get the promise which contains the views
+      ttg::device::detail::device_task_promise_type& dev_data = dev_task.promise();
+
+      /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */
+      assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER);
+
+      /* set up a temporary task-class to correctly specify the flows */
+      parsec_task_class_t tc = *task->parsec_task.task_class;
+
+      tc.name = task->parsec_task.task_class->name;
+      // input flows are set up during register_device_memory as part of the first invocation above
+      for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+        tc.in[i]  = gpu_task->flow[i];
+        tc.out[i] = gpu_task->flow[i];
+      }
+      tc.nb_flows = MAX_PARAM_COUNT;
+
+      /* swap in the new task class */
+      const parsec_task_class_t* tmp = task->parsec_task.task_class;
+      *const_cast<parsec_task_class_t**>(&task->parsec_task.task_class) = &tc;
+
+      /* TODO: is this the right place to set the mask? */
+      task->parsec_task.chore_mask = PARSEC_DEV_ALL;
+      /* get a device and come back if we need another one */
+      int64_t task_load = 1;
+      dev_index = parsec_get_best_device(parsec_task, &task_load);
+
+      /* swap back the original task class */
+      task->parsec_task.task_class = tmp;
+
+      gpu_task->load = task_load;
+      assert(dev_index >= 0);
+      if (!parsec_mca_device_is_gpu(dev_index)) {
+          return PARSEC_HOOK_RETURN_NEXT; /* Fall back */
+      }
+
+      parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)parsec_mca_device_get(dev_index);
+      assert(NULL != device);
+
+      task->dev_ptr->device = device;
+
+      switch(device->super.type) {
+
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+        case PARSEC_DEV_CUDA:
+          if constexpr (Space == ttg::ExecutionSpace::CUDA) {
+            /* TODO: we need custom staging functions because PaRSEC looks at the
+             *       task-class to determine the number of flows. */
+            gpu_task->stage_in  = static_device_stage_in_hook;
+            gpu_task->stage_out = parsec_default_gpu_stage_out;
+            return parsec_device_kernel_scheduler(&device->super, es, gpu_task);
+          }
+          break;
+#endif
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+        case PARSEC_DEV_HIP:
+          if constexpr (Space == ttg::ExecutionSpace::HIP) {
+            gpu_task->stage_in  = static_device_stage_in_hook;
+            gpu_task->stage_out = parsec_default_gpu_stage_out;
+            return parsec_device_kernel_scheduler(&device->super, es, gpu_task);
+          }
+          break;
+#endif // PARSEC_HAVE_DEV_HIP_SUPPORT
+#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT)
+        case PARSEC_DEV_LEVEL_ZERO:
+          if constexpr (Space == ttg::ExecutionSpace::L0) {
+            gpu_task->stage_in  = static_device_stage_in_hook;
+            gpu_task->stage_out = parsec_default_gpu_stage_out;
+            return parsec_device_kernel_scheduler(&device->super, es, gpu_task);
+          }
+          break;
+#endif // PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
+        default:
+          break;
+      }
+      ttg::print_error(task->tt->get_name(), " : received mismatching device type ", (int)device->super.type, " from PaRSEC");
+      ttg::abort();
+      return PARSEC_HOOK_RETURN_DONE; // will not be reacehed
+    }
+#endif  // TTG_HAVE_DEVICE
+
+    template <ttg::ExecutionSpace Space>
+    static parsec_hook_return_t static_op(parsec_task_t *parsec_task) {
+
+      task_t *task = (task_t*)parsec_task;
+      void* suspended_task_address =
+#ifdef TTG_HAS_COROUTINE
+        task->suspended_task_address;  // non-null = need to resume the task
+#else
+        nullptr;
+#endif
+      //std::cout << "static_op: suspended_task_address " << suspended_task_address << std::endl;
+      if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
+
+        ttT *baseobj = task->tt;
+        derivedT *obj = static_cast<derivedT *>(baseobj);
+        assert(detail::parsec_ttg_caller == nullptr);
+        detail::parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
+        if (obj->tracing()) {
+          if constexpr (!ttg::meta::is_void_v<keyT>)
+            ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": executing");
+          else
+            ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : executing");
+        }
+
+        if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+          auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, std::move(input), obj->output_terminals));
+        } else if constexpr (!ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, obj->output_terminals));
+        } else if constexpr (ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+          auto input = make_tuple_of_ref_from_array(task, std::make_index_sequence<numinvals>{});
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(std::move(input), obj->output_terminals));
+        } else if constexpr (ttg::meta::is_void_v<keyT> && ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(obj->output_terminals));
+        } else {
+          ttg::abort();
+        }
+        detail::parsec_ttg_caller = nullptr;
+      }
+      else {  // resume the suspended coroutine
+
+        assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid);
+
+#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_DEVICE
+        if (task->coroutine_id == ttg::TaskCoroutineID::DeviceTask) {
+          ttg::device::Task coro = ttg::device::detail::device_task_handle_type::from_address(suspended_task_address);
+          assert(detail::parsec_ttg_caller == nullptr);
+          detail::parsec_ttg_caller = static_cast<detail::parsec_ttg_task_base_t*>(task);
+          // TODO: unify the outputs tls handling
+          auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor();
+          task->tt->set_outputs_tls_ptr();
+          coro.resume();
+          if (coro.completed()) {
+            coro.destroy();
+            suspended_task_address = nullptr;
+          }
+          task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
+          detail::parsec_ttg_caller = nullptr;
+        } else
+#endif  // TTG_HAVE_DEVICE
+      if (task->coroutine_id == ttg::TaskCoroutineID::ResumableTask) {
+        auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address));
+        assert(ret.ready());
+        auto old_output_tls_ptr = task->tt->outputs_tls_ptr_accessor();
+        task->tt->set_outputs_tls_ptr();
+        ret.resume();
+        if (ret.completed()) {
+          ret.destroy();
+          suspended_task_address = nullptr;
+        }
+        else { // not yet completed
+          // leave suspended_task_address as is
+
+          // right now can events are not properly implemented, we are only testing the workflow with dummy events
+          // so mark the events finished manually, parsec will rerun this task again and it should complete the second time
+          auto events = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address)).events();
+          for (auto &event_ptr : events) {
+            event_ptr->finish();
+          }
+          assert(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address).promise().ready());
+        }
+        task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
+        detail::parsec_ttg_caller = nullptr;
+      }
+      else
+        ttg::abort();  // unrecognized task id
+#else // TTG_HAS_COROUTINE
+ttg::abort();  // should not happen
+#endif  // TTG_HAS_COROUTINE
+      }
+      task->suspended_task_address = suspended_task_address;
+
+      if (suspended_task_address == nullptr) {
+        ttT *baseobj = task->tt;
+        derivedT *obj = static_cast<derivedT *>(baseobj);
+        if (obj->tracing()) {
+          if constexpr (!ttg::meta::is_void_v<keyT>)
+            ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", task->key, ": done executing");
+          else
+            ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : done executing");
+        }
+      }
+
+      return PARSEC_HOOK_RETURN_DONE;
+    }
+
+    template <ttg::ExecutionSpace Space>
+    static parsec_hook_return_t static_op_noarg(parsec_task_t *parsec_task) {
+      task_t *task = static_cast<task_t*>(parsec_task);
+
+      void* suspended_task_address =
+#ifdef TTG_HAS_COROUTINE
+        task->suspended_task_address;  // non-null = need to resume the task
+#else
+        nullptr;
+#endif
+      if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
+        ttT *baseobj = (ttT *)task->object_ptr;
+        derivedT *obj = (derivedT *)task->object_ptr;
+        assert(detail::parsec_ttg_caller == NULL);
+        detail::parsec_ttg_caller = task;
+        if constexpr (!ttg::meta::is_void_v<keyT>) {
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(task->key, obj->output_terminals));
+        } else if constexpr (ttg::meta::is_void_v<keyT>) {
+          TTG_PROCESS_TT_OP_RETURN(suspended_task_address, task->coroutine_id, baseobj->template op<Space>(obj->output_terminals));
+        } else  // unreachable
+          ttg:: abort();
+        detail::parsec_ttg_caller = NULL;
+      }
+      else {
+#ifdef TTG_HAS_COROUTINE
+        auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address));
+        assert(ret.ready());
+        ret.resume();
+        if (ret.completed()) {
+          ret.destroy();
+          suspended_task_address = nullptr;
+        }
+        else { // not yet completed
+          // leave suspended_task_address as is
+        }
+#else
+        ttg::abort();  // should not happen
+#endif
+      }
+      task->suspended_task_address = suspended_task_address;
+
+      if (suspended_task_address) {
+        ttg::abort();  // not yet implemented
+        // see comments in static_op()
+        return PARSEC_HOOK_RETURN_AGAIN;
+      }
+      else
+        return PARSEC_HOOK_RETURN_DONE;
+    }
+
+    template <std::size_t i>
+    static parsec_hook_return_t static_reducer_op(parsec_execution_stream_s *es, parsec_task_t *parsec_task) {
+      using rtask_t = detail::reducer_task_t;
+      using value_t = std::tuple_element_t<i, actual_input_tuple_type>;
+      constexpr const bool val_is_void = ttg::meta::is_void_v<value_t>;
+      constexpr const bool input_is_const = std::is_const_v<value_t>;
+      rtask_t *rtask = (rtask_t*)parsec_task;
+      task_t *parent_task = static_cast<task_t*>(rtask->parent_task);
+      ttT *baseobj = parent_task->tt;
+      derivedT *obj = static_cast<derivedT *>(baseobj);
+
+      auto& reducer = std::get<i>(baseobj->input_reducers);
+
+      //std::cout << "static_reducer_op " << parent_task->key << std::endl;
+
+      if (obj->tracing()) {
+        if constexpr (!ttg::meta::is_void_v<keyT>)
+          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", parent_task->key, ": reducer executing");
+        else
+          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : reducer executing");
+      }
+
+      /* the copy to reduce into */
+      detail::ttg_data_copy_t *target_copy;
+      target_copy = parent_task->copies[i];
+      assert(val_is_void || nullptr != target_copy);
+      /* once we hit 0 we have to stop since another thread might enqueue a new reduction task */
+      std::size_t c = 0;
+      std::size_t size = 0;
+      assert(parent_task->streams[i].reduce_count > 0);
+      if (rtask->is_first) {
+        if (0 == (parent_task->streams[i].reduce_count.fetch_sub(1, std::memory_order_acq_rel)-1)) {
+          /* we were the first and there is nothing to be done */
+          if (obj->tracing()) {
+            if constexpr (!ttg::meta::is_void_v<keyT>)
+              ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", parent_task->key, ": first reducer empty");
+            else
+              ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : first reducer empty");
+          }
+
+          return PARSEC_HOOK_RETURN_DONE;
+        }
+      }
+
+      assert(detail::parsec_ttg_caller == NULL);
+      detail::parsec_ttg_caller = rtask->parent_task;
+
+      do {
+        if constexpr(!val_is_void) {
+          /* the copies to reduce out of */
+          detail::ttg_data_copy_t *source_copy;
+          parsec_list_item_t *item;
+          item = parsec_lifo_pop(&parent_task->streams[i].reduce_copies);
+          if (nullptr == item) {
+            // maybe someone is changing the goal right now
+            break;
+          }
+          source_copy = ((detail::ttg_data_copy_self_t *)(item))->self;
+          assert(target_copy->num_readers() == target_copy->mutable_tag);
+          assert(source_copy->num_readers() > 0);
+          reducer(*reinterpret_cast<std::decay_t<value_t> *>(target_copy->get_ptr()),
+                  *reinterpret_cast<std::decay_t<value_t> *>(source_copy->get_ptr()));
+          detail::release_data_copy(source_copy);
+        } else if constexpr(val_is_void) {
+          reducer(); // invoke control reducer
+        }
+        // there is only one task working on this stream, so no need to be atomic here
+        size = ++parent_task->streams[i].size;
+        //std::cout << "static_reducer_op size " << size << " of " << parent_task->streams[i].goal << std::endl;
+      } while ((c = (parent_task->streams[i].reduce_count.fetch_sub(1, std::memory_order_acq_rel)-1)) > 0);
+      //} while ((c = (--task->streams[i].reduce_count)) > 0);
+
+      /* finalize_argstream sets goal to 1, so size may be larger than goal */
+      bool complete = (size >= parent_task->streams[i].goal);
+
+      //std::cout << "static_reducer_op size " << size
+      //          << " of " << parent_task->streams[i].goal << " complete " << complete
+      //          << " c " << c << std::endl;
+      if (complete && c == 0) {
+        if constexpr(input_is_const) {
+          /* make the consumer task a reader if its input is const */
+          target_copy->reset_readers();
+        }
+        /* task may not be runnable yet because other inputs are missing, have release_task decide */
+        parent_task->remove_from_hash = true;
+        parent_task->release_task(parent_task);
+      }
+
+      detail::parsec_ttg_caller = NULL;
+
+      if (obj->tracing()) {
+        if constexpr (!ttg::meta::is_void_v<keyT>)
+          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : ", parent_task->key, ": done executing");
+        else
+          ttg::trace(obj->get_world().rank(), ":", obj->get_name(), " : done executing");
+      }
+
+      return PARSEC_HOOK_RETURN_DONE;
+    }
+
+
+   protected:
+    template <typename T>
+    uint64_t unpack(T &obj, void *_bytes, uint64_t pos) {
+      const ttg_data_descriptor *dObj = ttg::get_data_descriptor<ttg::meta::remove_cvr_t<T>>();
+      uint64_t payload_size;
+      if constexpr (!ttg::default_data_descriptor<ttg::meta::remove_cvr_t<T>>::serialize_size_is_const) {
+        const ttg_data_descriptor *dSiz = ttg::get_data_descriptor<uint64_t>();
+        dSiz->unpack_payload(&payload_size, sizeof(uint64_t), pos, _bytes);
+        pos += sizeof(uint64_t);
+      } else {
+        payload_size = dObj->payload_size(&obj);
+      }
+      dObj->unpack_payload(&obj, payload_size, pos, _bytes);
+      return pos + payload_size;
+    }
+
+    template <typename T>
+    uint64_t pack(T &obj, void *bytes, uint64_t pos, detail::ttg_data_copy_t *copy = nullptr) {
+      const ttg_data_descriptor *dObj = ttg::get_data_descriptor<ttg::meta::remove_cvr_t<T>>();
+      uint64_t payload_size = dObj->payload_size(&obj);
+      if (copy) {
+        /* reset any tracked data, we don't care about the packing from the payload size */
+        copy->iovec_reset();
+      }
+
+      if constexpr (!ttg::default_data_descriptor<ttg::meta::remove_cvr_t<T>>::serialize_size_is_const) {
+        const ttg_data_descriptor *dSiz = ttg::get_data_descriptor<uint64_t>();
+        dSiz->pack_payload(&payload_size, sizeof(uint64_t), pos, bytes);
+        pos += sizeof(uint64_t);
+      }
+      dObj->pack_payload(&obj, payload_size, pos, bytes);
+      return pos + payload_size;
+    }
+
+    static void static_set_arg(void *data, std::size_t size, ttg::TTBase *bop) {
+      assert(size >= sizeof(msg_header_t) &&
+             "Trying to unpack as message that does not hold enough bytes to represent a single header");
+      msg_header_t *hd = static_cast<msg_header_t *>(data);
+      derivedT *obj = reinterpret_cast<derivedT *>(bop);
+      switch (hd->fn_id) {
+        case msg_header_t::MSG_SET_ARG: {
+          if (0 <= hd->param_id) {
+            assert(hd->param_id >= 0);
+            assert(hd->param_id < obj->set_arg_from_msg_fcts.size());
+            auto member = obj->set_arg_from_msg_fcts[hd->param_id];
+            (obj->*member)(data, size);
+          } else {
+            // there is no good reason to have negative param ids
+            ttg::abort();
+          }
+          break;
+        }
+        case msg_header_t::MSG_SET_ARGSTREAM_SIZE: {
+          assert(hd->param_id >= 0);
+          assert(hd->param_id < obj->set_argstream_size_from_msg_fcts.size());
+          auto member = obj->set_argstream_size_from_msg_fcts[hd->param_id];
+          (obj->*member)(data, size);
+          break;
+        }
+        case msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE: {
+          assert(hd->param_id >= 0);
+          assert(hd->param_id < obj->finalize_argstream_from_msg_fcts.size());
+          auto member = obj->finalize_argstream_from_msg_fcts[hd->param_id];
+          (obj->*member)(data, size);
+          break;
+        }
+        case msg_header_t::MSG_GET_FROM_PULL: {
+          assert(hd->param_id >= 0);
+          assert(hd->param_id < obj->get_from_pull_msg_fcts.size());
+          auto member = obj->get_from_pull_msg_fcts[hd->param_id];
+          (obj->*member)(data, size);
+          break;
+        }
+        default:
+          ttg::abort();
+      }
+    }
+
+    /** Returns the task memory pool owned by the calling thread */
+    inline parsec_thread_mempool_t *get_task_mempool(void) {
+      auto &world_impl = world.impl();
+      parsec_execution_stream_s *es = world_impl.execution_stream();
+      int index = (es->virtual_process->vp_id * es->virtual_process->nb_cores + es->th_id);
+      return &mempools.thread_mempools[index];
+    }
+
+    template <size_t i, typename valueT>
+    void set_arg_from_msg_keylist(ttg::span<keyT> &&keylist, detail::ttg_data_copy_t *copy) {
+      /* create a dummy task that holds the copy, which can be reused by others */
+      task_t *dummy;
+      parsec_execution_stream_s *es = world.impl().execution_stream();
+      parsec_thread_mempool_t *mempool = get_task_mempool();
+      dummy = new (parsec_thread_mempool_allocate(mempool)) task_t(mempool, &this->self);
       dummy->set_dummy(true);
       // TODO: do we need to copy static_stream_goal in dummy?
 
       /* set the received value as the dummy's only data */
-      dummy->parsec_task.data[0].data_in = copy;
+      dummy->copies[0] = copy;
 
       /* We received the task on this world, so it's using the same taskpool */
       dummy->parsec_task.taskpool = world.impl().taskpool();
 
       /* save the current task and set the dummy task */
-      auto parsec_ttg_caller_save = parsec_ttg_caller;
-      parsec_ttg_caller = dummy;
+      auto parsec_ttg_caller_save = detail::parsec_ttg_caller;
+      detail::parsec_ttg_caller = dummy;
 
       /* iterate over the keys and have them use the copy we made */
       parsec_task_t *task_ring = nullptr;
       for (auto &&key : keylist) {
-        set_arg_local_impl<i>(key, *reinterpret_cast<valueT *>(copy->device_private), copy, &task_ring);
+        set_arg_local_impl<i>(key, *reinterpret_cast<valueT *>(copy->get_ptr()), copy, &task_ring);
       }
 
       if (nullptr != task_ring) {
@@ -1432,7 +1947,7 @@ namespace ttg_parsec {
       }
 
       /* restore the previous task */
-      parsec_ttg_caller = parsec_ttg_caller_save;
+      detail::parsec_ttg_caller = parsec_ttg_caller_save;
 
       /* release the dummy task */
       complete_task_and_release(es, &dummy->parsec_task);
@@ -1455,7 +1970,9 @@ namespace ttg_parsec {
       msg_t *msg = static_cast<msg_t *>(data);
       if constexpr (!ttg::meta::is_void_v<keyT>) {
         /* unpack the keys */
-        uint64_t pos = 0;
+        /* TODO: can we avoid copying all the keys?! */
+        uint64_t pos = msg->tt_id.key_offset;
+        uint64_t key_end_pos;
         std::vector<keyT> keylist;
         int num_keys = msg->tt_id.num_keys;
         keylist.reserve(num_keys);
@@ -1466,90 +1983,140 @@ namespace ttg_parsec {
           assert(keymap(key) == rank);
           keylist.push_back(std::move(key));
         }
+        key_end_pos = pos;
+        /* jump back to the beginning of the message to get the value */
+        pos = 0;
         // case 1
         if constexpr (!ttg::meta::is_void_v<valueT>) {
           using decvalueT = std::decay_t<valueT>;
-          if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
-            detail::ttg_data_copy_t *copy = detail::create_new_datacopy(decvalueT{});
-            unpack(*static_cast<decvalueT *>(copy->device_private), msg->bytes, pos);
-
-            set_arg_from_msg_keylist<i, decvalueT>(ttg::span<keyT>(&keylist[0], num_keys), copy);
-          } else {
-            /* unpack the header and start the RMA transfers */
+          int32_t num_iovecs = msg->tt_id.num_iovecs;
+          //bool inline_data = msg->inline_data;
+          detail::ttg_data_copy_t *copy;
+          if constexpr (ttg::has_split_metadata<decvalueT>::value) {
             ttg::SplitMetadataDescriptor<decvalueT> descr;
             using metadata_t = decltype(descr.get_metadata(std::declval<decvalueT>()));
-            size_t metadata_size = sizeof(metadata_t);
 
             /* unpack the metadata */
             metadata_t metadata;
-            std::memcpy(&metadata, msg->bytes + pos, metadata_size);
-            pos += metadata_size;
+            pos = unpack(metadata, msg->bytes, pos);
+
+            //std::cout << "set_arg_from_msg splitmd num_iovecs " << num_iovecs << std::endl;
+
+            copy = detail::create_new_datacopy(descr.create_from_metadata(metadata));
+          } else if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
+            copy = detail::create_new_datacopy(decvalueT{});
+#if 0
+            // TODO: first attempt at sending directly to the device
+            parsec_gpu_data_copy_t* gpu_elem;
+            gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index);
+            int i = detail::first_device_id;
+            int devid = detail::first_device_id;
+            while (i < parsec_nb_devices) {
+              if (nullptr == gpu_elem) {
+                gpu_elem = PARSEC_OBJ_NEW(parsec_data_copy_t);
+                gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED;
+                gpu_elem->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+                gpu_elem->version = 0;
+                gpu_elem->coherency_state = PARSEC_DATA_COHERENCY_OWNED;
+              }
+              if (nullptr == gpu_elem->device_private) {
+                gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
+                if (nullptr == gpu_elem->device_private) {
+                  devid++;
+                  continue;
+                }
+                break;
+              }
+            }
+#endif // 0
+            /* unpack the object, potentially discovering iovecs */
+            pos = unpack(*static_cast<decvalueT *>(copy->get_ptr()), msg->bytes, pos);
+            //std::cout << "set_arg_from_msg iovec_begin num_iovecs " << num_iovecs << " distance " << std::distance(copy->iovec_begin(), copy->iovec_end()) << std::endl;
+            assert(std::distance(copy->iovec_begin(), copy->iovec_end()) == num_iovecs);
+          }
 
-            /* unpack the remote rank */
-            int remote;
-            std::memcpy(&remote, msg->bytes + pos, sizeof(remote));
-            pos += sizeof(remote);
+          if (num_iovecs == 0) {
+            set_arg_from_msg_keylist<i, decvalueT>(ttg::span<keyT>(&keylist[0], num_keys), copy);
+          } else {
+            /* unpack the header and start the RMA transfers */
 
+            /* get the remote rank */
+            int remote = msg->tt_id.sender;
             assert(remote < world.size());
 
-            /* extract the number of chunks */
-            int32_t num_iovecs;
-            std::memcpy(&num_iovecs, msg->bytes + pos, sizeof(num_iovecs));
-            pos += sizeof(num_iovecs);
-
-            detail::ttg_data_copy_t *copy = detail::create_new_datacopy(descr.create_from_metadata(metadata));
-            /* nothing else to do if the object is empty */
-            if (0 == num_iovecs) {
-              set_arg_from_msg_keylist<i, decvalueT>(keylist, copy);
-            } else {
-              /* extract the callback tag */
-              parsec_ce_tag_t cbtag;
-              std::memcpy(&cbtag, msg->bytes + pos, sizeof(cbtag));
-              pos += sizeof(cbtag);
-
-              /* create the value from the metadata */
-              auto activation = new detail::rma_delayed_activate(
-                  std::move(keylist), copy, num_iovecs, [this](std::vector<keyT> &&keylist, detail::ttg_data_copy_t *copy) {
-                    set_arg_from_msg_keylist<i, decvalueT>(keylist, copy);
-                    this->world.impl().decrement_inflight_msg();
-                  });
-              auto &val = *static_cast<decvalueT *>(copy->device_private);
-
-              using ActivationT = std::decay_t<decltype(*activation)>;
-
-              int nv = 0;
-              /* process payload iovecs */
-              auto iovecs = descr.get_data(val);
-              /* start the RMA transfers */
-              for (auto &&iov : iovecs) {
-                ++nv;
-                parsec_ce_mem_reg_handle_t rreg;
-                int32_t rreg_size_i;
-                std::memcpy(&rreg_size_i, msg->bytes + pos, sizeof(rreg_size_i));
-                pos += sizeof(rreg_size_i);
-                rreg = static_cast<parsec_ce_mem_reg_handle_t>(msg->bytes + pos);
-                pos += rreg_size_i;
-                // std::intptr_t *fn_ptr = reinterpret_cast<std::intptr_t *>(msg->bytes + pos);
-                // pos += sizeof(*fn_ptr);
-                std::intptr_t fn_ptr;
-                std::memcpy(&fn_ptr, msg->bytes + pos, sizeof(fn_ptr));
-                pos += sizeof(fn_ptr);
-
-                /* register the local memory */
-                parsec_ce_mem_reg_handle_t lreg;
-                size_t lreg_size;
-                parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
-                                       iov.num_bytes, &lreg, &lreg_size);
-                world.impl().increment_inflight_msg();
-                /* TODO: PaRSEC should treat the remote callback as a tag, not a function pointer! */
-                parsec_ce.get(&parsec_ce, lreg, 0, rreg, 0, iov.num_bytes, remote,
-                              &detail::get_complete_cb<ActivationT>, activation,
-                              /*world.impl().parsec_ttg_rma_tag()*/
-                              cbtag, &fn_ptr, sizeof(std::intptr_t));
-              }
+            auto &val = *static_cast<decvalueT *>(copy->get_ptr());
+
+            bool inline_data = msg->tt_id.inline_data;
+
+            int nv = 0;
+            /* start the RMA transfers */
+            auto handle_iovecs_fn =
+              [&](auto&& iovecs) {
+
+                if (inline_data) {
+                  /* unpack the data from the message */
+                  for (auto &&iov : iovecs) {
+                    ++nv;
+                    std::memcpy(iov.data, msg->bytes + pos, iov.num_bytes);
+                    pos += iov.num_bytes;
+                  }
+                } else {
+                  /* extract the callback tag */
+                  parsec_ce_tag_t cbtag;
+                  std::memcpy(&cbtag, msg->bytes + pos, sizeof(cbtag));
+                  pos += sizeof(cbtag);
+
+                  /* create the value from the metadata */
+                  auto activation = new detail::rma_delayed_activate(
+                      std::move(keylist), copy, num_iovecs, [this](std::vector<keyT> &&keylist, detail::ttg_data_copy_t *copy) {
+                        set_arg_from_msg_keylist<i, decvalueT>(keylist, copy);
+                        this->world.impl().decrement_inflight_msg();
+                      });
+
+                  using ActivationT = std::decay_t<decltype(*activation)>;
+
+                  for (auto &&iov : iovecs) {
+                    ++nv;
+                    parsec_ce_mem_reg_handle_t rreg;
+                    int32_t rreg_size_i;
+                    std::memcpy(&rreg_size_i, msg->bytes + pos, sizeof(rreg_size_i));
+                    pos += sizeof(rreg_size_i);
+                    rreg = static_cast<parsec_ce_mem_reg_handle_t>(msg->bytes + pos);
+                    pos += rreg_size_i;
+                    // std::intptr_t *fn_ptr = reinterpret_cast<std::intptr_t *>(msg->bytes + pos);
+                    // pos += sizeof(*fn_ptr);
+                    std::intptr_t fn_ptr;
+                    std::memcpy(&fn_ptr, msg->bytes + pos, sizeof(fn_ptr));
+                    pos += sizeof(fn_ptr);
+
+                    /* register the local memory */
+                    parsec_ce_mem_reg_handle_t lreg;
+                    size_t lreg_size;
+                    parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
+                                            iov.num_bytes, &lreg, &lreg_size);
+                    world.impl().increment_inflight_msg();
+                    /* TODO: PaRSEC should treat the remote callback as a tag, not a function pointer! */
+                    //std::cout << "set_arg_from_msg: get rreg " << rreg << " remote " << remote << std::endl;
+                    parsec_ce.get(&parsec_ce, lreg, 0, rreg, 0, iov.num_bytes, remote,
+                                  &detail::get_complete_cb<ActivationT>, activation,
+                                  /*world.impl().parsec_ttg_rma_tag()*/
+                                  cbtag, &fn_ptr, sizeof(std::intptr_t));
+                  }
+                }
+            };
+            if constexpr (ttg::has_split_metadata<decvalueT>::value) {
+              ttg::SplitMetadataDescriptor<decvalueT> descr;
+              handle_iovecs_fn(descr.get_data(val));
+            } else if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
+              handle_iovecs_fn(copy->iovec_span());
+              copy->iovec_reset();
+            }
+
+            assert(num_iovecs == nv);
+            assert(size == (key_end_pos + sizeof(msg_header_t)));
 
-              assert(num_iovecs == nv);
-              assert(size == (pos + sizeof(msg_header_t)));
+            if (inline_data) {
+              set_arg_from_msg_keylist<i, decvalueT>(ttg::span<keyT>(&keylist[0], num_keys), copy);
             }
           }
           // case 2 and 3
@@ -1568,8 +2135,8 @@ namespace ttg_parsec {
         // case 5 and 6
       } else if constexpr (ttg::meta::is_void_v<keyT> && std::is_void_v<valueT>) {
         set_arg<i, keyT, ttg::Void>(ttg::Void{});
-      } else {
-        abort();
+      } else {  // unreachable
+        ttg::abort();
       }
     }
 
@@ -1668,29 +2235,50 @@ namespace ttg_parsec {
       char *taskobj = (char *)parsec_thread_mempool_allocate(mempool);
       int32_t priority = 0;
       if constexpr (!keyT_is_Void) {
-        //priority = priomap(key);
+        priority = priomap(key);
         /* placement-new the task */
         newtask = new (taskobj) task_t(key, mempool, &this->self, world_impl.taskpool(), this, priority);
       } else {
-        //priority = priomap();
+        priority = priomap();
         /* placement-new the task */
         newtask = new (taskobj) task_t(mempool, &this->self, world_impl.taskpool(), this, priority);
       }
 
-      newtask->function_template_class_ptr[static_cast<std::size_t>(ttg::ExecutionSpace::Host)] =
-          reinterpret_cast<detail::parsec_static_op_t>(&TT::static_op<ttg::ExecutionSpace::Host>);
-      if constexpr (derived_has_cuda_op())
-        newtask->function_template_class_ptr[static_cast<std::size_t>(ttg::ExecutionSpace::CUDA)] =
-            reinterpret_cast<detail::parsec_static_op_t>(&TT::static_op<ttg::ExecutionSpace::CUDA>);
-
       for (int i = 0; i < static_stream_goal.size(); ++i) {
-        newtask->stream[i].goal = static_stream_goal[i];
+        newtask->streams[i].goal = static_stream_goal[i];
       }
 
       ttg::trace(world.rank(), ":", get_name(), " : ", key, ": creating task");
       return newtask;
     }
 
+
+    template <std::size_t i>
+    detail::reducer_task_t *create_new_reducer_task(task_t *task, bool is_first) {
+      /* make sure we can reuse the existing memory pool and don't have to create a new one */
+      static_assert(sizeof(task_t) >= sizeof(detail::reducer_task_t));
+      constexpr const bool keyT_is_Void = ttg::meta::is_void_v<keyT>;
+      auto &world_impl = world.impl();
+      detail::reducer_task_t *newtask;
+      parsec_thread_mempool_t *mempool = get_task_mempool();
+      char *taskobj = (char *)parsec_thread_mempool_allocate(mempool);
+      // use the priority of the task we stream into
+      int32_t priority = 0;
+      if constexpr (!keyT_is_Void) {
+        priority = priomap(task->key);
+        ttg::trace(world.rank(), ":", get_name(), " : ", task->key, ": creating reducer task");
+      } else {
+        priority = priomap();
+        ttg::trace(world.rank(), ":", get_name(), ": creating reducer task");
+      }
+      /* placement-new the task */
+      newtask = new (taskobj) detail::reducer_task_t(task, mempool, inpute_reducers_taskclass[i],
+                                                     world_impl.taskpool(), priority, is_first);
+
+      return newtask;
+    }
+
+
     // Used to set the i'th argument
     template <std::size_t i, typename Key, typename Value>
     void set_arg_local_impl(const Key &key, Value &&value, detail::ttg_data_copy_t *copy_in = nullptr,
@@ -1716,12 +2304,14 @@ namespace ttg_parsec {
       task_t *task;
       auto &world_impl = world.impl();
       auto &reducer = std::get<i>(input_reducers);
-      bool release = true;
+      bool release = false;
       bool remove_from_hash = true;
       bool discover_task = true;
       bool get_pull_data = false;
+      bool has_lock = false;
       /* If we have only one input and no reducer on that input we can skip the hash table */
       if (numins > 1 || reducer) {
+        has_lock = true;
         parsec_hash_table_lock_bucket(&tasks_table, hk);
         if (nullptr == (task = (task_t *)parsec_hash_table_nolock_find(&tasks_table, hk))) {
           task = create_new_task(key);
@@ -1739,7 +2329,11 @@ namespace ttg_parsec {
           parsec_hash_table_nolock_remove(&tasks_table, hk);
           remove_from_hash = false;
         }
-        parsec_hash_table_unlock_bucket(&tasks_table, hk);
+        /* if we have a reducer, we need to hold on to the lock for just a little longer */
+        if (!reducer) {
+          parsec_hash_table_unlock_bucket(&tasks_table, hk);
+          has_lock = false;
+        }
       } else {
         task = create_new_task(key);
         world_impl.increment_created();
@@ -1754,8 +2348,8 @@ namespace ttg_parsec {
 
       if( world_impl.dag_profiling() ) {
 #if defined(PARSEC_PROF_GRAPHER)
-        if(NULL != parsec_ttg_caller && !parsec_ttg_caller->dummy()) {
-          int orig_index = detail::find_index_of_copy_in_task(parsec_ttg_caller, &value);
+        if(NULL != detail::parsec_ttg_caller && !detail::parsec_ttg_caller->is_dummy()) {
+          int orig_index = detail::find_index_of_copy_in_task(detail::parsec_ttg_caller, &value);
           char orig_str[32];
           char dest_str[32];
           if(orig_index >= 0) {
@@ -1768,62 +2362,116 @@ namespace ttg_parsec {
                               .flow_index = 0, .flow_datatype_mask = ~0 };
           parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
                               .flow_index = 0, .flow_datatype_mask = ~0 };
-          parsec_prof_grapher_dep(&parsec_ttg_caller->parsec_task, &task->parsec_task, discover_task ? 1 : 0, &orig, &dest);
+          parsec_prof_grapher_dep(&detail::parsec_ttg_caller->parsec_task, &task->parsec_task, discover_task ? 1 : 0, &orig, &dest);
         }
 #endif
       }
 
-      if (reducer) {  // is this a streaming input? reduce the received value
-        // N.B. Right now reductions are done eagerly, without spawning tasks
-        //      this means we must lock
-        parsec_hash_table_lock_bucket(&tasks_table, hk);
+      auto get_copy_fn = [&](detail::parsec_ttg_task_base_t *task, auto&& value, bool is_const){
+        detail::ttg_data_copy_t *copy = copy_in;
+        if (nullptr == copy && nullptr != detail::parsec_ttg_caller) {
+          copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+        }
+        if (nullptr != copy) {
+          /* retain the data copy */
+          copy = detail::register_data_copy<valueT>(copy, task, is_const);
+        } else {
+          /* create a new copy */
+          copy = detail::create_new_datacopy(std::forward<Value>(value));
+          if (!is_const) {
+            copy->mark_mutable();
+          }
+        }
+        return copy;
+      };
+
+      if (reducer && 1 != task->streams[i].goal) {  // is this a streaming input? reduce the received value
+        auto submit_reducer_task = [&](auto *parent_task){
+          /* check if we need to create a task */
+          std::size_t c = parent_task->streams[i].reduce_count.fetch_add(1, std::memory_order_acquire);
+          //std::cout << "submit_reducer_task " << key << " c " << c << std::endl;
+          if (0 == c) {
+            /* we are responsible for creating the reduction task */
+            detail::reducer_task_t *reduce_task;
+            reduce_task = create_new_reducer_task<i>(parent_task, false);
+            reduce_task->release_task(reduce_task); // release immediately
+          }
+        };
 
         if constexpr (!ttg::meta::is_void_v<valueT>) {  // for data values
           // have a value already? if not, set, otherwise reduce
           detail::ttg_data_copy_t *copy = nullptr;
-          if (nullptr == (copy = static_cast<detail::ttg_data_copy_t *>(task->parsec_task.data[i].data_in))) {
+          if (nullptr == (copy = task->copies[i])) {
             using decay_valueT = std::decay_t<valueT>;
-            /* For now, we always create a copy because we cannot rely on the task_release
-             * mechanism (it would release the task, not the reduction value). */
-            copy = detail::create_new_datacopy(std::forward<Value>(value));
-            task->parsec_task.data[i].data_in = copy;
+
+            /* first input value, create a task and bind it to the copy */
+            //std::cout << "Creating new reducer task for " << key << std::endl;
+            detail::reducer_task_t *reduce_task;
+            reduce_task = create_new_reducer_task<i>(task, true);
+
+            /* protected by the bucket lock */
+            task->streams[i].size = 1;
+            task->streams[i].reduce_count.store(1, std::memory_order_relaxed);
+
+            /* get the copy to use as input for this task */
+            detail::ttg_data_copy_t *copy = get_copy_fn(reduce_task, std::forward<Value>(value), false);
+
+            /* put the copy into the task */
+            task->copies[i] = copy;
+
+            /* release the task if we're not deferred
+             * TODO: can we delay that until we get the second value?
+             */
+            if (copy->get_next_task() != &reduce_task->parsec_task) {
+              reduce_task->release_task(reduce_task);
+            }
+
+            /* now we can unlock the bucket */
+            parsec_hash_table_unlock_bucket(&tasks_table, hk);
           } else {
-            reducer(*reinterpret_cast<std::decay_t<valueT> *>(copy->device_private), value);
+            /* unlock the bucket, the lock is not needed anymore */
+            parsec_hash_table_unlock_bucket(&tasks_table, hk);
+
+            /* get the copy to use as input for this task */
+            detail::ttg_data_copy_t *copy = get_copy_fn(task, std::forward<Value>(value), true);
+
+            /* enqueue the data copy to be reduced */
+            parsec_lifo_push(&task->streams[i].reduce_copies, &copy->super);
+            submit_reducer_task(task);
           }
         } else {
-          reducer();  // even if this was a control input, must execute the reducer for possible side effects
-        }
-        task->stream[i].size++;
-        release = (task->stream[i].size == task->stream[i].goal);
-        if (release) {
-          parsec_hash_table_nolock_remove(&tasks_table, hk);
-          remove_from_hash = false;
+          /* unlock the bucket, the lock is not needed anymore */
+          parsec_hash_table_unlock_bucket(&tasks_table, hk);
+          /* submit reducer for void values to handle side effects */
+          submit_reducer_task(task);
         }
-        parsec_hash_table_unlock_bucket(&tasks_table, hk);
+        //if (release) {
+        //  parsec_hash_table_nolock_remove(&tasks_table, hk);
+        //  remove_from_hash = false;
+        //}
+        //parsec_hash_table_unlock_bucket(&tasks_table, hk);
       } else {
+        /* unlock the bucket, the lock is not needed anymore */
+        if (has_lock) {
+          parsec_hash_table_unlock_bucket(&tasks_table, hk);
+        }
         /* whether the task needs to be deferred or not */
         if constexpr (!valueT_is_Void) {
-          if (nullptr != task->parsec_task.data[i].data_in) {
+          if (nullptr != task->copies[i]) {
             ttg::print_error(get_name(), " : ", key, ": error argument is already set : ", i);
             throw std::logic_error("bad set arg");
           }
 
-          detail::ttg_data_copy_t *copy = copy_in;
-          if (nullptr == copy_in && nullptr != parsec_ttg_caller) {
-            copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
-          }
+          /* get the copy to use as input for this task */
+          detail::ttg_data_copy_t *copy = get_copy_fn(task, std::forward<Value>(value), input_is_const);
 
-          if (nullptr != copy) {
-            /* register_data_copy might provide us with a different copy if !input_is_const */
-            copy = detail::register_data_copy<valueT>(copy, task, input_is_const);
-          } else {
-            copy = detail::create_new_datacopy(std::forward<Value>(value));
-          }
           /* if we registered as a writer and were the first to register with this copy
            * we need to defer the release of this task to give other tasks a chance to
            * make a copy of the original data */
-          release = (copy->push_task != &task->parsec_task);
-          task->parsec_task.data[i].data_in = copy;
+          release = (copy->get_next_task() != &task->parsec_task);
+          task->copies[i] = copy;
+        } else {
+          release = true;
         }
       }
       task->remove_from_hash = remove_from_hash;
@@ -1910,10 +2558,42 @@ namespace ttg_parsec {
       set_arg_impl<i>(key, ttg::Void{});
     }
 
+    template<typename Value, typename Key>
+    bool can_inline_data(Value* value_ptr, detail::ttg_data_copy_t *copy, const Key& key, std::size_t num_keys) {
+      using decvalueT = std::decay_t<Value>;
+      bool inline_data = false;
+      /* check whether to send data in inline */
+      std::size_t iov_size = 0;
+      std::size_t metadata_size = 0;
+      if constexpr (ttg::has_split_metadata<std::decay_t<Value>>::value) {
+        ttg::SplitMetadataDescriptor<decvalueT> descr;
+        auto iovs = descr.get_data(*const_cast<decvalueT *>(value_ptr));
+        iov_size = std::accumulate(iovs.begin(), iovs.end(), 0,
+                                    [](std::size_t s, auto& iov){ return s + iov.num_bytes; });
+        auto metadata = descr.get_metadata(*const_cast<decvalueT *>(value_ptr));
+        metadata_size = ttg::default_data_descriptor<decltype(metadata)>::payload_size(&metadata);
+      } else {
+        /* TODO: how can we query the iovecs of the buffers here without actually packing the data? */
+        metadata_size = ttg::default_data_descriptor<ttg::meta::remove_cvr_t<Value>>::payload_size(value_ptr);
+        iov_size = std::accumulate(copy->iovec_begin(), copy->iovec_end(), 0,
+                                    [](std::size_t s, auto& iov){ return s + iov.num_bytes; });
+      }
+      /* key is packed at the end */
+      std::size_t key_pack_size = ttg::default_data_descriptor<Key>::payload_size(&key);
+      std::size_t pack_size = key_pack_size + metadata_size + iov_size;
+      if (pack_size < detail::max_inline_size) {
+        inline_data = true;
+      }
+      return inline_data;
+    }
+
     // Used to set the i'th argument
     template <std::size_t i, typename Key, typename Value>
-    void set_arg_impl(const Key &key, Value &&value) {
+    void set_arg_impl(const Key &key, Value &&value, detail::ttg_data_copy_t *copy_in = nullptr) {
       int owner;
+      using decvalueT = std::decay_t<Value>;
+      using norefvalueT = std::remove_reference_t<Value>;
+      norefvalueT *value_ptr = &value;
 
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
       if(world.impl().profiling()) {
@@ -1927,9 +2607,9 @@ namespace ttg_parsec {
         owner = keymap();
       if (owner == world.rank()) {
         if constexpr (!ttg::meta::is_void_v<keyT>)
-          set_arg_local<i, keyT, Value>(key, std::forward<Value>(value));
+          set_arg_local_impl<i>(key, std::forward<Value>(value), copy_in);
         else
-          set_arg_local<i, keyT, Value>(std::forward<Value>(value));
+          set_arg_local_impl<i>(ttg::Void{}, std::forward<Value>(value), copy_in);
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
           if(world.impl().profiling()) {
             parsec_profiling_ts_trace(world.impl().parsec_ttg_profile_backend_set_arg_end, 0, 0, NULL);
@@ -1943,88 +2623,116 @@ namespace ttg_parsec {
       using msg_t = detail::msg_t;
       auto &world_impl = world.impl();
       uint64_t pos = 0;
+      int num_iovecs = 0;
       std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                           msg_header_t::MSG_SET_ARG, i, 1);
-      using decvalueT = std::decay_t<Value>;
-      /* pack the key */
-      msg->tt_id.num_keys = 0;
-      if constexpr (!ttg::meta::is_void_v<Key>) {
-        pos = pack(key, msg->bytes, pos);
-        msg->tt_id.num_keys = 1;
-      }
+                                                           msg_header_t::MSG_SET_ARG, i, world_impl.rank(), 1);
 
       if constexpr (!ttg::meta::is_void_v<decvalueT>) {
-        if constexpr (!ttg::has_split_metadata<decvalueT>::value) {
-          pos = pack(value, msg->bytes, pos);
-        } else {
-          detail::ttg_data_copy_t *copy;
-          copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
+
+        detail::ttg_data_copy_t *copy = copy_in;
+        /* make sure we have a data copy to register with */
+        if (nullptr == copy) {
+          copy = detail::find_copy_in_task(detail::parsec_ttg_caller, value_ptr);
           if (nullptr == copy) {
             // We need to create a copy for this data, as it does not exist yet.
             copy = detail::create_new_datacopy(std::forward<Value>(value));
+            // use the new value from here on out
+            value_ptr = static_cast<norefvalueT*>(copy->get_ptr());
           }
-          copy = detail::register_data_copy<decvalueT>(copy, nullptr, true);
-
-          ttg::SplitMetadataDescriptor<decvalueT> descr;
-          auto metadata = descr.get_metadata(value);
-          size_t metadata_size = sizeof(metadata);
-          /* pack the metadata */
-          std::memcpy(msg->bytes + pos, &metadata, metadata_size);
-          pos += metadata_size;
-          /* pack the local rank */
-          int rank = world.rank();
-          std::memcpy(msg->bytes + pos, &rank, sizeof(rank));
-          pos += sizeof(rank);
+        }
 
-          auto iovecs = descr.get_data(*static_cast<decvalueT *>(copy->device_private));
+        bool inline_data = can_inline_data(value_ptr, copy, key, 1);
+        msg->tt_id.inline_data = inline_data;
 
-          int32_t num_iovs = std::distance(std::begin(iovecs), std::end(iovecs));
-          std::memcpy(msg->bytes + pos, &num_iovs, sizeof(num_iovs));
-          pos += sizeof(num_iovs);
+        auto handle_iovec_fn = [&](auto&& iovecs){
 
-          /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a
-           * raw function pointer instead of a preregistered AM tag, so play that game.
-           * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */
-          parsec_ce_tag_t cbtag = reinterpret_cast<parsec_ce_tag_t>(&detail::get_remote_complete_cb);
-          std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag));
-          pos += sizeof(cbtag);
+          if (inline_data) {
+            /* inline data is packed right after the tt_id in the message */
+            for (auto &&iov : iovecs) {
+              std::memcpy(msg->bytes + pos, iov.data, iov.num_bytes);
+              pos += iov.num_bytes;
+            }
+          } else {
 
-          /**
-           * register the generic iovecs and pack the registration handles
-           * memory layout: [<lreg_size, lreg, release_cb_ptr>, ...]
-           */
-          for (auto &&iov : iovecs) {
-            parsec_ce_mem_reg_handle_t lreg;
-            size_t lreg_size;
-            /* TODO: only register once when we can broadcast the data! */
-            parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
-                                   iov.num_bytes, &lreg, &lreg_size);
-            auto lreg_ptr = std::shared_ptr<void>{lreg, [](void *ptr) {
-                                                    parsec_ce_mem_reg_handle_t memreg = (parsec_ce_mem_reg_handle_t)ptr;
-                                                    parsec_ce.mem_unregister(&memreg);
-                                                  }};
-            int32_t lreg_size_i = lreg_size;
-            std::memcpy(msg->bytes + pos, &lreg_size_i, sizeof(lreg_size_i));
-            pos += sizeof(lreg_size_i);
-            std::memcpy(msg->bytes + pos, lreg, lreg_size_i);
-            pos += lreg_size_i;
-            /* TODO: can we avoid the extra indirection of going through std::function? */
-            std::function<void(void)> *fn = new std::function<void(void)>([=]() mutable {
-              /* shared_ptr of value and registration captured by value so resetting
-               * them here will eventually release the memory/registration */
-              detail::release_data_copy(copy);
-              lreg_ptr.reset();
-            });
-            std::intptr_t fn_ptr{reinterpret_cast<std::intptr_t>(fn)};
-            std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr));
-            pos += sizeof(fn_ptr);
+            /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a
+            * raw function pointer instead of a preregistered AM tag, so play that game.
+            * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */
+            parsec_ce_tag_t cbtag = reinterpret_cast<parsec_ce_tag_t>(&detail::get_remote_complete_cb);
+            std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag));
+            pos += sizeof(cbtag);
+
+            /**
+             * register the generic iovecs and pack the registration handles
+             * memory layout: [<lreg_size, lreg, release_cb_ptr>, ...]
+             */
+            for (auto &&iov : iovecs) {
+              copy = detail::register_data_copy<decvalueT>(copy, nullptr, true);
+              parsec_ce_mem_reg_handle_t lreg;
+              size_t lreg_size;
+              /* TODO: only register once when we can broadcast the data! */
+              parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
+                                    iov.num_bytes, &lreg, &lreg_size);
+              auto lreg_ptr = std::shared_ptr<void>{lreg, [](void *ptr) {
+                                                      parsec_ce_mem_reg_handle_t memreg = (parsec_ce_mem_reg_handle_t)ptr;
+                                                      parsec_ce.mem_unregister(&memreg);
+                                                    }};
+              int32_t lreg_size_i = lreg_size;
+              std::memcpy(msg->bytes + pos, &lreg_size_i, sizeof(lreg_size_i));
+              pos += sizeof(lreg_size_i);
+              std::memcpy(msg->bytes + pos, lreg, lreg_size);
+              pos += lreg_size;
+              //std::cout << "set_arg_impl lreg " << lreg << std::endl;
+              /* TODO: can we avoid the extra indirection of going through std::function? */
+              std::function<void(void)> *fn = new std::function<void(void)>([=]() mutable {
+                /* shared_ptr of value and registration captured by value so resetting
+                * them here will eventually release the memory/registration */
+                detail::release_data_copy(copy);
+                lreg_ptr.reset();
+              });
+              std::intptr_t fn_ptr{reinterpret_cast<std::intptr_t>(fn)};
+              std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr));
+              pos += sizeof(fn_ptr);
+            }
           }
+        };
+
+        if constexpr (ttg::has_split_metadata<std::decay_t<Value>>::value) {
+          ttg::SplitMetadataDescriptor<decvalueT> descr;
+          auto iovs = descr.get_data(*const_cast<decvalueT *>(value_ptr));
+          num_iovecs = std::distance(std::begin(iovs), std::end(iovs));
+          /* pack the metadata */
+          auto metadata = descr.get_metadata(*const_cast<decvalueT *>(value_ptr));
+          size_t metadata_size = sizeof(metadata);
+          pos = pack(metadata, msg->bytes, pos);
+          //std::cout << "set_arg_impl splitmd num_iovecs " << num_iovecs << std::endl;
+          handle_iovec_fn(iovs);
+        } else if constexpr (!ttg::has_split_metadata<std::decay_t<Value>>::value) {
+          /* serialize the object */
+          //std::cout << "PRE pack num_iovecs " << std::distance(copy->iovec_begin(), copy->iovec_end()) << std::endl;
+          pos = pack(*value_ptr, msg->bytes, pos, copy);
+          num_iovecs = std::distance(copy->iovec_begin(), copy->iovec_end());
+          //std::cout << "POST pack num_iovecs " << num_iovecs << std::endl;
+          /* handle any iovecs contained in it */
+          handle_iovec_fn(copy->iovec_span());
+          copy->iovec_reset();
         }
+
+        msg->tt_id.num_iovecs = num_iovecs;
+      }
+
+      /* pack the key */
+      msg->tt_id.num_keys = 0;
+      msg->tt_id.key_offset = pos;
+      if constexpr (!ttg::meta::is_void_v<Key>) {
+        size_t tmppos = pack(key, msg->bytes, pos);
+        pos = tmppos;
+        msg->tt_id.num_keys = 1;
       }
+
       parsec_taskpool_t *tp = world_impl.taskpool();
       tp->tdm.module->outgoing_message_start(tp, owner, NULL);
       tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
-      // std::cout << "Sending AM with " << msg->op_id.num_keys << " keys " << std::endl;
+      //std::cout << "set_arg_impl send_am owner " << owner << " sender " << msg->tt_id.sender << std::endl;
       parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast<void *>(msg.get()),
                         sizeof(msg_header_t) + pos);
 #if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
@@ -2033,8 +2741,8 @@ namespace ttg_parsec {
       }
 #endif
 #if defined(PARSEC_PROF_GRAPHER)
-      if(NULL != parsec_ttg_caller && !parsec_ttg_caller->dummy()) {
-        int orig_index = detail::find_index_of_copy_in_task(parsec_ttg_caller, &value);
+      if(NULL != detail::parsec_ttg_caller && !detail::parsec_ttg_caller->is_dummy()) {
+        int orig_index = detail::find_index_of_copy_in_task(detail::parsec_ttg_caller, value_ptr);
         char orig_str[32];
         char dest_str[32];
         if(orig_index >= 0) {
@@ -2043,124 +2751,54 @@ namespace ttg_parsec {
           strncpy(orig_str, "_", 32);
         }
         snprintf(dest_str, 32, "%lu", i);
-        parsec_flow_t orig{ .name = orig_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
-                            .flow_index = 0, .flow_datatype_mask = ~0 };
-        parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
-                            .flow_index = 0, .flow_datatype_mask = ~0 };
-        task_t *task = create_new_task(key);
-        parsec_prof_grapher_dep(&parsec_ttg_caller->parsec_task, &task->parsec_task, 0, &orig, &dest);
-        delete task;
-      }
-#endif
-    }
-
-    template <int i, typename Iterator, typename Value>
-    void broadcast_arg_local(Iterator &&begin, Iterator &&end, const Value &value) {
-#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
-      if(world.impl().profiling()) {
-        parsec_profiling_ts_trace(world.impl().parsec_ttg_profile_backend_bcast_arg_start, 0, 0, NULL);
-      }
-#endif
-      parsec_task_t *task_ring = nullptr;
-      detail::ttg_data_copy_t *copy = nullptr;
-      if (nullptr != parsec_ttg_caller) {
-        copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
-      }
-
-      for (auto it = begin; it != end; ++it) {
-        set_arg_local_impl<i>(*it, value, copy, &task_ring);
-      }
-      /* submit all ready tasks at once */
-      if (nullptr != task_ring) {
-        parsec_task_t *vp_task_ring[1] = { task_ring };
-        __parsec_schedule_vp(world.impl().execution_stream(), vp_task_ring, 0);
-      }
-#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
-      if(world.impl().profiling()) {
-        parsec_profiling_ts_trace(world.impl().parsec_ttg_profile_backend_set_arg_end, 0, 0, NULL);
-      }
-#endif
-    }
-
-    template <std::size_t i, typename Key, typename Value>
-    std::enable_if_t<!ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>> &&
-                         !ttg::has_split_metadata<std::decay_t<Value>>::value,
-                     void>
-    broadcast_arg(const ttg::span<const Key> &keylist, const Value &value) {
-      auto world = ttg_default_execution_context();
-      int rank = world.rank();
-
-      bool have_remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(),
-                                                       [&](const Key &key) { return keymap(key) != rank; });
-
-      if (have_remote) {
-        std::vector<Key> keylist_sorted(keylist.begin(), keylist.end());
-
-        /* Assuming there are no local keys, will be updated while processing remote keys */
-        auto local_begin = keylist_sorted.end();
-        auto local_end = keylist_sorted.end();
-
-        /* sort the input key list by owner and check whether there are remote keys */
-        std::sort(keylist_sorted.begin(), keylist_sorted.end(), [&](const Key &a, const Key &b) mutable {
-          int rank_a = keymap(a);
-          int rank_b = keymap(b);
-          return rank_a < rank_b;
-        });
-
-        using msg_t = detail::msg_t;
-        local_begin = keylist_sorted.end();
-        auto &world_impl = world.impl();
-        std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_SET_ARG, i);
-
-        parsec_taskpool_t *tp = world_impl.taskpool();
-
-        for (auto it = keylist_sorted.begin(); it < keylist_sorted.end(); /* increment inline */) {
-          auto owner = keymap(*it);
-          if (owner == rank) {
-            /* make sure we don't lose local keys */
-            local_begin = it;
-            local_end =
-                std::find_if_not(++it, keylist_sorted.end(), [&](const Key &key) { return keymap(key) == rank; });
-            it = local_end;
-            continue;
-          }
-
-          /* pack all keys for this owner */
-          int num_keys = 0;
-          uint64_t pos = 0;
-          do {
-            ++num_keys;
-            pos = pack(*it, msg->bytes, pos);
-            ++it;
-          } while (it < keylist_sorted.end() && keymap(*it) == owner);
-          msg->tt_id.num_keys = num_keys;
+        parsec_flow_t orig{ .name = orig_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
+                            .flow_index = 0, .flow_datatype_mask = ~0 };
+        parsec_flow_t dest{ .name = dest_str, .sym_type = PARSEC_SYM_INOUT, .flow_flags = PARSEC_FLOW_ACCESS_RW,
+                            .flow_index = 0, .flow_datatype_mask = ~0 };
+        task_t *task = create_new_task(key);
+        parsec_prof_grapher_dep(&detail::parsec_ttg_caller->parsec_task, &task->parsec_task, 0, &orig, &dest);
+        delete task;
+      }
+#endif
+    }
 
-          /* TODO: use RMA to transfer the value */
-          pos = pack(value, msg->bytes, pos);
+    template <int i, typename Iterator, typename Value>
+    void broadcast_arg_local(Iterator &&begin, Iterator &&end, const Value &value) {
+#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
+      if(world.impl().profiling()) {
+        parsec_profiling_ts_trace(world.impl().parsec_ttg_profile_backend_bcast_arg_start, 0, 0, NULL);
+      }
+#endif
+      parsec_task_t *task_ring = nullptr;
+      detail::ttg_data_copy_t *copy = nullptr;
+      if (nullptr != detail::parsec_ttg_caller) {
+        copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+      }
 
-          /* Send the message */
-          tp->tdm.module->outgoing_message_start(tp, owner, NULL);
-          tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
-          parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast<void *>(msg.get()),
-                            sizeof(msg_header_t) + pos);
-        }
-        /* handle local keys */
-        broadcast_arg_local<i>(local_begin, local_end, value);
-      } else {
-        /* only local keys */
-        broadcast_arg_local<i>(keylist.begin(), keylist.end(), value);
+      for (auto it = begin; it != end; ++it) {
+        set_arg_local_impl<i>(*it, value, copy, &task_ring);
+      }
+      /* submit all ready tasks at once */
+      if (nullptr != task_ring) {
+        parsec_task_t *vp_task_ring[1] = { task_ring };
+        __parsec_schedule_vp(world.impl().execution_stream(), vp_task_ring, 0);
+      }
+#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
+      if(world.impl().profiling()) {
+        parsec_profiling_ts_trace(world.impl().parsec_ttg_profile_backend_set_arg_end, 0, 0, NULL);
       }
+#endif
     }
 
     template <std::size_t i, typename Key, typename Value>
-    std::enable_if_t<!ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>> &&
-                         ttg::has_split_metadata<std::decay_t<Value>>::value,
+    std::enable_if_t<!ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>>,
                      void>
-    splitmd_broadcast_arg(const ttg::span<const Key> &keylist, const Value &value) {
+    broadcast_arg(const ttg::span<const Key> &keylist, const Value &value) {
       using valueT = std::tuple_element_t<i, input_values_full_tuple_type>;
       auto world = ttg_default_execution_context();
+      auto np = world.size();
       int rank = world.rank();
+      uint64_t pos = 0;
       bool have_remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(),
                                                        [&](const Key &key) { return keymap(key) != rank; });
 
@@ -2172,48 +2810,95 @@ namespace ttg_parsec {
         std::sort(keylist_sorted.begin(), keylist_sorted.end(), [&](const Key &a, const Key &b) mutable {
           int rank_a = keymap(a);
           int rank_b = keymap(b);
-          return rank_a < rank_b;
+          // sort so that the keys for my rank are first, rank+1 next, ..., wrapping around to 0
+          int pos_a = (rank_a + np - rank) % np;
+          int pos_b = (rank_b + np - rank) % np;
+          return pos_a < pos_b;
         });
 
         /* Assuming there are no local keys, will be updated while iterating over the keys */
         auto local_begin = keylist_sorted.end();
         auto local_end = keylist_sorted.end();
 
-        ttg::SplitMetadataDescriptor<decvalueT> descr;
-        auto iovs = descr.get_data(*const_cast<decvalueT *>(&value));
-        int32_t num_iovs = std::distance(std::begin(iovs), std::end(iovs));
-        std::vector<std::pair<int32_t, std::shared_ptr<void>>> memregs;
-        memregs.reserve(num_iovs);
-
-        /* register all iovs so the registration can be reused */
-        for (auto &&iov : iovs) {
-          parsec_ce_mem_reg_handle_t lreg;
-          size_t lreg_size;
-          parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
-                                 iov.num_bytes, &lreg, &lreg_size);
-          /* TODO: use a static function for deregistration here? */
-          memregs.push_back(std::make_pair(static_cast<int32_t>(lreg_size),
-                                           /* TODO: this assumes that parsec_ce_mem_reg_handle_t is void* */
-                                           std::shared_ptr<void>{lreg, [](void *ptr) {
-                                                                   parsec_ce_mem_reg_handle_t memreg =
-                                                                       (parsec_ce_mem_reg_handle_t)ptr;
-                                                                   parsec_ce.mem_unregister(&memreg);
-                                                                 }}));
-        }
+        int32_t num_iovs = 0;
+
+        detail::ttg_data_copy_t *copy;
+        copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+        assert(nullptr != copy);
 
         using msg_t = detail::msg_t;
         auto &world_impl = world.impl();
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_SET_ARG, i);
-        auto metadata = descr.get_metadata(value);
-        size_t metadata_size = sizeof(metadata);
+                                                             msg_header_t::MSG_SET_ARG, i, world_impl.rank());
 
-        detail::ttg_data_copy_t *copy;
-        copy = detail::find_copy_in_task(parsec_ttg_caller, &value);
-        assert(nullptr != copy);
+        /* check if we inline the data */
+        /* TODO: this assumes the worst case: that all keys are packed at once (i.e., go to the same remote). Can we do better?*/
+        bool inline_data = can_inline_data(&value, copy, keylist_sorted[0], keylist_sorted.size());
+        msg->tt_id.inline_data = inline_data;
+
+        std::vector<std::pair<int32_t, std::shared_ptr<void>>> memregs;
+        auto handle_iovs_fn = [&](auto&& iovs){
+
+          if (inline_data) {
+            /* inline data is packed right after the tt_id in the message */
+            for (auto &&iov : iovs) {
+              std::memcpy(msg->bytes + pos, iov.data, iov.num_bytes);
+              pos += iov.num_bytes;
+            }
+          } else {
+
+            /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a
+              * raw function pointer instead of a preregistered AM tag, so play that game.
+              * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */
+            parsec_ce_tag_t cbtag = reinterpret_cast<parsec_ce_tag_t>(&detail::get_remote_complete_cb);
+            std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag));
+            pos += sizeof(cbtag);
+
+            for (auto &&iov : iovs) {
+              parsec_ce_mem_reg_handle_t lreg;
+              size_t lreg_size;
+              parsec_ce.mem_register(iov.data, PARSEC_MEM_TYPE_NONCONTIGUOUS, iov.num_bytes, parsec_datatype_int8_t,
+                                    iov.num_bytes, &lreg, &lreg_size);
+              /* TODO: use a static function for deregistration here? */
+              memregs.push_back(std::make_pair(static_cast<int32_t>(lreg_size),
+                                              /* TODO: this assumes that parsec_ce_mem_reg_handle_t is void* */
+                                              std::shared_ptr<void>{lreg, [](void *ptr) {
+                                                                      parsec_ce_mem_reg_handle_t memreg =
+                                                                          (parsec_ce_mem_reg_handle_t)ptr;
+                                                                      //std::cout << "broadcast_arg memunreg lreg " << memreg << std::endl;
+                                                                      parsec_ce.mem_unregister(&memreg);
+                                                                    }}));
+              //std::cout << "broadcast_arg memreg lreg " << lreg << std::endl;
+            }
+          }
+        };
+
+        if constexpr (ttg::has_split_metadata<std::decay_t<Value>>::value) {
+          ttg::SplitMetadataDescriptor<decvalueT> descr;
+          /* pack the metadata */
+          auto metadata = descr.get_metadata(value);
+          size_t metadata_size = sizeof(metadata);
+          pos = pack(metadata, msg->bytes, pos);
+          auto iovs = descr.get_data(*const_cast<decvalueT *>(&value));
+          num_iovs = std::distance(std::begin(iovs), std::end(iovs));
+          memregs.reserve(num_iovs);
+          handle_iovs_fn(iovs);
+          //std::cout << "broadcast_arg splitmd num_iovecs " << num_iovs << std::endl;
+        } else if constexpr (!ttg::has_split_metadata<std::decay_t<Value>>::value) {
+          /* serialize the object once */
+          pos = pack(value, msg->bytes, pos, copy);
+          num_iovs = std::distance(copy->iovec_begin(), copy->iovec_end());
+          handle_iovs_fn(copy->iovec_span());
+          copy->iovec_reset();
+        }
+
+        msg->tt_id.num_iovecs = num_iovs;
+
+        std::size_t save_pos = pos;
 
         parsec_taskpool_t *tp = world_impl.taskpool();
         for (auto it = keylist_sorted.begin(); it < keylist_sorted.end(); /* increment done inline */) {
+
           auto owner = keymap(*it);
           if (owner == rank) {
             local_begin = it;
@@ -2224,8 +2909,42 @@ namespace ttg_parsec {
             continue;
           }
 
-          /* count keys and set it afterwards */
-          uint64_t pos = 0;
+          /* rewind the buffer and start packing a new set of memregs and keys */
+          pos = save_pos;
+          /**
+           * pack the registration handles
+           * memory layout: [<lreg_size, lreg, lreg_fn>, ...]
+           * NOTE: we need to pack these for every receiver to ensure correct ref-counting of the registration
+           */
+          if (!inline_data) {
+            for (int idx = 0; idx < num_iovs; ++idx) {
+              // auto [lreg_size, lreg_ptr] = memregs[idx];
+              int32_t lreg_size;
+              std::shared_ptr<void> lreg_ptr;
+              std::tie(lreg_size, lreg_ptr) = memregs[idx];
+              std::memcpy(msg->bytes + pos, &lreg_size, sizeof(lreg_size));
+              pos += sizeof(lreg_size);
+              std::memcpy(msg->bytes + pos, lreg_ptr.get(), lreg_size);
+              pos += lreg_size;
+              //std::cout << "broadcast_arg lreg_ptr " << lreg_ptr.get() << std::endl;
+              /* mark another reader on the copy */
+              copy = detail::register_data_copy<valueT>(copy, nullptr, true);
+              /* create a function that will be invoked upon RMA completion at the target */
+              std::function<void(void)> *fn = new std::function<void(void)>([=]() mutable {
+                /* shared_ptr of value and registration captured by value so resetting
+                  * them here will eventually release the memory/registration */
+                detail::release_data_copy(copy);
+                lreg_ptr.reset();
+              });
+              std::intptr_t fn_ptr{reinterpret_cast<std::intptr_t>(fn)};
+              std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr));
+              pos += sizeof(fn_ptr);
+            }
+          }
+
+          /* mark the beginning of the keys */
+          msg->tt_id.key_offset = pos;
+
           /* pack all keys for this owner */
           int num_keys = 0;
           do {
@@ -2235,55 +2954,9 @@ namespace ttg_parsec {
           } while (it < keylist_sorted.end() && keymap(*it) == owner);
           msg->tt_id.num_keys = num_keys;
 
-          /* pack the metadata */
-          std::memcpy(msg->bytes + pos, &metadata, metadata_size);
-          pos += metadata_size;
-          /* pack the local rank */
-          int rank = world.rank();
-          std::memcpy(msg->bytes + pos, &rank, sizeof(rank));
-          pos += sizeof(rank);
-          /* pack the number of iovecs */
-          std::memcpy(msg->bytes + pos, &num_iovs, sizeof(num_iovs));
-          pos += sizeof(num_iovs);
-
-          /* TODO: at the moment, the tag argument to parsec_ce.get() is treated as a
-           * raw function pointer instead of a preregistered AM tag, so play that game.
-           * Once this is fixed in PaRSEC we need to use parsec_ttg_rma_tag instead! */
-          parsec_ce_tag_t cbtag = reinterpret_cast<parsec_ce_tag_t>(&detail::get_remote_complete_cb);
-          std::memcpy(msg->bytes + pos, &cbtag, sizeof(cbtag));
-          pos += sizeof(cbtag);
-
-          /**
-           * pack the registration handles
-           * memory layout: [<lreg_size, lreg, lreg_fn>, ...]
-           */
-          int idx = 0;
-          for (auto &&iov : iovs) {
-            // auto [lreg_size, lreg_ptr] = memregs[idx];
-            int32_t lreg_size;
-            std::shared_ptr<void> lreg_ptr;
-            std::tie(lreg_size, lreg_ptr) = memregs[idx];
-            std::memcpy(msg->bytes + pos, &lreg_size, sizeof(lreg_size));
-            pos += sizeof(lreg_size);
-            std::memcpy(msg->bytes + pos, lreg_ptr.get(), lreg_size);
-            pos += lreg_size;
-            /* create a function that will be invoked upon RMA completion at the target */
-            std::shared_ptr<void> lreg_ptr_v = lreg_ptr;
-            /* mark another reader on the copy */
-            copy = detail::register_data_copy<valueT>(copy, nullptr, true);
-            std::function<void(void)> *fn = new std::function<void(void)>([=]() mutable {
-              /* shared_ptr of value and registration captured by value so resetting
-               * them here will eventually release the memory/registration */
-              detail::release_data_copy(copy);
-              lreg_ptr_v.reset();
-            });
-            std::intptr_t fn_ptr{reinterpret_cast<std::intptr_t>(fn)};
-            std::memcpy(msg->bytes + pos, &fn_ptr, sizeof(fn_ptr));
-            pos += sizeof(fn_ptr);
-            ++idx;
-          }
           tp->tdm.module->outgoing_message_start(tp, owner, NULL);
           tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
+          //std::cout << "broadcast_arg send_am owner " << owner << std::endl;
           parsec_ce.send_am(&parsec_ce, world_impl.parsec_ttg_tag(), owner, static_cast<void *>(msg.get()),
                             sizeof(msg_header_t) + pos);
         }
@@ -2341,13 +3014,13 @@ namespace ttg_parsec {
     /// \param size positive integer that specifies the default stream size
     template <std::size_t i>
     void set_static_argstream_size(std::size_t size) {
-      assert(std::get<i>(input_reducers) && "TT::set_argstream_size called on nonstreaming input terminal");
+      assert(std::get<i>(input_reducers) && "TT::set_static_argstream_size called on nonstreaming input terminal");
       assert(size > 0 && "TT::set_static_argstream_size(key,size) called with size=0");
 
       this->trace(world.rank(), ":", get_name(), ": setting global stream size for terminal ", i);
 
       // Check if stream is already bounded
-      if (static_stream_goal[i] > 0) {
+      if (static_stream_goal[i] < std::numeric_limits<std::size_t>::max()) {
         ttg::print_error(world.rank(), ":", get_name(), " : error stream is already bounded : ", i);
         throw std::runtime_error("TT::set_static_argstream_size called for a bounded stream");
       }
@@ -2372,10 +3045,10 @@ namespace ttg_parsec {
         auto &world_impl = world.impl();
         uint64_t pos = 0;
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_SET_ARGSTREAM_SIZE, i, 1);
+                                                             msg_header_t::MSG_SET_ARGSTREAM_SIZE, i,
+                                                             world_impl.rank(), 1);
         /* pack the key */
         pos = pack(key, msg->bytes, pos);
-        msg->tt_id.num_keys = 1;
         pos = pack(size, msg->bytes, pos);
         parsec_taskpool_t *tp = world_impl.taskpool();
         tp->tdm.module->outgoing_message_start(tp, owner, NULL);
@@ -2398,16 +3071,22 @@ namespace ttg_parsec {
 #endif
           }
         }
+        parsec_hash_table_unlock_bucket(&tasks_table, hk);
 
         // TODO: Unfriendly implementation, cannot check if stream is already bounded
         // TODO: Unfriendly implementation, cannot check if stream has been finalized already
 
         // commit changes
-        task->stream[i].goal = size;
-        bool release = (task->stream[i].size == task->stream[i].goal);
-        parsec_hash_table_unlock_bucket(&tasks_table, hk);
-
-        if (release) release_task(task);
+        // 1) "lock" the stream by incrementing the reduce_count
+        // 2) set the goal
+        // 3) "unlock" the stream
+        // only one thread will see the reduce_count be zero and the goal match the size
+        task->streams[i].reduce_count.fetch_add(1, std::memory_order_acquire);
+        task->streams[i].goal = size;
+        auto c = task->streams[i].reduce_count.fetch_sub(1, std::memory_order_release);
+        if (1 == c && (task->streams[i].size >= size)) {
+          release_task(task);
+        }
       }
     }
 
@@ -2427,9 +3106,8 @@ namespace ttg_parsec {
         auto &world_impl = world.impl();
         uint64_t pos = 0;
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_SET_ARGSTREAM_SIZE, i, 1);
-        /* pack the key */
-        msg->tt_id.num_keys = 0;
+                                                             msg_header_t::MSG_SET_ARGSTREAM_SIZE, i,
+                                                             world_impl.rank(), 0);
         pos = pack(size, msg->bytes, pos);
         parsec_taskpool_t *tp = world_impl.taskpool();
         tp->tdm.module->outgoing_message_start(tp, owner, NULL);
@@ -2452,16 +3130,22 @@ namespace ttg_parsec {
 #endif
           }
         }
+        parsec_hash_table_unlock_bucket(&tasks_table, hk);
 
         // TODO: Unfriendly implementation, cannot check if stream is already bounded
         // TODO: Unfriendly implementation, cannot check if stream has been finalized already
 
         // commit changes
-        task->stream[i].goal = size;
-        bool release = (task->stream[i].size == task->stream[i].goal);
-        parsec_hash_table_unlock_bucket(&tasks_table, hk);
-
-        if (release) release_task(task);
+        // 1) "lock" the stream by incrementing the reduce_count
+        // 2) set the goal
+        // 3) "unlock" the stream
+        // only one thread will see the reduce_count be zero and the goal match the size
+        task->streams[i].reduce_count.fetch_add(1, std::memory_order_acquire);
+        task->streams[i].goal = size;
+        auto c = task->streams[i].reduce_count.fetch_sub(1, std::memory_order_release);
+        if (1 == c && (task->streams[i].size >= size)) {
+          release_task(task);
+        }
       }
     }
 
@@ -2479,10 +3163,10 @@ namespace ttg_parsec {
         auto &world_impl = world.impl();
         uint64_t pos = 0;
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i, 1);
+                                                             msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i,
+                                                             world_impl.rank(), 1);
         /* pack the key */
         pos = pack(key, msg->bytes, pos);
-        msg->tt_id.num_keys = 1;
         parsec_taskpool_t *tp = world_impl.taskpool();
         tp->tdm.module->outgoing_message_start(tp, owner, NULL);
         tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
@@ -2493,8 +3177,8 @@ namespace ttg_parsec {
 
         auto hk = reinterpret_cast<parsec_key_t>(&key);
         task_t *task = nullptr;
-        parsec_hash_table_lock_bucket(&tasks_table, hk);
-        if (nullptr == (task = (task_t *)parsec_hash_table_nolock_find(&tasks_table, hk))) {
+        //parsec_hash_table_lock_bucket(&tasks_table, hk);
+        if (nullptr == (task = (task_t *)parsec_hash_table_find(&tasks_table, hk))) {
           ttg::print_error(world.rank(), ":", get_name(), ":", key,
                            " : error finalize called on stream that never received an input data: ", i);
           throw std::runtime_error("TT::finalize called on stream that never received an input data");
@@ -2504,10 +3188,16 @@ namespace ttg_parsec {
         // TODO: Unfriendly implementation, cannot check if stream has been finalized already
 
         // commit changes
-        task->stream[i].size = 1;
-        parsec_hash_table_unlock_bucket(&tasks_table, hk);
-
-        release_task(task);
+        // 1) "lock" the stream by incrementing the reduce_count
+        // 2) set the goal
+        // 3) "unlock" the stream
+        // only one thread will see the reduce_count be zero and the goal match the size
+        task->streams[i].reduce_count.fetch_add(1, std::memory_order_acquire);
+        task->streams[i].goal = 1;
+        auto c = task->streams[i].reduce_count.fetch_sub(1, std::memory_order_release);
+        if (1 == c && (task->streams[i].size >= 1)) {
+          release_task(task);
+        }
       }
     }
 
@@ -2525,8 +3215,8 @@ namespace ttg_parsec {
         auto &world_impl = world.impl();
         uint64_t pos = 0;
         std::unique_ptr<msg_t> msg = std::make_unique<msg_t>(get_instance_id(), world_impl.taskpool()->taskpool_id,
-                                                             msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i, 1);
-        msg->tt_id.num_keys = 0;
+                                                             msg_header_t::MSG_FINALIZE_ARGSTREAM_SIZE, i,
+                                                             world_impl.rank(), 0);
         parsec_taskpool_t *tp = world_impl.taskpool();
         tp->tdm.module->outgoing_message_start(tp, owner, NULL);
         tp->tdm.module->outgoing_message_pack(tp, owner, NULL, NULL, 0);
@@ -2537,8 +3227,7 @@ namespace ttg_parsec {
 
         auto hk = static_cast<parsec_key_t>(0);
         task_t *task = nullptr;
-        parsec_hash_table_lock_bucket(&tasks_table, hk);
-        if (nullptr == (task = (task_t *)parsec_hash_table_nolock_find(&tasks_table, hk))) {
+        if (nullptr == (task = (task_t *)parsec_hash_table_find(&tasks_table, hk))) {
           ttg::print_error(world.rank(), ":", get_name(),
                            " : error finalize called on stream that never received an input data: ", i);
           throw std::runtime_error("TT::finalize called on stream that never received an input data");
@@ -2548,11 +3237,170 @@ namespace ttg_parsec {
         // TODO: Unfriendly implementation, cannot check if stream has been finalized already
 
         // commit changes
-        task->stream[i].size = 1;
-        parsec_hash_table_unlock_bucket(&tasks_table, hk);
+        // 1) "lock" the stream by incrementing the reduce_count
+        // 2) set the goal
+        // 3) "unlock" the stream
+        // only one thread will see the reduce_count be zero and the goal match the size
+        task->streams[i].reduce_count.fetch_add(1, std::memory_order_acquire);
+        task->streams[i].goal = 1;
+        auto c = task->streams[i].reduce_count.fetch_sub(1, std::memory_order_release);
+        if (1 == c && (task->streams[i].size >= 1)) {
+          release_task(task);
+        }
+      }
+    }
+
+    void copy_mark_pushout(detail::ttg_data_copy_t *copy) {
+
+      assert(detail::parsec_ttg_caller->dev_ptr && detail::parsec_ttg_caller->dev_ptr->gpu_task);
+      parsec_gpu_task_t *gpu_task = detail::parsec_ttg_caller->dev_ptr->gpu_task;
+      auto check_parsec_data = [&](parsec_data_t* data) {
+        if (data->owner_device != 0) {
+          /* find the flow */
+          int flowidx = 0;
+          while (flowidx < MAX_PARAM_COUNT &&
+                gpu_task->flow[flowidx]->flow_flags != PARSEC_FLOW_ACCESS_NONE) {
+            if (detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in->original == data) {
+              /* found the right data, set the corresponding flow as pushout */
+              break;
+            }
+            ++flowidx;
+          }
+          if (flowidx == MAX_PARAM_COUNT) {
+            throw std::runtime_error("Cannot add more than MAX_PARAM_COUNT flows to a task!");
+          }
+          if (gpu_task->flow[flowidx]->flow_flags == PARSEC_FLOW_ACCESS_NONE) {
+            /* no flow found, add one and mark it pushout */
+            detail::parsec_ttg_caller->parsec_task.data[flowidx].data_in = data->device_copies[0];
+            gpu_task->flow_nb_elts[flowidx] = data->nb_elts;
+          }
+          /* need to mark the flow RW to make PaRSEC happy */
+          ((parsec_flow_t *)gpu_task->flow[flowidx])->flow_flags |= PARSEC_FLOW_ACCESS_RW;
+          gpu_task->pushout |= 1<<flowidx;
+        }
+      };
+      copy->foreach_parsec_data(check_parsec_data);
+    }
+
+
+    /* check whether a data needs to be pushed out */
+    template <std::size_t i, typename Value, typename RemoteCheckFn>
+    std::enable_if_t<!std::is_void_v<std::decay_t<Value>>,
+                     void>
+    do_prepare_send(const Value &value, RemoteCheckFn&& remote_check) {
+      using valueT = std::tuple_element_t<i, input_values_full_tuple_type>;
+      static constexpr const bool value_is_const = std::is_const_v<valueT>;
+
+      /* get the copy */
+      detail::ttg_data_copy_t *copy;
+      copy = detail::find_copy_in_task(detail::parsec_ttg_caller, &value);
+
+      /* if there is no copy we don't need to prepare anything */
+      if (nullptr == copy) {
+        return;
+      }
+
+      detail::parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller;
+      bool need_pushout = false;
+
+      if (caller->data_flags & detail::ttg_parsec_data_flags::MARKED_PUSHOUT) {
+        /* already marked pushout, skip the rest */
+        return;
+      }
 
-        release_task(task);
+      /* TODO: remove this once we support reductions on the GPU */
+      auto &reducer = std::get<i>(input_reducers);
+      if (reducer) {
+        /* reductions are currently done only on the host so push out */
+        copy_mark_pushout(copy);
+        caller->data_flags |= detail::ttg_parsec_data_flags::MARKED_PUSHOUT;
+        return;
+      }
+
+      if constexpr (value_is_const) {
+        if (caller->data_flags & detail::ttg_parsec_data_flags::IS_MODIFIED) {
+          /* The data has been modified previously. PaRSEC requires us to pushout
+           * data if we transition from a writer to one or more readers. */
+          need_pushout = true;
+        }
+
+        /* check for multiple readers */
+        if (caller->data_flags & detail::ttg_parsec_data_flags::SINGLE_READER) {
+          caller->data_flags |= detail::ttg_parsec_data_flags::MULTIPLE_READER;
+        }
+
+        if (caller->data_flags & detail::ttg_parsec_data_flags::SINGLE_WRITER) {
+          /* there is a writer already, we will need to create a copy */
+          need_pushout = true;
+        }
+
+        caller->data_flags |= detail::ttg_parsec_data_flags::SINGLE_READER;
+      } else {
+        if (caller->data_flags & detail::ttg_parsec_data_flags::SINGLE_WRITER) {
+          caller->data_flags |= detail::ttg_parsec_data_flags::MULTIPLE_WRITER;
+          need_pushout = true;
+        } else {
+          if (caller->data_flags & detail::ttg_parsec_data_flags::SINGLE_READER) {
+            /* there are readers, we will need to create a copy */
+            need_pushout = true;
+          }
+          caller->data_flags |= detail::ttg_parsec_data_flags::SINGLE_WRITER;
+        }
+      }
+
+      if constexpr (!derived_has_device_op()) {
+        need_pushout = true;
       }
+
+      /* check if there are non-local successors if it's a device task */
+      if (!need_pushout) {
+        bool device_supported = false;
+        if constexpr (derived_has_cuda_op()) {
+          device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::CUDA);
+        } else if constexpr (derived_has_hip_op()) {
+          device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::HIP);
+        } else if constexpr (derived_has_level_zero_op()) {
+          device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::L0);
+        }
+        /* if MPI supports the device we don't care whether we have remote peers
+         * because we can send from the device directly */
+        if (!device_supported) {
+          need_pushout = remote_check();
+        }
+      }
+
+      if (need_pushout) {
+        copy_mark_pushout(copy);
+        caller->data_flags |= detail::ttg_parsec_data_flags::MARKED_PUSHOUT;
+      }
+    }
+
+    /* check whether a data needs to be pushed out */
+    template <std::size_t i, typename Key, typename Value>
+    std::enable_if_t<!ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>>,
+                     void>
+    prepare_send(const ttg::span<const Key> &keylist, const Value &value) {
+      auto remote_check = [&](){
+          auto world = ttg_default_execution_context();
+          int rank = world.rank();
+          uint64_t pos = 0;
+          bool remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(),
+                                                      [&](const Key &key) { return keymap(key) != rank; });
+          return remote;
+        };
+      do_prepare_send<i>(value, remote_check);
+    }
+
+    template <std::size_t i, typename Key, typename Value>
+    std::enable_if_t<ttg::meta::is_void_v<Key> && !std::is_void_v<std::decay_t<Value>>,
+                     void>
+    prepare_send(const Value &value) {
+      auto remote_check = [&](){
+          auto world = ttg_default_execution_context();
+          int rank = world.rank();
+          return (keymap() != rank);
+        };
+      do_prepare_send<i>(value, remote_check);
     }
 
    private:
@@ -2588,15 +3436,15 @@ namespace ttg_parsec {
           set_arg<i, keyT, const valueT &>(key, value);
         };
         auto broadcast_callback = [this](const ttg::span<const keyT> &keylist, const valueT &value) {
-          if constexpr (ttg::has_split_metadata<std::decay_t<valueT>>::value) {
-            splitmd_broadcast_arg<i, keyT, valueT>(keylist, value);
-          } else {
             broadcast_arg<i, keyT, valueT>(keylist, value);
-          }
+        };
+        auto prepare_send_callback = [this](const ttg::span<const keyT> &keylist, const valueT &value) {
+            prepare_send<i, keyT, valueT>(keylist, value);
         };
         auto setsize_callback = [this](const keyT &key, std::size_t size) { set_argstream_size<i>(key, size); };
         auto finalize_callback = [this](const keyT &key) { finalize_argstream<i>(key); };
-        input.set_callback(send_callback, move_callback, broadcast_callback, setsize_callback, finalize_callback);
+        input.set_callback(send_callback, move_callback, broadcast_callback,
+                           setsize_callback, finalize_callback, prepare_send_callback);
       }
       //////////////////////////////////////////////////////////////////
       // case 2: nonvoid key, void value, mixed inputs
@@ -2619,7 +3467,10 @@ namespace ttg_parsec {
         auto send_callback = [this](const valueT &value) { set_arg<i, keyT, const valueT &>(value); };
         auto setsize_callback = [this](std::size_t size) { set_argstream_size<i>(size); };
         auto finalize_callback = [this]() { finalize_argstream<i>(); };
-        input.set_callback(send_callback, move_callback, {}, setsize_callback, finalize_callback);
+        auto prepare_send_callback = [this](const valueT &value) {
+            prepare_send<i, void>(value);
+        };
+        input.set_callback(send_callback, move_callback, {}, setsize_callback, finalize_callback, prepare_send_callback);
       }
       //////////////////////////////////////////////////////////////////
       // case 5: void key, void value, mixed inputs
@@ -2635,7 +3486,7 @@ namespace ttg_parsec {
       // NOTE: subsumed in case 5 above, kept for historical reasons
       //////////////////////////////////////////////////////////////////
       else
-        abort();
+        ttg::abort();
     }
 
     template <std::size_t... IS>
@@ -2659,6 +3510,7 @@ namespace ttg_parsec {
       junk[0]++;
     }
 
+#if 0
     template <typename input_terminals_tupleT, std::size_t... IS, typename flowsT>
     void _initialize_flows(std::index_sequence<IS...>, flowsT &&flows) {
       int junk[] = {0,
@@ -2674,6 +3526,7 @@ namespace ttg_parsec {
       _initialize_flows<input_terminals_tupleT>(
           std::make_index_sequence<std::tuple_size<input_terminals_tupleT>::value>{}, flows);
     }
+#endif // 0
 
     void fence() override { ttg::default_execution_context().impl().fence(); }
 
@@ -2719,24 +3572,22 @@ namespace ttg_parsec {
         return reinterpret_cast<parsec_key_t>(key);
     }
 
-    static char *parsec_ttg_task_snprintf(char *buffer, size_t buffer_size, const parsec_task_t *t) {
+    static char *parsec_ttg_task_snprintf(char *buffer, size_t buffer_size, const parsec_task_t *parsec_task) {
       if(buffer_size == 0)
         return buffer;
 
       if constexpr (ttg::meta::is_void_v<keyT>) {
-        snprintf(buffer, buffer_size, "%s()[]<%d>", t->task_class->name, t->priority);
+        snprintf(buffer, buffer_size, "%s()[]<%d>", parsec_task->task_class->name, parsec_task->priority);
       }  else {
-        // we use the locals array as a scratchpad to store the hash of the key and its actual address
-        // locals[0] amd locals[1] hold the hash, while locals[2] and locals[3] hold the key pointer
-        keyT *key = *(keyT**)&(t->locals[2]);
+        const task_t *task = reinterpret_cast<const task_t*>(parsec_task);
         std::stringstream ss;
-        ss << *key;
+        ss << task->key;
 
         std::string keystr = ss.str();
         std::replace(keystr.begin(), keystr.end(), '(', ':');
         std::replace(keystr.begin(), keystr.end(), ')', ':');
 
-        snprintf(buffer, buffer_size, "%s(%s)[]<%d>", t->task_class->name, keystr.c_str(), t->priority);
+        snprintf(buffer, buffer_size, "%s(%s)[]<%d>", parsec_task->task_class->name, keystr.c_str(), parsec_task->priority);
       }
       return buffer;
     }
@@ -2744,19 +3595,14 @@ namespace ttg_parsec {
 #if defined(PARSEC_PROF_TRACE)
     static void *parsec_ttg_task_info(void *dst, const void *data, size_t size)
     {
-      const parsec_task_t *t = reinterpret_cast<const parsec_task_t *>(data);
+      const task_t *task = reinterpret_cast<const task_t *>(data);
 
       if constexpr (ttg::meta::is_void_v<keyT>) {
         snprintf(reinterpret_cast<char*>(dst), size, "()");
       } else {
-        // we use the locals array as a scratchpad to store the hash of the key and its actual address
-        // locals[0] amd locals[1] hold the hash, while locals[2] and locals[3] hold the key pointer
-        keyT *key = *(keyT**)&(t->locals[2]);
         std::stringstream ss;
-        ss << *key;
-
-        std::string keystr = ss.str();
-        snprintf(reinterpret_cast<char*>(dst), size, "%s", keystr.c_str());
+        ss << task->key;
+        snprintf(reinterpret_cast<char*>(dst), size, "%s", ss.str().c_str());
       }
       return dst;
     }
@@ -2764,13 +3610,65 @@ namespace ttg_parsec {
 
     parsec_key_fn_t tasks_hash_fcts = {key_equal, key_print, key_hash};
 
-    static parsec_hook_return_t complete_task_and_release(parsec_execution_stream_t *es, parsec_task_t *t) {
-      auto *task = (detail::parsec_ttg_task_base_t *)t;
+    template<std::size_t I>
+    inline static void increment_data_version_impl(task_t *task) {
+      if constexpr (!std::is_const_v<std::tuple_element_t<I, typename TT::input_values_tuple_type>>) {
+        if (task->copies[I] != nullptr){
+          task->copies[I]->inc_current_version();
+        }
+      }
+    }
+
+    template<std::size_t... Is>
+    inline static void increment_data_versions(task_t *task, std::index_sequence<Is...>) {
+      /* increment version of each mutable data */
+      int junk[] = {0, (increment_data_version_impl<Is>(task), 0)...};
+      junk[0]++;
+    }
+
+    static parsec_hook_return_t complete_task_and_release(parsec_execution_stream_t *es, parsec_task_t *parsec_task) {
+
+      //std::cout << "complete_task_and_release: task " << parsec_task << std::endl;
+
+      task_t *task = (task_t*)parsec_task;
+
+      /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */
+      if (task->suspended_task_address) {
+        assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid);
+#ifdef TTG_HAVE_DEVICE
+        if (task->coroutine_id == ttg::TaskCoroutineID::DeviceTask) {
+          /* increment versions of all data we might have modified
+           * this must happen before we issue the sends */
+          //increment_data_versions(task, std::make_index_sequence<std::tuple_size_v<typename TT::input_values_tuple_type>>{});
+
+          // get the device task from the coroutine handle
+          auto dev_task = ttg::device::detail::device_task_handle_type::from_address(task->suspended_task_address);
+
+          // get the promise which contains the views
+          auto dev_data = dev_task.promise();
+
+          /* for now make sure we're waiting for the kernel to complete and the coro hasn't skipped this step */
+          assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_SENDOUT);
+
+          /* execute the sends we stored */
+          if (dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_SENDOUT) {
+            /* set the current task, needed inside the sends */
+            detail::parsec_ttg_caller = task;
+            dev_data.do_sends();
+            detail::parsec_ttg_caller = nullptr;
+          }
+        }
+#endif // TTG_HAVE_DEVICE
+        /* the coroutine should have completed and we cannot access the promise anymore */
+        task->suspended_task_address = nullptr;
+      }
+
+      /* release our data copies */
       for (int i = 0; i < task->data_count; i++) {
-        detail::ttg_data_copy_t *copy = static_cast<detail::ttg_data_copy_t *>(task->parsec_task.data[i].data_in);
+        detail::ttg_data_copy_t *copy = task->copies[i];
         if (nullptr == copy) continue;
         detail::release_data_copy(copy);
-        task->parsec_task.data[i].data_in = nullptr;
+        task->copies[i] = nullptr;
       }
       return PARSEC_HOOK_RETURN_DONE;
     }
@@ -2786,8 +3684,7 @@ namespace ttg_parsec {
         , keymap(std::is_same<keymapT, ttg::detail::default_keymap<keyT>>::value
                      ? decltype(keymap)(ttg::detail::default_keymap<keyT>(world))
                      : decltype(keymap)(std::forward<keymapT>(keymap_)))
-        , priomap(decltype(keymap)(std::forward<priomapT>(priomap_)))
-        , static_stream_goal() {
+        , priomap(decltype(keymap)(std::forward<priomapT>(priomap_))) {
       // Cannot call these in base constructor since terminals not yet constructed
       if (innames.size() != numinedges) throw std::logic_error("ttg_parsec::TT: #input names != #input terminals");
       if (outnames.size() != numouts) throw std::logic_error("ttg_parsec::TT: #output names != #output terminals");
@@ -2812,7 +3709,9 @@ namespace ttg_parsec {
       self.task_class_id = get_instance_id();
       self.nb_parameters = 0;
       self.nb_locals = 0;
-      self.nb_flows = numflows;
+      //self.nb_flows = numflows;
+      self.nb_flows = MAX_PARAM_COUNT; // we're not using all flows but have to
+                                       // trick the device handler into looking at all of them
 
       if( world_impl.profiling() ) {
         // first two ints are used to store the hash of the key.
@@ -2839,32 +3738,49 @@ namespace ttg_parsec {
 
       world_impl.taskpool()->nb_task_classes = std::max(world_impl.taskpool()->nb_task_classes, static_cast<decltype(world_impl.taskpool()->nb_task_classes)>(self.task_class_id+1));
       //    function_id_to_instance[self.task_class_id] = this;
-
+      //self.incarnations = incarnations_array.data();
+//#if 0
       if constexpr (derived_has_cuda_op()) {
         self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA;
         ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL;
-        ((__parsec_chore_t *)self.incarnations)[0].hook = detail::hook_cuda;
-        ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_CPU;
+        ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_cuda<TT>;
+        ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
+        ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
+        ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
+      } else if (derived_has_hip_op()) {
+        self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
+        ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_HIP;
+        ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL;
+        ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_hip<TT>;
+
+        ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
+        ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
+        ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
+      } else if (derived_has_level_zero_op()) {
+        self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
+        ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_LEVEL_ZERO;
+        ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL;
+        ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_level_zero<TT>;
+
+        ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
-        ((__parsec_chore_t *)self.incarnations)[1].hook = detail::hook;
-        ((__parsec_chore_t *)self.incarnations)[2].type = PARSEC_DEV_NONE;
-        ((__parsec_chore_t *)self.incarnations)[2].evaluate = NULL;
-        ((__parsec_chore_t *)self.incarnations)[2].hook = NULL;
+        ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
       } else {
         self.incarnations = (__parsec_chore_t *)malloc(2 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CPU;
         ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL;
-        ((__parsec_chore_t *)self.incarnations)[0].hook = detail::hook;
+        ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook<TT>;
         ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
       }
+//#endif // 0
 
       self.release_task = &parsec_release_task_to_mempool_update_nbtasks;
       self.complete_execution = complete_task_and_release;
 
-      for (i = 0; i < numins; i++) {
+      for (i = 0; i < MAX_PARAM_COUNT; i++) {
         parsec_flow_t *flow = new parsec_flow_t;
         flow->name = strdup((std::string("flow in") + std::to_string(i)).c_str());
         flow->sym_type = PARSEC_SYM_INOUT;
@@ -2873,13 +3789,13 @@ namespace ttg_parsec {
         flow->dep_in[0] = NULL;
         flow->dep_out[0] = NULL;
         flow->flow_index = i;
-        flow->flow_datatype_mask = (1 << i);
+        flow->flow_datatype_mask = ~0;
         *((parsec_flow_t **)&(self.in[i])) = flow;
       }
-      *((parsec_flow_t **)&(self.in[i])) = NULL;
-      initialize_flows<input_terminals_type>(self.in);
+      //*((parsec_flow_t **)&(self.in[i])) = NULL;
+      //initialize_flows<input_terminals_type>(self.in);
 
-      for (i = 0; i < numouts; i++) {
+      for (i = 0; i < MAX_PARAM_COUNT; i++) {
         parsec_flow_t *flow = new parsec_flow_t;
         flow->name = strdup((std::string("flow out") + std::to_string(i)).c_str());
         flow->sym_type = PARSEC_SYM_INOUT;
@@ -2890,7 +3806,7 @@ namespace ttg_parsec {
         flow->flow_datatype_mask = (1 << i);
         *((parsec_flow_t **)&(self.out[i])) = flow;
       }
-      *((parsec_flow_t **)&(self.out[i])) = NULL;
+      //*((parsec_flow_t **)&(self.out[i])) = NULL;
 
       self.flags = 0;
       self.dependencies_goal = numins; /* (~(uint32_t)0) >> (32 - numins); */
@@ -2942,6 +3858,13 @@ namespace ttg_parsec {
         free((void*)self.name);
         self.name = nullptr;
       }
+
+      for (std::size_t i = 0; i < numins; ++i) {
+        if (inpute_reducers_taskclass[i] != nullptr) {
+          std::free(inpute_reducers_taskclass[i]);
+          inpute_reducers_taskclass[i] = nullptr;
+        }
+      }
       release();
     }
 
@@ -2955,6 +3878,10 @@ namespace ttg_parsec {
       }
     }
 
+    void print_incomplete_tasks() {
+      parsec_hash_table_for_all(&tasks_table, ht_iter_cb, this);
+    }
+
     virtual void release() override { do_release(); }
 
     void do_release() {
@@ -2963,20 +3890,22 @@ namespace ttg_parsec {
       }
       alive = false;
       /* print all outstanding tasks */
-      parsec_hash_table_for_all(&tasks_table, ht_iter_cb, this);
+      print_incomplete_tasks();
       parsec_hash_table_fini(&tasks_table);
       parsec_mempool_destruct(&mempools);
       // uintptr_t addr = (uintptr_t)self.incarnations;
       // free((void *)addr);
       free((__parsec_chore_t *)self.incarnations);
-      for (int i = 0; i < numflows; i++) {
+      for (int i = 0; i < MAX_PARAM_COUNT; i++) {
         if (NULL != self.in[i]) {
           free(self.in[i]->name);
           delete self.in[i];
+          self.in[i] = nullptr;
         }
         if (NULL != self.out[i]) {
           free(self.out[i]->name);
           delete self.out[i];
+          self.out[i] = nullptr;
         }
       }
       world.impl().deregister_op(this);
@@ -2987,18 +3916,86 @@ namespace ttg_parsec {
     /// define the reducer function to be called when additional inputs are
     /// received on a streaming terminal
     ///   @tparam <i> the index of the input terminal that is used as a streaming terminal
-    ///   @param[in] reducer: a function of prototype (input_type<i> &a, const input_type<i> &b)
+    ///   @param[in] reducer: a function of prototype `void(input_type<i> &a, const input_type<i> &b)`
     ///                       that function should aggregate b into a
     template <std::size_t i, typename Reducer>
     void set_input_reducer(Reducer &&reducer) {
       ttg::trace(world.rank(), ":", get_name(), " : setting reducer for terminal ", i);
       std::get<i>(input_reducers) = reducer;
+
+      parsec_task_class_t *tc = inpute_reducers_taskclass[i];
+      if (nullptr == tc) {
+        tc = (parsec_task_class_t *)std::calloc(1, sizeof(*tc));
+        inpute_reducers_taskclass[i] = tc;
+
+        tc->name = strdup((get_name() + std::string(" reducer ") + std::to_string(i)).c_str());
+        tc->task_class_id = get_instance_id();
+        tc->nb_parameters = 0;
+        tc->nb_locals = 0;
+        tc->nb_flows = numflows;
+
+        auto &world_impl = world.impl();
+
+        if( world_impl.profiling() ) {
+          // first two ints are used to store the hash of the key.
+          tc->nb_parameters = (sizeof(void*)+sizeof(int)-1)/sizeof(int);
+          // seconds two ints are used to store a pointer to the key of the task.
+          tc->nb_locals     = self.nb_parameters + (sizeof(void*)+sizeof(int)-1)/sizeof(int);
+
+          // If we have parameters and locals, we need to define the corresponding dereference arrays
+          tc->params[0] = &detail::parsec_taskclass_param0;
+          tc->params[1] = &detail::parsec_taskclass_param1;
+
+          tc->locals[0] = &detail::parsec_taskclass_param0;
+          tc->locals[1] = &detail::parsec_taskclass_param1;
+          tc->locals[2] = &detail::parsec_taskclass_param2;
+          tc->locals[3] = &detail::parsec_taskclass_param3;
+        }
+        tc->make_key = make_key;
+        tc->key_functions = &tasks_hash_fcts;
+        tc->task_snprintf = parsec_ttg_task_snprintf;
+
+#if defined(PARSEC_PROF_TRACE)
+        tc->profile_info = &parsec_ttg_task_info;
+#endif
+
+        world_impl.taskpool()->nb_task_classes = std::max(world_impl.taskpool()->nb_task_classes, static_cast<decltype(world_impl.taskpool()->nb_task_classes)>(self.task_class_id+1));
+
+#if 0
+        // FIXME: currently only support reduction on the host
+        if constexpr (derived_has_cuda_op()) {
+          self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
+          ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA;
+          ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL;
+          ((__parsec_chore_t *)self.incarnations)[0].hook = detail::hook_cuda;
+          ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_CPU;
+          ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
+          ((__parsec_chore_t *)self.incarnations)[1].hook = detail::hook;
+          ((__parsec_chore_t *)self.incarnations)[2].type = PARSEC_DEV_NONE;
+          ((__parsec_chore_t *)self.incarnations)[2].evaluate = NULL;
+          ((__parsec_chore_t *)self.incarnations)[2].hook = NULL;
+        } else
+#endif // 0
+        {
+          tc->incarnations = (__parsec_chore_t *)malloc(2 * sizeof(__parsec_chore_t));
+          ((__parsec_chore_t *)tc->incarnations)[0].type = PARSEC_DEV_CPU;
+          ((__parsec_chore_t *)tc->incarnations)[0].evaluate = NULL;
+          ((__parsec_chore_t *)tc->incarnations)[0].hook = &static_reducer_op<i>;
+          ((__parsec_chore_t *)tc->incarnations)[1].type = PARSEC_DEV_NONE;
+          ((__parsec_chore_t *)tc->incarnations)[1].evaluate = NULL;
+          ((__parsec_chore_t *)tc->incarnations)[1].hook = NULL;
+        }
+
+        /* the reduction task does not alter the termination detection because the target task will execute */
+        tc->release_task = &parsec_release_task_to_mempool;
+        tc->complete_execution = NULL;
+      }
     }
 
     /// define the reducer function to be called when additional inputs are
     /// received on a streaming terminal
     ///   @tparam <i> the index of the input terminal that is used as a streaming terminal
-    ///   @param[in] reducer: a function of prototype (input_type<i> &a, const input_type<i> &b)
+    ///   @param[in] reducer: a function of prototype `void(input_type<i> &a, const input_type<i> &b)`
     ///                       that function should aggregate b into a
     ///   @param[in] size: the default number of inputs that are received in this streaming terminal,
     ///                    for each task
@@ -3084,6 +4081,49 @@ namespace ttg_parsec {
         TTBase::invoke();
     }
 
+  private:
+    template<typename Key, typename Arg, typename... Args, std::size_t I, std::size_t... Is>
+    void invoke_arglist(std::index_sequence<I, Is...>, const Key& key, Arg&& arg, Args&&... args) {
+      using arg_type = std::decay_t<Arg>;
+      if constexpr (ttg::meta::is_ptr_v<arg_type>) {
+        /* add a reference to the object */
+        auto copy = ttg_parsec::detail::get_copy(arg);
+        copy->add_ref();
+        /* reset readers so that the value can flow without copying */
+        copy->reset_readers();
+        auto& val = *arg;
+        set_arg_impl<I>(key, val, copy);
+        ttg_parsec::detail::release_data_copy(copy);
+        if constexpr (std::is_rvalue_reference_v<Arg>) {
+          /* if the ptr was moved in we reset it */
+          arg.reset();
+        }
+      } else if constexpr (!ttg::meta::is_ptr_v<arg_type>) {
+        set_arg<I>(key, std::forward<Arg>(arg));
+      }
+      if constexpr (sizeof...(Is) > 0) {
+        /* recursive next argument */
+        invoke_arglist(std::index_sequence<Is...>{}, key, std::forward<Args>(args)...);
+      }
+    }
+
+  public:
+    // Manual injection of a task with all input arguments specified as variadic arguments
+    template <typename Key = keyT, typename Arg, typename... Args>
+    std::enable_if_t<!ttg::meta::is_void_v<Key> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>, void> invoke(
+        const Key &key, Arg&& arg, Args&&... args) {
+      static_assert(sizeof...(Args)+1 == std::tuple_size_v<actual_input_tuple_type>,
+                    "Number of arguments to invoke must match the number of task inputs.");
+      TTG_OP_ASSERT_EXECUTABLE();
+      /* trigger non-void inputs */
+      invoke_arglist(ttg::meta::nonvoid_index_seq<actual_input_tuple_type>{}, key,
+                     std::forward<Arg>(arg), std::forward<Args>(args)...);
+      //set_args(ttg::meta::nonvoid_index_seq<actual_input_tuple_type>{}, key, args);
+      /* trigger void inputs */
+      using void_index_seq = ttg::meta::void_index_seq<actual_input_tuple_type>;
+      set_args(void_index_seq{}, key, ttg::detail::make_void_tuple<void_index_seq::size()>());
+    }
+
     void set_defer_writer(bool value) {
       m_defer_writer = value;
     }
@@ -3174,47 +4214,101 @@ template <>
 struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
  private:
   ttg_parsec::detail::ttg_data_copy_t *copy_to_remove = nullptr;
+  bool do_release = true;
 
  public:
+  value_copy_handler() = default;
+  value_copy_handler(const value_copy_handler& h) = delete;
+  value_copy_handler(value_copy_handler&& h)
+  : copy_to_remove(h.copy_to_remove)
+  {
+    h.copy_to_remove = nullptr;
+  }
+
+  value_copy_handler& operator=(const value_copy_handler& h) = delete;
+  value_copy_handler& operator=(value_copy_handler&& h)
+  {
+    std::swap(copy_to_remove, h.copy_to_remove);
+    return *this;
+  }
+
   ~value_copy_handler() {
     if (nullptr != copy_to_remove) {
-      ttg_parsec::detail::remove_data_copy(copy_to_remove, parsec_ttg_caller);
-      ttg_parsec::detail::release_data_copy(copy_to_remove);
+      ttg_parsec::detail::remove_data_copy(copy_to_remove, ttg_parsec::detail::parsec_ttg_caller);
+      if (do_release) {
+        ttg_parsec::detail::release_data_copy(copy_to_remove);
+      }
     }
   }
 
   template <typename Value>
-  inline Value &&operator()(Value &&value) {
-    if (nullptr == parsec_ttg_caller) {
-      ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
+  inline std::add_lvalue_reference_t<Value> operator()(Value &&value) {
+    static_assert(std::is_rvalue_reference_v<decltype(value)> ||
+                  std::is_copy_constructible_v<std::decay_t<Value>>,
+                  "Data sent without being moved must be copy-constructible!");
+
+    auto caller = ttg_parsec::detail::parsec_ttg_caller;
+    if (nullptr == caller) {
+      ttg::print("ERROR: ttg::send or ttg::broadcast called outside of a task!\n");
     }
+    using value_type = std::remove_reference_t<Value>;
     ttg_parsec::detail::ttg_data_copy_t *copy;
-    copy = ttg_parsec::detail::find_copy_in_task(parsec_ttg_caller, &value);
-    Value *value_ptr = &value;
+    copy = ttg_parsec::detail::find_copy_in_task(caller, &value);
+    value_type *value_ptr = &value;
     if (nullptr == copy) {
       /**
        * the value is not known, create a copy that we can track
        * depending on Value, this uses either the copy or move constructor
        */
       copy = ttg_parsec::detail::create_new_datacopy(std::forward<Value>(value));
-      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
+      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, caller);
       assert(inserted);
-      value_ptr = reinterpret_cast<Value *>(copy->device_private);
+      value_ptr = reinterpret_cast<value_type *>(copy->get_ptr());
       copy_to_remove = copy;
     } else {
-      /* this copy won't be modified anymore so mark it as read-only */
-      copy->reset_readers();
+      if constexpr (std::is_rvalue_reference_v<decltype(value)>) {
+        /* this copy won't be modified anymore so mark it as read-only */
+        copy->reset_readers();
+      }
+      /* the value was potentially changed, so increment version */
+      copy->inc_current_version();
+    }
+    /* We're coming from a writer so mark the data as modified.
+     * That way we can force a pushout in prepare_send if we move to read-only tasks (needed by PaRSEC). */
+    caller->data_flags = ttg_parsec::detail::ttg_parsec_data_flags::IS_MODIFIED;
+    return *value_ptr;
+  }
+
+  template<typename Value>
+  inline std::add_lvalue_reference_t<Value> operator()(ttg_parsec::detail::persistent_value_ref<Value> vref) {
+    auto caller = ttg_parsec::detail::parsec_ttg_caller;
+    if (nullptr == caller) {
+      ttg::print("ERROR: ttg::send or ttg::broadcast called outside of a task!\n");
     }
-    return std::move(*value_ptr);
+    ttg_parsec::detail::ttg_data_copy_t *copy;
+    copy = ttg_parsec::detail::find_copy_in_task(caller, &vref.value_ref);
+    if (nullptr == copy) {
+      // no need to create a new copy since it's derived from the copy already
+      copy = const_cast<ttg_parsec::detail::ttg_data_copy_t *>(static_cast<const ttg_parsec::detail::ttg_data_copy_t *>(&vref.value_ref));
+      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, caller);
+      assert(inserted);
+      copy_to_remove = copy; // we want to remove the copy from the task once done sending
+      do_release = false; // we don't release the copy since we didn't allocate it
+      copy->add_ref(); // add a reference so that TTG does not attempt to delete this object
+    }
+    return vref.value_ref;
   }
 
   template <typename Value>
   inline const Value &operator()(const Value &value) {
-    if (nullptr == parsec_ttg_caller) {
-      ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
+    static_assert(std::is_copy_constructible_v<std::decay_t<Value>>,
+                  "Data sent without being moved must be copy-constructible!");
+    auto caller = ttg_parsec::detail::parsec_ttg_caller;
+    if (nullptr == caller) {
+      ttg::print("ERROR: ttg::send or ttg::broadcast called outside of a task!\n");
     }
     ttg_parsec::detail::ttg_data_copy_t *copy;
-    copy = ttg_parsec::detail::find_copy_in_task(parsec_ttg_caller, &value);
+    copy = ttg_parsec::detail::find_copy_in_task(caller, &value);
     const Value *value_ptr = &value;
     if (nullptr == copy) {
       /**
@@ -3222,30 +4316,15 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
        * depending on Value, this uses either the copy or move constructor
        */
       copy = ttg_parsec::detail::create_new_datacopy(value);
-      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
+      bool inserted = ttg_parsec::detail::add_copy_to_task(copy, caller);
       assert(inserted);
-      value_ptr = reinterpret_cast<Value *>(copy->device_private);
+      value_ptr = reinterpret_cast<Value *>(copy->get_ptr());
       copy_to_remove = copy;
     }
+    caller->data_flags = ttg_parsec::detail::ttg_parsec_data_flags::NONE;
     return *value_ptr;
   }
 
-  /* we have to make a copy of non-const data as the user may modify it after
-   * send/broadcast */
-  template <typename Value, typename Enabler = std::enable_if_t<!std::is_const_v<Value>>>
-  inline Value &operator()(Value &value) {
-    if (nullptr == parsec_ttg_caller) {
-      ttg::print("ERROR: ttg_send or ttg_broadcast called outside of a task!\n");
-    }
-    /* the value is not known, create a copy that we can track */
-    ttg_parsec::detail::ttg_data_copy_t *copy;
-    copy = ttg_parsec::detail::create_new_datacopy(value);
-    bool inserted = ttg_parsec::detail::add_copy_to_task(copy, parsec_ttg_caller);
-    assert(inserted);
-    Value *value_ptr = reinterpret_cast<Value *>(copy->device_private);
-    copy_to_remove = copy;
-    return *value_ptr;
-  }
 };
 
 #endif  // PARSEC_TTG_H_INCLUDED
diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h
index 461984e3d..a4b4575fa 100644
--- a/ttg/ttg/parsec/ttg_data_copy.h
+++ b/ttg/ttg/parsec/ttg_data_copy.h
@@ -3,36 +3,212 @@
 
 #include <utility>
 #include <limits>
+#include <vector>
+#include <iterator>
+#include <atomic>
+#include <type_traits>
+
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+#include <cuda_runtime.h>
+#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
 
 #include <parsec.h>
 
+#include "ttg/parsec/thread_local.h"
+#include "ttg/parsec/parsec-ext.h"
+#include "ttg/util/span.h"
+
 
 namespace ttg_parsec {
 
   namespace detail {
 
-    /* Extension of PaRSEC's data copy. Note that we use the readers field
-    * to facilitate the ref-counting of the data copy.
-    * TODO: create abstractions for all fields in parsec_data_copy_t that we access.
-    */
-    struct ttg_data_copy_t : public parsec_data_copy_t {
-#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
-      int64_t size;
-      int64_t uid;
-#endif
+    // fwd-decl
+    struct ttg_data_copy_t;
+
+    /* Wrapper managing the relationship between a ttg data copy and the parsec_data_t object */
+    struct ttg_parsec_data_wrapper_t {
+
+    protected:
+      using parsec_data_ptr = std::unique_ptr<parsec_data_t, decltype(&parsec_data_destroy)>;
+
+      ttg_data_copy_t *m_ttg_copy = nullptr;
+      parsec_data_ptr m_data;
+
+      friend ttg_data_copy_t;
+
+      static parsec_data_t* create_parsec_data(void *ptr, size_t size) {
+        parsec_data_t *data = parsec_data_create_with_type(nullptr, 0, ptr, size,
+                                                          parsec_datatype_int8_t);
+        data->device_copies[0]->flags |= PARSEC_DATA_FLAG_PARSEC_MANAGED;
+        data->device_copies[0]->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+        data->device_copies[0]->version = 1;
+        return data;
+      }
+
+      parsec_data_t* parsec_data() {
+        return m_data.get();
+      }
+
+      const parsec_data_t* parsec_data() const {
+        return m_data.get();
+      }
+
+      static void delete_parsec_data(parsec_data_t *data) {
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+        if (data->device_copies[0]->flags & TTG_PARSEC_DATA_FLAG_REGISTERED) {
+          // register the memory for faster access
+          cudaError_t status;
+          status = cudaHostUnregister(data->device_copies[0]->device_private);
+          assert(cudaSuccess == status);
+          data->device_copies[0]->flags ^= TTG_PARSEC_DATA_FLAG_REGISTERED;
+        }
+#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
+        //std::fprintf(stderr, "parsec_data_destroy %p device_copy[0] %p\n", data, data->device_copies[0]);
+        //parsec_data_destroy(data);
+        assert(data->device_copies[0] != nullptr);
+        auto copy = data->device_copies[0];
+        parsec_data_copy_detach(data, data->device_copies[0], 0);
+        PARSEC_OBJ_RELEASE(copy);
+        PARSEC_OBJ_RELEASE(data);
+
+      }
+
+      static void delete_null_parsec_data(parsec_data_t *) {
+        // nothing to be done, only used for nullptr
+      }
+
+    protected:
+
+      /* remove the the data from the owning data copy */
+      void remove_from_owner();
+
+      /* add the data to the owning data copy */
+      void reset_parsec_data(void *ptr, size_t size);
+
+      ttg_parsec_data_wrapper_t();
+
+      ttg_parsec_data_wrapper_t(const ttg_parsec_data_wrapper_t& other) = delete;
+
+      ttg_parsec_data_wrapper_t(ttg_parsec_data_wrapper_t&& other);
+
+      ttg_parsec_data_wrapper_t& operator=(const ttg_parsec_data_wrapper_t& other) = delete;
+
+      ttg_parsec_data_wrapper_t& operator=(ttg_parsec_data_wrapper_t&& other);
+
+      virtual ~ttg_parsec_data_wrapper_t();
+
+      /* set a new owning data copy object */
+      void set_owner(ttg_data_copy_t& new_copy) {
+        m_ttg_copy = &new_copy;
+      }
+    };
+
+
+    /* templated to break cyclic dependency with ttg_data_copy_container */
+    template<typename T = ttg_data_copy_t>
+    struct ttg_data_copy_container_setter {
+      ttg_data_copy_container_setter(T* ptr) {
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = ptr;
+      }
+    };
+
+    /* special type: stores a pointer to the ttg_data_copy_t. This is necessary
+     * because ttg_data_copy_t has virtual functions so we cannot cast from parsec_data_copy_t
+     * to ttg_data_copy_t (offsetof is not supported for virtual classes).
+     * The self pointer is a back-pointer to the ttg_data_copy_t. */
+    struct ttg_data_copy_self_t {
+      parsec_list_item_t super;
+      ttg_data_copy_t *self;
+      ttg_data_copy_self_t(ttg_data_copy_t* dc)
+      : self(dc)
+      {
+        PARSEC_OBJ_CONSTRUCT(&super, parsec_list_item_t);
+      }
+    };
+
+    /* Non-owning copy-tracking wrapper, accounting for N readers or 1 writer.
+     * Also counts external references, which are not treated as
+     * readers or writers but merely prevent the object from being
+     * destroyed once no readers/writers exist.
+     */
+    struct ttg_data_copy_t : public ttg_data_copy_self_t {
 
       /* special value assigned to parsec_data_copy_t::readers to mark the copy as
       * mutable, i.e., a task will modify it */
       static constexpr int mutable_tag = std::numeric_limits<int>::min();
 
+      ttg_data_copy_t()
+      : ttg_data_copy_self_t(this)
+      { }
+
+      ttg_data_copy_t(const ttg_data_copy_t& c)
+      : ttg_data_copy_self_t(this)
+      {
+        /* we allow copying but do not copy any data over from the original
+         * device copies will have to be allocated again
+         * and it's a new object to reference */
+      }
+
+      ttg_data_copy_t(ttg_data_copy_t&& c)
+      : ttg_data_copy_self_t(this)
+      , m_next_task(c.m_next_task)
+      , m_readers(c.m_readers)
+      , m_refs(c.m_refs.load(std::memory_order_relaxed))
+      , m_dev_data(std::move(c.m_dev_data))
+      , m_single_dev_data(c.m_single_dev_data)
+      , m_num_dev_data(c.m_num_dev_data)
+      {
+        c.m_num_dev_data = 0;
+        c.m_readers = 0;
+        c.m_single_dev_data = nullptr;
+
+        foreach_wrapper([&](ttg_parsec_data_wrapper_t* data){
+          data->set_owner(*this);
+        });
+      }
+
+      ttg_data_copy_t& operator=(ttg_data_copy_t&& c)
+      {
+        m_next_task = c.m_next_task;
+        c.m_next_task = nullptr;
+        m_readers = c.m_readers;
+        c.m_readers = 0;
+        m_refs.store(c.m_refs.load(std::memory_order_relaxed), std::memory_order_relaxed);
+        c.m_refs.store(0, std::memory_order_relaxed);
+        m_dev_data = std::move(c.m_dev_data);
+        m_single_dev_data = c.m_single_dev_data;
+        c.m_single_dev_data = nullptr;
+        m_num_dev_data = c.m_num_dev_data;
+        c.m_num_dev_data = 0;
+
+        /* move all data to the new owner */
+        foreach_wrapper([&](ttg_parsec_data_wrapper_t* data){
+          data->set_owner(*this);
+        });
+        return *this;
+      }
+
+      ttg_data_copy_t& operator=(const ttg_data_copy_t& c) {
+        /* we allow copying but do not copy any data over from the original
+         * device copies will have to be allocated again
+         * and it's a new object to reference */
+
+        return *this;
+      }
+
+      /* mark destructor as virtual */
+      virtual ~ttg_data_copy_t() = default;
+
       /* Returns true if the copy is mutable */
       bool is_mutable() const {
-        return this->readers == mutable_tag;
+        return m_readers == mutable_tag;
       }
 
       /* Mark the copy as mutable */
       void mark_mutable() {
-        this->readers = mutable_tag;
+        m_readers = mutable_tag;
       }
 
       /* Increment the reader counter and return previous value
@@ -41,9 +217,11 @@ namespace ttg_parsec {
       template<bool Atomic = true>
       int increment_readers() {
         if constexpr(Atomic) {
-          return parsec_atomic_fetch_inc_int32(&this->readers);
+          return parsec_atomic_fetch_inc_int32(&m_readers);
+//          std::atomic_ref<int32_t> a{m_readers};
+//          return a.fetch_add(1, std::memory_order_relaxed);
         } else {
-          return this->readers++;
+          return m_readers++;
         }
       }
 
@@ -51,7 +229,9 @@ namespace ttg_parsec {
       * Reset the number of readers to read-only with a single reader.
       */
       void reset_readers() {
-        this->readers = 1;
+        if (mutable_tag == m_readers) {
+          m_readers = 1;
+        }
       }
 
       /* Decrement the reader counter and return previous value.
@@ -60,28 +240,203 @@ namespace ttg_parsec {
       template<bool Atomic = true>
       int decrement_readers() {
         if constexpr(Atomic) {
-          return parsec_atomic_fetch_dec_int32(&this->readers);
+          return parsec_atomic_fetch_dec_int32(&m_readers);
+//          std::atomic_ref<int32_t> a{m_readers};
+//          return a.fetch_sub(1, std::memory_order_relaxed);
         } else {
-          return this->readers--;
+          return m_readers--;
         }
       }
 
       /* Returns the number of readers if the copy is immutable, or \c mutable_tag
       * if the copy is mutable */
       int num_readers() const {
-        return this->readers;
+        return m_readers;
       }
 
-      ttg_data_copy_t()
-      {
-        /* TODO: do we need this construction? */
-        PARSEC_OBJ_CONSTRUCT(this, parsec_data_copy_t);
-        this->readers = 1;
-        this->push_task = nullptr;
+      /* Returns the pointer to the user data wrapped by the the copy object */
+      virtual void* get_ptr() = 0;
+
+      parsec_task_t* get_next_task() const {
+        return m_next_task;
       }
 
-      /* mark destructor as virtual */
-      virtual ~ttg_data_copy_t() = default;
+      void set_next_task(parsec_task_t* task) {
+        m_next_task = task;
+      }
+
+      int32_t add_ref() {
+        return m_refs.fetch_add(1, std::memory_order_relaxed);
+      }
+
+      int32_t drop_ref() {
+        return m_refs.fetch_sub(1, std::memory_order_relaxed);
+      }
+
+      bool has_ref() {
+        return (m_refs.load(std::memory_order_relaxed) != 0);
+      }
+
+      int32_t num_ref() const {
+        return m_refs.load(std::memory_order_relaxed);
+      }
+
+      /* increment the version of the current copy */
+      void inc_current_version() {
+        //std::cout << "data-copy " << this << " inc_current_version " << " count " << m_num_dev_data << std::endl;
+        foreach_parsec_data([](parsec_data_t* data){
+          assert(data->device_copies[0] != nullptr);
+          data->device_copies[0]->version++;
+        });
+      }
+
+      void transfer_ownership(int access, int device = 0) {
+        foreach_parsec_data([&](parsec_data_t* data){
+          parsec_data_transfer_ownership_to_copy(data, device, access);
+        });
+      }
+
+      /* manage device copies owned by this object
+       * we only touch the vector if we have more than one copies to track
+       * and otherwise use the single-element member.
+       */
+      using iterator = ttg_parsec_data_wrapper_t**;
+
+      void add_device_data(ttg_parsec_data_wrapper_t* data) {
+        switch (m_num_dev_data) {
+          case 0:
+            m_single_dev_data = data;
+            break;
+          case 1:
+            /* move single copy into vector and add new copy below */
+            m_dev_data.push_back(m_single_dev_data);
+            m_single_dev_data = nullptr;
+            /* fall-through */
+          default:
+            /* store in multi-copy vector */
+            m_dev_data.push_back(data);
+            break;
+        }
+        //std::cout << "data-copy " << this << " add data " << data << " count " << m_num_dev_data << std::endl;
+        m_num_dev_data++;
+      }
+
+      void remove_device_data(ttg_parsec_data_wrapper_t* data) {
+        //std::cout << "data-copy " << this << " remove data " << data << " count " << m_num_dev_data << std::endl;
+        if (m_num_dev_data == 0) {
+          /* this may happen if we're integrated into the object and have been moved */
+          return;
+        }
+        if (m_num_dev_data == 1) {
+          assert(m_single_dev_data == data);
+          m_single_dev_data = nullptr;
+        } else if (m_num_dev_data > 1) {
+          auto it = std::find(m_dev_data.begin(), m_dev_data.end(), data);
+          if (it != m_dev_data.end()) {
+            m_dev_data.erase(it);
+          }
+        }
+        --m_num_dev_data;
+        /* make single-entry if needed */
+        if (m_num_dev_data == 1) {
+          m_single_dev_data = m_dev_data[0];
+          m_dev_data.clear();
+        }
+      }
+
+      int num_dev_data() const {
+        return m_num_dev_data;
+      }
+
+      template<typename Fn>
+      void foreach_wrapper(Fn&& fn) {
+        if (m_num_dev_data == 1) {
+          fn(m_single_dev_data);
+        } else if (m_num_dev_data > 1) {
+          std::for_each(m_dev_data.begin(), m_dev_data.end(), fn);
+        }
+      }
+
+      template<typename Fn>
+      void foreach_parsec_data(Fn&& fn) {
+        if (m_num_dev_data == 1) {
+          if (m_single_dev_data->parsec_data()) {
+            fn(m_single_dev_data->parsec_data());
+          }
+        } else if (m_num_dev_data > 1) {
+          std::for_each(m_dev_data.begin(), m_dev_data.end(),
+            [&](ttg_parsec_data_wrapper_t* data){
+              if (data->parsec_data()) {
+                fn(data->parsec_data());
+              }
+            }
+          );
+        }
+      }
+
+
+#if 0
+      iterator begin() {
+        switch(m_num_dev_data) {
+          // no device copies
+          case 0: return end();
+          case 1: return &m_single_dev_data;
+          default: return m_dev_data.data();
+        }
+      }
+
+      iterator end() {
+        switch(m_num_dev_data) {
+          case 0:
+          case 1:
+            return &(m_single_dev_data) + 1;
+          default:
+            return m_dev_data.data() + m_dev_data.size();
+        }
+      }
+#endif // 0
+
+      using iovec_iterator = typename std::vector<ttg::iovec>::iterator;
+
+      iovec_iterator iovec_begin() {
+        return m_iovecs.begin();
+      }
+
+      iovec_iterator iovec_end() {
+        return m_iovecs.end();
+      }
+
+      void iovec_reset() {
+        m_iovecs.clear();
+      }
+
+      void iovec_add(const ttg::iovec& iov) {
+        m_iovecs.push_back(iov);
+      }
+
+      ttg::span<ttg::iovec> iovec_span() {
+        return ttg::span<ttg::iovec>(m_iovecs.data(), m_iovecs.size());
+      }
+
+      std::size_t iovec_count() const {
+        return m_iovecs.size();
+      }
+
+#if defined(PARSEC_PROF_TRACE) && defined(PARSEC_TTG_PROFILE_BACKEND)
+      int64_t size;
+      int64_t uid;
+#endif
+    protected:
+      parsec_task_t *m_next_task = nullptr;
+      int32_t        m_readers  = 1;
+      std::atomic<int32_t>  m_refs = 1;                     //< number of entities referencing this copy (TTGs, external)
+
+      std::vector<ttg::iovec> m_iovecs;
+
+      std::vector<ttg_parsec_data_wrapper_t*> m_dev_data;   //< used if there are multiple device copies
+                                                            //  that belong to this object
+      ttg_parsec_data_wrapper_t *m_single_dev_data;         //< used if there is a single device copy
+      int m_num_dev_data = 0;                               //< number of device copies
     };
 
 
@@ -91,21 +446,153 @@ namespace ttg_parsec {
     * the destructor of ttg_data_copy_t base class is called.
     */
     template<typename ValueT>
-    struct ttg_data_value_copy_t final : public ttg_data_copy_t {
-      using value_type = std::decay_t<ValueT>;
+    struct ttg_data_value_copy_t final : private ttg_data_copy_container_setter<ttg_data_copy_t>
+                                       , public ttg_data_copy_t {
+      using value_type = ValueT;
       value_type m_value;
 
       template<typename T>
       ttg_data_value_copy_t(T&& value)
-      : ttg_data_copy_t(), m_value(std::forward<T>(value))
+      : ttg_data_copy_container_setter(this)
+      , ttg_data_copy_t()
+      , m_value(std::forward<T>(value))
+      {
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t(ttg_data_value_copy_t&& c)
+        noexcept(std::is_nothrow_move_constructible_v<value_type>)
+      : ttg_data_copy_container_setter(this)
+      , ttg_data_copy_t(std::move(c))
+      , m_value(std::move(c.m_value))
+      {
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t(const ttg_data_value_copy_t& c)
+        noexcept(std::is_nothrow_copy_constructible_v<value_type>)
+      : ttg_data_copy_container_setter(this)
+      , ttg_data_copy_t(c)
+      , m_value(c.m_value)
       {
-        this->device_private = const_cast<value_type*>(&m_value);
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t& operator=(ttg_data_value_copy_t&& c)
+        noexcept(std::is_nothrow_move_assignable_v<value_type>)
+      {
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+        ttg_data_copy_t::operator=(std::move(c));
+        m_value = std::move(c.m_value);
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      ttg_data_value_copy_t& operator=(const ttg_data_value_copy_t& c)
+        noexcept(std::is_nothrow_copy_assignable_v<value_type>)
+      {
+        /* set the container ptr here, will be reset in the the ttg_data_value_copy_t ctor */
+        ttg_data_copy_container() = this;
+        ttg_data_copy_t::operator=(c);
+        m_value = c.m_value;
+        /* reset the container tracker */
+        ttg_data_copy_container() = nullptr;
+      }
+
+      value_type& operator*() {
+        return m_value;
       }
 
       /* will destruct the value */
       virtual ~ttg_data_value_copy_t() = default;
+
+      virtual void* get_ptr() override final {
+        return &m_value;
+      }
     };
 
+    /**
+     * definition of ttg_parsec_data_wrapper_t members that depend on ttg_data_copy_t
+     */
+
+    inline
+    void ttg_parsec_data_wrapper_t::remove_from_owner() {
+      if (nullptr != m_ttg_copy) {
+        m_ttg_copy->remove_device_data(this);
+        m_ttg_copy = nullptr;
+      }
+    }
+
+    inline
+    void ttg_parsec_data_wrapper_t::reset_parsec_data(void *ptr, size_t size) {
+      if (ptr == m_data.get()) return;
+
+      if (nullptr == ptr) {
+        m_data = parsec_data_ptr(nullptr, &delete_null_parsec_data);
+      } else {
+        m_data = parsec_data_ptr(create_parsec_data(ptr, size), &delete_parsec_data);
+      }
+    }
+
+    inline
+    ttg_parsec_data_wrapper_t::ttg_parsec_data_wrapper_t()
+    : m_data(nullptr, delete_null_parsec_data)
+    , m_ttg_copy(detail::ttg_data_copy_container())
+    {
+      if (m_ttg_copy) {
+        m_ttg_copy->add_device_data(this);
+      }
+    }
+
+    inline
+    ttg_parsec_data_wrapper_t::ttg_parsec_data_wrapper_t(ttg_parsec_data_wrapper_t&& other)
+    : m_data(std::move(other.m_data))
+    , m_ttg_copy(detail::ttg_data_copy_container())
+    {
+      /* the ttg_data_copy may have moved us already */
+      //if (other.m_ttg_copy != m_ttg_copy) {
+        // try to remove the old buffer from the *old* ttg_copy
+        other.remove_from_owner();
+
+        // register with the new ttg_copy
+        if (nullptr != m_ttg_copy) {
+          m_ttg_copy->add_device_data(this);
+        }
+      //} else {
+      //  other.m_ttg_copy = nullptr;
+      //}
+    }
+
+    inline
+    ttg_parsec_data_wrapper_t& ttg_parsec_data_wrapper_t::operator=(ttg_parsec_data_wrapper_t&& other) {
+      m_data = std::move(other.m_data);
+      /* check whether the owning ttg_data_copy has already moved us */
+      if (other.m_ttg_copy != m_ttg_copy) {
+        /* remove from old ttg copy */
+        other.remove_from_owner();
+
+        if (nullptr != m_ttg_copy) {
+          /* register with the new ttg_copy */
+          m_ttg_copy->add_device_data(this);
+        }
+      }
+      return *this;
+    }
+
+
+    inline
+    ttg_parsec_data_wrapper_t::~ttg_parsec_data_wrapper_t() {
+      if (nullptr != m_ttg_copy) {
+        m_ttg_copy->remove_device_data(this);
+        m_ttg_copy = nullptr;
+      }
+    }
+
+
   } // namespace detail
 
 } // namespace ttg_parsec
diff --git a/ttg/ttg/parsec/ttvalue.h b/ttg/ttg/parsec/ttvalue.h
new file mode 100644
index 000000000..b93f1687f
--- /dev/null
+++ b/ttg/ttg/parsec/ttvalue.h
@@ -0,0 +1,101 @@
+#ifndef TTG_PARSEC_TTVALUE_H
+#define TTG_PARSEC_TTVALUE_H
+
+#include <type_traits>
+
+#include "ttg/parsec/ttg_data_copy.h"
+
+namespace ttg_parsec {
+
+  /**
+   * Base class for data to moved into, through, and out of
+   * a task graph. By inheriting from this base class,
+   * TTG is able to easily track the data and avoid some
+   * of the copies otherwise necessary.
+   */
+  template<typename DerivedT>
+  struct TTValue : private ttg_parsec::detail::ttg_data_copy_container_setter<ttg_parsec::detail::ttg_data_copy_t>
+                 , public ttg_parsec::detail::ttg_data_copy_t {
+
+    using derived_type = std::decay_t<DerivedT>;
+
+    /* Constructor called with a pointer to the derived class object */
+    TTValue()
+    : ttg_data_copy_container_setter(this)
+    , ttg_data_copy_t()
+    { }
+
+    /* default copy ctor */
+    TTValue(const TTValue& v)
+    : ttg_data_copy_container_setter(this)
+    , ttg_data_copy_t(v)
+    { }
+
+    /* default move ctor */
+    TTValue(TTValue&& v)
+    : ttg_data_copy_container_setter(this)
+    , ttg_data_copy_t(std::move(v))
+    { }
+
+    /* virtual mark destructor */
+    virtual ~TTValue() = default;
+
+    /* default copy operator */
+    TTValue& operator=(const TTValue& v) {
+      ttg_parsec::detail::ttg_data_copy_container() = this;
+      ttg_data_copy_t::operator=(v);
+      return *this;
+    }
+
+    /* default move operator */
+    TTValue& operator=(TTValue&& v) {
+      ttg_parsec::detail::ttg_data_copy_container() = this;
+      ttg_data_copy_t::operator=(std::move(v));
+      return *this;
+    }
+
+    virtual void* get_ptr() override final {
+      return static_cast<DerivedT*>(this);
+    }
+
+    derived_type& get_derived() {
+        return *static_cast<DerivedT*>(this);
+    }
+
+    const derived_type& get_derived() const {
+        return *static_cast<DerivedT*>(this);
+    }
+  };
+
+  namespace detail {
+
+    template<typename T, typename Enabler = void>
+    struct is_ttvalue_base : std::false_type {};
+
+    template<typename T>
+    struct is_ttvalue_base<T, std::is_base_of<TTValue<std::decay_t<T>>, std::decay_t<T>>>
+    : std::true_type
+    { };
+
+    template<typename T>
+    static constexpr const bool is_ttvalue_base_v = is_ttvalue_base<T>::value;
+
+    template<typename ValueT>
+    struct persistent_value_ref {
+      using reference_type = ValueT;
+      using value_type = std::decay_t<ValueT>;
+      using lvalue_reference_type = std::add_lvalue_reference_t<std::remove_reference_t<ValueT>>;
+      lvalue_reference_type value_ref;
+    };
+  } // namespace detail
+
+  template<typename ValueT>
+  inline auto persistent(ValueT&& value) {
+    static_assert(std::is_base_of_v<TTValue<std::decay_t<ValueT>>, std::decay_t<ValueT>>,
+                  "ttg::persistent can only be used on types derived from ttg::TTValue");
+    return detail::persistent_value_ref<ValueT>{value};
+  }
+
+} // namespace ttg_parsec
+
+#endif // TTG_PARSEC_TTVALUE_H
\ No newline at end of file
diff --git a/ttg/ttg/ptr.h b/ttg/ttg/ptr.h
new file mode 100644
index 000000000..c6c92006f
--- /dev/null
+++ b/ttg/ttg/ptr.h
@@ -0,0 +1,104 @@
+#ifndef TTG_PTR_H
+#define TTG_PTR_H
+
+#include "ttg/fwd.h"
+
+namespace ttg {
+
+template<typename T>
+using Ptr = TTG_IMPL_NS::Ptr<T>;
+
+template<typename T, typename... Args>
+inline Ptr<T> make_ptr(Args&&... args) {
+  return TTG_IMPL_NS::make_ptr(std::forward<Args>(args)...);
+}
+
+template<typename T>
+inline Ptr<std::decay_t<T>> get_ptr(T&& obj) {
+  return TTG_IMPL_NS::get_ptr(std::forward<T>(obj));
+}
+
+#if 0
+namespace detail {
+
+    /* awaiter for ttg::get_ptr with multiple arguments
+     * operator co_wait will return the tuple of ttg::Ptr
+     */
+    template<typename... Ts>
+    struct get_ptr_tpl_t {
+    private:
+      std::tuple<ttg::Ptr<Ts>...> m_ptr_tuple;
+      bool m_is_ready = false;
+    public:
+      get_ptr_tpl_t(bool is_ready, std::tuple<ttg::ptr<Ts>...>&& ptrs)
+      : m_ptr_tuple(std::forward<std::tuple<ttg::Ptr<Ts>...>>(ptrs))
+      , m_is_ready(is_ready)
+      { }
+
+      bool await_ready() const noexcept {
+        return m_is_ready;
+      }
+
+      constexpr void await_suspend( std::coroutine_handle<> ) const noexcept {
+        /* TODO: anything to be done here? */
+      }
+
+      auto await_resume() const noexcept {
+        return std::move(m_ptr_tuple);
+      }
+    };
+
+
+    /* awaiter for ttg::get_ptr for a single argument */
+    template<typename T>
+    struct get_ptr_t {
+    private:
+      ttg::Ptr<T> m_ptr;
+      bool m_is_ready = false;
+    public:
+      get_ptr_t(bool is_ready, ttg::Ptr<T>&& ptr)
+      : m_ptr(std::forward<ttg::Ptr<T>>(ptr))
+      , m_is_ready(is_ready)
+      { }
+
+      bool await_ready() const noexcept {
+        return m_is_ready;
+      }
+
+      constexpr void await_suspend( std::coroutine_handle<> ) const noexcept {
+        /* TODO: anything to be done here? */
+      }
+
+      auto await_resume() const noexcept {
+        return std::move(m_ptr);
+      }
+    };
+  } // namespace detail
+
+  /**
+   * Get an awaiter that results in a ttg::Ptr to a task argument.
+   * Must only be called inside a task on a value that was passed
+   * to the task and has not yet been moved on.
+   * Should be used in conjunction with co_await, e.g.,
+   * ttg::Ptr<double> ptr = co_await ttg::get_ptr(val);
+   *
+   * Multiple value can be passed, which results in a tuple of ptr:
+   * ttg::Ptr<double> ptr1, ptr2;
+   * std::tie(ptr1, ptr2) = co_await ttg::get_ptr(val1, val2);
+   */
+  template<typename Arg, typename... Args>
+  auto get_ptr(Arg&& arg, Args&&... args) {
+    bool is_ready;
+    using tpl_type    = std::tuple<ttg::Ptr<std::decay_t<Arg>, std::decay<Args>...>>;
+    using result_type = std::pair<bool, tpl_type>;
+    result_type p = TTG_IMPL_NS::get_ptr(std::forward<Arg>(arg), std::forward<Args>(args)...);
+    if constexpr (sizeof...(Args) > 0) {
+      return detail::get_ptr_tpl_t<std::decay_t<Arg>, std::decay_t<Args>...>(p.first, std::move(p.second));
+    } else if constexpr (sizeof...(Args) == 0) {
+      return detail::get_ptr_t<std::decay_t<Arg>>(p.first, std::move(std::get<0>(p.second)));
+    }
+  }
+#endif // 0
+} // namespace ttg
+
+#endif // TTG_PTR_H
\ No newline at end of file
diff --git a/ttg/ttg/run.h b/ttg/ttg/run.h
index 21ec337e8..06111e05e 100644
--- a/ttg/ttg/run.h
+++ b/ttg/ttg/run.h
@@ -9,6 +9,7 @@
 
 #include "ttg/util/bug.h"
 #include "ttg/util/env.h"
+#include "ttg/world.h"
 
 namespace ttg {
 
@@ -57,6 +58,7 @@ namespace ttg {
   inline void finalize() { TTG_IMPL_NS::ttg_finalize(); }
 
   /// Aborts the TTG program using the default backend's `ttg_abort` method
+  [[noreturn]]
   inline void abort() { TTG_IMPL_NS::ttg_abort(); }
 
   /// Accesses the default backend's default execution context
diff --git a/ttg/ttg/serialization/backends.h b/ttg/ttg/serialization/backends.h
index 3cbc9cb42..97f88f5aa 100644
--- a/ttg/ttg/serialization/backends.h
+++ b/ttg/ttg/serialization/backends.h
@@ -6,7 +6,6 @@
 #define TTG_SERIALIZATION_BACKENDS_H
 
 #include "ttg/serialization/backends/boost.h"
-#include "ttg/serialization/backends/cereal.h"
 #include "ttg/serialization/backends/madness.h"
 
 #endif  // TTG_SERIALIZATION_ALL_H
diff --git a/ttg/ttg/serialization/backends/cereal.h b/ttg/ttg/serialization/backends/cereal.h
deleted file mode 100644
index f9e6731b5..000000000
--- a/ttg/ttg/serialization/backends/cereal.h
+++ /dev/null
@@ -1,98 +0,0 @@
-//
-// Created by Eduard Valeyev on 5/3/21.
-//
-
-#ifndef TTG_SERIALIZATION_CEREAL_H
-#define TTG_SERIALIZATION_CEREAL_H
-
-#include <type_traits>
-
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-#include <cereal/archives/binary.hpp>
-#include <cereal/cereal.hpp>
-#include <cereal/details/helpers.hpp>
-#include <cereal/details/traits.hpp>
-#endif
-
-namespace ttg::detail {
-
-  //////// is_cereal_serializable
-
-  template <typename Archive, typename T, class = void>
-  struct is_cereal_serializable : std::false_type {};
-
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-  template <typename Archive, typename T>
-  struct is_cereal_serializable<Archive, T,
-                                std::enable_if_t<cereal::traits::is_output_serializable<T, Archive>::value ||
-                                                 cereal::traits::is_input_serializable<T, Archive>::value>>
-      : std::true_type {};
-#endif  // TTG_SERIALIZATION_SUPPORTS_CEREAL
-
-  template <typename Archive, typename T>
-  inline static constexpr bool is_cereal_serializable_v = is_cereal_serializable<Archive, T>::value;
-
-  template <typename T, class = void>
-  struct is_cereal_buffer_serializable : std::false_type {};
-
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-  template <typename T>
-  struct is_cereal_buffer_serializable<T, std::enable_if_t<is_cereal_serializable_v<cereal::BinaryInputArchive, T> &&
-                                                           is_cereal_serializable_v<cereal::BinaryOutputArchive, T>>>
-      : std::true_type {};
-#endif  // TTG_SERIALIZATION_SUPPORTS_CEREAL
-
-  /// evaluates to true if can serialize @p T to/from buffer using Cereal serialization
-  template <typename T>
-  inline constexpr bool is_cereal_buffer_serializable_v = is_cereal_buffer_serializable<T>::value;
-
-  template <typename Archive, typename T, typename Enabler = void>
-  struct is_cereal_array_serializable;
-
-  template <typename Archive, typename T>
-  struct is_cereal_array_serializable<Archive, T, std::enable_if_t<!std::is_array_v<T>>> : std::false_type {};
-
-  template <typename Archive, typename T>
-  struct is_cereal_array_serializable<Archive, T, std::enable_if_t<std::is_array_v<T>>>
-      : std::bool_constant<is_cereal_serializable_v<Archive, std::remove_extent_t<T>>> {};
-
-  template <typename Archive, typename T>
-  inline static constexpr bool is_cereal_array_serializable_v = is_cereal_array_serializable<Archive, T>::value;
-
-  template <typename Archive, typename T>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v = false;
-
-  template <typename Archive, typename T, class = void>
-  struct is_cereal_user_serializable : std::false_type {};
-
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-  template <typename Archive, typename T>
-  struct is_cereal_user_serializable<
-      Archive, T,
-      std::enable_if_t<(cereal::traits::detail::count_specializations<T, Archive>::value != 0) ||
-                       ((cereal::traits::is_input_serializable<T, Archive>::value ||
-                         cereal::traits::is_output_serializable<T, Archive>::value) &&
-                        (!std::is_arithmetic_v<T> && !ttg::detail::is_cereal_array_serializable_v<Archive, T> &&
-                         !is_stlcontainer_cereal_serializable_v<Archive, T>))>> : std::true_type {};
-#endif
-
-  template <typename Archive, typename T>
-  inline constexpr bool is_cereal_user_serializable_v = is_cereal_user_serializable<Archive, T>::value;
-
-  template <typename T, class = void>
-  struct is_cereal_user_buffer_serializable : std::false_type {};
-
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-  template <typename T>
-  struct is_cereal_user_buffer_serializable<
-      T, std::enable_if_t<is_cereal_user_serializable_v<cereal::BinaryInputArchive, T> ||
-                          is_cereal_user_serializable_v<cereal::BinaryOutputArchive, T>>> : std::true_type {};
-#endif  // TTG_SERIALIZATION_SUPPORTS_CEREAL
-
-  /// evaluates to true if can serialize @p T to/from buffer using user-provided Cereal serialization
-  template <typename T>
-  inline constexpr bool is_cereal_user_buffer_serializable_v = is_cereal_user_buffer_serializable<T>::value;
-
-}  // namespace ttg::detail
-
-#endif  // TTG_SERIALIZATION_CEREAL_H
diff --git a/ttg/ttg/serialization/backends/madness.h b/ttg/ttg/serialization/backends/madness.h
index e3f64cc86..fff23b5c9 100644
--- a/ttg/ttg/serialization/backends/madness.h
+++ b/ttg/ttg/serialization/backends/madness.h
@@ -13,6 +13,8 @@
 #include <madness/world/type_traits.h>
 #endif
 
+#include "ttg/serialization/traits.h"
+
 namespace ttg::detail {
 
   /*----- if_madness_{input,output,}_archive_v -----*/
diff --git a/ttg/ttg/serialization/data_descriptor.h b/ttg/ttg/serialization/data_descriptor.h
index 265897e94..f14d00e72 100644
--- a/ttg/ttg/serialization/data_descriptor.h
+++ b/ttg/ttg/serialization/data_descriptor.h
@@ -44,7 +44,7 @@ namespace ttg {
   /// @tparam T a trivially-copyable type
   template <typename T>
   struct default_data_descriptor<
-      T, std::enable_if_t<std::is_trivially_copyable_v<T> && !detail::is_user_buffer_serializable_v<T> &&
+      T, std::enable_if_t<detail::is_memcpyable_v<T> && !detail::is_user_buffer_serializable_v<T> &&
                           !ttg::has_split_metadata<T>::value>> {
     static constexpr const bool serialize_size_is_const = true;
 
@@ -154,7 +154,7 @@ namespace ttg {
   /// and support MADNESS serialization
   template <typename T>
   struct default_data_descriptor<
-      T, std::enable_if_t<((!std::is_trivially_copyable_v<T> && detail::is_madness_buffer_serializable_v<T>) ||
+      T, std::enable_if_t<((!detail::is_memcpyable_v<T> && detail::is_madness_buffer_serializable_v<T>) ||
                            detail::is_madness_user_buffer_serializable_v<T>)&&!ttg::has_split_metadata<T>::value>> {
     static constexpr const bool serialize_size_is_const = false;
 
@@ -200,7 +200,7 @@ namespace ttg {
   /// do not support MADNESS serialization, and support Boost serialization
   template <typename T>
   struct default_data_descriptor<
-      T, std::enable_if_t<(!std::is_trivially_copyable_v<T> && !detail::is_madness_buffer_serializable_v<T> &&
+      T, std::enable_if_t<(!detail::is_memcpyable_v<T> && !detail::is_madness_buffer_serializable_v<T> &&
                            detail::is_boost_buffer_serializable_v<T>) ||
                           (!detail::is_madness_user_buffer_serializable_v<T> &&
                            detail::is_boost_user_buffer_serializable_v<T>)>> {
@@ -236,46 +236,6 @@ namespace ttg {
 
 #endif  // has Boost serialization
 
-#if defined(TTG_SERIALIZATION_SUPPORTS_CEREAL)
-
-namespace ttg {
-
-  /// The default implementation for non-POD data types that are not directly copyable
-  /// do not support MADNESS or Boost serialization, and support Cereal serialization
-  template <typename T>
-  struct default_data_descriptor<
-      T, std::enable_if_t<(!std::is_trivially_copyable_v<T> && !detail::is_madness_buffer_serializable_v<T> &&
-                           !detail::is_boost_buffer_serializable_v<T> && detail::is_cereal_buffer_serializable_v<T>) ||
-                          (!detail::is_madness_user_buffer_serializable_v<T> &&
-                           !detail::is_boost_user_buffer_serializable_v<T> &&
-                           detail::is_cereal_user_buffer_serializable_v<T>)>> {
-    static constexpr const bool serialize_size_is_const = false;
-
-    static uint64_t payload_size(const void *object) {
-      ttg::detail::counting_streambuf sbuf;
-      std::ostream os(&sbuf);
-      cereal::BinaryOutputArchive oa(os);
-      oa << (*(T *)object);
-      return sbuf.size();
-    }
-
-    /// object --- obj to be serialized
-    /// chunk_size --- inputs max amount of data to output, and on output returns amount actually output
-    /// pos --- position in the input buffer to resume serialization
-    /// buf[pos] --- place for output
-    static uint64_t pack_payload(const void *object, uint64_t chunk_size, uint64_t pos, void *_buf) { abort(); }
-
-    /// object --- obj to be deserialized
-    /// chunk_size --- amount of data for input
-    /// pos --- position in the input buffer to resume deserialization
-    /// object -- pointer to the object to fill up
-    static void unpack_payload(void *object, uint64_t chunk_size, uint64_t pos, const void *_buf) { abort(); }
-  };
-
-}  // namespace ttg
-
-#endif  // has Cereal serialization
-
 namespace ttg {
 
   // Returns a pointer to a constant static instance initialized
diff --git a/ttg/ttg/serialization/splitmd_data_descriptor.h b/ttg/ttg/serialization/splitmd_data_descriptor.h
index 8edfe20d6..46bdb7b76 100644
--- a/ttg/ttg/serialization/splitmd_data_descriptor.h
+++ b/ttg/ttg/serialization/splitmd_data_descriptor.h
@@ -3,20 +3,10 @@
 
 #include <type_traits>
 #include "ttg/util/meta.h"
+#include "ttg/util/iovec.h"
 
 namespace ttg {
 
-  /**
-   * Used to describe transfer payload in types using the \sa SplitMetadataDescriptor.
-   * @c data Pointer to the data to be read from / written to.
-   * @c num_bytes The number of bytes to read from / write to the memory location
-   *                   \sa data.
-   */
-  struct iovec {
-    size_t num_bytes;
-    void* data;
-  };
-
   /**
    * SplitMetadataDescriptor is a serialization descriptor provided by the user
    * for a user-specified type. It should contain the following public member
diff --git a/ttg/ttg/serialization/std/array.h b/ttg/ttg/serialization/std/array.h
index 60aa9fa93..05d008ac8 100644
--- a/ttg/ttg/serialization/std/array.h
+++ b/ttg/ttg/serialization/std/array.h
@@ -25,18 +25,4 @@ namespace ttg::detail {
 
 #endif  // TTG_SERIALIZATION_SUPPORTS_BOOST
 
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-#include <cereal/types/array.hpp>
-
-namespace ttg::detail {
-  template <typename Archive, typename T, std::size_t N>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, std::array<T, N>> =
-      is_cereal_serializable_v<Archive, T>;
-  template <typename Archive, typename T, std::size_t N>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, const std::array<T, N>> =
-      is_cereal_serializable_v<Archive, const T>;
-}  // namespace ttg::detail
-
-#endif
-
 #endif  // TTG_SERIALIZATION_STD_ARRAY_H
diff --git a/ttg/ttg/serialization/std/list.h b/ttg/ttg/serialization/std/list.h
index ebf24ab04..3c525b0f8 100644
--- a/ttg/ttg/serialization/std/list.h
+++ b/ttg/ttg/serialization/std/list.h
@@ -26,18 +26,4 @@ namespace ttg::detail {
 
 #endif  // TTG_SERIALIZATION_SUPPORTS_BOOST
 
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-#include <cereal/types/list.hpp>
-
-namespace ttg::detail {
-  template <typename Archive, typename T, typename A>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, std::list<T, A>> =
-      is_cereal_serializable_v<Archive, T>;
-  template <typename Archive, typename T, typename A>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, const std::list<T, A>> =
-      is_cereal_serializable_v<Archive, const T>;
-}  // namespace ttg::detail
-
-#endif
-
 #endif  // TTG_SERIALIZATION_STD_LIST_H
diff --git a/ttg/ttg/serialization/std/pair.h b/ttg/ttg/serialization/std/pair.h
index cca0076ed..ccfce52a9 100644
--- a/ttg/ttg/serialization/std/pair.h
+++ b/ttg/ttg/serialization/std/pair.h
@@ -25,18 +25,4 @@ namespace ttg::detail {
 
 #endif  // TTG_SERIALIZATION_SUPPORTS_BOOST
 
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-#include <cereal/types/utility.hpp>
-
-namespace ttg::detail {
-  template <typename Archive, typename T1, typename T2>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, std::pair<T1, T2>> =
-      is_cereal_serializable_v<Archive, T1>&& is_cereal_serializable_v<Archive, T2>;
-  template <typename Archive, typename T1, typename T2>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, const std::pair<T1, T2>> =
-      is_cereal_serializable_v<Archive, const T1>&& is_cereal_serializable_v<Archive, const T2>;
-}  // namespace ttg::detail
-
-#endif
-
 #endif  // TTG_SERIALIZATION_STD_PAIR_H
diff --git a/ttg/ttg/serialization/std/tuple.h b/ttg/ttg/serialization/std/tuple.h
index dd8e6aadb..048f64ec5 100644
--- a/ttg/ttg/serialization/std/tuple.h
+++ b/ttg/ttg/serialization/std/tuple.h
@@ -46,18 +46,4 @@ namespace ttg::detail {
 
 #endif  // TTG_SERIALIZATION_SUPPORTS_BOOST
 
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-#include <cereal/types/tuple.hpp>
-
-namespace ttg::detail {
-  template <typename Archive, typename... Ts>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, std::tuple<Ts...>> =
-      (is_cereal_serializable_v<Archive, Ts> && ...);
-  template <typename Archive, typename... Ts>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, const std::tuple<Ts...>> =
-      (is_cereal_serializable_v<Archive, const Ts> && ...);
-}  // namespace ttg::detail
-
-#endif
-
 #endif  // TTG_SERIALIZATION_STD_TUPLE_H
diff --git a/ttg/ttg/serialization/std/vector.h b/ttg/ttg/serialization/std/vector.h
index f939d24b4..90a05f072 100644
--- a/ttg/ttg/serialization/std/vector.h
+++ b/ttg/ttg/serialization/std/vector.h
@@ -26,18 +26,4 @@ namespace ttg::detail {
 
 #endif  // TTG_SERIALIZATION_SUPPORTS_BOOST
 
-#ifdef TTG_SERIALIZATION_SUPPORTS_CEREAL
-#include <cereal/types/vector.hpp>
-
-namespace ttg::detail {
-  template <typename Archive, typename T, typename A>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, std::vector<T, A>> =
-      is_cereal_serializable_v<Archive, T>;
-  template <typename Archive, typename T, typename A>
-  inline static constexpr bool is_stlcontainer_cereal_serializable_v<Archive, const std::vector<T, A>> =
-      is_cereal_serializable_v<Archive, const T>;
-}  // namespace ttg::detail
-
-#endif
-
 #endif  // TTG_SERIALIZATION_STD_VECTOR_H
diff --git a/ttg/ttg/serialization/traits.h b/ttg/ttg/serialization/traits.h
index 431b86098..d1b4905bd 100644
--- a/ttg/ttg/serialization/traits.h
+++ b/ttg/ttg/serialization/traits.h
@@ -23,7 +23,7 @@ namespace ttg::detail {
 
   /// helps to detect that `T` has a member serialization method that
   /// accepts single argument of type `Archive`
-  /// @note use in combination with `ttg::meta`::is_detected_v
+  /// @note use in combination with ttg::meta::is_detected_v
   template <typename T, typename Archive>
   using has_member_serialize_t = decltype(std::declval<T&>().serialize(std::declval<Archive&>()));
 
@@ -179,12 +179,33 @@ namespace ttg::detail {
   //  is_boost_user_buffer_serializable_v<T>>> : std::true_type {};
   template <typename T>
   struct is_user_buffer_serializable<
-      T, std::enable_if_t<is_madness_user_buffer_serializable_v<T> || is_boost_user_buffer_serializable_v<T> ||
-                          is_cereal_user_buffer_serializable_v<T>>> : std::true_type {};
+      T, std::enable_if_t<is_madness_user_buffer_serializable_v<T> || is_boost_user_buffer_serializable_v<T>>> : std::true_type {};
 
   template <typename T>
   inline constexpr bool is_user_buffer_serializable_v = is_user_buffer_serializable<T>::value;
 
+  /// \brief can be used to override the default value of is_memcpyable<T>::value
+
+  /// std::is_trivially_copyable_v<T> is sufficient to guarantee that T std::memcpy is safe to use on object of type T
+  /// however, sometimes is_trivially_copyable_v<T> reports false for objects that can be copied with memcpy ,
+  /// e.g., std::pair<int, int> (see https://danlark.org/2020/04/13/why-is-stdpair-broken/).
+  /// In this case specialize this trait
+  template <typename T>
+  inline constexpr bool is_memcpyable_override_v = std::is_trivially_copyable_v<T>;
+
+  // std::pair of trivially-copyable types is trivially copyable
+  template <typename T1, typename T2>
+  inline constexpr bool is_memcpyable_override_v<std::pair<T1,T2>> = std::is_trivially_copyable_v<T1> && std::is_trivially_copyable_v<T2>;
+
+  /// \brief reports whether objects of type T are safe to std::memcpy
+
+  /// True if either std::is_trivially_copyable_v<T> or is_memcpyable_override_v<T> are true
+  template <typename T>
+  struct is_memcpyable : std::bool_constant<std::is_trivially_copyable_v<T> || is_memcpyable_override_v<T>> {};
+
+  template <typename T>
+  inline constexpr bool is_memcpyable_v = is_memcpyable<T>::value;
+
 }  // namespace ttg::detail
 
 #endif  // TTG_SERIALIZATION_TRAITS_H
diff --git a/ttg/ttg/terminal.h b/ttg/ttg/terminal.h
index f4ed76087..dc9f4b08b 100644
--- a/ttg/ttg/terminal.h
+++ b/ttg/ttg/terminal.h
@@ -155,6 +155,7 @@ namespace ttg {
     using broadcast_callback_type = meta::detail::broadcast_callback_t<keyT, std::decay_t<valueT>>;
     using setsize_callback_type = typename base_type::setsize_callback_type;
     using finalize_callback_type = typename base_type::finalize_callback_type;
+    using prepare_send_callback_type = meta::detail::prepare_send_callback_t<keyT, std::decay_t<valueT>>;
     static constexpr bool is_an_input_terminal = true;
     ttg::detail::ContainerWrapper<keyT, valueT> container;
 
@@ -162,6 +163,7 @@ namespace ttg {
     send_callback_type send_callback;
     move_callback_type move_callback;
     broadcast_callback_type broadcast_callback;
+    prepare_send_callback_type prepare_send_callback;
 
     // No moving, copying, assigning permitted
     In(In &&other) = delete;
@@ -186,13 +188,16 @@ namespace ttg {
     ///     will continue adding data onto this terminal
     /// \param[in] setsize_callback: if the terminal is a reduce terminal, announces how many items will be set
     ///     unto this terminal for reduction
+    /// \param[in] prepare_send_callback: for resumable/device tasks this is called before actual send
     void set_callback(const send_callback_type &send_callback, const move_callback_type &move_callback,
                       const broadcast_callback_type &bcast_callback = broadcast_callback_type{},
                       const setsize_callback_type &setsize_callback = setsize_callback_type{},
-                      const finalize_callback_type &finalize_callback = finalize_callback_type{}) {
+                      const finalize_callback_type &finalize_callback = finalize_callback_type{},
+                      const prepare_send_callback_type &prepare_send_callback = prepare_send_callback_type{}) {
       this->send_callback = send_callback;
       this->move_callback = move_callback;
       this->broadcast_callback = bcast_callback;
+      this->prepare_send_callback = prepare_send_callback;
       base_type::set_callback(setsize_callback, finalize_callback);
     }
 
@@ -271,7 +276,7 @@ namespace ttg {
           for (auto &&key : keylist) send(key, v);
         } else {
           /* got something we cannot iterate over (single element?) so put one element in the span */
-          broadcast_callback(ttg::span<const keyT>(&keylist, 1), v);
+          send(ttg::span<const keyT>(&keylist, 1), v);
         }
       }
     }
@@ -291,7 +296,36 @@ namespace ttg {
           for (auto &&key : keylist) sendk(key);
         } else {
           /* got something we cannot iterate over (single element?) so put one element in the span */
-          broadcast_callback(ttg::span<const keyT>(&keylist, 1));
+          sendk(ttg::span<const keyT>(&keylist, 1));
+        }
+      }
+    }
+
+
+    template <typename rangeT, typename Value>
+    void prepare_send(const rangeT &keylist, Value &&value) {
+      const Value &v = value;
+      if (prepare_send_callback) {
+        if constexpr (ttg::meta::is_iterable_v<rangeT>) {
+          prepare_send_callback(ttg::span<const keyT>(&(*std::begin(keylist)),
+                                                      std::distance(std::begin(keylist), std::end(keylist))),
+                                v);
+        } else {
+          /* got something we cannot iterate over (single element?) so put one element in the span */
+          prepare_send_callback(ttg::span<const keyT>(&keylist, 1), v);
+        }
+      }
+    }
+
+    template <typename rangeT, typename Value>
+    void prepare_send(Value &&value) {
+      const Value &v = value;
+      if (prepare_send_callback) {
+        if constexpr (ttg::meta::is_iterable_v<rangeT>) {
+          prepare_send_callback(v);
+        } else {
+          /* got something we cannot iterate over (single element?) so put one element in the span */
+          prepare_send_callback(v);
         }
       }
     }
@@ -531,6 +565,32 @@ namespace ttg {
         }
       }
     }
+
+    template <typename rangeT, typename Key = keyT, typename Value = valueT>
+    std::enable_if_t<meta::is_none_void_v<Key> && !meta::is_void_v<valueT>, void>
+    prepare_send(const rangeT &keylist, const Value &value) {
+      for (auto &&successor : this->successors()) {
+        assert(successor->get_type() != TerminalBase::Type::Write);
+        if (successor->get_type() == TerminalBase::Type::Read) {
+          return static_cast<In<keyT, std::add_const_t<valueT>> *>(successor)->prepare_send(keylist, value);
+        } else if (successor->get_type() == TerminalBase::Type::Consume) {
+          return static_cast<In<keyT, valueT> *>(successor)->prepare_send(keylist, value);
+        }
+      }
+    }
+
+    template <typename rangeT, typename Key = keyT, typename Value = valueT>
+    std::enable_if_t<meta::is_none_void_v<Key> && !meta::is_void_v<valueT>, void>
+    prepare_send(const Value &value) {
+      for (auto &&successor : this->successors()) {
+        assert(successor->get_type() != TerminalBase::Type::Write);
+        if (successor->get_type() == TerminalBase::Type::Read) {
+          return static_cast<In<keyT, std::add_const_t<valueT>> *>(successor)->prepare_send(value);
+        } else if (successor->get_type() == TerminalBase::Type::Consume) {
+          return static_cast<In<keyT, valueT> *>(successor)->prepare_send(value);
+        }
+      }
+    }
   };
 
   namespace meta {
diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h
index 435fba11e..7024776aa 100644
--- a/ttg/ttg/tt.h
+++ b/ttg/ttg/tt.h
@@ -1,16 +1,25 @@
 #ifndef TTG_TT_H
 #define TTG_TT_H
 
-#include <vector>
-#include <memory>
-
 #include "ttg/fwd.h"
 
 #include "ttg/base/tt.h"
 #include "ttg/edge.h"
 
+#ifdef TTG_HAS_COROUTINE
+#include "ttg/coroutine.h"
+#endif
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
 namespace ttg {
 
+  // TODO describe TT concept (preferably as a C++20 concept)
+  // N.B. TT::op returns void or ttg::coroutine_handle<>
+  // see TTG_PROCESS_TT_OP_RETURN below
+
   /// @brief a template task graph implementation
 
   /// It contains (owns) one or more TT objects. Since it can also be viewed as a TT object itself,
@@ -35,12 +44,16 @@ namespace ttg {
     TTG(const TTG &) = delete;
     TTG &operator=(const TTG &) = delete;
     // movable
-    TTG(TTG && other) : TTBase(static_cast<TTBase&&>(other)), tts(other.tts), ins(std::move(other.ins)), outs(std::move(other.outs)) {
-        is_ttg_ = true;
-        own_my_tts();
+    TTG(TTG &&other)
+        : TTBase(static_cast<TTBase &&>(other))
+        , tts(other.tts)
+        , ins(std::move(other.ins))
+        , outs(std::move(other.outs)) {
+      is_ttg_ = true;
+      own_my_tts();
     }
-    TTG& operator=(TTG &&other) {
-      static_cast<TTBase&>(*this) = static_cast<TTBase&&>(other);
+    TTG &operator=(TTG &&other) {
+      static_cast<TTBase &>(*this) = static_cast<TTBase &&>(other);
       is_ttg_ = true;
       tts = std::move(other.tts);
       ins = std::move(other.ins);
@@ -93,17 +106,14 @@ namespace ttg {
     void own_my_tts() const {
       for (auto &op : tts) op->owning_ttg = this;
     }
-
   };
 
   template <typename ttseqT, typename input_terminalsT, typename output_terminalsT>
-  auto make_ttg(ttseqT &&tts, const input_terminalsT &ins,
-                const output_terminalsT &outs,
+  auto make_ttg(ttseqT &&tts, const input_terminalsT &ins, const output_terminalsT &outs,
                 const std::string &name = "ttg") {
     return std::make_unique<TTG<input_terminalsT, output_terminalsT>>(std::forward<ttseqT>(tts), ins, outs, name);
   }
 
-
   /// A data sink for one input
   template <typename keyT, typename input_valueT>
   class SinkTT : public TTBase {
@@ -128,20 +138,20 @@ namespace ttg {
       using valueT = std::decay_t<typename terminalT::value_type>;
       auto move_callback = [](const keyT &key, valueT &&value) {};
       auto send_callback = [](const keyT &key, const valueT &value) {};
-      auto broadcast_callback = [](const ttg::span<const keyT>& key, const valueT &value) {};
+      auto broadcast_callback = [](const ttg::span<const keyT> &key, const valueT &value) {};
       auto setsize_callback = [](const keyT &key, std::size_t size) {};
       auto finalize_callback = [](const keyT &key) {};
 
       input.set_callback(send_callback, move_callback, broadcast_callback, setsize_callback, finalize_callback);
     }
 
-  public:
-   SinkTT(const std::string& inname="junk") : TTBase("sink", numins, numouts) {
+   public:
+    SinkTT(const std::string &inname = "junk") : TTBase("sink", numins, numouts) {
       register_input_terminals(input_terminals, std::vector<std::string>{inname});
       register_input_callback(std::get<0>(input_terminals));
     }
 
-    SinkTT(const input_edges_type &inedges, const std::string& inname="junk") : TTBase("sink", numins, numouts) {
+    SinkTT(const input_edges_type &inedges, const std::string &inname = "junk") : TTBase("sink", numins, numouts) {
       register_input_terminals(input_terminals, std::vector<std::string>{inname});
       register_input_callback(std::get<0>(input_terminals));
       std::get<0>(inedges).set_out(&std::get<0>(input_terminals));
@@ -158,12 +168,38 @@ namespace ttg {
     /// Returns pointer to input terminal i to facilitate connection --- terminal cannot be copied, moved or assigned
     template <std::size_t i>
     std::tuple_element_t<i, input_terminals_type> *in() {
-      static_assert(i==0);
+      static_assert(i == 0);
       return &std::get<i>(input_terminals);
     }
   };
 
-
-} // namespace ttg
-
-#endif // TTG_TT_H
+}  // namespace ttg
+
+#ifndef TTG_PROCESS_TT_OP_RETURN
+#ifdef TTG_HAS_COROUTINE
+#define TTG_PROCESS_TT_OP_RETURN(result, id, invoke)                                                           \
+  {                                                                                                            \
+    using return_type = decltype(invoke);                                                                      \
+    if constexpr (std::is_same_v<return_type, void>) {                                                         \
+      invoke;                                                                                                  \
+      id = ttg::TaskCoroutineID::Invalid;                                                                      \
+    } else {                                                                                                   \
+      auto coro_return = invoke;                                                                               \
+      if constexpr (std::is_same_v<decltype(coro_return), ttg::coroutine_handle<ttg::resumable_task_state>>)   \
+        id = ttg::TaskCoroutineID::ResumableTask;                                                              \
+      else if constexpr (std::is_same_v<decltype(coro_return),                                                 \
+                                        ttg::coroutine_handle<ttg::device::detail::device_task_promise_type>>) \
+        id = ttg::TaskCoroutineID::DeviceTask;                                                                 \
+      else                                                                                                     \
+        std::abort();                                                                                          \
+      result = coro_return.address();                                                                          \
+    }                                                                                                          \
+  }
+#else
+#define TTG_PROCESS_TT_OP_RETURN(result, invoke) invoke
+#endif
+#else
+#error "TTG_PROCESS_TT_OP_RETURN already defined in ttg/tt.h, check your header guards"
+#endif  // !defined(TTG_PROCESS_TT_OP_RETURN)
+
+#endif  // TTG_TT_H
diff --git a/ttg/ttg/ttvalue.h b/ttg/ttg/ttvalue.h
new file mode 100644
index 000000000..4feb1376f
--- /dev/null
+++ b/ttg/ttg/ttvalue.h
@@ -0,0 +1,13 @@
+#ifndef TTG_TTVALUE_H
+#define TTG_TTVALUE_H
+
+#include "ttg/fwd.h"
+
+namespace ttg {
+
+  template<typename T>
+  using TTValue = TTG_IMPL_NS::TTValue<T>;
+
+} // namespace ttg
+
+#endif // TTG_TTVALUE_H
\ No newline at end of file
diff --git a/ttg/ttg/util/bug.cpp b/ttg/ttg/util/bug.cpp
index 1e91e8fd2..27b743096 100644
--- a/ttg/ttg/util/bug.cpp
+++ b/ttg/ttg/util/bug.cpp
@@ -28,6 +28,7 @@
 #include "bug.h"
 
 #include <unistd.h>
+#include <cfenv>
 #include <csignal>
 #include <cstdio>
 #include <cstdlib>
@@ -46,6 +47,103 @@
 using namespace std;
 using namespace ttg;
 
+namespace ttg {
+  void initialize_fpe() {
+#if defined(__APPLE__) && defined(__MACH__)
+
+    // Public domain polyfill for feenableexcept on OS X
+    // http://www-personal.umich.edu/~williams/archive/computation/fe-handling-example.c
+
+#ifndef HAVE_FEENABLEEXCEPT
+    auto feenableexcept = [](int excepts) -> int {
+      static fenv_t fenv;
+      const auto new_excepts = excepts & FE_ALL_EXCEPT;
+
+      if (fegetenv(&fenv)) {
+        return -1;
+      }
+#if defined(__x86_64__)
+      // previous masks
+      const unsigned int old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+      // unmask
+      fenv.__control &= ~new_excepts;
+      fenv.__mxcsr &= ~(new_excepts << 7);
+#elif defined(__arm64__)
+      if (new_excepts & FE_INVALID) fenv.__fpcr |= __fpcr_trap_invalid;
+      if (new_excepts & FE_DIVBYZERO) fenv.__fpcr |= __fpcr_trap_divbyzero;
+      if (new_excepts & FE_OVERFLOW) fenv.__fpcr |= __fpcr_trap_overflow;
+      if (new_excepts & FE_UNDERFLOW) fenv.__fpcr |= __fpcr_trap_underflow;
+      if (new_excepts & FE_INEXACT) fenv.__fpcr |= __fpcr_trap_inexact;
+#else
+#error "MacOS on unknown architecture"
+#endif
+      return fesetenv(&fenv);
+    };
+#define HAVE_FEENABLEEXCEPT 1
+#endif  // not defined HAVE_FEENABLEEXCEPT
+
+#ifndef HAVE_FEDISABLEEXCEPT
+    auto fedisableexcept = [](int excepts) -> int {
+      static fenv_t fenv;
+      const auto new_excepts = excepts & FE_ALL_EXCEPT;
+      // all previous masks
+
+      if (fegetenv(&fenv)) {
+        return -1;
+      }
+#if defined(__x86_64__)
+      const unsigned int old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+      // mask
+      fenv.__control |= new_excepts;
+      fenv.__mxcsr |= new_excepts << 7;
+#elif defined(__arm64__)
+      if (new_excepts & FE_INVALID) fenv.__fpcr &= ~__fpcr_trap_invalid;
+      if (new_excepts & FE_DIVBYZERO) fenv.__fpcr &= ~__fpcr_trap_divbyzero;
+      if (new_excepts & FE_OVERFLOW) fenv.__fpcr &= ~__fpcr_trap_overflow;
+      if (new_excepts & FE_UNDERFLOW) fenv.__fpcr &= ~__fpcr_trap_underflow;
+      if (new_excepts & FE_INEXACT) fenv.__fpcr &= ~__fpcr_trap_inexact;
+#else
+#error "MacOS on unknown architecture"
+#endif
+
+      return fesetenv(&fenv);
+    };
+
+#define HAVE_FEDISABLEEXCEPT 1
+#endif  // not defined HAVE_FEDISABLEEXCEPT
+#endif  // mac
+
+#ifdef HAVE_FEENABLEEXCEPT
+    // this uses a glibc extension to trap on individual exceptions
+    int enable_excepts = 0;
+#ifdef FE_DIVBYZERO
+    enable_excepts |= FE_DIVBYZERO;
+#endif
+#ifdef FE_INVALID
+    enable_excepts |= FE_INVALID;
+#endif
+#ifdef FE_OVERFLOW
+    enable_excepts |= FE_OVERFLOW;
+#endif
+    feenableexcept(enable_excepts);
+#endif
+
+#ifdef HAVE_FEDISABLEEXCEPT
+    // this uses a glibc extension to not trap on individual exceptions
+    int disable_excepts = 0;
+#ifdef FE_UNDERFLOW
+    disable_excepts |= FE_UNDERFLOW;
+#endif
+#ifdef FE_INEXACT
+    disable_excepts |= FE_INEXACT;
+#endif
+    fedisableexcept(disable_excepts);
+#endif
+  }
+}
+
 //////////////////////////////////////////////////////////////////////
 // static variables
 
diff --git a/ttg/ttg/util/bug.h b/ttg/ttg/util/bug.h
index f4e983bd2..1404fa01a 100644
--- a/ttg/ttg/util/bug.h
+++ b/ttg/ttg/util/bug.h
@@ -251,6 +251,15 @@ namespace ttg {
 
   }  // namespace detail
 
+  /// @brief Initializes the floating point exceptions.
+  ///
+  /// Enables (if available) FE_DIVBYZERO, FE_INVALID, and FE_OVERFLOW;
+  /// FE_UNDERFLOW and FE_INEXACT are disabled (if available).
+  /// @warning This should be called from the main thread *before* any threads
+  /// have been created (i.e. before madness::initialize()),
+  ///          so that all threads inherit the same floating point environment.
+  void initialize_fpe();
+
   /**
    * The Debugger class describes what should be done when a catastrophic
    * error causes unexpected program termination.  It can try things such as
@@ -277,12 +286,11 @@ namespace ttg {
 
     static std::shared_ptr<Debugger> default_debugger_;
 
-    /** prints out a backtrace
+    /** prints out a backtrace to `std::cout`
      *
      * @param prefix this string will be prepended at the beginning of each line
      * of Backtrace
      * @param reason optional string specifying the reason for traceback
-     * @return backtrace
      */
     static void __traceback(const std::string &prefix, const char *reason = nullptr);
 
diff --git a/ttg/ttg/util/env.cpp b/ttg/ttg/util/env.cpp
index a4de47b09..12d1d6051 100644
--- a/ttg/ttg/util/env.cpp
+++ b/ttg/ttg/util/env.cpp
@@ -30,5 +30,16 @@ namespace ttg {
       return static_cast<int>(result);
     }
 
+    bool force_device_comm() {
+      bool result = false;
+      const char* ttg_force_device_comm_cstr = std::getenv("TTG_FORCE_DEVICE_COMM");
+      if (ttg_force_device_comm_cstr) {
+        const auto result_int = std::atoi(ttg_force_device_comm_cstr);
+        if (result_int) {
+          result = true;
+        }
+      }
+      return result;
+    }
   }  // namespace detail
 }  // namespace ttg
diff --git a/ttg/ttg/util/env.h b/ttg/ttg/util/env.h
index 7681480ef..4e07bff7b 100644
--- a/ttg/ttg/util/env.h
+++ b/ttg/ttg/util/env.h
@@ -16,6 +16,14 @@ namespace ttg {
     /// @post `num_threads()>0`
     int num_threads();
 
+    /// Override whether TTG should attempt to communicate to and from device buffers.
+    /// TTG will attempt to query device support from the underlying MPI implementation (e.g.,
+    /// using the unofficial extension MPIX_Query_cuda_support). However, since not all MPI implementations
+    /// support this extension, users can force the use of device buffers in communication by setting
+    /// `TTG_FORCE_DEVICE_COMM` to a non-negative number.
+    /// @return true if the user wants to force the use of device-side buffers in communicaton.
+    bool force_device_comm();
+
   }  // namespace detail
 }  // namespace ttg
 
diff --git a/ttg/ttg/util/iovec.h b/ttg/ttg/util/iovec.h
new file mode 100644
index 000000000..ad20d066d
--- /dev/null
+++ b/ttg/ttg/util/iovec.h
@@ -0,0 +1,20 @@
+#ifndef TTG_UTIL_IOVEC_H_
+#define TTG_UTIL_IOVEC_H_
+
+#include <cstdint>
+
+namespace ttg {
+
+  /**
+   * Used to describe transfer payload in types using the \sa SplitMetadataDescriptor.
+   */
+  struct iovec {
+    /// The number of bytes to read from / write to the memory location given by `data`.
+    std::size_t num_bytes;
+    /// Pointer to the data to be read from / written to.
+    void* data;
+  };
+
+} // ttg
+
+#endif // TTG_UTIL_IOVEC_H_
diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h
index c19776118..b7bb31690 100644
--- a/ttg/ttg/util/meta.h
+++ b/ttg/ttg/util/meta.h
@@ -6,6 +6,9 @@
 
 #include "ttg/util/span.h"
 #include "ttg/util/typelist.h"
+#include "ttg/buffer.h"
+#include "ttg/ptr.h"
+#include "ttg/devicescratch.h"
 
 namespace ttg {
 
@@ -290,6 +293,53 @@ namespace ttg {
     template <typename... Ts>
     constexpr bool is_any_nonconst_lvalue_reference_v<std::tuple<Ts...>> = is_any_nonconst_lvalue_reference_v<Ts...>;
 
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // device type traits
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    template<typename T>
+    struct is_ptr : std::false_type
+    { };
+
+    template<typename T>
+    struct is_ptr<ttg::Ptr<T>> : std::true_type
+    { };
+
+    template<typename T>
+    constexpr bool is_ptr_v = is_ptr<T>::value;
+
+    template<typename T>
+    struct is_buffer : std::false_type
+    { };
+
+    template<typename T, typename A>
+    struct is_buffer<ttg::Buffer<T, A>> : std::true_type
+    { };
+
+    template<typename T, typename A>
+    struct is_buffer<const ttg::Buffer<T, A>> : std::true_type
+    { };
+
+    template<typename T>
+    constexpr bool is_buffer_v = is_buffer<T>::value;
+
+    template<typename T>
+    struct is_devicescratch : std::false_type
+    { };
+
+    template<typename T>
+    struct is_devicescratch<ttg::devicescratch<T>> : std::true_type
+    { };
+
+    template<typename T>
+    struct is_devicescratch<const ttg::devicescratch<T>> : std::true_type
+    { };
+
+    template<typename T>
+    constexpr bool is_devicescratch_v = is_devicescratch<T>::value;
+
+
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // typelist metafunctions
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -549,6 +599,15 @@ namespace ttg {
     template <typename T>
     constexpr bool is_tuple_v = is_tuple<T>::value;
 
+    template <typename>
+    struct is_span : std::false_type {};
+
+    template <typename T, std::size_t S>
+    struct is_span<ttg::span<T, S>> : std::true_type {};
+
+    template <typename T>
+    constexpr bool is_span_v = is_span<T>::value;
+
     template <template <class> class Pred, typename TupleT, std::size_t I, std::size_t... Is>
     struct predicate_index_seq_helper;
 
@@ -752,6 +811,8 @@ namespace ttg {
       template <typename Key, typename Value>
       using broadcast_callback_t = typename broadcast_callback<Key, Value>::type;
 
+
+
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       // setsize_callback_t<key> = std::function<void(const keyT &, std::size_t)> protected against void key
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -827,6 +888,31 @@ namespace ttg {
       template <typename... valueTs>
       using input_reducers_t = typename input_reducers<valueTs...>::type;
 
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // prepare_send_callback_t<Key, Value> = std::function<int(const ttg::span<Key> &, const Value &)> protected against void key
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      template <typename Key, typename Value, typename Enabler = void>
+      struct prepare_send_callback;
+      template <typename Key, typename Value>
+      struct prepare_send_callback<Key, Value, std::enable_if_t<!is_void_v<Key> && !is_void_v<Value>>> {
+        using type = std::function<void(const ttg::span<const Key> &, const Value &)>;
+      };
+      template <typename Key, typename Value>
+      struct prepare_send_callback<Key, Value, std::enable_if_t<!is_void_v<Key> && is_void_v<Value>>> {
+        using type = std::function<void(const ttg::span<const Key> &)>;
+      };
+      template <typename Key, typename Value>
+      struct prepare_send_callback<Key, Value, std::enable_if_t<is_void_v<Key> && !is_void_v<Value>>> {
+        using type = std::function<void(const Value &)>;
+      };
+      template <typename Key, typename Value>
+      struct prepare_send_callback<Key, Value, std::enable_if_t<is_void_v<Key> && is_void_v<Value>>> {
+        using type = std::function<void()>;
+      };
+      template <typename Key, typename Value>
+      using prepare_send_callback_t = typename prepare_send_callback<Key, Value>::type;
+
+
     }  // namespace detail
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -857,6 +943,16 @@ namespace ttg {
     constexpr bool is_invocable_typelist_r_v<ReturnType, Callable, ttg::typelist<Args...>> =
         std::is_invocable_r_v<ReturnType, Callable, Args...>;
 
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // detects the return result of a Callable when invoked with the arguments given as a typelist
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    template <typename Callable, typename Typelist>
+    struct invoke_result_typelist {};
+    template <typename Callable, typename... Args>
+    struct invoke_result_typelist<Callable, ttg::typelist<Args...>> : std::invoke_result<Callable, Args...> {};
+    template <class F, class... ArgTypes>
+    using invoke_result_typelist_t = typename invoke_result_typelist<F, ArgTypes...>::type;
+
   }  // namespace meta
 }  // namespace ttg
 
diff --git a/ttg/ttg/util/meta/callable.h b/ttg/ttg/util/meta/callable.h
index 7cdafd445..41b6f1672 100644
--- a/ttg/ttg/util/meta/callable.h
+++ b/ttg/ttg/util/meta/callable.h
@@ -31,12 +31,22 @@ namespace ttg::meta {
   template <typename Callable>
   constexpr inline bool is_generic_callable_v = is_generic_callable<Callable>::value;
 
+  /// callable_args<Callable> detects whether `Callable` is generic or not, and in the latter case
+  /// detects (using Boost.CallableTraits) its return and argument types.
+  /// callable_args<Callable> is a constexpr value of type
+  /// `std::pair<bool,ttg::typelist<ttg::typelist<ReturnType>,ttg::typelist<ArgTypes>>>`
+  /// where:
+  /// - the boolean indicates whether `Callable` is generic (true) or not (false),
+  /// - `ReturnType` is the return type of the `Callable` if it's generic, empty otherwise
+  /// - `ArgTypes`  is the argument types of the `Callable` if it's generic, empty otherwise
+  /// \tparam Callable a callable type
   template <typename Callable, typename Enabler = void>
-  constexpr std::pair<bool, ttg::typelist<>> callable_args = {true, {}};
+  constexpr std::pair<bool, std::pair<ttg::typelist<>, ttg::typelist<>>> callable_args = {true, {}};
 
   template <typename Callable>
   constexpr auto callable_args<Callable, ttg::meta::void_t<boost::callable_traits::args_t<Callable, ttg::typelist>>> =
-      std::pair<bool, boost::callable_traits::args_t<Callable, ttg::typelist>>{false, {}};
+      std::pair<bool, std::pair<ttg::typelist<boost::callable_traits::return_type_t<Callable>>,
+                                boost::callable_traits::args_t<Callable, ttg::typelist>>>{false, {}};
 
   //////////////////////////////////////
   // generic callables
@@ -57,6 +67,19 @@ namespace ttg::meta {
     return idx;
   }
 
+  /// detects argument and return types of a generic callable (\p func)
+  /// by trying each combination of types (\p argument_type_lists) for the respective arguments starting with
+  /// the combination corresponding to the given \p Ordinal
+  /// \tparam Ordinal a nonnegative integer specifying the ordinal of the type combination from \p Typelists to try;
+  ///                 maximum value is `(std::tuple_size_v<Typelists> * ...)`
+  /// \tparam Func a generic callable type
+  /// \tparam Typelists a pack of ttg::typelist's each of which specifies candidate types for the respective
+  ///         argument of \p Func
+  /// \param func a generic callable
+  /// \param argument_type_lists a ttg::typelist<Typelists...>
+  /// @note iterates over \p argument_type_lists in "row-major" order (i.e. last list in \p argument_type_lists
+  ///       is iterated first, etc.; the maxim
+  /// @return an object of type `ttg::typelist<ttg::typelist<ReturnType>,ttg::typelist<ArgTypes>>`
   template <std::size_t Ordinal, typename Func, typename... Typelists, std::size_t... ArgIdx>
   auto compute_arg_binding_types_impl(Func& func, typelist<Typelists...> argument_type_lists,
                                       std::index_sequence<ArgIdx...> arg_idx = {}) {
@@ -67,18 +90,34 @@ namespace ttg::meta {
     constexpr auto tensor_size = (extents[ArgIdx] * ...);
     static_assert(tensor_size >= Ordinal);
     if constexpr (tensor_size == Ordinal) {
-      return typelist<>{};
+      return typelist<typelist<>, typelist<>>{};
     } else {
       constexpr auto idx = ordinal2index(Ordinal, extents);
       auto args = typelist<std::tuple_element_t<idx[ArgIdx], std::tuple_element_t<ArgIdx, arg_typelists_t>>...>{};
-      if constexpr (is_invocable_typelist_v<Func, drop_void_t<decltype(args)>>) {
-        return args;
+      using args_sans_void_t = drop_void_t<decltype(args)>;
+      if constexpr (is_invocable_typelist_v<Func, args_sans_void_t>) {
+        using return_type = invoke_result_typelist_t<Func, args_sans_void_t>;
+        return ttg::typelist<ttg::typelist<return_type>, decltype(args)>{};
       } else {
         return compute_arg_binding_types_impl<Ordinal + 1>(func, argument_type_lists, arg_idx);
       }
     }
   }
 
+  /// detects argument types of a generic callable (\p func)
+  /// by trying each combination of types (\p argument_type_lists) for the respective arguments starting with
+  /// the combination corresponding to the given \p Ordinal . The callable is expected to return \p ReturnType
+  /// \tparam Ordinal a nonnegative integer specifying the ordinal of the type combination from \p Typelists to try;
+  ///                 maximum value is `(std::tuple_size_v<Typelists> * ...)`
+  /// \tparam ReturnType the expected return type of \p Func
+  /// \tparam Func a generic callable type
+  /// \tparam Typelists a pack of ttg::typelist's each of which specifies candidate types for the respective
+  ///         argument of \p Func
+  /// \param func a generic callable
+  /// \param argument_type_lists a ttg::typelist<Typelists...>
+  /// @note iterates over \p argument_type_lists in "row-major" order (i.e. last list in \p argument_type_lists
+  ///       is iterated first, etc.; the maxim
+  /// @return an object of type `ttg::typelist<ArgTypes>`
   template <std::size_t Ordinal, typename ReturnType, typename Func, typename... Typelists, std::size_t... ArgIdx>
   auto compute_arg_binding_types_r_impl(Func& func, typelist<Typelists...> argument_type_lists,
                                         std::index_sequence<ArgIdx...> arg_idx = {}) {
@@ -105,7 +144,9 @@ namespace ttg::meta {
   /// @tparam Typelists a pack of typelists encoding how each argument can be invoked
   /// @param func a reference to callable of type @p Func
   /// @param argument_type_lists a list of possible types to try for each argument; can contain `void`
-  /// @return a ttg::typelist encoding:
+  /// @return a `ttg::typelist<ttg::typelist<ReturnType>,ttg::typelist<ArgsTypes>>` where
+  ///          `ReturnType` is the return type of \p func and
+  ///         `ArgsTypes` encodes:
   ///         - the exact argument bindings used by `Func`, if @p func is a nongeneric callable;
   ///         - the first invocable combination of argument types discovered by row-major iteration, if @p func is a
   ///         generic callable
@@ -138,7 +179,7 @@ namespace ttg::meta {
       return compute_arg_binding_types_r_impl<0, ReturnType>(func, argument_type_lists,
                                                              std::make_index_sequence<sizeof...(Typelists)>{});
     } else {
-      return is_generic__args.second;
+      return is_generic__args.second.second;
     }
   }
 
diff --git a/ttg/ttg/util/multiindex.h b/ttg/ttg/util/multiindex.h
index 7141b7954..5c04494e2 100644
--- a/ttg/ttg/util/multiindex.h
+++ b/ttg/ttg/util/multiindex.h
@@ -5,6 +5,8 @@
 #ifndef TTG_UTIL_MULTIINDEX_H
 #define TTG_UTIL_MULTIINDEX_H
 
+#include "ttg/serialization/std/array.h"
+
 namespace ttg {
 
   template <std::size_t Rank, typename Int = int>
@@ -53,6 +55,11 @@ namespace ttg {
       return data_[idx];
     }
 
+    template <typename Archive>
+    void serialize(Archive &ar, const unsigned int version = 0) {
+      ar &data_;
+    }
+
    private:
     bool valid() {
       bool result = true;
diff --git a/ttg/ttg/util/span.h b/ttg/ttg/util/span.h
index 812322c3b..f551ead39 100644
--- a/ttg/ttg/util/span.h
+++ b/ttg/ttg/util/span.h
@@ -1,17 +1,6 @@
 #ifndef TTG_UTIL_SPAN_H
 #define TTG_UTIL_SPAN_H
 
-#if __cplusplus >= 202002L
-
-/* Use the std:: implementation available from C++20 onwards */
-#include <span>
-namespace ttg {
-  template <class T, std::size_t Extent = std::dynamic_extent>
-  using span = std::span<T, Extent>;
-}  // namespace ttg
-
-#else  // __cplusplus >= 202002L
-
 /* if TCB_SPAN_NAMESPACE_NAME already defined someone imported TCB span somewhere,
  * then reset it here to the ttg namespace, don't forget to revert back */
 #ifdef TCB_SPAN_NAMESPACE_NAME
@@ -533,6 +522,4 @@ namespace std {
 
 #endif
 
-#endif  // __cplusplus >= 202002L
-
 #endif  // TTG_UTIL_SPAN_H
diff --git a/ttg/ttg/util/typelist.h b/ttg/ttg/util/typelist.h
index 0f2363b12..72126a62b 100644
--- a/ttg/ttg/util/typelist.h
+++ b/ttg/ttg/util/typelist.h
@@ -26,7 +26,7 @@ namespace ttg {
       constexpr auto size() const { return sizeof...(Ts); }
 
       template <std::size_t I>
-      constexpr auto get() {
+      constexpr auto get() const {
         return type_identity<std::tuple_element_t<I, std::tuple<Ts...>>>{};
       }
     };