diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28490557f..5406877c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,7 @@ option(TTG_ENABLE_LEVEL_ZERO "Whether to TTG will look for Intel oneAPI Level Ze
 option(TTG_EXAMPLES "Whether to build examples" OFF)
 option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)
 
+option(TTG_ENABLE_COROUTINES "Whether to enable C++ coroutines, needed for accelerator device support" ON)
 option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
 option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF)
 option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF)
@@ -94,8 +95,26 @@ endif (BUILD_TESTING)
 ###########################
 # Boost
 include("${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake")
-# C++ coroutines
-find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental)
+
+if (TTG_ENABLE_COROUTINES)
+    set(SKIP_COROUTINE_DETECTION FALSE)
+    # C++ coroutines, check for broken GCC releases and skip if one is found
+    if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+        if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 11.4.0)
+            set(SKIP_COROUTINE_DETECTION TRUE)
+        elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1.0 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 12.3.0)
+            set(SKIP_COROUTINE_DETECTION TRUE)
+        endif()
+        if (SKIP_COROUTINE_DETECTION)
+            message(WARNING "GCC with broken Coroutine support detected, disabling Coroutine support. At least GCC 11.4, 12.3, or 13.1 required.")
+        endif(SKIP_COROUTINE_DETECTION)
+    endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+
+    if (NOT SKIP_COROUTINE_DETECTION)
+        find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental)
+        set(TTG_HAVE_COROUTINE CXXStdCoroutine_FOUND CACHE BOOL "True if the compiler has coroutine support")
+    endif(NOT SKIP_COROUTINE_DETECTION)
+endif(TTG_ENABLE_COROUTINES)
 
 
 ##########################
diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake
index fd96e5816..43042cacd 100644
--- a/cmake/modules/ExternalDependenciesVersions.cmake
+++ b/cmake/modules/ExternalDependenciesVersions.cmake
@@ -4,7 +4,7 @@
 set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5)  # used to provide "real" FindOrFetchBoost
 set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
 set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3)
-set(TTG_TRACKED_PARSEC_TAG 0b3140f58ad9dc78a3d64da9fd73ecc7f443ece7)
+set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
 set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2)
 set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73)
 
diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake
index 7b164019f..b3fd5faa3 100644
--- a/cmake/modules/FindOrFetchPARSEC.cmake
+++ b/cmake/modules/FindOrFetchPARSEC.cmake
@@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec)
 
   FetchContent_Declare(
           PARSEC
-          GIT_REPOSITORY      https://github.com/devreal/parsec-1.git
+          GIT_REPOSITORY      https://github.com/ICLDisco/parsec.git
           GIT_TAG             ${TTG_TRACKED_PARSEC_TAG}
   )
   FetchContent_MakeAvailable(PARSEC)
diff --git a/examples/potrf/pmw.h b/examples/potrf/pmw.h
index 0f8d75d7d..25f01933c 100644
--- a/examples/potrf/pmw.h
+++ b/examples/potrf/pmw.h
@@ -102,6 +102,14 @@ class PaRSECMatrixWrapper {
            (pm->uplo == PARSEC_MATRIX_UPPER && col >= row);
   }
 
+  int P() const {
+    return pm->grid.rows;
+  }
+
+  int Q() const {
+    return pm->grid.cols;
+  }
+
   PaRSECMatrixT* parsec() {
     return pm;
   }
@@ -132,7 +140,7 @@ class PaRSECMatrixWrapper {
 };
 
 template<typename ValueT>
-using MatrixT = PaRSECMatrixWrapper<sym_two_dim_block_cyclic_t, ValueT>;
+using MatrixT = PaRSECMatrixWrapper<parsec_matrix_sym_block_cyclic_t, ValueT>;
 
 static auto make_load_tt(MatrixT<double> &A, ttg::Edge<Key2, MatrixTile<double>> &toop, bool defer_write)
 {
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index f6ba6e147..60daef72f 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -674,10 +674,22 @@ namespace potrf {
     auto keymap1 = [&](const Key1& key) { return A.rank_of(key[0], key[0]); };
 
     auto keymap2a = [&](const Key2& key) { return A.rank_of(key[0], key[1]); };
-    auto keymap2b = [&](const Key2& key) { return A.rank_of(key[0], key[0]); };
+    auto keymap2b = [&](const Key2& key) { return A.rank_of(key[1], key[1]); };
 
     auto keymap3 = [&](const Key3& key) { return A.rank_of(key[0], key[1]); };
 
+    /**
+     * Device map hints: we try to keep tiles on one row on the same device to minimize
+     * data movement between devices. This provides hints for load-balancing up front
+     * and avoids movement of the TRSM result to GEMM tasks.
+     */
+    auto devmap1 = [&](const Key1& key) { return (key[0] / A.P()) % ttg::device::num_devices(); };
+
+    auto devmap2a = [&](const Key2& key) { return (key[0] / A.P()) % ttg::device::num_devices(); };
+    auto devmap2b = [&](const Key2& key) { return (key[1] / A.P()) % ttg::device::num_devices(); };
+
+    auto devmap3 = [&](const Key3& key) { return (key[0] / A.P()) % ttg::device::num_devices(); };
+
     ttg::Edge<Key1, MatrixTile<T>> syrk_potrf("syrk_potrf"), disp_potrf("disp_potrf");
 
     ttg::Edge<Key2, MatrixTile<T>> potrf_trsm("potrf_trsm"), trsm_syrk("trsm_syrk"), gemm_trsm("gemm_trsm"),
@@ -692,18 +704,30 @@ namespace potrf {
     auto tt_potrf = make_potrf(A, disp_potrf, syrk_potrf, potrf_trsm, output);
     tt_potrf->set_keymap(keymap1);
     tt_potrf->set_defer_writer(defer_write);
+#ifdef ENABLE_DEVICE_KERNEL
+    tt_potrf->set_devicemap(devmap1);
+#endif // 0
 
     auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output);
     tt_trsm->set_keymap(keymap2a);
     tt_trsm->set_defer_writer(defer_write);
+#ifdef ENABLE_DEVICE_KERNEL
+    tt_trsm->set_devicemap(devmap2a);
+#endif // 0
 
     auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk);
     tt_syrk->set_keymap(keymap2b);
     tt_syrk->set_defer_writer(defer_write);
+#ifdef ENABLE_DEVICE_KERNEL
+    tt_syrk->set_devicemap(devmap2b);
+#endif // 0
 
     auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm);
     tt_gemm->set_keymap(keymap3);
     tt_gemm->set_defer_writer(defer_write);
+#ifdef ENABLE_DEVICE_KERNEL
+    tt_gemm->set_devicemap(devmap3);
+#endif // 0
 
     /* Priorities taken from DPLASMA */
     auto nt = A.cols();
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 275f3fdd8..7f25dd6fe 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -12,12 +12,14 @@ set(ut_libs Catch2::Catch2)
 
 # coroutine tests
 # we definitely have TARGET std::coroutine
-list(APPEND ut_src fibonacci-coro.cc)
-list(APPEND ut_src device_coro.cc)
-if (TTG_HAVE_CUDA)
-    list(APPEND ut_src cuda_kernel.cu)
-endif(TTG_HAVE_CUDA)
-list(APPEND ut_libs std::coroutine)
+if (CXXStdCoroutine_FOUND)
+    list(APPEND ut_src fibonacci-coro.cc)
+    list(APPEND ut_src device_coro.cc)
+    if (TTG_HAVE_CUDA)
+        list(APPEND ut_src cuda_kernel.cu)
+    endif(TTG_HAVE_CUDA)
+    list(APPEND ut_libs std::coroutine)
+endif(CXXStdCoroutine_FOUND)
 
 add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" COMPILE_DEFINITIONS "CATCH_CONFIG_NO_POSIX_SIGNALS=1" )
 
diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt
index 900df6519..b1fa72947 100644
--- a/ttg/CMakeLists.txt
+++ b/ttg/CMakeLists.txt
@@ -114,7 +114,6 @@ if (TTG_ENABLE_TRACE)
 endif (TTG_ENABLE_TRACE)
 if (TARGET std::coroutine)
   list(APPEND ttg-deps std::coroutine)
-  list(APPEND ttg-defs "TTG_HAS_COROUTINE=1")
   list(APPEND ttg-util-headers
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/coroutine.h
           )
@@ -208,6 +207,7 @@ endif(TARGET Boost::serialization)
 if (TARGET MADworld)
   set(ttg-mad-headers
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/buffer.h
+          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/device.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/fwd.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/import.h
           ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/ttg.h
diff --git a/ttg/ttg/config.in.h b/ttg/ttg/config.in.h
index 51e58b4a2..106f0c58a 100644
--- a/ttg/ttg/config.in.h
+++ b/ttg/ttg/config.in.h
@@ -11,6 +11,9 @@
 /** the C++ namespace containing the coroutine API */
 #define TTG_CXX_COROUTINE_NAMESPACE @CXX_COROUTINE_NAMESPACE@
 
+/** whether the compiler supports C++ coroutines */
+#cmakedefine TTG_HAVE_COROUTINE
+
 /** whether TTG has CUDA language support */
 #cmakedefine TTG_HAVE_CUDA
 
diff --git a/ttg/ttg/coroutine.h b/ttg/ttg/coroutine.h
index 81d5b1657..dc1ed8e5b 100644
--- a/ttg/ttg/coroutine.h
+++ b/ttg/ttg/coroutine.h
@@ -6,11 +6,14 @@
 #define TTG_COROUTINE_H
 
 #include "ttg/config.h"
+
+#ifdef TTG_HAVE_COROUTINE
 #include TTG_CXX_COROUTINE_HEADER
 
 #include <algorithm>
 #include <array>
 
+
 namespace ttg {
 
   // import std coroutine API into ttg namespace
@@ -227,4 +230,6 @@ namespace ttg {
 
 }  // namespace ttg
 
+#endif // TTG_HAVE_COROUTINE
+
 #endif  // TTG_COROUTINE_H
diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h
index 6690982f6..244e9c944 100644
--- a/ttg/ttg/device/device.h
+++ b/ttg/ttg/device/device.h
@@ -2,6 +2,8 @@
 
 #include "ttg/config.h"
 #include "ttg/execution.h"
+#include "ttg/impl_selector.h"
+#include "ttg/fwd.h"
 
 
 
@@ -180,3 +182,9 @@ namespace ttg::device {
   }
 } // namespace ttg
 #endif // defined(TTG_HAVE_HIP)
+
+namespace ttg::device {
+  inline int num_devices() {
+    return TTG_IMPL_NS::num_devices();
+  }
+}
diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h
index d95e0d1eb..8e2d14cfc 100644
--- a/ttg/ttg/device/task.h
+++ b/ttg/ttg/device/task.h
@@ -9,6 +9,8 @@
 #include "ttg/impl_selector.h"
 #include "ttg/ptr.h"
 
+#ifdef TTG_HAVE_COROUTINE
+
 namespace ttg::device {
 
   namespace detail {
@@ -632,6 +634,8 @@ namespace ttg::device {
   bool device_reducer::completed() { return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE; }
 #endif // 0
 
-}  // namespace ttg::devie
+}  // namespace ttg::device
+
+#endif // TTG_HAVE_COROUTINE
 
 #endif // TTG_DEVICE_TASK_H
diff --git a/ttg/ttg/madness/device.h b/ttg/ttg/madness/device.h
new file mode 100644
index 000000000..e13321194
--- /dev/null
+++ b/ttg/ttg/madness/device.h
@@ -0,0 +1,9 @@
+#ifndef TTG_MADNESS_DEVICE_H
+#define TTG_MADNESS_DEVICE_H
+
+namespace ttg_madness {
+  /* no device support in MADNESS */
+  inline int num_devices() { return 0; }
+}
+
+#endif // TTG_MADNESS_DEVICE_H
\ No newline at end of file
diff --git a/ttg/ttg/madness/fwd.h b/ttg/ttg/madness/fwd.h
index 0d340db0b..6bee1a832 100644
--- a/ttg/ttg/madness/fwd.h
+++ b/ttg/ttg/madness/fwd.h
@@ -77,6 +77,8 @@ namespace ttg_madness {
   template<typename... Buffer>
   inline void mark_device_out(std::tuple<Buffer&...> &b);
 
+  inline int num_devices();
+
 }  // namespace ttg_madness
 
 #endif  // TTG_MADNESS_FWD_H
diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h
index 5fab6f6dd..5d2360cfb 100644
--- a/ttg/ttg/madness/ttg.h
+++ b/ttg/ttg/madness/ttg.h
@@ -13,6 +13,7 @@
 #include "ttg/base/keymap.h"
 #include "ttg/base/tt.h"
 #include "ttg/func.h"
+#include "ttg/madness/device.h"
 #include "ttg/runtimes.h"
 #include "ttg/tt.h"
 #include "ttg/util/bug.h"
@@ -23,9 +24,7 @@
 #include "ttg/util/meta/callable.h"
 #include "ttg/util/void.h"
 #include "ttg/world.h"
-#ifdef TTG_HAS_COROUTINE
 #include "ttg/coroutine.h"
-#endif
 
 #include <array>
 #include <cassert>
@@ -302,10 +301,10 @@ namespace ttg_madness {
       derivedT *derived;                            // Pointer to derived class instance
       bool pull_terminals_invoked = false;
       std::conditional_t<ttg::meta::is_void_v<keyT>, ttg::Void, keyT> key;  // Task key
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
       void *suspended_task_address = nullptr;  // if not null the function is suspended
       ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid;
-#endif
+#endif // TTG_HAVE_COROUTINE
 
       /// makes a tuple of references out of tuple of
       template <typename Tuple, std::size_t... Is>
@@ -335,11 +334,11 @@ namespace ttg_madness {
         ttT::threaddata.call_depth++;
 
         void *suspended_task_address =
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
             this->suspended_task_address;  // non-null = need to resume the task
-#else
+#else  // TTG_HAVE_COROUTINE
             nullptr;
-#endif
+#endif // TTG_HAVE_COROUTINE
         if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
           // ttg::print("starting task");
           if constexpr (!ttg::meta::is_void_v<keyT> && !ttg::meta::is_empty_tuple_v<input_values_tuple_type>) {
@@ -361,7 +360,7 @@ namespace ttg_madness {
           } else  // unreachable
             ttg::abort();
         } else {  // resume suspended coroutine
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
           auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address));
           assert(ret.ready());
           ret.resume();
@@ -372,9 +371,9 @@ namespace ttg_madness {
             // leave suspended_task_address as is
           }
           this->suspended_task_address = suspended_task_address;
-#else
+#else  // TTG_HAVE_COROUTINE
           ttg::abort();  // should not happen
-#endif
+#endif // TTG_HAVE_COROUTINE
         }
 
         ttT::threaddata.call_depth--;
@@ -383,7 +382,7 @@ namespace ttg_madness {
         //   ttg::print("finishing task",ttT::threaddata.call_depth);
         // }
 
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
         if (suspended_task_address) {
           // TODO implement handling of suspended coroutines properly
 
@@ -411,7 +410,7 @@ namespace ttg_madness {
             ttg::abort();
           }
         }
-#endif  // TTG_HAS_COROUTINE
+#endif  // TTG_HAVE_COROUTINE
       }
 
       virtual ~TTArgs() {}  // Will be deleted via TaskInterface*
diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h
index 1711d8556..81897b816 100644
--- a/ttg/ttg/make_tt.h
+++ b/ttg/ttg/make_tt.h
@@ -149,7 +149,7 @@ class CallableWrapTTArgs
   std::conditional_t<std::is_function_v<noref_funcT>, std::add_pointer_t<noref_funcT>, noref_funcT> func;
 
   using op_return_type =
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
       std::conditional_t<std::is_same_v<returnT, ttg::resumable_task>,
                          ttg::coroutine_handle<ttg::resumable_task_state>,
 #ifdef TTG_HAVE_DEVICE
@@ -160,9 +160,9 @@ class CallableWrapTTArgs
                            void
 #endif  // TTG_HAVE_DEVICE
                          >;
-#else   // TTG_HAS_COROUTINE
+#else   // TTG_HAVE_COROUTINE
       void;
-#endif  // TTG_HAS_COROUTINE
+#endif  // TTG_HAVE_COROUTINE
 
 public:
   static constexpr bool have_cuda_op = (space == ttg::ExecutionSpace::CUDA);
@@ -176,7 +176,7 @@ class CallableWrapTTArgs
     static_assert(std::is_same_v<std::remove_reference_t<decltype(ret)>, returnT>,
                   "CallableWrapTTArgs<funcT,returnT,...>: returnT does not match the actual return type of funcT");
     if constexpr (!std::is_void_v<returnT>) {  // protect from compiling for void returnT
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
       if constexpr (std::is_same_v<returnT, ttg::resumable_task>) {
         ttg::coroutine_handle<ttg::resumable_task_state> coro_handle;
         // if task completed destroy it
diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
index 98b14eb12..6e609a158 100644
--- a/ttg/ttg/parsec/buffer.h
+++ b/ttg/ttg/parsec/buffer.h
@@ -329,6 +329,14 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t
     //          << " parsec_data " << m_data.get() << std::endl;
   }
 
+  void prefer_device(ttg::device::Device dev) {
+    /* only set device if the host has the latest copy as otherwise we might end up with a stale copy */
+    if (dev.is_device() && this->parsec_data()->owner_device == 0) {
+      parsec_advise_data_on_device(this->parsec_data(), detail::ttg_device_to_parsec_device(dev),
+                                   PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
+    }
+  }
+
   /* serialization support */
 
 #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST
diff --git a/ttg/ttg/parsec/device.h b/ttg/ttg/parsec/device.h
index 77722b1c1..9f8ada05c 100644
--- a/ttg/ttg/parsec/device.h
+++ b/ttg/ttg/parsec/device.h
@@ -2,6 +2,7 @@
 #define TTG_PARSEC_DEVICE_H
 
 #include "ttg/device/device.h"
+#include <parsec/mca/device/device.h>
 
 namespace ttg_parsec {
 
@@ -35,6 +36,12 @@ namespace ttg_parsec {
     }
   } // namespace detail
 
+
+  inline
+  int num_devices() {
+    return parsec_nb_devices - detail::first_device_id;
+  }
+
 } // namespace ttg_parsec
 
 #endif // TTG_PARSEC_DEVICE_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h
index d5bc8931e..0cd798e87 100644
--- a/ttg/ttg/parsec/fwd.h
+++ b/ttg/ttg/parsec/fwd.h
@@ -82,6 +82,8 @@ namespace ttg_parsec {
   template<typename... Buffer>
   inline void mark_device_out(std::tuple<Buffer&...> &b);
 
+  inline int num_devices();
+
 #if 0
   template<typename... Args>
   inline std::pair<bool, std::tuple<ptr<std::decay_t<Args>>...>> get_ptr(Args&&... args);
diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h
index 5b23d53af..656117b94 100644
--- a/ttg/ttg/parsec/task.h
+++ b/ttg/ttg/parsec/task.h
@@ -15,6 +15,7 @@ namespace ttg_parsec {
       parsec_flow_t* flows = nullptr;
       parsec_gpu_exec_stream_t* stream = nullptr;
       parsec_device_gpu_module_t* device = nullptr;
+      parsec_task_class_t task_class; // copy of the taskclass
     };
 
     template<bool SupportDevice>
@@ -57,9 +58,10 @@ namespace ttg_parsec {
     }
 
     inline
-    ttg_parsec_data_flags operator|=(ttg_parsec_data_flags lhs, ttg_parsec_data_flags rhs) {
+    ttg_parsec_data_flags operator|=(ttg_parsec_data_flags& lhs, ttg_parsec_data_flags rhs) {
         using flags_type = std::underlying_type<ttg_parsec_data_flags>::type;
-        return ttg_parsec_data_flags(static_cast<flags_type>(lhs) | static_cast<flags_type>(rhs));
+        lhs = ttg_parsec_data_flags(static_cast<flags_type>(lhs) | static_cast<flags_type>(rhs));
+        return lhs;
     }
 
     inline
@@ -68,6 +70,13 @@ namespace ttg_parsec {
         return static_cast<flags_type>(lhs) & static_cast<flags_type>(rhs);
     }
 
+    inline
+    ttg_parsec_data_flags operator&=(ttg_parsec_data_flags& lhs, ttg_parsec_data_flags rhs) {
+        using flags_type = std::underlying_type<ttg_parsec_data_flags>::type;
+        lhs = ttg_parsec_data_flags(static_cast<flags_type>(lhs) & static_cast<flags_type>(rhs));
+        return lhs;
+    }
+
     inline
     bool operator!(ttg_parsec_data_flags lhs) {
         using flags_type = std::underlying_type<ttg_parsec_data_flags>::type;
@@ -146,10 +155,14 @@ namespace ttg_parsec {
           : data_count(data_count)
           , copies(copies)
           , defer_writer(defer_writer) {
+        PARSEC_OBJ_CONSTRUCT(&parsec_task, parsec_task_t);
         PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
         parsec_task.mempool_owner = mempool;
         parsec_task.task_class = task_class;
         parsec_task.priority = 0;
+
+        // TODO: can we avoid this?
+        for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; }
       }
 
       parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class,
@@ -161,6 +174,7 @@ namespace ttg_parsec {
           , copies(copies)
           , release_task_cb(release_fn)
           , defer_writer(defer_writer) {
+        PARSEC_OBJ_CONSTRUCT(&parsec_task, parsec_task_t);
         PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super);
         parsec_task.mempool_owner = mempool;
         parsec_task.task_class = task_class;
@@ -168,6 +182,9 @@ namespace ttg_parsec {
         parsec_task.taskpool = taskpool;
         parsec_task.priority = priority;
         parsec_task.chore_mask = 1<<0;
+
+        // TODO: can we avoid this?
+        for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; }
       }
 
     public:
@@ -185,7 +202,7 @@ namespace ttg_parsec {
       TT* tt = nullptr;
       key_type key;
       std::array<stream_info_t, num_streams> streams;
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
       void* suspended_task_address = nullptr;  // if not null the function is suspended
       ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid;
 #endif
@@ -234,6 +251,15 @@ namespace ttg_parsec {
         }
       }
 
+      template<ttg::ExecutionSpace Space>
+      parsec_hook_return_t invoke_evaluate() {
+        if constexpr (Space == ttg::ExecutionSpace::Host) {
+          return PARSEC_HOOK_RETURN_DONE;
+        } else {
+          return TT::template device_static_evaluate<Space>(&this->parsec_task);
+        }
+      }
+
       parsec_key_t pkey() { return reinterpret_cast<parsec_key_t>(&key); }
     };
 
@@ -242,7 +268,7 @@ namespace ttg_parsec {
       static constexpr size_t num_streams = TT::numins;
       TT* tt = nullptr;
       std::array<stream_info_t, num_streams> streams;
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
       void* suspended_task_address = nullptr;  // if not null the function is suspended
       ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid;
 #endif
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index f4653f962..73672d285 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -12,6 +12,8 @@
  * This may cause deadlocks, so use with caution. */
 #define TTG_PARSEC_DEFER_WRITER false
 
+#include "ttg/config.h"
+
 #include "ttg/impl_selector.h"
 
 /* include ttg header to make symbols available in case this header is included directly */
@@ -740,6 +742,7 @@ namespace ttg_parsec {
     inline void transfer_ownership_impl(ttg_data_copy_t *copy, int device) {
       if constexpr(!std::is_const_v<std::tuple_element_t<I, typename TT::input_values_tuple_type>>) {
         copy->transfer_ownership(PARSEC_FLOW_ACCESS_RW, device);
+        copy->inc_current_version();
       }
     }
 
@@ -765,7 +768,8 @@ namespace ttg_parsec {
         parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
         return me->template invoke_op<ttg::ExecutionSpace::CUDA>();
       } else {
-        throw std::runtime_error("PaRSEC CUDA hook invoked on a TT that does not support CUDA operations!");
+        std::cerr << "CUDA hook called without having a CUDA op!" << std::endl;
+        return PARSEC_HOOK_RETURN_ERROR;
       }
     }
 
@@ -775,7 +779,8 @@ namespace ttg_parsec {
         parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
         return me->template invoke_op<ttg::ExecutionSpace::HIP>();
       } else {
-        throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!");
+        std::cerr << "HIP hook called without having a HIP op!" << std::endl;
+        return PARSEC_HOOK_RETURN_ERROR;
       }
     }
 
@@ -785,10 +790,43 @@ namespace ttg_parsec {
         parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
         return me->template invoke_op<ttg::ExecutionSpace::L0>();
       } else {
-        throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!");
+        std::cerr << "L0 hook called without having a L0 op!" << std::endl;
+        return PARSEC_HOOK_RETURN_ERROR;
+      }
+    }
+
+
+    template<typename TT>
+    inline parsec_hook_return_t evaluate_cuda(const parsec_task_t *parsec_task) {
+      if constexpr(TT::derived_has_cuda_op()) {
+        parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
+        return me->template invoke_evaluate<ttg::ExecutionSpace::CUDA>();
+      } else {
+        return PARSEC_HOOK_RETURN_NEXT;
+      }
+    }
+
+    template<typename TT>
+    inline parsec_hook_return_t evaluate_hip(const parsec_task_t *parsec_task) {
+      if constexpr(TT::derived_has_hip_op()) {
+        parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
+        return me->template invoke_evaluate<ttg::ExecutionSpace::HIP>();
+      } else {
+        return PARSEC_HOOK_RETURN_NEXT;
+      }
+    }
+
+    template<typename TT>
+    inline parsec_hook_return_t evaluate_level_zero(const parsec_task_t *parsec_task) {
+      if constexpr(TT::derived_has_level_zero_op()) {
+        parsec_ttg_task_t<TT> *me = (parsec_ttg_task_t<TT> *)parsec_task;
+        return me->template invoke_evaluate<ttg::ExecutionSpace::L0>();
+      } else {
+        return PARSEC_HOOK_RETURN_NEXT;
       }
     }
 
+
     template <typename KeyT, typename ActivationCallbackT>
     class rma_delayed_activate {
       std::vector<KeyT> _keylist;
@@ -1263,6 +1301,7 @@ namespace ttg_parsec {
     ttg::World world;
     ttg::meta::detail::keymap_t<keyT> keymap;
     ttg::meta::detail::keymap_t<keyT> priomap;
+    ttg::meta::detail::keymap_t<keyT, ttg::device::Device> devicemap;
     // For now use same type for unary/streaming input terminals, and stream reducers assigned at runtime
     ttg::meta::detail::input_reducers_t<actual_input_tuple_type>
         input_reducers;  //!< Reducers for the input terminals (empty = expect single value)
@@ -1450,36 +1489,95 @@ namespace ttg_parsec {
     }
 
     template <ttg::ExecutionSpace Space>
-    static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) {
-      static_assert(derived_has_device_op());
-
-      int dev_index;
-      double ratio = 1.0;
+    static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) {
 
       task_t *task = (task_t*)parsec_task;
-      parsec_execution_stream_s *es = task->tt->world.impl().execution_stream();
+      if (task->dev_ptr->gpu_task == nullptr) {
+
+        /* set up a device task */
+        parsec_gpu_task_t *gpu_task;
+        /* PaRSEC wants to free the gpu_task, because F***K ownerships */
+        gpu_task = static_cast<parsec_gpu_task_t*>(std::calloc(1, sizeof(*gpu_task)));
+        PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
+        gpu_task->ec = parsec_task;
+        gpu_task->task_type = 0; // user task
+        gpu_task->last_data_check_epoch = -1; // used internally
+        gpu_task->pushout = 0;
+        gpu_task->submit = &TT::device_static_submit<Space>;
+
+        // one way to force the task device
+        // currently this will probably break all of PaRSEC if this hint
+        // does not match where the data is located, not really useful for us
+        // instead we set a hint on the data if there is no hint set yet
+        //parsec_task->selected_device = ...;
+
+        /* set the gpu_task so it's available in register_device_memory */
+        task->dev_ptr->gpu_task = gpu_task;
+
+        /* TODO: is this the right place to set the mask? */
+        task->parsec_task.chore_mask = PARSEC_DEV_ALL;
+
+        /* copy over the task class, because that's what we need */
+        task->dev_ptr->task_class = *task->parsec_task.task_class;
+
+        // first invocation of the coroutine to get the coroutine handle
+        static_op<Space>(parsec_task);
+
+        /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
+
+        parsec_task_class_t& tc = task->dev_ptr->task_class;
+
+        // input flows are set up during register_device_memory as part of the first invocation above
+        for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+          tc.in[i]  = gpu_task->flow[i];
+          tc.out[i] = gpu_task->flow[i];
+        }
+        tc.nb_flows = MAX_PARAM_COUNT;
+
+        /* set the device hint on the data */
+        TT *tt = task->tt;
+        if (tt->devicemap) {
+          int parsec_dev;
+          if constexpr (std::is_void_v<keyT>) {
+            parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap());
+          } else {
+            parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key));
+          }
+          for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
+            /* only set on mutable data since we have exclusive access */
+            if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) {
+              parsec_data_t *data = parsec_task->data[i].data_in->original;
+              /* only set the preferred device if the host has the latest copy
+               * as otherwise we may end up with the wrong data if there is a newer
+               * version on a different device. Also, keep fingers crossed. */
+              if (data->owner_device == 0) {
+                parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
+              }
+            }
+          }
+        }
 
-      //std::cout << "device_static_op: task " << parsec_task << std::endl;
+        /* set the new task class that contains the flows */
+        task->parsec_task.task_class = &task->dev_ptr->task_class;
 
-      /* set up a device task */
-      parsec_gpu_task_t *gpu_task;
-      /* PaRSEC wants to free the gpu_task, because F***K ownerships */
-      gpu_task = static_cast<parsec_gpu_task_t*>(std::calloc(1, sizeof(*gpu_task)));
-      PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
-      gpu_task->ec = parsec_task;
-      gpu_task->task_type = 0; // user task
-      gpu_task->load = 1;    // TODO: can we do better?
-      gpu_task->last_data_check_epoch = -1; // used internally
-      gpu_task->pushout = 0;
-      gpu_task->submit = &TT::device_static_submit<Space>;
+        /* select this one */
+        return PARSEC_HOOK_RETURN_DONE;
+      }
 
-      /* set the gpu_task so it's available in register_device_memory */
-      task->dev_ptr->gpu_task = gpu_task;
+      std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl;
 
-      // first invocation of the coroutine to get the coroutine handle
-      static_op<Space>(parsec_task);
+      /* not sure if this might happen*/
+      return PARSEC_HOOK_RETURN_ERROR;
 
-      /* when we come back here, the flows in gpu_task are set (see register_device_memory) */
+    }
+
+    template <ttg::ExecutionSpace Space>
+    static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) {
+      static_assert(derived_has_device_op());
+
+      /* when we come in here we have a device assigned and are ready to go */
+
+      task_t *task = (task_t*)parsec_task;
 
       if (nullptr == task->suspended_task_address) {
         /* short-cut in case the task returned immediately */
@@ -1495,40 +1593,12 @@ namespace ttg_parsec {
       /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */
       assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER);
 
-      /* set up a temporary task-class to correctly specify the flows */
-      parsec_task_class_t tc = *task->parsec_task.task_class;
-
-      tc.name = task->parsec_task.task_class->name;
-      // input flows are set up during register_device_memory as part of the first invocation above
-      for (int i = 0; i < MAX_PARAM_COUNT; ++i) {
-        tc.in[i]  = gpu_task->flow[i];
-        tc.out[i] = gpu_task->flow[i];
-      }
-      tc.nb_flows = MAX_PARAM_COUNT;
-
-      /* swap in the new task class */
-      const parsec_task_class_t* tmp = task->parsec_task.task_class;
-      *const_cast<parsec_task_class_t**>(&task->parsec_task.task_class) = &tc;
-
-      /* TODO: is this the right place to set the mask? */
-      task->parsec_task.chore_mask = PARSEC_DEV_ALL;
-      /* get a device and come back if we need another one */
-      int64_t task_load = 1;
-      dev_index = parsec_get_best_device(parsec_task, &task_load);
-
-      /* swap back the original task class */
-      task->parsec_task.task_class = tmp;
-
-      gpu_task->load = task_load;
-      assert(dev_index >= 0);
-      if (!parsec_mca_device_is_gpu(dev_index)) {
-          return PARSEC_HOOK_RETURN_NEXT; /* Fall back */
-      }
-
-      parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)parsec_mca_device_get(dev_index);
+      parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)task->parsec_task.selected_device;
       assert(NULL != device);
 
       task->dev_ptr->device = device;
+      parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task;
+      parsec_execution_stream_s *es = task->tt->world.impl().execution_stream();
 
       switch(device->super.type) {
 
@@ -1575,11 +1645,11 @@ namespace ttg_parsec {
 
       task_t *task = (task_t*)parsec_task;
       void* suspended_task_address =
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
         task->suspended_task_address;  // non-null = need to resume the task
-#else
+#else  // TTG_HAVE_COROUTINE
         nullptr;
-#endif
+#endif // TTG_HAVE_COROUTINE
       //std::cout << "static_op: suspended_task_address " << suspended_task_address << std::endl;
       if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
 
@@ -1611,9 +1681,9 @@ namespace ttg_parsec {
       }
       else {  // resume the suspended coroutine
 
+#ifdef TTG_HAVE_COROUTINE
         assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid);
 
-#ifdef TTG_HAS_COROUTINE
 #ifdef TTG_HAVE_DEVICE
         if (task->coroutine_id == ttg::TaskCoroutineID::DeviceTask) {
           ttg::device::Task coro = ttg::device::detail::device_task_handle_type::from_address(suspended_task_address);
@@ -1654,14 +1724,14 @@ namespace ttg_parsec {
         }
         task->tt->set_outputs_tls_ptr(old_output_tls_ptr);
         detail::parsec_ttg_caller = nullptr;
+        task->suspended_task_address = suspended_task_address;
       }
       else
         ttg::abort();  // unrecognized task id
-#else // TTG_HAS_COROUTINE
-ttg::abort();  // should not happen
-#endif  // TTG_HAS_COROUTINE
+#else // TTG_HAVE_COROUTINE
+      ttg::abort();  // should not happen
+#endif  // TTG_HAVE_COROUTINE
       }
-      task->suspended_task_address = suspended_task_address;
 
       if (suspended_task_address == nullptr) {
         ttT *baseobj = task->tt;
@@ -1682,11 +1752,11 @@ ttg::abort();  // should not happen
       task_t *task = static_cast<task_t*>(parsec_task);
 
       void* suspended_task_address =
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
         task->suspended_task_address;  // non-null = need to resume the task
-#else
+#else // TTG_HAVE_COROUTINE
         nullptr;
-#endif
+#endif // TTG_HAVE_COROUTINE
       if (suspended_task_address == nullptr) {  // task is a coroutine that has not started or an ordinary function
         ttT *baseobj = (ttT *)task->object_ptr;
         derivedT *obj = (derivedT *)task->object_ptr;
@@ -1701,7 +1771,7 @@ ttg::abort();  // should not happen
         detail::parsec_ttg_caller = NULL;
       }
       else {
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
         auto ret = static_cast<ttg::resumable_task>(ttg::coroutine_handle<ttg::resumable_task_state>::from_address(suspended_task_address));
         assert(ret.ready());
         ret.resume();
@@ -1712,9 +1782,9 @@ ttg::abort();  // should not happen
         else { // not yet completed
           // leave suspended_task_address as is
         }
-#else
+#else  // TTG_HAVE_COROUTINE
         ttg::abort();  // should not happen
-#endif
+#endif // TTG_HAVE_COROUTINE
       }
       task->suspended_task_address = suspended_task_address;
 
@@ -3352,11 +3422,11 @@ ttg::abort();  // should not happen
       if (!need_pushout) {
         bool device_supported = false;
         if constexpr (derived_has_cuda_op()) {
-          device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::CUDA);
+          device_supported = world.impl().mpi_support(ttg::ExecutionSpace::CUDA);
         } else if constexpr (derived_has_hip_op()) {
-          device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::HIP);
+          device_supported = world.impl().mpi_support(ttg::ExecutionSpace::HIP);
         } else if constexpr (derived_has_level_zero_op()) {
-          device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::L0);
+          device_supported = world.impl().mpi_support(ttg::ExecutionSpace::L0);
         }
         /* if MPI supports the device we don't care whether we have remote peers
          * because we can send from the device directly */
@@ -3379,7 +3449,6 @@ ttg::abort();  // should not happen
       auto remote_check = [&](){
           auto world = ttg_default_execution_context();
           int rank = world.rank();
-          uint64_t pos = 0;
           bool remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(),
                                                       [&](const Key &key) { return keymap(key) != rank; });
           return remote;
@@ -3628,6 +3697,7 @@ ttg::abort();  // should not happen
 
       task_t *task = (task_t*)parsec_task;
 
+#ifdef TTG_HAVE_COROUTINE
       /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */
       if (task->suspended_task_address) {
         assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid);
@@ -3658,6 +3728,7 @@ ttg::abort();  // should not happen
         /* the coroutine should have completed and we cannot access the promise anymore */
         task->suspended_task_address = nullptr;
       }
+#endif // TTG_HAVE_COROUTINE
 
       /* release our data copies */
       for (int i = 0; i < task->data_count; i++) {
@@ -3739,29 +3810,31 @@ ttg::abort();  // should not happen
       if constexpr (derived_has_cuda_op()) {
         self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA;
-        ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL;
+        ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_cuda<TT>;
         ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_cuda<TT>;
         ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
-      } else if (derived_has_hip_op()) {
+      } else if constexpr (derived_has_hip_op()) {
         self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_HIP;
-        ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL;
+        ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_hip<TT>;
         ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_hip<TT>;
 
         ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
-      } else if (derived_has_level_zero_op()) {
+#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT)
+      } else if constexpr (derived_has_level_zero_op()) {
         self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_LEVEL_ZERO;
-        ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL;
+        ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_level_zero<TT>;
         ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_level_zero<TT>;
 
         ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE;
         ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL;
         ((__parsec_chore_t *)self.incarnations)[1].hook = NULL;
+#endif // PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
       } else {
         self.incarnations = (__parsec_chore_t *)malloc(2 * sizeof(__parsec_chore_t));
         ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CPU;
@@ -4156,6 +4229,37 @@ ttg::abort();  // should not happen
       priomap = std::forward<Priomap>(pm);
     }
 
+    /// device map setter
+    /// The device map provides a hint on which device a task should execute.
+    /// TTG may not be able to honor the request and the corresponding task
+    /// may execute on a different device.
+    /// @arg pm a function that provides a hint on which device the task should execute.
+    template<typename Devicemap>
+    void set_devicemap(Devicemap&& dm) {
+      static_assert(derived_has_device_op(), "Device map only allowed on device-enabled TT!");
+      if constexpr (std::is_same_v<ttg::device::Device, decltype(dm(std::declval<keyT>()))>) {
+        // dm returns a Device
+        devicemap = std::forward<Devicemap>(dm);
+      } else {
+        // convert dm return into a Device
+        devicemap = [=](const keyT& key) {
+          if constexpr (derived_has_cuda_op()) {
+            return ttg::device::Device(dm(key), ttg::ExecutionSpace::CUDA);
+          } else if constexpr (derived_has_hip_op()) {
+            return ttg::device::Device(dm(key), ttg::ExecutionSpace::HIP);
+          } else if constexpr (derived_has_level_zero_op()) {
+            return ttg::device::Device(dm(key), ttg::ExecutionSpace::L0);
+          } else {
+            throw std::runtime_error("Unknown device type!");
+          }
+        };
+      }
+    }
+
+    /// device map accessor
+    /// @return the device map
+    auto get_devicemap() { return devicemap; }
+
     // Register the static_op function to associate it to instance_id
     void register_static_op_function(void) {
       int rank;
@@ -4266,8 +4370,6 @@ struct ttg::detail::value_copy_handler<ttg::Runtime::PaRSEC> {
         /* this copy won't be modified anymore so mark it as read-only */
         copy->reset_readers();
       }
-      /* the value was potentially changed, so increment version */
-      copy->inc_current_version();
     }
     /* We're coming from a writer so mark the data as modified.
      * That way we can force a pushout in prepare_send if we move to read-only tasks (needed by PaRSEC). */
diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h
index 7024776aa..6b3972984 100644
--- a/ttg/ttg/tt.h
+++ b/ttg/ttg/tt.h
@@ -1,12 +1,13 @@
 #ifndef TTG_TT_H
 #define TTG_TT_H
 
+#include "ttg/config.h"
 #include "ttg/fwd.h"
 
 #include "ttg/base/tt.h"
 #include "ttg/edge.h"
 
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
 #include "ttg/coroutine.h"
 #endif
 
@@ -96,9 +97,9 @@ namespace ttg {
 
     ttg::World get_world() const override final { return tts[0]->get_world(); }
 
-    void fence() { tts[0]->fence(); }
+    void fence() override { tts[0]->fence(); }
 
-    void make_executable() {
+    void make_executable() override {
       for (auto &op : tts) op->make_executable();
     }
 
@@ -176,7 +177,7 @@ namespace ttg {
 }  // namespace ttg
 
 #ifndef TTG_PROCESS_TT_OP_RETURN
-#ifdef TTG_HAS_COROUTINE
+#ifdef TTG_HAVE_COROUTINE
 #define TTG_PROCESS_TT_OP_RETURN(result, id, invoke)                                                           \
   {                                                                                                            \
     using return_type = decltype(invoke);                                                                      \
@@ -196,7 +197,7 @@ namespace ttg {
     }                                                                                                          \
   }
 #else
-#define TTG_PROCESS_TT_OP_RETURN(result, invoke) invoke
+#define TTG_PROCESS_TT_OP_RETURN(result, id, invoke) invoke
 #endif
 #else
 #error "TTG_PROCESS_TT_OP_RETURN already defined in ttg/tt.h, check your header guards"
diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h
index b7bb31690..f3af03152 100644
--- a/ttg/ttg/util/meta.h
+++ b/ttg/ttg/util/meta.h
@@ -848,18 +848,18 @@ namespace ttg {
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       // keymap_t<key,value> = std::function<int(const key&>, protected against void key
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      template <typename Key, typename Enabler = void>
+      template <typename Key, typename Return, typename Enabler = void>
       struct keymap;
-      template <typename Key>
-      struct keymap<Key, std::enable_if_t<!is_void_v<Key>>> {
-        using type = std::function<int(const Key &)>;
+      template <typename Key, typename Return>
+      struct keymap<Key, Return, std::enable_if_t<!is_void_v<Key>>> {
+        using type = std::function<Return(const Key &)>;
       };
-      template <typename Key>
-      struct keymap<Key, std::enable_if_t<is_void_v<Key>>> {
-        using type = std::function<int()>;
+      template <typename Key, typename Return>
+      struct keymap<Key, Return, std::enable_if_t<is_void_v<Key>>> {
+        using type = std::function<Return()>;
       };
-      template <typename Key>
-      using keymap_t = typename keymap<Key>::type;
+      template <typename Key, typename Return = int>
+      using keymap_t = typename keymap<Key, Return>::type;
 
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       // input_reducers_t<valueTs...> = std::tuple<