diff --git a/CMakeLists.txt b/CMakeLists.txt index 28490557f..5406877c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,7 @@ option(TTG_ENABLE_LEVEL_ZERO "Whether to TTG will look for Intel oneAPI Level Ze option(TTG_EXAMPLES "Whether to build examples" OFF) option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF) +option(TTG_ENABLE_COROUTINES "Whether to enable C++ coroutines, needed for accelerator device support" ON) option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF) option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF) option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF) @@ -94,8 +95,26 @@ endif (BUILD_TESTING) ########################### # Boost include("${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake") -# C++ coroutines -find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) + +if (TTG_ENABLE_COROUTINES) + set(SKIP_COROUTINE_DETECTION FALSE) + # C++ coroutines, check for broken GCC releases and skip if one is found + if (${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 11.4.0) + set(SKIP_COROUTINE_DETECTION TRUE) + elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.1.0 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 12.3.0) + set(SKIP_COROUTINE_DETECTION TRUE) + endif() + if (SKIP_COROUTINE_DETECTION) + message(WARNING "GCC with broken Coroutine support detected, disabling Coroutine support. At least GCC 11.4, 12.3, or 13.1 required.") + endif(SKIP_COROUTINE_DETECTION) + endif(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") + + if (NOT SKIP_COROUTINE_DETECTION) + find_package(CXXStdCoroutine MODULE REQUIRED COMPONENTS Final Experimental) + set(TTG_HAVE_COROUTINE CXXStdCoroutine_FOUND CACHE BOOL "True if the compiler has coroutine support") + endif(NOT SKIP_COROUTINE_DETECTION) +endif(TTG_ENABLE_COROUTINES) ########################## diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index fd96e5816..43042cacd 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -4,7 +4,7 @@ set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5) # used to provide "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3) -set(TTG_TRACKED_PARSEC_TAG 0b3140f58ad9dc78a3d64da9fd73ecc7f443ece7) +set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c) set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73) diff --git a/cmake/modules/FindOrFetchPARSEC.cmake b/cmake/modules/FindOrFetchPARSEC.cmake index 7b164019f..b3fd5faa3 100644 --- a/cmake/modules/FindOrFetchPARSEC.cmake +++ b/cmake/modules/FindOrFetchPARSEC.cmake @@ -17,7 +17,7 @@ if (NOT TARGET PaRSEC::parsec) FetchContent_Declare( PARSEC - GIT_REPOSITORY https://github.com/devreal/parsec-1.git + GIT_REPOSITORY https://github.com/ICLDisco/parsec.git GIT_TAG ${TTG_TRACKED_PARSEC_TAG} ) FetchContent_MakeAvailable(PARSEC) diff --git a/examples/potrf/pmw.h b/examples/potrf/pmw.h index 0f8d75d7d..25f01933c 100644 --- a/examples/potrf/pmw.h +++ b/examples/potrf/pmw.h @@ -102,6 +102,14 @@ class PaRSECMatrixWrapper { (pm->uplo == PARSEC_MATRIX_UPPER && col >= row); } + int P() const { + return pm->grid.rows; + } + + int Q() const { + return pm->grid.cols; + } + PaRSECMatrixT* parsec() { return pm; } @@ -132,7 +140,7 @@ class PaRSECMatrixWrapper { }; template -using MatrixT = PaRSECMatrixWrapper; +using MatrixT = PaRSECMatrixWrapper; static auto make_load_tt(MatrixT &A, ttg::Edge> &toop, bool defer_write) { diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index f6ba6e147..60daef72f 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -674,10 +674,22 @@ namespace potrf { auto keymap1 = [&](const Key1& key) { return A.rank_of(key[0], key[0]); }; auto keymap2a = [&](const Key2& key) { return A.rank_of(key[0], key[1]); }; - auto keymap2b = [&](const Key2& key) { return A.rank_of(key[0], key[0]); }; + auto keymap2b = [&](const Key2& key) { return A.rank_of(key[1], key[1]); }; auto keymap3 = [&](const Key3& key) { return A.rank_of(key[0], key[1]); }; + /** + * Device map hints: we try to keep tiles on one row on the same device to minimize + * data movement between devices. This provides hints for load-balancing up front + * and avoids movement of the TRSM result to GEMM tasks. + */ + auto devmap1 = [&](const Key1& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; + + auto devmap2a = [&](const Key2& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; + auto devmap2b = [&](const Key2& key) { return (key[1] / A.P()) % ttg::device::num_devices(); }; + + auto devmap3 = [&](const Key3& key) { return (key[0] / A.P()) % ttg::device::num_devices(); }; + ttg::Edge> syrk_potrf("syrk_potrf"), disp_potrf("disp_potrf"); ttg::Edge> potrf_trsm("potrf_trsm"), trsm_syrk("trsm_syrk"), gemm_trsm("gemm_trsm"), @@ -692,18 +704,30 @@ namespace potrf { auto tt_potrf = make_potrf(A, disp_potrf, syrk_potrf, potrf_trsm, output); tt_potrf->set_keymap(keymap1); tt_potrf->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_potrf->set_devicemap(devmap1); +#endif // 0 auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output); tt_trsm->set_keymap(keymap2a); tt_trsm->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_trsm->set_devicemap(devmap2a); +#endif // 0 auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk); tt_syrk->set_keymap(keymap2b); tt_syrk->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_syrk->set_devicemap(devmap2b); +#endif // 0 auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm); tt_gemm->set_keymap(keymap3); tt_gemm->set_defer_writer(defer_write); +#ifdef ENABLE_DEVICE_KERNEL + tt_gemm->set_devicemap(devmap3); +#endif // 0 /* Priorities taken from DPLASMA */ auto nt = A.cols(); diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 275f3fdd8..7f25dd6fe 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -12,12 +12,14 @@ set(ut_libs Catch2::Catch2) # coroutine tests # we definitely have TARGET std::coroutine -list(APPEND ut_src fibonacci-coro.cc) -list(APPEND ut_src device_coro.cc) -if (TTG_HAVE_CUDA) - list(APPEND ut_src cuda_kernel.cu) -endif(TTG_HAVE_CUDA) -list(APPEND ut_libs std::coroutine) +if (CXXStdCoroutine_FOUND) + list(APPEND ut_src fibonacci-coro.cc) + list(APPEND ut_src device_coro.cc) + if (TTG_HAVE_CUDA) + list(APPEND ut_src cuda_kernel.cu) + endif(TTG_HAVE_CUDA) + list(APPEND ut_libs std::coroutine) +endif(CXXStdCoroutine_FOUND) add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" COMPILE_DEFINITIONS "CATCH_CONFIG_NO_POSIX_SIGNALS=1" ) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 900df6519..b1fa72947 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -114,7 +114,6 @@ if (TTG_ENABLE_TRACE) endif (TTG_ENABLE_TRACE) if (TARGET std::coroutine) list(APPEND ttg-deps std::coroutine) - list(APPEND ttg-defs "TTG_HAS_COROUTINE=1") list(APPEND ttg-util-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/coroutine.h ) @@ -208,6 +207,7 @@ endif(TARGET Boost::serialization) if (TARGET MADworld) set(ttg-mad-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/buffer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/device.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/fwd.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/import.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/ttg.h diff --git a/ttg/ttg/config.in.h b/ttg/ttg/config.in.h index 51e58b4a2..106f0c58a 100644 --- a/ttg/ttg/config.in.h +++ b/ttg/ttg/config.in.h @@ -11,6 +11,9 @@ /** the C++ namespace containing the coroutine API */ #define TTG_CXX_COROUTINE_NAMESPACE @CXX_COROUTINE_NAMESPACE@ +/** whether the compiler supports C++ coroutines */ +#cmakedefine TTG_HAVE_COROUTINE + /** whether TTG has CUDA language support */ #cmakedefine TTG_HAVE_CUDA diff --git a/ttg/ttg/coroutine.h b/ttg/ttg/coroutine.h index 81d5b1657..dc1ed8e5b 100644 --- a/ttg/ttg/coroutine.h +++ b/ttg/ttg/coroutine.h @@ -6,11 +6,14 @@ #define TTG_COROUTINE_H #include "ttg/config.h" + +#ifdef TTG_HAVE_COROUTINE #include TTG_CXX_COROUTINE_HEADER #include #include + namespace ttg { // import std coroutine API into ttg namespace @@ -227,4 +230,6 @@ namespace ttg { } // namespace ttg +#endif // TTG_HAVE_COROUTINE + #endif // TTG_COROUTINE_H diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h index 6690982f6..244e9c944 100644 --- a/ttg/ttg/device/device.h +++ b/ttg/ttg/device/device.h @@ -2,6 +2,8 @@ #include "ttg/config.h" #include "ttg/execution.h" +#include "ttg/impl_selector.h" +#include "ttg/fwd.h" @@ -180,3 +182,9 @@ namespace ttg::device { } } // namespace ttg #endif // defined(TTG_HAVE_HIP) + +namespace ttg::device { + inline int num_devices() { + return TTG_IMPL_NS::num_devices(); + } +} diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h index d95e0d1eb..8e2d14cfc 100644 --- a/ttg/ttg/device/task.h +++ b/ttg/ttg/device/task.h @@ -9,6 +9,8 @@ #include "ttg/impl_selector.h" #include "ttg/ptr.h" +#ifdef TTG_HAVE_COROUTINE + namespace ttg::device { namespace detail { @@ -632,6 +634,8 @@ namespace ttg::device { bool device_reducer::completed() { return base_type::promise().state() == ttg::device::detail::TTG_DEVICE_CORO_COMPLETE; } #endif // 0 -} // namespace ttg::devie +} // namespace ttg::device + +#endif // TTG_HAVE_COROUTINE #endif // TTG_DEVICE_TASK_H diff --git a/ttg/ttg/madness/device.h b/ttg/ttg/madness/device.h new file mode 100644 index 000000000..e13321194 --- /dev/null +++ b/ttg/ttg/madness/device.h @@ -0,0 +1,9 @@ +#ifndef TTG_MADNESS_DEVICE_H +#define TTG_MADNESS_DEVICE_H + +namespace ttg_madness { + /* no device support in MADNESS */ + inline int num_devices() { return 0; } +} + +#endif // TTG_MADNESS_DEVICE_H \ No newline at end of file diff --git a/ttg/ttg/madness/fwd.h b/ttg/ttg/madness/fwd.h index 0d340db0b..6bee1a832 100644 --- a/ttg/ttg/madness/fwd.h +++ b/ttg/ttg/madness/fwd.h @@ -77,6 +77,8 @@ namespace ttg_madness { template inline void mark_device_out(std::tuple &b); + inline int num_devices(); + } // namespace ttg_madness #endif // TTG_MADNESS_FWD_H diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h index 5fab6f6dd..5d2360cfb 100644 --- a/ttg/ttg/madness/ttg.h +++ b/ttg/ttg/madness/ttg.h @@ -13,6 +13,7 @@ #include "ttg/base/keymap.h" #include "ttg/base/tt.h" #include "ttg/func.h" +#include "ttg/madness/device.h" #include "ttg/runtimes.h" #include "ttg/tt.h" #include "ttg/util/bug.h" @@ -23,9 +24,7 @@ #include "ttg/util/meta/callable.h" #include "ttg/util/void.h" #include "ttg/world.h" -#ifdef TTG_HAS_COROUTINE #include "ttg/coroutine.h" -#endif #include #include @@ -302,10 +301,10 @@ namespace ttg_madness { derivedT *derived; // Pointer to derived class instance bool pull_terminals_invoked = false; std::conditional_t, ttg::Void, keyT> key; // Task key -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void *suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; -#endif +#endif // TTG_HAVE_COROUTINE /// makes a tuple of references out of tuple of template @@ -335,11 +334,11 @@ namespace ttg_madness { ttT::threaddata.call_depth++; void *suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE this->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function // ttg::print("starting task"); if constexpr (!ttg::meta::is_void_v && !ttg::meta::is_empty_tuple_v) { @@ -361,7 +360,7 @@ namespace ttg_madness { } else // unreachable ttg::abort(); } else { // resume suspended coroutine -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE auto ret = static_cast(ttg::coroutine_handle::from_address(suspended_task_address)); assert(ret.ready()); ret.resume(); @@ -372,9 +371,9 @@ namespace ttg_madness { // leave suspended_task_address as is } this->suspended_task_address = suspended_task_address; -#else +#else // TTG_HAVE_COROUTINE ttg::abort(); // should not happen -#endif +#endif // TTG_HAVE_COROUTINE } ttT::threaddata.call_depth--; @@ -383,7 +382,7 @@ namespace ttg_madness { // ttg::print("finishing task",ttT::threaddata.call_depth); // } -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE if (suspended_task_address) { // TODO implement handling of suspended coroutines properly @@ -411,7 +410,7 @@ namespace ttg_madness { ttg::abort(); } } -#endif // TTG_HAS_COROUTINE +#endif // TTG_HAVE_COROUTINE } virtual ~TTArgs() {} // Will be deleted via TaskInterface* diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h index 1711d8556..81897b816 100644 --- a/ttg/ttg/make_tt.h +++ b/ttg/ttg/make_tt.h @@ -149,7 +149,7 @@ class CallableWrapTTArgs std::conditional_t, std::add_pointer_t, noref_funcT> func; using op_return_type = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE std::conditional_t, ttg::coroutine_handle, #ifdef TTG_HAVE_DEVICE @@ -160,9 +160,9 @@ class CallableWrapTTArgs void #endif // TTG_HAVE_DEVICE >; -#else // TTG_HAS_COROUTINE +#else // TTG_HAVE_COROUTINE void; -#endif // TTG_HAS_COROUTINE +#endif // TTG_HAVE_COROUTINE public: static constexpr bool have_cuda_op = (space == ttg::ExecutionSpace::CUDA); @@ -176,7 +176,7 @@ class CallableWrapTTArgs static_assert(std::is_same_v, returnT>, "CallableWrapTTArgs: returnT does not match the actual return type of funcT"); if constexpr (!std::is_void_v) { // protect from compiling for void returnT -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE if constexpr (std::is_same_v) { ttg::coroutine_handle coro_handle; // if task completed destroy it diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h index 98b14eb12..6e609a158 100644 --- a/ttg/ttg/parsec/buffer.h +++ b/ttg/ttg/parsec/buffer.h @@ -329,6 +329,14 @@ struct Buffer : public detail::ttg_parsec_data_wrapper_t // << " parsec_data " << m_data.get() << std::endl; } + void prefer_device(ttg::device::Device dev) { + /* only set device if the host has the latest copy as otherwise we might end up with a stale copy */ + if (dev.is_device() && this->parsec_data()->owner_device == 0) { + parsec_advise_data_on_device(this->parsec_data(), detail::ttg_device_to_parsec_device(dev), + PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); + } + } + /* serialization support */ #ifdef TTG_SERIALIZATION_SUPPORTS_BOOST diff --git a/ttg/ttg/parsec/device.h b/ttg/ttg/parsec/device.h index 77722b1c1..9f8ada05c 100644 --- a/ttg/ttg/parsec/device.h +++ b/ttg/ttg/parsec/device.h @@ -2,6 +2,7 @@ #define TTG_PARSEC_DEVICE_H #include "ttg/device/device.h" +#include namespace ttg_parsec { @@ -35,6 +36,12 @@ namespace ttg_parsec { } } // namespace detail + + inline + int num_devices() { + return parsec_nb_devices - detail::first_device_id; + } + } // namespace ttg_parsec #endif // TTG_PARSEC_DEVICE_H \ No newline at end of file diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h index d5bc8931e..0cd798e87 100644 --- a/ttg/ttg/parsec/fwd.h +++ b/ttg/ttg/parsec/fwd.h @@ -82,6 +82,8 @@ namespace ttg_parsec { template inline void mark_device_out(std::tuple &b); + inline int num_devices(); + #if 0 template inline std::pair>...>> get_ptr(Args&&... args); diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 5b23d53af..656117b94 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -15,6 +15,7 @@ namespace ttg_parsec { parsec_flow_t* flows = nullptr; parsec_gpu_exec_stream_t* stream = nullptr; parsec_device_gpu_module_t* device = nullptr; + parsec_task_class_t task_class; // copy of the taskclass }; template @@ -57,9 +58,10 @@ namespace ttg_parsec { } inline - ttg_parsec_data_flags operator|=(ttg_parsec_data_flags lhs, ttg_parsec_data_flags rhs) { + ttg_parsec_data_flags operator|=(ttg_parsec_data_flags& lhs, ttg_parsec_data_flags rhs) { using flags_type = std::underlying_type::type; - return ttg_parsec_data_flags(static_cast(lhs) | static_cast(rhs)); + lhs = ttg_parsec_data_flags(static_cast(lhs) | static_cast(rhs)); + return lhs; } inline @@ -68,6 +70,13 @@ namespace ttg_parsec { return static_cast(lhs) & static_cast(rhs); } + inline + ttg_parsec_data_flags operator&=(ttg_parsec_data_flags& lhs, ttg_parsec_data_flags rhs) { + using flags_type = std::underlying_type::type; + lhs = ttg_parsec_data_flags(static_cast(lhs) & static_cast(rhs)); + return lhs; + } + inline bool operator!(ttg_parsec_data_flags lhs) { using flags_type = std::underlying_type::type; @@ -146,10 +155,14 @@ namespace ttg_parsec { : data_count(data_count) , copies(copies) , defer_writer(defer_writer) { + PARSEC_OBJ_CONSTRUCT(&parsec_task, parsec_task_t); PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); parsec_task.mempool_owner = mempool; parsec_task.task_class = task_class; parsec_task.priority = 0; + + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } parsec_ttg_task_base_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, @@ -161,6 +174,7 @@ namespace ttg_parsec { , copies(copies) , release_task_cb(release_fn) , defer_writer(defer_writer) { + PARSEC_OBJ_CONSTRUCT(&parsec_task, parsec_task_t); PARSEC_LIST_ITEM_SINGLETON(&parsec_task.super); parsec_task.mempool_owner = mempool; parsec_task.task_class = task_class; @@ -168,6 +182,9 @@ namespace ttg_parsec { parsec_task.taskpool = taskpool; parsec_task.priority = priority; parsec_task.chore_mask = 1<<0; + + // TODO: can we avoid this? + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { this->parsec_task.data[i].data_in = nullptr; } } public: @@ -185,7 +202,7 @@ namespace ttg_parsec { TT* tt = nullptr; key_type key; std::array streams; -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void* suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; #endif @@ -234,6 +251,15 @@ namespace ttg_parsec { } } + template + parsec_hook_return_t invoke_evaluate() { + if constexpr (Space == ttg::ExecutionSpace::Host) { + return PARSEC_HOOK_RETURN_DONE; + } else { + return TT::template device_static_evaluate(&this->parsec_task); + } + } + parsec_key_t pkey() { return reinterpret_cast(&key); } }; @@ -242,7 +268,7 @@ namespace ttg_parsec { static constexpr size_t num_streams = TT::numins; TT* tt = nullptr; std::array streams; -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE void* suspended_task_address = nullptr; // if not null the function is suspended ttg::TaskCoroutineID coroutine_id = ttg::TaskCoroutineID::Invalid; #endif diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index f4653f962..73672d285 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -12,6 +12,8 @@ * This may cause deadlocks, so use with caution. */ #define TTG_PARSEC_DEFER_WRITER false +#include "ttg/config.h" + #include "ttg/impl_selector.h" /* include ttg header to make symbols available in case this header is included directly */ @@ -740,6 +742,7 @@ namespace ttg_parsec { inline void transfer_ownership_impl(ttg_data_copy_t *copy, int device) { if constexpr(!std::is_const_v>) { copy->transfer_ownership(PARSEC_FLOW_ACCESS_RW, device); + copy->inc_current_version(); } } @@ -765,7 +768,8 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC CUDA hook invoked on a TT that does not support CUDA operations!"); + std::cerr << "CUDA hook called without having a CUDA op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; } } @@ -775,7 +779,8 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!"); + std::cerr << "HIP hook called without having a HIP op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; } } @@ -785,10 +790,43 @@ namespace ttg_parsec { parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; return me->template invoke_op(); } else { - throw std::runtime_error("PaRSEC HIP hook invoked on a TT that does not support HIP operations!"); + std::cerr << "L0 hook called without having a L0 op!" << std::endl; + return PARSEC_HOOK_RETURN_ERROR; + } + } + + + template + inline parsec_hook_return_t evaluate_cuda(const parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_cuda_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; + } + } + + template + inline parsec_hook_return_t evaluate_hip(const parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_hip_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; + } + } + + template + inline parsec_hook_return_t evaluate_level_zero(const parsec_task_t *parsec_task) { + if constexpr(TT::derived_has_level_zero_op()) { + parsec_ttg_task_t *me = (parsec_ttg_task_t *)parsec_task; + return me->template invoke_evaluate(); + } else { + return PARSEC_HOOK_RETURN_NEXT; } } + template class rma_delayed_activate { std::vector _keylist; @@ -1263,6 +1301,7 @@ namespace ttg_parsec { ttg::World world; ttg::meta::detail::keymap_t keymap; ttg::meta::detail::keymap_t priomap; + ttg::meta::detail::keymap_t devicemap; // For now use same type for unary/streaming input terminals, and stream reducers assigned at runtime ttg::meta::detail::input_reducers_t input_reducers; //!< Reducers for the input terminals (empty = expect single value) @@ -1450,36 +1489,95 @@ namespace ttg_parsec { } template - static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { - static_assert(derived_has_device_op()); - - int dev_index; - double ratio = 1.0; + static parsec_hook_return_t device_static_evaluate(parsec_task_t* parsec_task) { task_t *task = (task_t*)parsec_task; - parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); + if (task->dev_ptr->gpu_task == nullptr) { + + /* set up a device task */ + parsec_gpu_task_t *gpu_task; + /* PaRSEC wants to free the gpu_task, because F***K ownerships */ + gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); + PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); + gpu_task->ec = parsec_task; + gpu_task->task_type = 0; // user task + gpu_task->last_data_check_epoch = -1; // used internally + gpu_task->pushout = 0; + gpu_task->submit = &TT::device_static_submit; + + // one way to force the task device + // currently this will probably break all of PaRSEC if this hint + // does not match where the data is located, not really useful for us + // instead we set a hint on the data if there is no hint set yet + //parsec_task->selected_device = ...; + + /* set the gpu_task so it's available in register_device_memory */ + task->dev_ptr->gpu_task = gpu_task; + + /* TODO: is this the right place to set the mask? */ + task->parsec_task.chore_mask = PARSEC_DEV_ALL; + + /* copy over the task class, because that's what we need */ + task->dev_ptr->task_class = *task->parsec_task.task_class; + + // first invocation of the coroutine to get the coroutine handle + static_op(parsec_task); + + /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + + parsec_task_class_t& tc = task->dev_ptr->task_class; + + // input flows are set up during register_device_memory as part of the first invocation above + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + tc.in[i] = gpu_task->flow[i]; + tc.out[i] = gpu_task->flow[i]; + } + tc.nb_flows = MAX_PARAM_COUNT; + + /* set the device hint on the data */ + TT *tt = task->tt; + if (tt->devicemap) { + int parsec_dev; + if constexpr (std::is_void_v) { + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap()); + } else { + parsec_dev = detail::ttg_device_to_parsec_device(tt->devicemap(task->key)); + } + for (int i = 0; i < MAX_PARAM_COUNT; ++i) { + /* only set on mutable data since we have exclusive access */ + if (tc.in[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) { + parsec_data_t *data = parsec_task->data[i].data_in->original; + /* only set the preferred device if the host has the latest copy + * as otherwise we may end up with the wrong data if there is a newer + * version on a different device. Also, keep fingers crossed. */ + if (data->owner_device == 0) { + parsec_advise_data_on_device(data, parsec_dev, PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE); + } + } + } + } - //std::cout << "device_static_op: task " << parsec_task << std::endl; + /* set the new task class that contains the flows */ + task->parsec_task.task_class = &task->dev_ptr->task_class; - /* set up a device task */ - parsec_gpu_task_t *gpu_task; - /* PaRSEC wants to free the gpu_task, because F***K ownerships */ - gpu_task = static_cast(std::calloc(1, sizeof(*gpu_task))); - PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); - gpu_task->ec = parsec_task; - gpu_task->task_type = 0; // user task - gpu_task->load = 1; // TODO: can we do better? - gpu_task->last_data_check_epoch = -1; // used internally - gpu_task->pushout = 0; - gpu_task->submit = &TT::device_static_submit; + /* select this one */ + return PARSEC_HOOK_RETURN_DONE; + } - /* set the gpu_task so it's available in register_device_memory */ - task->dev_ptr->gpu_task = gpu_task; + std::cerr << "EVALUATE called on task with assigned GPU task!" << std::endl; - // first invocation of the coroutine to get the coroutine handle - static_op(parsec_task); + /* not sure if this might happen*/ + return PARSEC_HOOK_RETURN_ERROR; - /* when we come back here, the flows in gpu_task are set (see register_device_memory) */ + } + + template + static parsec_hook_return_t device_static_op(parsec_task_t* parsec_task) { + static_assert(derived_has_device_op()); + + /* when we come in here we have a device assigned and are ready to go */ + + task_t *task = (task_t*)parsec_task; if (nullptr == task->suspended_task_address) { /* short-cut in case the task returned immediately */ @@ -1495,40 +1593,12 @@ namespace ttg_parsec { /* for now make sure we're waiting for transfers and the coro hasn't skipped this step */ assert(dev_data.state() == ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER); - /* set up a temporary task-class to correctly specify the flows */ - parsec_task_class_t tc = *task->parsec_task.task_class; - - tc.name = task->parsec_task.task_class->name; - // input flows are set up during register_device_memory as part of the first invocation above - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - tc.in[i] = gpu_task->flow[i]; - tc.out[i] = gpu_task->flow[i]; - } - tc.nb_flows = MAX_PARAM_COUNT; - - /* swap in the new task class */ - const parsec_task_class_t* tmp = task->parsec_task.task_class; - *const_cast(&task->parsec_task.task_class) = &tc; - - /* TODO: is this the right place to set the mask? */ - task->parsec_task.chore_mask = PARSEC_DEV_ALL; - /* get a device and come back if we need another one */ - int64_t task_load = 1; - dev_index = parsec_get_best_device(parsec_task, &task_load); - - /* swap back the original task class */ - task->parsec_task.task_class = tmp; - - gpu_task->load = task_load; - assert(dev_index >= 0); - if (!parsec_mca_device_is_gpu(dev_index)) { - return PARSEC_HOOK_RETURN_NEXT; /* Fall back */ - } - - parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)parsec_mca_device_get(dev_index); + parsec_device_gpu_module_t *device = (parsec_device_gpu_module_t*)task->parsec_task.selected_device; assert(NULL != device); task->dev_ptr->device = device; + parsec_gpu_task_t *gpu_task = task->dev_ptr->gpu_task; + parsec_execution_stream_s *es = task->tt->world.impl().execution_stream(); switch(device->super.type) { @@ -1575,11 +1645,11 @@ namespace ttg_parsec { task_t *task = (task_t*)parsec_task; void* suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE task->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE //std::cout << "static_op: suspended_task_address " << suspended_task_address << std::endl; if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function @@ -1611,9 +1681,9 @@ namespace ttg_parsec { } else { // resume the suspended coroutine +#ifdef TTG_HAVE_COROUTINE assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); -#ifdef TTG_HAS_COROUTINE #ifdef TTG_HAVE_DEVICE if (task->coroutine_id == ttg::TaskCoroutineID::DeviceTask) { ttg::device::Task coro = ttg::device::detail::device_task_handle_type::from_address(suspended_task_address); @@ -1654,14 +1724,14 @@ namespace ttg_parsec { } task->tt->set_outputs_tls_ptr(old_output_tls_ptr); detail::parsec_ttg_caller = nullptr; + task->suspended_task_address = suspended_task_address; } else ttg::abort(); // unrecognized task id -#else // TTG_HAS_COROUTINE -ttg::abort(); // should not happen -#endif // TTG_HAS_COROUTINE +#else // TTG_HAVE_COROUTINE + ttg::abort(); // should not happen +#endif // TTG_HAVE_COROUTINE } - task->suspended_task_address = suspended_task_address; if (suspended_task_address == nullptr) { ttT *baseobj = task->tt; @@ -1682,11 +1752,11 @@ ttg::abort(); // should not happen task_t *task = static_cast(parsec_task); void* suspended_task_address = -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE task->suspended_task_address; // non-null = need to resume the task -#else +#else // TTG_HAVE_COROUTINE nullptr; -#endif +#endif // TTG_HAVE_COROUTINE if (suspended_task_address == nullptr) { // task is a coroutine that has not started or an ordinary function ttT *baseobj = (ttT *)task->object_ptr; derivedT *obj = (derivedT *)task->object_ptr; @@ -1701,7 +1771,7 @@ ttg::abort(); // should not happen detail::parsec_ttg_caller = NULL; } else { -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE auto ret = static_cast(ttg::coroutine_handle::from_address(suspended_task_address)); assert(ret.ready()); ret.resume(); @@ -1712,9 +1782,9 @@ ttg::abort(); // should not happen else { // not yet completed // leave suspended_task_address as is } -#else +#else // TTG_HAVE_COROUTINE ttg::abort(); // should not happen -#endif +#endif // TTG_HAVE_COROUTINE } task->suspended_task_address = suspended_task_address; @@ -3352,11 +3422,11 @@ ttg::abort(); // should not happen if (!need_pushout) { bool device_supported = false; if constexpr (derived_has_cuda_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::CUDA); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::CUDA); } else if constexpr (derived_has_hip_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::HIP); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::HIP); } else if constexpr (derived_has_level_zero_op()) { - device_supported = !world.impl().mpi_support(ttg::ExecutionSpace::L0); + device_supported = world.impl().mpi_support(ttg::ExecutionSpace::L0); } /* if MPI supports the device we don't care whether we have remote peers * because we can send from the device directly */ @@ -3379,7 +3449,6 @@ ttg::abort(); // should not happen auto remote_check = [&](){ auto world = ttg_default_execution_context(); int rank = world.rank(); - uint64_t pos = 0; bool remote = keylist.end() != std::find_if(keylist.begin(), keylist.end(), [&](const Key &key) { return keymap(key) != rank; }); return remote; @@ -3628,6 +3697,7 @@ ttg::abort(); // should not happen task_t *task = (task_t*)parsec_task; +#ifdef TTG_HAVE_COROUTINE /* if we still have a coroutine handle we invoke it one more time to get the sends/broadcasts */ if (task->suspended_task_address) { assert(task->coroutine_id != ttg::TaskCoroutineID::Invalid); @@ -3658,6 +3728,7 @@ ttg::abort(); // should not happen /* the coroutine should have completed and we cannot access the promise anymore */ task->suspended_task_address = nullptr; } +#endif // TTG_HAVE_COROUTINE /* release our data copies */ for (int i = 0; i < task->data_count; i++) { @@ -3739,29 +3810,31 @@ ttg::abort(); // should not happen if constexpr (derived_has_cuda_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CUDA; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_cuda; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_cuda; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; - } else if (derived_has_hip_op()) { + } else if constexpr (derived_has_hip_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_HIP; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_hip; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_hip; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; - } else if (derived_has_level_zero_op()) { +#if defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) + } else if constexpr (derived_has_level_zero_op()) { self.incarnations = (__parsec_chore_t *)malloc(3 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_LEVEL_ZERO; - ((__parsec_chore_t *)self.incarnations)[0].evaluate = NULL; + ((__parsec_chore_t *)self.incarnations)[0].evaluate = &detail::evaluate_level_zero; ((__parsec_chore_t *)self.incarnations)[0].hook = &detail::hook_level_zero; ((__parsec_chore_t *)self.incarnations)[1].type = PARSEC_DEV_NONE; ((__parsec_chore_t *)self.incarnations)[1].evaluate = NULL; ((__parsec_chore_t *)self.incarnations)[1].hook = NULL; +#endif // PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT } else { self.incarnations = (__parsec_chore_t *)malloc(2 * sizeof(__parsec_chore_t)); ((__parsec_chore_t *)self.incarnations)[0].type = PARSEC_DEV_CPU; @@ -4156,6 +4229,37 @@ ttg::abort(); // should not happen priomap = std::forward(pm); } + /// device map setter + /// The device map provides a hint on which device a task should execute. + /// TTG may not be able to honor the request and the corresponding task + /// may execute on a different device. + /// @arg pm a function that provides a hint on which device the task should execute. + template + void set_devicemap(Devicemap&& dm) { + static_assert(derived_has_device_op(), "Device map only allowed on device-enabled TT!"); + if constexpr (std::is_same_v()))>) { + // dm returns a Device + devicemap = std::forward(dm); + } else { + // convert dm return into a Device + devicemap = [=](const keyT& key) { + if constexpr (derived_has_cuda_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::CUDA); + } else if constexpr (derived_has_hip_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::HIP); + } else if constexpr (derived_has_level_zero_op()) { + return ttg::device::Device(dm(key), ttg::ExecutionSpace::L0); + } else { + throw std::runtime_error("Unknown device type!"); + } + }; + } + } + + /// device map accessor + /// @return the device map + auto get_devicemap() { return devicemap; } + // Register the static_op function to associate it to instance_id void register_static_op_function(void) { int rank; @@ -4266,8 +4370,6 @@ struct ttg::detail::value_copy_handler { /* this copy won't be modified anymore so mark it as read-only */ copy->reset_readers(); } - /* the value was potentially changed, so increment version */ - copy->inc_current_version(); } /* We're coming from a writer so mark the data as modified. * That way we can force a pushout in prepare_send if we move to read-only tasks (needed by PaRSEC). */ diff --git a/ttg/ttg/tt.h b/ttg/ttg/tt.h index 7024776aa..6b3972984 100644 --- a/ttg/ttg/tt.h +++ b/ttg/ttg/tt.h @@ -1,12 +1,13 @@ #ifndef TTG_TT_H #define TTG_TT_H +#include "ttg/config.h" #include "ttg/fwd.h" #include "ttg/base/tt.h" #include "ttg/edge.h" -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE #include "ttg/coroutine.h" #endif @@ -96,9 +97,9 @@ namespace ttg { ttg::World get_world() const override final { return tts[0]->get_world(); } - void fence() { tts[0]->fence(); } + void fence() override { tts[0]->fence(); } - void make_executable() { + void make_executable() override { for (auto &op : tts) op->make_executable(); } @@ -176,7 +177,7 @@ namespace ttg { } // namespace ttg #ifndef TTG_PROCESS_TT_OP_RETURN -#ifdef TTG_HAS_COROUTINE +#ifdef TTG_HAVE_COROUTINE #define TTG_PROCESS_TT_OP_RETURN(result, id, invoke) \ { \ using return_type = decltype(invoke); \ @@ -196,7 +197,7 @@ namespace ttg { } \ } #else -#define TTG_PROCESS_TT_OP_RETURN(result, invoke) invoke +#define TTG_PROCESS_TT_OP_RETURN(result, id, invoke) invoke #endif #else #error "TTG_PROCESS_TT_OP_RETURN already defined in ttg/tt.h, check your header guards" diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h index b7bb31690..f3af03152 100644 --- a/ttg/ttg/util/meta.h +++ b/ttg/ttg/util/meta.h @@ -848,18 +848,18 @@ namespace ttg { //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // keymap_t = std::function, protected against void key //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - template + template struct keymap; - template - struct keymap>> { - using type = std::function; + template + struct keymap>> { + using type = std::function; }; - template - struct keymap>> { - using type = std::function; + template + struct keymap>> { + using type = std::function; }; - template - using keymap_t = typename keymap::type; + template + using keymap_t = typename keymap::type; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // input_reducers_t = std::tuple<