From 835955e64e3882c32ef1a355e94a907fb4a2ddce Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 13 Oct 2022 16:21:17 -0400 Subject: [PATCH 001/259] Mock of ttg::make_device_tt and views Signed-off-by: Joseph Schuchart --- ttg/ttg/make_device_tt.h | 51 ++++++++++++++++++++++++++++++ ttg/ttg/make_tt.h | 2 ++ ttg/ttg/view.h | 68 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+) create mode 100644 ttg/ttg/make_device_tt.h create mode 100644 ttg/ttg/view.h diff --git a/ttg/ttg/make_device_tt.h b/ttg/ttg/make_device_tt.h new file mode 100644 index 000000000..63f4c78eb --- /dev/null +++ b/ttg/ttg/make_device_tt.h @@ -0,0 +1,51 @@ +// to be #include'd within runtime::ttg namespace + +#ifndef TTG_MAKE_DEVICE_TT_H +#define TTG_MAKE_DEVICE_TT_H + +#include +#include + +#include "ttg/edge.h" + +template +auto make_tt(DevViewFuncT &&view_func, + DevKernelFuncT &&kernel_func, + DevOutFuncT &&out_func, + const std::tuple...> &inedges, + const std::tuple &outedges, const std::string &name = "wrapper", + const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), + const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { + + // TODO: +} + + + +template +auto make_tt(HostFuncT &&host_func, + DevViewFuncT &&view_func, + DevKernelFuncT &&kernel_func, + DevOutFuncT &&out_func, + const std::tuple...> &inedges, + const std::tuple &outedges, const std::string &name = "wrapper", + const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), + const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { + + // TODO: +} + +#endif // TTG_MAKE_DEVICE_TT_H diff --git a/ttg/ttg/make_tt.h b/ttg/ttg/make_tt.h index eabd3c34e..491d32e01 100644 --- a/ttg/ttg/make_tt.h +++ b/ttg/ttg/make_tt.h @@ -517,4 +517,6 @@ template (std::forward(func), inedges, outedges, name, innames, outnames); } +#include "ttg/make_device_tt.h" + #endif // TTG_MAKE_TT_H diff --git a/ttg/ttg/view.h b/ttg/ttg/view.h new file mode 100644 index 000000000..1eee6856e --- /dev/null +++ b/ttg/ttg/view.h @@ -0,0 +1,68 @@ +#ifndef TTG_VIEW_H +#define TTG_VIEW_H + +#include +#include "ttg/util/iovec.h" + +namespace ttg { + + namespace detail { + + template + struct typed_iov { + T* ptr; + std::size_t size; + }; + + template + struct typed_iovs { + std::tuple> iovs; + }; + + } // namespace detail + + template + struct view_t { + + view_t(HostT& obj, detail::typed_iovs iovs) + : m_obj(obj) + , m_iovs(iovs) + { } + + template + auto get_device_ptr() { + return std::get(m_iovs).ptr; + } + + template + std::size_t get_device_size() { + return std::get(m_iovs).size; + } + + HostT& get_host_object() { + return m_obj; + } + + private: + HostT& m_obj; + detail::typed_iovs m_iovs; + + }; + + template + auto make_view(HostT&& obj, detail::typed_iov iovs) { + /* TODO: allocate memory on the device and transfer the data to it */ + return view_t(obj, std::move(iovs)); + } + + template + auto new_view(HostT&& obj, detail::typed_iov iovs) { + /* TODO: allocate memory on the device, no copying needed */ + return view_t(obj, std::move(iovs)); + } + + +} // namespace ttg + + +#endif // TTG_VIEW_H From 7b6319b0fa80eca3b74664e6a93ef78cea959cef Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Fri, 14 Oct 2022 14:34:00 -0400 Subject: [PATCH 002/259] Add new header to CMake files Signed-off-by: Joseph Schuchart --- ttg/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 96429ceca..b77a264f8 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -53,6 +53,8 @@ set(ttg-impl-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/traverse.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/world.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/make_tt.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/make_device_tt.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/view.h ) set(ttg-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg.h From 25d8c3c343f5a0d038ee387320ccdffc49303b01 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 20 Oct 2022 15:57:00 -0400 Subject: [PATCH 003/259] Add device_mock example to compile the ttg device mock API Signed-off-by: Joseph Schuchart --- examples/CMakeLists.txt | 2 + examples/device_mock/device_mock.cc | 194 ++++++++++++++++++ ttg/ttg/make_device_tt.h | 49 ++--- ttg/ttg/parsec/ttg.h | 2 + .../serialization/splitmd_data_descriptor.h | 12 +- ttg/ttg/util/iovec.h | 21 ++ ttg/ttg/view.h | 35 ++-- 7 files changed, 262 insertions(+), 53 deletions(-) create mode 100644 examples/device_mock/device_mock.cc create mode 100644 ttg/ttg/util/iovec.h diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4fc51ce3d..2ca2e3214 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -37,6 +37,8 @@ add_ttg_executable(testing_dtrtri potrf/testing_dtrtri.cc LINK_LIBRARIES lapackp add_ttg_executable(testing_dlauum potrf/testing_dlauum.cc LINK_LIBRARIES lapackpp) add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES lapackpp) +add_ttg_executable(device_mock device_mock/device_mock.cc LINK_LIBRARIES MADworld) + if (OpenMP_CXX_FOUND AND TARGET std::execution) add_ttg_executable(fw-apsp-df floyd-warshall/floyd_warshall_df.cc LINK_LIBRARIES OpenMP::OpenMP_CXX std::execution MADworld) endif () diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc new file mode 100644 index 000000000..9dd11e832 --- /dev/null +++ b/examples/device_mock/device_mock.cc @@ -0,0 +1,194 @@ +//#define TTG_USE_PARSEC 1 + +#ifdef TTG_USE_PARSEC +#endif // TTG_USE_PARSEC + +#define USE_PARSEC_PROF_API 0 + +#include +#include "../matrixtile.h" + +#include +#include +#include +#include +#include +// needed for madness::hashT and xterm_debug +#include + +using Key2 = std::tuple; + +using Key3 = std::tuple; + +/* number of tiles */ +#define KT 100 + +static void +dplasma_dprint_tile( int m, int n, + const parsec_tiled_matrix_dc_t* descA, + const double *M ); + + +template +auto make_gemm(ttg::Edge>& A, + ttg::Edge>& B, + ttg::Edge>& output_result) +{ + + ttg::Edge> C; + auto f_cpu = [=](const Key3& key, + const MatrixTile& A, + const MatrixTile& B, + MatrixTile& C, + std::tuple>, + ttg::Out>>& out) + { + int m = std::get<0>(key); + int n = std::get<1>(key); + int k = std::get<2>(key); + + /* + if(k == 0) { + dlprng(C.data(), 1789, A.mb()*B.nb()); + } + dgemm(A.data(), A.mb(), A.nb(), + B.data(), B.mb(), B.nb(), + 1.0, + C.data(), C.nb(), C.nb()); + */ + + if( k == KT-1 || C.data()[0] < 1e-9 ) { + ttg::send<0>(Key2{m, n}, std::move(C)); + } else { + ttg::send<1>(Key3{m, n, k+1}, std::move(C)); + } + }; + + auto f_gpu_host_views = [=](const Key3& key, + const MatrixTile& A, + const MatrixTile& B, + MatrixTile& C) + { + ttg::View, const T> dev_A = ttg::make_view( A, std::make_tuple(ttg::span(A.data(), A.size())) ); + ttg::View, const T> dev_B = ttg::make_view( B, std::make_tuple(ttg::span(B.data(), B.size())) ); + ttg::View, T> dev_C; + ttg::View dev_tmp; + T *host_tmp = new(T); + dev_tmp = ttg::new_view( *host_tmp, std::make_tuple(ttg::span(host_tmp, 1)) ); // dev_tmp is a promise of 1 T on the device, associated with host_tmp + + int k = std::get<2>(key); + if(0 == k) { + // view_new tells the runtime system that the device view needs to be allocated but doesn't need to be + // initialized with C.data(). However, C.data() is still associated with the device memory, so if the + // runtime system evicts that data from the device, it will be first copied back into C.data(). + dev_C = ttg::new_view( C, std::make_tuple(ttg::span(C.data(), C.size())) ); + } else { + dev_C = ttg::make_view( C, std::make_tuple(ttg::span(C.data(), C.size())) ); + } + + return std::make_tuple(dev_A, dev_B, dev_C, dev_tmp); + }; + + auto f_gpu_kernel = [=](const Key3& key, + ttg::View, const T>& dev_A, + ttg::View, const T>& dev_B, + ttg::View, T>& dev_C, + ttg::View& dev_tmp) + { + int k = std::get<2>(key); + + const MatrixTile& A = dev_A.get_host_object(); + const MatrixTile& B = dev_B.get_host_object(); + MatrixTile& C = dev_C.get_host_object(); + T& host_tmp = dev_tmp.get_host_object(); + auto beta = 1.0; + if(k == 0) { + //cublasDplrng(dev_C.get(0), C.mb(), C.nb()); + } + + /* + cublasDgemm(dev_A.get(0), A.mb(), A.nb(), + dev_B.get(0), B.mb(), B.nb(), + beta, + dev_C.get(0), C.mb(), C.nb()); + + cudaMemcpyAsync(&dev_C.get(0)[0], host_tmp, sizeof(T), cudaDeviceToHost); + */ + }; + + auto f_gpu_output_flows = [=](const Key3& key, + const MatrixTile& A, + const MatrixTile& B, + MatrixTile& C, + T& host_tmp, + std::tuple>, + ttg::Out>>& out) + { + int m = std::get<0>(key); + int n = std::get<1>(key); + int k = std::get<2>(key); + + if( k == KT-1 || host_tmp < 1e-9 ) { + ttg::send<0>(Key2{m, n}, std::move(C)); + } else { + ttg::send<1>(Key3{m, n, k+1}, std::move(C)); + } + delete &host_tmp; + }; + + /* If we only have GPU */ + auto gemm_tt = ttg::make_device_tt(f_gpu_host_views, f_gpu_kernel, f_gpu_output_flows, ttg::ExecutionSpace::CUDA, + ttg::edges(A, B), ttg::edges(output_result, C), + "GEMM", {"A", "B"}, {"output_result", "C"}); + +#if 0 + /* Alternative: to get both type of tasklets: */ + auto gemm_tt = ttg::make_device_tt(f_cpu, f_gpu_host_views, f_gpu_kernel, f_gpu_output_flows, ttg::ExecutionSpace::CUDA, + ttg::edges(A, B), ttg::edges(output_result, C), + "GEMM", {"A", "B"}, {"output_result", "C"}); +#endif + return gemm_tt; +} + +int main(int argc, char **argv) +{ + + std::chrono::time_point beg, end; + int N = 1024; + int M = N; + int NB = 128; + int check = 0; + int nthreads = -1; + const char* prof_filename = nullptr; + + if (argc > 1) { + N = M = atoi(argv[1]); + } + + if (argc > 2) { + NB = atoi(argv[2]); + } + + if (argc > 3) { + check = atoi(argv[3]); + } + + if (argc > 4) { + nthreads = atoi(argv[4]); + } + + ttg::initialize(argc, argv, nthreads); + + auto world = ttg::default_execution_context(); + + ttg::Edge> edge_a, edge_b; + ttg::Edge> edge_out; + + auto gemm_tt = make_gemm(edge_a, edge_b, edge_out); + + + + ttg::finalize(); + return 0; +} + diff --git a/ttg/ttg/make_device_tt.h b/ttg/ttg/make_device_tt.h index 63f4c78eb..6639f679f 100644 --- a/ttg/ttg/make_device_tt.h +++ b/ttg/ttg/make_device_tt.h @@ -1,51 +1,46 @@ // to be #include'd within runtime::ttg namespace -#ifndef TTG_MAKE_DEVICE_TT_H -#define TTG_MAKE_DEVICE_TT_H -#include -#include - -#include "ttg/edge.h" - -template -auto make_tt(DevViewFuncT &&view_func, - DevKernelFuncT &&kernel_func, - DevOutFuncT &&out_func, - const std::tuple...> &inedges, - const std::tuple &outedges, const std::string &name = "wrapper", - const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), - const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { +auto make_device_tt(DevViewFuncT &&view_func, + DevKernelFuncT &&kernel_func, + DevOutFuncT &&out_func, + ttg::ExecutionSpace space, + const std::tuple...> &inedges, + const std::tuple &outedges, const std::string &name = "wrapper", + const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), + const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { // TODO: + + return ttg::Void(); } -template -auto make_tt(HostFuncT &&host_func, - DevViewFuncT &&view_func, - DevKernelFuncT &&kernel_func, - DevOutFuncT &&out_func, - const std::tuple...> &inedges, - const std::tuple &outedges, const std::string &name = "wrapper", - const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), - const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { +auto make_device_tt(HostFuncT &&host_func, + DevViewFuncT &&view_func, + DevKernelFuncT &&kernel_func, + DevOutFuncT &&out_func, + ttg::ExecutionSpace space, + const std::tuple...> &inedges, + const std::tuple &outedges, const std::string &name = "wrapper", + const std::vector &innames = std::vector(sizeof...(input_edge_valuesT), "input"), + const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { // TODO: + return ttg::Void(); } -#endif // TTG_MAKE_DEVICE_TT_H diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index b88874b0e..ef56f4e82 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -92,6 +92,8 @@ void parsec_taskpool_termination_detected(parsec_taskpool_t *tp); int parsec_add_fetch_runtime_task(parsec_taskpool_t *tp, int tasks); } +#include "ttg/view.h" + namespace ttg_parsec { typedef void (*static_set_arg_fct_type)(void *, size_t, ttg::TTBase *); typedef std::pair static_set_arg_fct_call_t; diff --git a/ttg/ttg/serialization/splitmd_data_descriptor.h b/ttg/ttg/serialization/splitmd_data_descriptor.h index 8edfe20d6..46bdb7b76 100644 --- a/ttg/ttg/serialization/splitmd_data_descriptor.h +++ b/ttg/ttg/serialization/splitmd_data_descriptor.h @@ -3,20 +3,10 @@ #include #include "ttg/util/meta.h" +#include "ttg/util/iovec.h" namespace ttg { - /** - * Used to describe transfer payload in types using the \sa SplitMetadataDescriptor. - * @c data Pointer to the data to be read from / written to. - * @c num_bytes The number of bytes to read from / write to the memory location - * \sa data. - */ - struct iovec { - size_t num_bytes; - void* data; - }; - /** * SplitMetadataDescriptor is a serialization descriptor provided by the user * for a user-specified type. It should contain the following public member diff --git a/ttg/ttg/util/iovec.h b/ttg/ttg/util/iovec.h new file mode 100644 index 000000000..2a8d672b3 --- /dev/null +++ b/ttg/ttg/util/iovec.h @@ -0,0 +1,21 @@ +#ifndef TTG_UTIL_IOVEC_H_ +#define TTG_UTIL_IOVEC_H_ + +#include + +namespace ttg { + + /** + * Used to describe transfer payload in types using the \sa SplitMetadataDescriptor. + * @member data Pointer to the data to be read from / written to. + * @member num_bytes The number of bytes to read from / write to the memory location + * \sa data. + */ + struct iovec { + std::size_t num_bytes; + void* data; + }; + +} // ttg + +#endif // TTG_UTIL_IOVEC_H_ diff --git a/ttg/ttg/view.h b/ttg/ttg/view.h index 1eee6856e..73a8bf5e0 100644 --- a/ttg/ttg/view.h +++ b/ttg/ttg/view.h @@ -22,43 +22,48 @@ namespace ttg { } // namespace detail template - struct view_t { + struct View { - view_t(HostT& obj, detail::typed_iovs iovs) - : m_obj(obj) - , m_iovs(iovs) + View() + : m_obj(nullptr) + , m_spans(ttg::span(nullptr, std::size_t{0})...) + {} + + View(HostT& obj, std::tuple...> spans) + : m_obj(&obj) + , m_spans(std::move(spans)) { } template auto get_device_ptr() { - return std::get(m_iovs).ptr; + return std::get(m_spans).data(); } template std::size_t get_device_size() { - return std::get(m_iovs).size; + return std::get(m_spans).size(); } HostT& get_host_object() { - return m_obj; + return *m_obj; } private: - HostT& m_obj; - detail::typed_iovs m_iovs; + HostT* m_obj; + std::tuple...> m_spans; }; - template - auto make_view(HostT&& obj, detail::typed_iov iovs) { + template + auto make_view(HostT& obj, std::tuple spans) { /* TODO: allocate memory on the device and transfer the data to it */ - return view_t(obj, std::move(iovs)); + return View(obj, std::move(spans)); } - template - auto new_view(HostT&& obj, detail::typed_iov iovs) { + template + auto new_view(HostT& obj, std::tuple spans) { /* TODO: allocate memory on the device, no copying needed */ - return view_t(obj, std::move(iovs)); + return View(obj, std::move(spans)); } From 831bc0f09a790b6dda79263611846437481e7315 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 20 Oct 2022 16:03:24 -0400 Subject: [PATCH 004/259] [example/device_mock] added missing #include --- examples/device_mock/device_mock.cc | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc index 9dd11e832..ff37bf922 100644 --- a/examples/device_mock/device_mock.cc +++ b/examples/device_mock/device_mock.cc @@ -1,11 +1,6 @@ -//#define TTG_USE_PARSEC 1 - -#ifdef TTG_USE_PARSEC -#endif // TTG_USE_PARSEC - -#define USE_PARSEC_PROF_API 0 #include +#include #include "../matrixtile.h" #include From 5618e86b1c6275841efa76a1ddb7d3c1cb55b30b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 20 Oct 2022 16:04:13 -0400 Subject: [PATCH 005/259] [example/device_mock] declare as off-limits to clang-format --- examples/device_mock/device_mock.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc index ff37bf922..ca180a596 100644 --- a/examples/device_mock/device_mock.cc +++ b/examples/device_mock/device_mock.cc @@ -1,3 +1,4 @@ +// clang-format off #include #include @@ -186,4 +187,3 @@ int main(int argc, char **argv) ttg::finalize(); return 0; } - From 1e8fb69270a839f4c1ef77cb341f0f9c8cbbe555 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 20 Oct 2022 16:23:36 -0400 Subject: [PATCH 006/259] [example/device_mock] cleanup --- examples/device_mock/device_mock.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc index ca180a596..971d1c3d1 100644 --- a/examples/device_mock/device_mock.cc +++ b/examples/device_mock/device_mock.cc @@ -19,12 +19,6 @@ using Key3 = std::tuple; /* number of tiles */ #define KT 100 -static void -dplasma_dprint_tile( int m, int n, - const parsec_tiled_matrix_dc_t* descA, - const double *M ); - - template auto make_gemm(ttg::Edge>& A, ttg::Edge>& B, From 4d2b9373a7818e3c3818962c2965c0a5c416dd9a Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 20 Oct 2022 17:35:43 -0400 Subject: [PATCH 007/259] Partial implementation of ttg::make_device_tt Signed-off-by: Joseph Schuchart --- ttg/ttg/make_device_tt.h | 49 ++++++++++++++++++++++++++++++++++++++-- ttg/ttg/view.h | 4 ++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/ttg/ttg/make_device_tt.h b/ttg/ttg/make_device_tt.h index 6639f679f..65e9740f0 100644 --- a/ttg/ttg/make_device_tt.h +++ b/ttg/ttg/make_device_tt.h @@ -22,6 +22,38 @@ auto make_device_tt(DevViewFuncT &&view_func, } +namespace detail { + template + void invoke_unpacked_views(FuncT&& func, const keyT& key, std::tuple& views, std::index_sequence) { + func(key, std::get(views)...); + } + + template + void allocate_view_on_device(ttg::View& view, std::index_sequence) { + + /* TODO: allocate memory on device */ + + /* TODO: copy data to device */ + + allocate_view_on_device(view, std::index_sequence); + } + + template + void allocate_on_device(std::tuple& views, std::index_sequence) { + + auto& view = std::get(views); + allocate_view_on_device(view, std::make_index_sequence()); + + allocate_on_device(); + } + + template + void allocate_on_device(std::tuple& views, std::index_sequence) { + + } + +} // namespace detail + template &innames = std::vector(sizeof...(input_edge_valuesT), "input"), const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { - // TODO: - return ttg::Void(); + + + auto taskfn = [=](const keyT& key, auto... args){ + auto views = view_func(key, args...); + /* 1) allocate memory on device */ + auto device_view = views; + /* 2) move data from views to device */ + /* 3) call kernel function */ + detail::invoke_unpacked_views(key, views, std::index_sequence_for()); + /* 4) move data back out into host objects */ + /* 5) call output function */ + out_func(key, args...); + }; + + return make_tt(taskfn, inedges, outedges, innames, outnames); } diff --git a/ttg/ttg/view.h b/ttg/ttg/view.h index 73a8bf5e0..13c6b0e2a 100644 --- a/ttg/ttg/view.h +++ b/ttg/ttg/view.h @@ -48,6 +48,10 @@ namespace ttg { return *m_obj; } + constexpr static std::size_t size() { + return std::tuple_size_v; + } + private: HostT* m_obj; std::tuple...> m_spans; From 11c4d4d769b2e3a595c695921cce1c5934250634 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 26 Oct 2022 07:04:35 -0400 Subject: [PATCH 008/259] make device_mock compile again, and no need for MAWworld for no reason --- examples/device_mock/device_mock.cc | 2 -- ttg/ttg/make_device_tt.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc index 971d1c3d1..e0d6d6c07 100644 --- a/examples/device_mock/device_mock.cc +++ b/examples/device_mock/device_mock.cc @@ -9,8 +9,6 @@ #include #include #include -// needed for madness::hashT and xterm_debug -#include using Key2 = std::tuple; diff --git a/ttg/ttg/make_device_tt.h b/ttg/ttg/make_device_tt.h index 65e9740f0..008285935 100644 --- a/ttg/ttg/make_device_tt.h +++ b/ttg/ttg/make_device_tt.h @@ -35,7 +35,7 @@ namespace detail { /* TODO: copy data to device */ - allocate_view_on_device(view, std::index_sequence); + allocate_view_on_device(view, std::index_sequence{}); } template From fa6359ac7ac60eda3121d69fa3cd776a8fa0485b Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 26 Oct 2022 11:45:26 -0400 Subject: [PATCH 009/259] Encapsulate callables passed to make_device_tt mock in a regular task Signed-off-by: Joseph Schuchart --- ttg/ttg/make_device_tt.h | 270 ++++++++++++++++++++++++++++++++------- ttg/ttg/view.h | 27 +++- 2 files changed, 249 insertions(+), 48 deletions(-) diff --git a/ttg/ttg/make_device_tt.h b/ttg/ttg/make_device_tt.h index 008285935..f5425e334 100644 --- a/ttg/ttg/make_device_tt.h +++ b/ttg/ttg/make_device_tt.h @@ -1,6 +1,170 @@ +#ifndef TTG_MAKE_DEVICE_TT_H +#define TTG_MAKE_DEVICE_TT_H + // to be #include'd within runtime::ttg namespace +namespace detail { +#ifdef TTG_USE_CUDA + inline thread_local cudaStream_t* ts_stream = nullptr; +#endif // TTG_USE_CUDA + + template + inline void invoke_with_unpacked_views(FuncT&& func, const keyT& key, std::tuple& views, std::index_sequence) { +#ifdef TTG_USE_CUDA + func(key, std::get(views)..., ts_stream); +#else // TTG_USE_CUDA + func(key, std::get(views)...); +#endif // TTG_USE_CUDA + } + + /* TODO: extract host objects from views */ + template + struct host_obj_type; + + template + struct host_obj_type> { + using type = std::tuple; + }; + + template + using host_obj_type_t = typename host_obj_type::type; + + template + inline void invoke_out_with_unpacked_views(FuncT&& func, const keyT& key, std::tuple views, std::index_sequence) { + func(key, std::get(views).get_host_object()...); + } + + template + inline void create_view_on_device(const ttg::View& view, std::tuple...>& dev_spans, std::index_sequence) { + + /* fill in pointers for the device -- we're relying on managed memory for this simple wrapper */ + typename std::tuple_element_t::span_tuple_type>::element_type *ptr; + size_t size; + ptr = view.template get_device_ptr(); + size = view.template get_device_size(); + //cudaMalloc(&ptr, span.size_bytes()); + std::get(dev_spans) = ttg::span(ptr, size); + + /* copy data to device */ + //cudaMemcpy(ptr, span.data(), span.size_bytes(), cudaMemcpyHostToDevice); +#ifdef TTG_USE_CUDA + cudaMemPrefetchAsync(span.data(), span.size_bytes(), 0, *ts_stream); +#endif // TTG_USE_CUDA + if constexpr(sizeof...(Is) > 0) { + create_view_on_device(view, dev_spans, std::index_sequence{}); + } + } + + template + inline void create_on_device(std::tuple& views, std::tuple& dev_views, std::index_sequence) { + + using view_tuple_t = typename std::tuple; + auto& view = std::get(views); + typename std::tuple_element_t::span_tuple_type dev_spans; + create_view_on_device(view, dev_spans, std::make_index_sequence::size()>()); + + /* set the view for the device */ + std::get(dev_views) = ttg::make_view(view.get_host_object(), dev_spans); + if constexpr(sizeof...(Is) > 0) { + create_on_device(views, dev_views, std::index_sequence{}); + } + } + + template + inline void sync_view_to_host(const ttg::View& view, std::tuple...>& dev_spans, std::index_sequence) { + /* prefetch back to host */ + + void *ptr; + auto& span = std::get(dev_spans); + + /* prefetch data from device */ +#ifdef TTG_USE_CUDA + cudaMemPrefetchAsync(span.data(), span.size_bytes(), cudaCpuDeviceId, *ts_stream); +#endif // TTG_USE_CUDA + + if constexpr(sizeof...(Is) > 0) { + sync_view_to_host(view, dev_spans, std::index_sequence{}); + } + } + + template + inline void sync_back_to_host(std::tuple& views, std::tuple& dev_views, std::index_sequence) { + + auto& view = std::get(views); + using view_type = std::remove_reference_t; + typename std::tuple_element_t>::span_tuple_type dev_spans; + sync_view_to_host(view, dev_spans, std::make_index_sequence()); + + if constexpr(sizeof...(Is) > 0) { + sync_back_to_host(views, dev_views, std::index_sequence{}); + } + } + + template + auto make_device_tt_helper(DevViewFuncT &&view_func, + DevKernelFuncT &&kernel_func, + DevOutFuncT &&out_func, + ttg::ExecutionSpace space, + const std::tuple...> &inedges, + const std::tuple &outedges, + const std::string &name, + const std::vector &innames, + const std::vector &outnames, + const ttg::typelist& full_input_args) { + + using output_terminals_type = typename ttg::edges_to_output_terminals>::type; + + auto taskfn = [=](const keyT& key, Args... args) mutable { + +#ifdef TTG_USE_CUDA + if (nullptr == ts_stream) { + ts_stream = new cudaStream_t(); + cudaStreamCreate(ts_stream); + } +#endif // TTG_USE_CUDA + + auto views = view_func(key, std::forward(args)...); + using view_tuple_t = std::remove_reference_t; + constexpr std::size_t view_tuple_size = std::tuple_size_v; + /* 1) allocate memory on device */ + auto device_views = views; + /* 2) move data from views to device */ + if constexpr(std::tuple_size_v > 0) { + create_on_device(views, device_views, std::make_index_sequence()); + } + /* 3) call kernel function */ + detail::invoke_with_unpacked_views(kernel_func, key, device_views, std::make_index_sequence()); + /* 4) move data back out into host objects */ + if constexpr(std::tuple_size_v > 0) { + sync_back_to_host(views, device_views, std::make_index_sequence()); + } + #ifdef TTG_USE_CUDA + /* wait for the */ + cudaStreamSynchronize(*ts_stream); + #endif // TTG_USE_CUDA + /* 5) call output function */ + detail::invoke_out_with_unpacked_views(out_func, key, views, std::make_index_sequence()); + }; + + using wrapT = typename CallableWrapTTArgsAsTypelist>::type; + + return std::make_unique(std::move(taskfn), inedges, outedges, name, innames, outnames); + + } + + +} // namespace detail + + template &innames = std::vector(sizeof...(input_edge_valuesT), "input"), const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { - // TODO: - - return ttg::Void(); -} - - -namespace detail { - template - void invoke_unpacked_views(FuncT&& func, const keyT& key, std::tuple& views, std::index_sequence) { - func(key, std::get(views)...); - } - - template - void allocate_view_on_device(ttg::View& view, std::index_sequence) { + using output_terminals_type = typename ttg::edges_to_output_terminals>::type; - /* TODO: allocate memory on device */ + constexpr auto void_key = ttg::meta::is_void_v; - /* TODO: copy data to device */ + // gross list of candidate argument types + using gross_candidate_func_args_t = ttg::meta::typelist< + ttg::meta::candidate_argument_bindings_t>, + ttg::meta::candidate_argument_bindings_t::value_type>..., + ttg::meta::typelist>; - allocate_view_on_device(view, std::index_sequence{}); - } + // net list of candidate argument types excludes the empty typelists for void arguments + using candidate_func_args_t = ttg::meta::filter_t; - template - void allocate_on_device(std::tuple& views, std::index_sequence) { + // gross argument typelist for invoking func, can include void for optional args + constexpr static auto func_is_generic = ttg::meta::is_generic_callable_v; + using gross_func_args_t = decltype(ttg::meta::compute_arg_binding_types_r(view_func, candidate_func_args_t{})); + constexpr auto DETECTED_HOW_TO_INVOKE_GENERIC_FUNC = + func_is_generic ? !std::is_same_v> : true; + static_assert(DETECTED_HOW_TO_INVOKE_GENERIC_FUNC, + "ttd::make_tt(func, inedges, ...): could not detect how to invoke generic callable func, either the " + "signature of func " + "is faulty, or inedges does match the expected list of types, or both"); - auto& view = std::get(views); - allocate_view_on_device(view, std::make_index_sequence()); + // net argument typelist + using func_args_t = ttg::meta::drop_void_t; + constexpr auto num_args = std::tuple_size_v; - allocate_on_device(); - } + // if given task id, make sure it's passed via const lvalue ref + constexpr bool TASK_ID_PASSED_AS_CONST_LVALUE_REF = + !void_key ? ttg::meta::probe_first_v : true; + static_assert(TASK_ID_PASSED_AS_CONST_LVALUE_REF, + "ttg::make_tt(func, ...): if given to func, the task id must be passed by const lvalue ref"); - template - void allocate_on_device(std::tuple& views, std::index_sequence) { + // if given out-terminal tuple, make sure it's passed via nonconst lvalue ref + constexpr bool have_outterm_tuple = + func_is_generic ? !ttg::meta::is_last_void_v + : ttg::meta::probe_last_v; + constexpr bool OUTTERM_TUPLE_PASSED_AS_NONCONST_LVALUE_REF = + have_outterm_tuple ? ttg::meta::probe_last_v : true; + static_assert( + OUTTERM_TUPLE_PASSED_AS_NONCONST_LVALUE_REF, + "ttg::make_tt(func, ...): if given to func, the output terminal tuple must be passed by nonconst lvalue ref"); - } - -} // namespace detail + // TT needs actual types of arguments to func ... extract them and pass to CallableWrapTTArgs + using input_edge_value_types = ttg::meta::typelist...>; + // input_args_t = {input_valuesT&&...} + using input_args_t = typename ttg::meta::take_first_n< + typename ttg::meta::drop_first_n::type, + std::tuple_size_v - (void_key ? 0 : 1) - (have_outterm_tuple ? 1 : 0)>::type; + constexpr auto NO_ARGUMENTS_PASSED_AS_NONCONST_LVALUE_REF = + !ttg::meta::is_any_nonconst_lvalue_reference_v; + static_assert( + NO_ARGUMENTS_PASSED_AS_NONCONST_LVALUE_REF, + "ttg::make_tt(func, inedges, outedges): one or more arguments to func can only be passed by nonconst lvalue " + "ref; this is illegal, should only pass arguments as const lavlue ref or (nonconst) rvalue ref"); + using decayed_input_args_t = ttg::meta::decayed_typelist_t; + // 3. full_input_args_t = edge-types with non-void types replaced by input_args_t + using full_input_args_t = ttg::meta::replace_nonvoid_t; + return detail::make_device_tt_helper(std::forward(view_func), + std::forward(kernel_func), + std::forward(out_func), + space, inedges, outedges, name, innames, outnames, + full_input_args_t{}); +} +#if 0 template &innames = std::vector(sizeof...(input_edge_valuesT), "input"), const std::vector &outnames = std::vector(sizeof...(output_edgesT), "output")) { - - - auto taskfn = [=](const keyT& key, auto... args){ - auto views = view_func(key, args...); - /* 1) allocate memory on device */ - auto device_view = views; - /* 2) move data from views to device */ - /* 3) call kernel function */ - detail::invoke_unpacked_views(key, views, std::index_sequence_for()); - /* 4) move data back out into host objects */ - /* 5) call output function */ - out_func(key, args...); - }; - - return make_tt(taskfn, inedges, outedges, innames, outnames); + /* drop the host function */ + return make_device_tt(view_func, kernel_func, out_func, space, inedges, outedges, name, innames, outnames); } - +#endif // 0 +#endif // TTG_MAKE_DEVICE_TT_H diff --git a/ttg/ttg/view.h b/ttg/ttg/view.h index 13c6b0e2a..1723482c7 100644 --- a/ttg/ttg/view.h +++ b/ttg/ttg/view.h @@ -24,23 +24,40 @@ namespace ttg { template struct View { + using span_tuple_type = std::tuple...>; + using host_type = HostT; + + using view_type = View; + View() : m_obj(nullptr) , m_spans(ttg::span(nullptr, std::size_t{0})...) {} - View(HostT& obj, std::tuple...> spans) + View(HostT& obj, span_tuple_type spans) : m_obj(&obj) , m_spans(std::move(spans)) { } + View(view_type&&) = default; + + View(const view_type&) = default; + + view_type& operator=(view_type&&) = default; + view_type& operator=(const view_type&) = default; + template auto get_device_ptr() { return std::get(m_spans).data(); } template - std::size_t get_device_size() { + const auto get_device_ptr() const { + return std::get(m_spans).data(); + } + + template + std::size_t get_device_size() const { return std::get(m_spans).size(); } @@ -48,13 +65,17 @@ namespace ttg { return *m_obj; } + const HostT& get_host_object() const { + return *m_obj; + } + constexpr static std::size_t size() { return std::tuple_size_v; } private: HostT* m_obj; - std::tuple...> m_spans; + span_tuple_type m_spans; }; From 0317069de76edaaafe594392450c014a729d3b59 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 26 Oct 2022 11:50:49 -0400 Subject: [PATCH 010/259] Use the new multiindex key type in device mock example Signed-off-by: Joseph Schuchart --- examples/device_mock/device_mock.cc | 49 +++++++++++++++-------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc index e0d6d6c07..21bbbc7b3 100644 --- a/examples/device_mock/device_mock.cc +++ b/examples/device_mock/device_mock.cc @@ -10,16 +10,19 @@ #include #include -using Key2 = std::tuple; +#include "ttg/util/meta.h" -using Key3 = std::tuple; +#include + +using Key2 = ttg::MultiIndex<2>; +using Key3 = ttg::MultiIndex<3>; /* number of tiles */ #define KT 100 template -auto make_gemm(ttg::Edge>& A, - ttg::Edge>& B, +auto make_gemm(ttg::Edge>& A, + ttg::Edge>& B, ttg::Edge>& output_result) { @@ -31,9 +34,9 @@ auto make_gemm(ttg::Edge>& A, std::tuple>, ttg::Out>>& out) { - int m = std::get<0>(key); - int n = std::get<1>(key); - int k = std::get<2>(key); + int m = key[0]; + int n = key[1]; + int k = key[2]; /* if(k == 0) { @@ -54,8 +57,8 @@ auto make_gemm(ttg::Edge>& A, auto f_gpu_host_views = [=](const Key3& key, const MatrixTile& A, - const MatrixTile& B, - MatrixTile& C) + const MatrixTile& B, + MatrixTile&& C) { ttg::View, const T> dev_A = ttg::make_view( A, std::make_tuple(ttg::span(A.data(), A.size())) ); ttg::View, const T> dev_B = ttg::make_view( B, std::make_tuple(ttg::span(B.data(), B.size())) ); @@ -64,7 +67,7 @@ auto make_gemm(ttg::Edge>& A, T *host_tmp = new(T); dev_tmp = ttg::new_view( *host_tmp, std::make_tuple(ttg::span(host_tmp, 1)) ); // dev_tmp is a promise of 1 T on the device, associated with host_tmp - int k = std::get<2>(key); + int k = key[2]; if(0 == k) { // view_new tells the runtime system that the device view needs to be allocated but doesn't need to be // initialized with C.data(). However, C.data() is still associated with the device memory, so if the @@ -83,7 +86,7 @@ auto make_gemm(ttg::Edge>& A, ttg::View, T>& dev_C, ttg::View& dev_tmp) { - int k = std::get<2>(key); + int k = key[2]; const MatrixTile& A = dev_A.get_host_object(); const MatrixTile& B = dev_B.get_host_object(); @@ -105,16 +108,14 @@ auto make_gemm(ttg::Edge>& A, }; auto f_gpu_output_flows = [=](const Key3& key, - const MatrixTile& A, - const MatrixTile& B, - MatrixTile& C, - T& host_tmp, - std::tuple>, - ttg::Out>>& out) + const MatrixTile& A, + const MatrixTile& B, + MatrixTile& C, + T& host_tmp) { - int m = std::get<0>(key); - int n = std::get<1>(key); - int k = std::get<2>(key); + int m = key[0]; + int n = key[1]; + int k = key[2]; if( k == KT-1 || host_tmp < 1e-9 ) { ttg::send<0>(Key2{m, n}, std::move(C)); @@ -124,9 +125,11 @@ auto make_gemm(ttg::Edge>& A, delete &host_tmp; }; + //ttg::meta::type_printer x; + /* If we only have GPU */ - auto gemm_tt = ttg::make_device_tt(f_gpu_host_views, f_gpu_kernel, f_gpu_output_flows, ttg::ExecutionSpace::CUDA, - ttg::edges(A, B), ttg::edges(output_result, C), + auto gemm_tt = ttg::make_device_tt(f_gpu_host_views, f_gpu_kernel, f_gpu_output_flows, ttg::ExecutionSpace::CUDA, + ttg::edges(A, B, C), ttg::edges(output_result, C), "GEMM", {"A", "B"}, {"output_result", "C"}); #if 0 @@ -169,7 +172,7 @@ int main(int argc, char **argv) auto world = ttg::default_execution_context(); - ttg::Edge> edge_a, edge_b; + ttg::Edge> edge_a, edge_b; ttg::Edge> edge_out; auto gemm_tt = make_gemm(edge_a, edge_b, edge_out); From ffef761c5503d753e7f0663cc7796aa533c63dcb Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 26 Oct 2022 11:53:34 -0400 Subject: [PATCH 011/259] Remove superfluous todos The views on the device will be handled by the backend, not in ttg::make_view Signed-off-by: Joseph Schuchart --- ttg/ttg/view.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/ttg/ttg/view.h b/ttg/ttg/view.h index 1723482c7..404f453b1 100644 --- a/ttg/ttg/view.h +++ b/ttg/ttg/view.h @@ -81,13 +81,11 @@ namespace ttg { template auto make_view(HostT& obj, std::tuple spans) { - /* TODO: allocate memory on the device and transfer the data to it */ return View(obj, std::move(spans)); } template auto new_view(HostT& obj, std::tuple spans) { - /* TODO: allocate memory on the device, no copying needed */ return View(obj, std::move(spans)); } From 65668f4fae53691151dd234afbc540118d299e58 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Wed, 26 Oct 2022 15:28:49 -0400 Subject: [PATCH 012/259] Provide a scope for views and replace ttg::span with ttg::ViewSpan [ttg|std]::span is not a good fit because it carries a size template argument that we don't care about but makes handling more difficult. Plus, we need to encode the scope for each span. Three scopes are available: 1) SyncIn: copy host data to device before invoking the kernel callable. 2) SyncOut: copy the device data back to the host before invoking the output callable. 3) Allocate: allocate but do not synchronize in or out. Both SyncIn and SyncOut will allocate sufficient memory on the device. Signed-off-by: Joseph Schuchart --- examples/device_mock/device_mock.cc | 16 +++-- ttg/ttg/make_device_tt.h | 43 +++++++------ ttg/ttg/util/meta.h | 9 +++ ttg/ttg/view.h | 94 ++++++++++++++++++++--------- 4 files changed, 108 insertions(+), 54 deletions(-) diff --git a/examples/device_mock/device_mock.cc b/examples/device_mock/device_mock.cc index 21bbbc7b3..daa6decbd 100644 --- a/examples/device_mock/device_mock.cc +++ b/examples/device_mock/device_mock.cc @@ -60,21 +60,25 @@ auto make_gemm(ttg::Edge>& A, const MatrixTile& B, MatrixTile&& C) { - ttg::View, const T> dev_A = ttg::make_view( A, std::make_tuple(ttg::span(A.data(), A.size())) ); - ttg::View, const T> dev_B = ttg::make_view( B, std::make_tuple(ttg::span(B.data(), B.size())) ); + // The default ViewScope::SyncIn scope tells the runtime that the data should be copied + // to the device before the kernel callable is invoked. + ttg::View, const T> dev_A = ttg::make_view( A, ttg::ViewSpan(A.data(), A.size()) ); + ttg::View, const T> dev_B = ttg::make_view( B, ttg::ViewSpan(B.data(), B.size()) ); ttg::View, T> dev_C; ttg::View dev_tmp; T *host_tmp = new(T); - dev_tmp = ttg::new_view( *host_tmp, std::make_tuple(ttg::span(host_tmp, 1)) ); // dev_tmp is a promise of 1 T on the device, associated with host_tmp + // ViewScope::SyncOut tells the runtime system that the view should be synchronized back to the + // host before invoking the output callable. + dev_tmp = ttg::make_view( *host_tmp, ttg::ViewSpan(host_tmp, 1, ttg::ViewScope::SyncOut) ); int k = key[2]; if(0 == k) { - // view_new tells the runtime system that the device view needs to be allocated but doesn't need to be + // ViewScope::Allocate tells the runtime system that the device view needs to be allocated but doesn't need to be // initialized with C.data(). However, C.data() is still associated with the device memory, so if the // runtime system evicts that data from the device, it will be first copied back into C.data(). - dev_C = ttg::new_view( C, std::make_tuple(ttg::span(C.data(), C.size())) ); + dev_C = ttg::make_view( C, ttg::ViewSpan(C.data(), C.size(), ttg::ViewScope::Allocate) ); } else { - dev_C = ttg::make_view( C, std::make_tuple(ttg::span(C.data(), C.size())) ); + dev_C = ttg::make_view( C, ttg::ViewSpan(C.data(), C.size()) ); } return std::make_tuple(dev_A, dev_B, dev_C, dev_tmp); diff --git a/ttg/ttg/make_device_tt.h b/ttg/ttg/make_device_tt.h index f5425e334..dbfbdc933 100644 --- a/ttg/ttg/make_device_tt.h +++ b/ttg/ttg/make_device_tt.h @@ -36,7 +36,9 @@ namespace detail { } template - inline void create_view_on_device(const ttg::View& view, std::tuple...>& dev_spans, std::index_sequence) { + inline void create_view_on_device(const ttg::View& view, + std::tuple...>& dev_spans, + std::index_sequence) { /* fill in pointers for the device -- we're relying on managed memory for this simple wrapper */ typename std::tuple_element_t::span_tuple_type>::element_type *ptr; @@ -44,18 +46,26 @@ namespace detail { ptr = view.template get_device_ptr(); size = view.template get_device_size(); //cudaMalloc(&ptr, span.size_bytes()); - std::get(dev_spans) = ttg::span(ptr, size); + std::get(dev_spans) = ttg::ViewSpan(ptr, size, view.template get_scope()); /* copy data to device */ //cudaMemcpy(ptr, span.data(), span.size_bytes(), cudaMemcpyHostToDevice); + if (view.template get_span().is_sync_in()) { #ifdef TTG_USE_CUDA - cudaMemPrefetchAsync(span.data(), span.size_bytes(), 0, *ts_stream); + cudaMemPrefetchAsync(span.data(), span.size_bytes(), 0, *ts_stream); #endif // TTG_USE_CUDA + } + if constexpr(sizeof...(Is) > 0) { create_view_on_device(view, dev_spans, std::index_sequence{}); } } + template + auto make_view_from_tuple(HostT& obj, std::tuple...>& spans, std::index_sequence) { + return ttg::make_view(obj, std::get(spans)...); + } + template inline void create_on_device(std::tuple& views, std::tuple& dev_views, std::index_sequence) { @@ -65,39 +75,36 @@ namespace detail { create_view_on_device(view, dev_spans, std::make_index_sequence::size()>()); /* set the view for the device */ - std::get(dev_views) = ttg::make_view(view.get_host_object(), dev_spans); + std::get(dev_views) = make_view_from_tuple(view.get_host_object(), dev_spans, std::make_index_sequence>{}); if constexpr(sizeof...(Is) > 0) { create_on_device(views, dev_views, std::index_sequence{}); } } - template - inline void sync_view_to_host(const ttg::View& view, std::tuple...>& dev_spans, std::index_sequence) { + template + inline void sync_view_to_host(ttg::View& dev_view, std::index_sequence) { /* prefetch back to host */ - - void *ptr; - auto& span = std::get(dev_spans); + auto span = dev_view.template get_span(); /* prefetch data from device */ + if (span.is_sync_out()) { #ifdef TTG_USE_CUDA - cudaMemPrefetchAsync(span.data(), span.size_bytes(), cudaCpuDeviceId, *ts_stream); + cudaMemPrefetchAsync(span.data(), span.size_bytes(), cudaCpuDeviceId, *ts_stream); #endif // TTG_USE_CUDA + } if constexpr(sizeof...(Is) > 0) { - sync_view_to_host(view, dev_spans, std::index_sequence{}); + sync_view_to_host(dev_view, std::index_sequence{}); } } template - inline void sync_back_to_host(std::tuple& views, std::tuple& dev_views, std::index_sequence) { + inline void sync_back_to_host(std::tuple& dev_views, std::index_sequence) { - auto& view = std::get(views); - using view_type = std::remove_reference_t; - typename std::tuple_element_t>::span_tuple_type dev_spans; - sync_view_to_host(view, dev_spans, std::make_index_sequence()); + sync_view_to_host(std::get(dev_views), std::make_index_sequence>::size()>()); if constexpr(sizeof...(Is) > 0) { - sync_back_to_host(views, dev_views, std::index_sequence{}); + sync_back_to_host(dev_views, std::index_sequence{}); } } @@ -144,7 +151,7 @@ namespace detail { detail::invoke_with_unpacked_views(kernel_func, key, device_views, std::make_index_sequence()); /* 4) move data back out into host objects */ if constexpr(std::tuple_size_v > 0) { - sync_back_to_host(views, device_views, std::make_index_sequence()); + sync_back_to_host(device_views, std::make_index_sequence()); } #ifdef TTG_USE_CUDA /* wait for the */ diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h index c19776118..37b43e4f4 100644 --- a/ttg/ttg/util/meta.h +++ b/ttg/ttg/util/meta.h @@ -549,6 +549,15 @@ namespace ttg { template constexpr bool is_tuple_v = is_tuple::value; + template + struct is_span : std::false_type {}; + + template + struct is_span> : std::true_type {}; + + template + constexpr bool is_span_v = is_span::value; + template