diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 68a36b4d0..155d5107a 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -88,7 +88,7 @@ jobs: # and build directories, but this is only available with CMake 3.13 and higher. # The CMake binaries on the Github Actions machines are (as of this writing) 3.12 run: | - cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log) + cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeConfigureLog.yaml) - name: Build working-directory: ${{github.workspace}}/build @@ -113,11 +113,11 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash run: | - cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_helloworld/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_helloworld/CMakeFiles/CMakeError.log) + cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && test_install_devsamp_helloworld/CMakeFiles/CMakeConfigureLog.yaml) cmake --build test_install_devsamp_helloworld $MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-parsec $MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-mad - cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_fibonacci/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeError.log) + cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeConfigureLog.yaml) cmake --build test_install_devsamp_fibonacci $MPIEXEC -n 2 test_install_devsamp_fibonacci/fibonacci-parsec cmake -E make_directory test_install_userexamples @@ -130,7 +130,7 @@ jobs: add_ttg_executable(iterative $GITHUB_WORKSPACE/doc/dox/user/examples/iterative.cc NOT_EXCLUDE_FROM_ALL) add_ttg_executable(distributed $GITHUB_WORKSPACE/doc/dox/user/examples/distributed.cc NOT_EXCLUDE_FROM_ALL) EOF - cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_userexamples/CMakeFiles/CMakeOutput.log && cat test_install_userexamples/CMakeFiles/CMakeError.log) + cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeConfigureLog.yaml) cmake --build test_install_userexamples/build - name: Build+Deploy Dox diff --git a/CMakeLists.txt b/CMakeLists.txt index 5406877c8..63b0d2bfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,7 @@ option(TTG_EXAMPLES "Whether to build examples" OFF) option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF) option(TTG_ENABLE_COROUTINES "Whether to enable C++ coroutines, needed for accelerator device support" ON) -option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF) +option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" ON) option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF) option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF) # See https://medium.com/@alasher/colored-c-compiler-output-with-ninja-clang-gcc-10bfe7f2b949 diff --git a/INSTALL.md b/INSTALL.md index b8974f927..b9497ac0a 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -63,14 +63,14 @@ TTG includes several examples that may require additional prerequisites. These a ## useful cmake cache variables: -| Variable |Default | Description | -|--------------------------------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `TTG_ENABLE_CUDA` | `OFF` | whether to enable CUDA device support | -| `TTG_ENABLE_HIP` | `OFF` | whether to enable HIP/ROCm device support | -| `TTG_ENABLE_LEVEL_ZERO` | `OFF` | whether to enable Intel oneAPI Level Zero device support | -| `BUILD_TESTING` | `ON` | whether target `check-ttg` and its relatives will actually build and run unit tests | -| `TTG_EXAMPLES` | `OFF` | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source | -| `TTG_ENABLE_TRACE` | `OFF` | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op | -| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF` | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. | -| `TTG_FETCH_BOOST` | `OFF` | whether to download and build Boost automatically, if missing | -| `TTG_IGNORE_BUNDLED_EXTERNALS` | `OFF` | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits) | +| Variable | Default | Description | +|--------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `TTG_ENABLE_CUDA` | `OFF` | whether to enable CUDA device support | +| `TTG_ENABLE_HIP` | `OFF` | whether to enable HIP/ROCm device support | +| `TTG_ENABLE_LEVEL_ZERO` | `OFF` | whether to enable Intel oneAPI Level Zero device support | +| `BUILD_TESTING` | `ON` | whether target `check-ttg` and its relatives will actually build and run unit tests | +| `TTG_EXAMPLES` | `OFF` | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source | +| `TTG_ENABLE_TRACE` | `OFF` | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op | +| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF` | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. | +| `TTG_FETCH_BOOST` | `ON` | whether to download and build Boost automatically, if missing | +| `TTG_IGNORE_BUNDLED_EXTERNALS` | `OFF` | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits) | diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index 1429de0bf..49f66fe0a 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -1,12 +1,17 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TTG_TRACKED_VG_CMAKE_KIT_TAG 092efee765e039b02e0a9aaf013c12fc3c4e89cf) # used to provide "real" FindOrFetchBoost +set(TTG_TRACKED_VG_CMAKE_KIT_TAG d1b34157c349cf0a7c2f149b7704a682d53f6486) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) -set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3) +set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058) set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c) -set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) -set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73) +set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f) +set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815) # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work -set(TTG_OLDEST_BOOST_VERSION 1.66) +# BUT if will be building examples, inherit the oldest version from the pickiest Boost consumer (TA and/or BSPMM) +if (TTG_EXAMPLES) + set(TTG_OLDEST_BOOST_VERSION 1.81) +else() + set(TTG_OLDEST_BOOST_VERSION 1.66) +endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index c34e3c07e..b5d1ff622 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,7 +20,7 @@ if (TARGET tiledarray) add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES tiledarray lapackpp) if (TARGET CUDA::cublas) - add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc + add_ttg_executable(bspmm-cuda spmm/spmm.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS CUDA::cublas COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1 RUNTIMES "parsec") @@ -32,7 +32,7 @@ if (TARGET tiledarray) RUNTIMES "parsec") endif(TARGET CUDA::cusolver) elseif (TARGET roc::hipblas) - add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc + add_ttg_executable(bspmm-hip spmm/spmm.cc LINK_LIBRARIES tiledarray TiledArray_Eigen roc::hipblas COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_HIP=1 RUNTIMES "parsec") @@ -43,7 +43,7 @@ if (TARGET tiledarray) RUNTIMES "parsec") endif(TARGET roc::hipsolver) elseif (TARGET MKL::MKL_DPCPP) - add_ttg_executable(bspmm-lz spmm/spmm_cuda.cc + add_ttg_executable(bspmm-lz spmm/spmm.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS MKL::MKL_DPCPP level_zero::ze_loader m COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_LEVEL_ZERO=1 RUNTIMES "parsec") diff --git a/examples/matrixtile.h b/examples/matrixtile.h index a759b1c3c..203c58bf2 100644 --- a/examples/matrixtile.h +++ b/examples/matrixtile.h @@ -10,23 +10,27 @@ #include -#include +#include #if defined(TILEDARRAY_HAS_DEVICE) -#define ALLOCATOR TiledArray::device_pinned_allocator +template +using Allocator = TiledArray::device_pinned_allocator; inline void allocator_init(int argc, char **argv) { // initialize MADNESS so that TA allocators can be created #if defined(TTG_PARSEC_IMPORTED) madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context()); -#endif // TTG_PARSEC_IMPORTED madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true); +#endif // TTG_PARSEC_IMPORTED } inline void allocator_fini() { +#if defined(TTG_PARSEC_IMPORTED) madness::finalize(); +#endif // TTG_PARSEC_IMPORTED } #else // TILEDARRAY_HAS_DEVICE -#define ALLOCATOR std::allocator +template +using Allocator = std::allocator; inline void allocator_init(int argc, char **argv) { } @@ -34,13 +38,13 @@ inline void allocator_fini() { } #endif // TILEDARRAY_HAS_DEVICE -template -class MatrixTile : public ttg::TTValue> { +template > +class MatrixTile : public ttg::TTValue> { public: using metadata_t = typename std::tuple; - using buffer_t = typename ttg::Buffer; - using ttvalue_type = ttg::TTValue>; + using buffer_t = typename ttg::Buffer; + using ttvalue_type = ttg::TTValue>; private: buffer_t _buffer; @@ -87,15 +91,15 @@ class MatrixTile : public ttg::TTValue> { , _lda(lda) { } - MatrixTile(MatrixTile&& other) = default; + MatrixTile(MatrixTile&& other) = default; - MatrixTile& operator=(MatrixTile&& other) = default; + MatrixTile& operator=(MatrixTile&& other) = default; /* Deep copy ctor und op are not needed for PO since tiles will never be read * and written concurrently. Hence shallow copies are enough, will all * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend * can handle data sharing without excessive copying */ - MatrixTile(const MatrixTile& other) + MatrixTile(const MatrixTile& other) : ttvalue_type() , _buffer(other._lda*other._cols) , _rows(other._rows) @@ -108,7 +112,7 @@ class MatrixTile : public ttg::TTValue> { std::copy_n(other.data(), _lda * _cols, this->data()); } - MatrixTile& operator=(const MatrixTile& other) { + MatrixTile& operator=(const MatrixTile& other) { this->_rows = other._rows; this->_cols = other._cols; this->_lda = other._lda; @@ -166,7 +170,7 @@ class MatrixTile : public ttg::TTValue> { } #endif // DEBUG_TILES_VALUES - friend std::ostream& operator<<(std::ostream& o, MatrixTile const& tt) { + friend std::ostream& operator<<(std::ostream& o, MatrixTile const& tt) { auto ptr = tt.data(); o << std::endl << " "; o << "MatrixTile<" << typeid(T).name() << ">{ rows=" << tt.rows() << " cols=" << tt.cols() << " ld=" << tt.lda(); diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h index d3a92cb2f..4e5e3c194 100644 --- a/examples/potrf/potrf.h +++ b/examples/potrf/potrf.h @@ -95,7 +95,7 @@ namespace potrf { ttg::Edge>& output_result) { using T = typename MatrixT::element_type; #if defined(ENABLE_DEVICE_KERNEL) - auto iallocator = std::make_shared>(); + auto iallocator = std::make_shared>(); //std::cout << "Creating CUDA POTRF task " << std::endl; auto f_dev = [=, iallocator = std::move(iallocator)] (const Key1& key, MatrixTile&& tile_kk, @@ -669,7 +669,8 @@ namespace potrf { template auto make_potrf_ttg(MatrixT& A, ttg::Edge>& input, - ttg::Edge>& output, bool defer_write) { + ttg::Edge>& output, bool defer_write, + bool enable_device_map = true) { using T = typename MatrixT::element_type; auto keymap1 = [&](const Key1& key) { return A.rank_of(key[0], key[0]); }; @@ -705,28 +706,36 @@ namespace potrf { tt_potrf->set_keymap(keymap1); tt_potrf->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_potrf->set_devicemap(devmap1); + if (enable_device_map) { + tt_potrf->set_devicemap(devmap1); + } #endif // 0 auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output); tt_trsm->set_keymap(keymap2a); tt_trsm->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_trsm->set_devicemap(devmap2a); + if (enable_device_map) { + tt_trsm->set_devicemap(devmap2a); + } #endif // 0 auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk); tt_syrk->set_keymap(keymap2b); tt_syrk->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_syrk->set_devicemap(devmap2b); + if (enable_device_map) { + tt_syrk->set_devicemap(devmap2b); + } #endif // 0 auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm); tt_gemm->set_keymap(keymap3); tt_gemm->set_defer_writer(defer_write); #ifdef ENABLE_DEVICE_KERNEL - tt_gemm->set_devicemap(devmap3); + if (enable_device_map) { + tt_gemm->set_devicemap(devmap3); + } #endif // 0 /* Priorities taken from DPLASMA */ diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc index 2a5f6dacb..781cc417c 100644 --- a/examples/potrf/testing_dpotrf.cc +++ b/examples/potrf/testing_dpotrf.cc @@ -61,6 +61,9 @@ int main(int argc, char **argv) bool check = !cmdOptionExists(argv+1, argv+argc, "-x"); bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w"); + /* whether we set a device mapping */ + bool enable_device_map = !cmdOptionExists(argv, argv+argc, "--default-device-map"); + // TODO: need to filter out our arguments to make parsec happy ttg::initialize(1, argv, nthreads); @@ -130,7 +133,7 @@ int main(int argc, char **argv) init_tt->set_keymap([&]() {return world.rank();}); auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint); - auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint); + auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint, enable_device_map); auto result_ttg = make_result_ttg(A, result, cow_hint); auto connected = make_graph_executable(init_tt.get()); diff --git a/examples/spmm/devicegemm.h b/examples/spmm/devicegemm.h new file mode 100644 index 000000000..bdfc99562 --- /dev/null +++ b/examples/spmm/devicegemm.h @@ -0,0 +1,81 @@ + +#if defined(TTG_ENABLE_LEVEL_ZERO) +#include +#include +#endif + +#include "../devblas_helper.h" + + +template +inline void device_gemm(Blk &C, const Blk &A, const Blk &B) { + using blk_t = Blk; + using T = typename blk_t::value_type; + static_assert(std::is_same_v || std::is_same_v); + static const T alpha = 1.0; + static const T beta = 1.0; + // make sure all memory is on the device + // TODO: A and B are read-only so the owner device will be 0. How to fix? + //assert(A.b.get_current_device() != 0); + //assert(B.b.get_current_device() != 0); + auto device = ttg::device::current_device(); + assert(device.is_device()); +#if defined(TTG_ENABLE_CUDA) + if constexpr (std::is_same_v) { + cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1), + &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta, + C.b.current_device_ptr(), C.extent(0)); + } + else if constexpr (std::is_same_v) { + cublasSgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1), + &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta, + C.b.current_device_ptr(), C.extent(0)); + } +#elif defined(TTG_ENABLE_HIP) + if constexpr (std::is_same_v) { + hipblasDgemm(hipblas_handle(), + HIPBLAS_OP_N, HIPBLAS_OP_N, + C.extent(0), C.extent(1), A.extent(1), &alpha, + A.b.current_device_ptr(), A.extent(0), + B.b.current_device_ptr(), B.extent(0), &beta, + C.b.current_device_ptr(), C.extent(0)); + } else if constexpr (std::is_same_v) { + hipblasSgemm(hipblas_handle(), + HIPBLAS_OP_N, HIPBLAS_OP_N, + C.extent(0), C.extent(1), A.extent(1), &alpha, + A.b.current_device_ptr(), A.extent(0), + B.b.current_device_ptr(), B.extent(0), &beta, + C.b.current_device_ptr(), C.extent(0)); + } +#elif defined(TTG_ENABLE_LEVEL_ZERO) + +#if defined(DEBUG_SYNCHRONOUS) + try { +#endif /* DEBUG_SYNCHRONOUS */ + cl::sycl::event gemm_event; + gemm_event = oneapi::mkl::blas::gemm(ttg::device::current_stream(), + oneapi::mkl::transpose::N, oneapi::mkl::transpose::N, + C.extent(0), C.extent(1), A.extent(1), + alpha, A.b.current_device_ptr(), A.extent(0), + B.b.current_device_ptr(), B.extent(0), + beta, C.b.current_device_ptr(), C.extent(0)); +#if defined(DEBUG_SYNCHRONOUS) + gemm_event.wait(); + } catch (const oneapi::mkl::invalid_argument &e) { + std::cerr << "OneAPI MKL BLAS GEMM throws invalid argument exception" << std::endl; + } catch (const oneapi::mkl::unsupported_device &e) { + std::cerr << "OneAPI MKL BLAS GEMM throws unsuported device exception" << std::endl; + } catch (const oneapi::mkl::host_bad_alloc &e) { + std::cerr << "OneAPI MKL BLAS GEMM throws host bad allocation exception" << std::endl; + } catch (const oneapi::mkl::device_bad_alloc &e) { + std::cerr << "OneAPI MKL BLAS GEMM throws device bad allocation exception" << std::endl; + } catch (const oneapi::mkl::unimplemented &e) { + std::cerr << "OneAPI MKL BLAS GEMM throws unimplemented exception" << std::endl; + } catch (const std::exception& e) { + std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception" << std::endl; + } catch (...) { + std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception that is also badly formatted..." << std::endl; + } +#endif /* DEBUG_SYNCHRONOUS */ +#endif +} diff --git a/examples/spmm/devicetensor.h b/examples/spmm/devicetensor.h new file mode 100644 index 000000000..d5a0f6d9f --- /dev/null +++ b/examples/spmm/devicetensor.h @@ -0,0 +1,218 @@ +#ifndef HAVE_DEVICETENSOR_H +#define HAVE_DEVICETENSOR_H + +#include + +#if __has_include() +#pragma message("C Preprocessor got here!") +#include +#ifdef BTAS_IS_USABLE +#include +#include +#include +#include +#include "../devblas_helper.h" +#include // need to initialize MADNESS purely for the purposes of TA allocators +#else +#warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost" +#endif +#endif + +#if defined(BTAS_IS_USABLE) + +/** + * Derives from btas::Tensor and wraps a ttg::Buffer + * to enable device support in SPMM. The ttg::Buffer + * does not own the host memory but mananages the device + * memory. + */ +template +struct DeviceTensor : public ttg::TTValue> + , public btas::Tensor<_T, _Range, _Storage> { + using tensor_type = typename btas::Tensor<_T, _Range, _Storage>; + using ttvalue_type = typename ttg::TTValue>; + ttg::Buffer<_T> b; // does not own the host buffer + + using value_type = typename tensor_type::value_type; + using size_type = typename tensor_type::size_type; + using storage_type = typename tensor_type::storage_type; + using range_type = typename tensor_type::range_type; + + + public: + DeviceTensor() = default; + ~DeviceTensor() = default; + + /// constructor with index extent + template + explicit DeviceTensor(const size_type& first, const _args&... rest) + : ttvalue_type() + , tensor_type(first, rest...) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// construct from \c range, allocate data, but not initialized + template + explicit DeviceTensor(const Range& range, typename std::enable_if::value>::type* = 0) + : ttvalue_type() + , tensor_type(range) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// construct from \c range object, set all elements to \c v + template + DeviceTensor(const Range& range, value_type v, typename std::enable_if::value>::type* = 0) + : ttvalue_type() + , tensor_type(range) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// construct from \c range object, copy elements from \c vec + template + DeviceTensor(const Range& range, U* vec, typename std::enable_if::value>::type* = 0) + : ttvalue_type() + , tensor_type(range, vec) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// construct from \c range and \c storage + template + DeviceTensor(const Range& range, const Storage& storage, + typename std::enable_if::value & not std::is_same::value & + not std::is_same::value>::type* = 0) + : ttvalue_type() + , tensor_type(range, storage) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// copy-copy-construct from \c range and \c storage + DeviceTensor(const range_type& range, const storage_type& storage) + : ttvalue_type() + , tensor_type(range, storage) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// copy-move-construct from \c range and \c storage + DeviceTensor(const range_type& range, storage_type&& storage) + : ttvalue_type() + , tensor_type(range, std::forward(storage)) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// move-construct from \c range and \c storage + DeviceTensor(range_type&& range, storage_type&& storage) + : ttvalue_type() + , tensor_type(std::forward(range), std::forward(storage)) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// Construct an evaluated tensor + + /// This constructor will allocate memory for \c range.area() elements. Each element + /// will be initialized as: + /// \code + /// for(auto&& idx: range) + /// (*this)[idx] = op(*(it++)); + /// \endcode + /// \tparam Range An input Range type. + /// \tparam InIter An input iterator type. + /// \tparam Op A unary operation type + /// \param range the input range type + /// \param first An input iterator for the argument + /// \param op The unary operation to be applied to the argument data + template + DeviceTensor(const Range& range, InIter it, const Op& op, + typename std::enable_if::value>::type* = 0) + : ttvalue_type() + , tensor_type(range, it, op) + , b(this->size() ? this->data() : nullptr, this->size()) + { } + + /// copy constructor + /// It will accept Tensors and TensorViews + template ::value>::type> + DeviceTensor(const _Tensor& x) noexcept + : ttvalue_type() + , tensor_type(x.clone()) + , b(this->size() ? this->data() : nullptr, this->size()) + { + //std::cout << "DeviceTensor tensor_type copy ctor" << std::endl; + } + + /// copy constructor: devicebuf cannot be copied, so deleted + DeviceTensor(const DeviceTensor& x) noexcept + : ttvalue_type(x) + , tensor_type(x.clone()) + , b(this->size() ? this->data() : nullptr, this->size()) + { + //std::cout << "DeviceTensor copy ctor" << std::endl; + } + + /// move constructor + DeviceTensor(tensor_type&& x) noexcept + : ttvalue_type() + , tensor_type(std::move(x)) + , b(this->size() ? this->data() : nullptr, this->size()) + { + //std::cout << "DeviceTensor tensor_type move ctor" << std::endl; + } + + DeviceTensor(DeviceTensor&& x) noexcept + : ttvalue_type(std::move(x)) + , tensor_type(static_cast(x)) + , b(std::move(x.b)) + { + assert(this->data() == b.host_ptr()); + //std::cout << "DeviceTensor move ctor" << std::endl; + } + + /// copy assignment operator + template ::value && + not std::is_same::value>::type> + DeviceTensor& operator=(const _Tensor& x) noexcept { + tensor_type::operator=(x.clone()); + b.reset(this->size() ? this->data() : nullptr, this->size()); + //std::cout << "DeviceTensor tensor_type copy operator" << std::endl; + return *this; + } + + /// copy assignment operator + template ::value>::type, + class = typename std::enable_if< + std::is_same::value>::type> + DeviceTensor& operator=(const _Tensor& x) noexcept { + tensor_type::operator=(x.clone()); + b.reset(this->size() ? this->data() : nullptr, this->size()); + //std::cout << "DeviceTensor tensor_type copy operator" << std::endl; + return *this; + } + + /// copy assignment: devicebuf cannot be copied, deleted + DeviceTensor& operator=(const DeviceTensor& x) noexcept { + ttvalue_type::operator=(x); + tensor_type::operator=(x.clone()); + b.reset(this->size() ? this->data() : nullptr, this->size()); + //std::cout << "DeviceTensor copy operator" << std::endl; + return *this; + } + + /// move assignment operator + DeviceTensor& operator=(DeviceTensor&& x) noexcept { + ttvalue_type::operator=(std::move(x)); + tensor_type::operator=(static_cast(x)); + b = std::move(x.b); + //std::cout << "DeviceTensor move ctor" << std::endl; + return *this; + } + + using tensor_type::begin; + using tensor_type::cbegin; + using tensor_type::end; + using tensor_type::cend; + +}; + +#endif // defined(BTAS_IS_USABLE) + +#endif // HAVE_DEVICETENSOR_H \ No newline at end of file diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index 3760c9946..f29ae9088 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -29,8 +29,7 @@ #endif #include "ttg.h" - -using namespace ttg; +#include "../ttg_matrix.h" #include "ttg/util/future.h" @@ -39,12 +38,41 @@ using namespace ttg; #include "ttg/util/bug.h" +#include "devicetensor.h" +#include "devicegemm.h" + +using namespace ttg; + +#if defined(TTG_ENABLE_CUDA) +#define HAVE_SPMM_DEVICE 1 +static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::CUDA; +#elif defined(TTG_ENABLE_HIP) +#define HAVE_SPMM_DEVICE 1 +static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::HIP; +#elif defined(TTG_ENABLE_LEVEL_ZERO) +#define HAVE_SPMM_DEVICE 1 +static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::L0; +#else +static constexpr ttg::ExecutionSpace space = ttg::ExecutionSpace::Host; +#endif + +/* set to true to automatically release constraints + * this removes the ability to control the window + * size and is equal to a window size of 1 */ +#define USE_AUTO_CONSTRAINT false + #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE) using scalar_t = double; + +#if HAVE_SPMM_DEVICE +using blk_t = DeviceTensor>, + btas::Handle::shared_ptr>>; +#else // HAVE_SPMM_DEVICE using blk_t = btas::Tensor, btas::Handle::shared_ptr>>; +#endif // HAVE_SPMM_DEVICE -//#include -//static std::atomic reduce_count = 0; #if defined(TTG_USE_PARSEC) namespace ttg { @@ -101,7 +129,7 @@ using blk_t = double; template using SpMatrix = Eigen::SparseMatrix; template -using SpMatrixTriplet = Eigen::Triplet; // {row,col,value} +using SpMatrixTriplet = ttg::matrix::Triplet; // {row,col,value} #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE) @@ -121,7 +149,7 @@ namespace btas { } template - btas::Tensor gemm(btas::Tensor &&C, const btas::Tensor &A, + void gemm(btas::Tensor &C, const btas::Tensor &A, const btas::Tensor &B) { using array = btas::DEFAULT::index; if (C.empty()) { // first contribution to C = allocate it and gemm with beta=0 @@ -131,11 +159,11 @@ namespace btas { else { // subsequent contributions to C = gemm with beta=1 btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false); } - return std::move(C); + //return std::move(C); } } // namespace btas #endif // BTAS_IS_USABLE -double gemm(double C, double A, double B) { return C + A * B; } +inline void gemm(double& C, const double A, const double B) { C += A * B; } // template // struct colmajor_layout; @@ -206,7 +234,9 @@ class Read_SpMatrix : public TT, const auto i = it.row(); // IF the receiver uses the same keymap, these sends are local if (rank == this->ij_keymap_(Key<2>(std::initializer_list({i, j})))) { - ::send<0>(Key<2>(std::initializer_list({i, j})), it.value(), out); + ::send<0>(Key<2>(std::initializer_list({i, j})), + ttg::persistent(it.value()), + out); } } } @@ -230,14 +260,14 @@ class Write_SpMatrix : public TT, std::tuple<>, Write_SpMatrix, ttg: , write_back_(write_back) { } - void op(const Key<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) { + void op(const Key<2> &key, typename baseT::input_refs_tuple_type &&elem, std::tuple<> &) { if (write_back_) { std::lock_guard lock(mtx_); ttg::trace("rank =", default_execution_context().rank(), "/ thread_id =", reinterpret_cast(pthread_self()), "spmm.cc Write_SpMatrix wrote {", key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast(&matrix_), " with mutex @", static_cast(&mtx_), " for object @", static_cast(this)); - values_.emplace_back(key[0], key[1], baseT::template get<0>(elem)); + values_.emplace_back(key[0], key[1], std::move(baseT::template get<0>(elem))); } } @@ -269,7 +299,9 @@ class Write_SpMatrix : public TT, std::tuple<>, Write_SpMatrix, ttg: /// @tparam KeyMap2 maps {i,j} to processor /// @tparam KeyMap3 maps {i,j,k} to processor -template &)>, typename Keymap3 = std::function &)>, +template &)>, + typename Keymap3 = std::function &)>, typename Blk = blk_t> class SpMM25D { public: @@ -282,24 +314,31 @@ class SpMM25D { const std::vector> &b_cols_of_row, const std::vector> &b_rows_of_col, const std::vector &mTiles, const std::vector &nTiles, const std::vector &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap, - long R, long parallel_bcasts = 1) + long P, long Q, long R, long parallel_bcasts = 1, bool enable_device_map = true) : a_cols_of_row_(a_cols_of_row) , b_rows_of_col_(b_rows_of_col) , a_rows_of_col_(a_rows_of_col) , b_cols_of_row_(b_cols_of_row) + , k_cnt_(a_rows_of_col_.size()+1) , ij_keymap_(std::move(ij_keymap)) , ijk_keymap_(std::move(ijk_keymap)) - , parallel_bcasts_(parallel_bcasts) { + , parallel_bcasts_(std::max(parallel_bcasts, 1L)) + , enable_device_map_(enable_device_map) { Edge, void> a_ctl, b_ctl; Edge, int> a_rowctl, b_colctl; // TODO: can we have multiple control inputs per TT? - bcast_a_ = std::make_unique(a, a_ctl, a_rowctl, local_a_ijk_, a_rows_of_col_, a_cols_of_row_, b_cols_of_row_, - ij_keymap_, ijk_keymap_, parallel_bcasts_); + auto constraint = ttg::make_shared_constraint>>(USE_AUTO_CONSTRAINT); + bcast_a_ = std::make_unique(a, local_a_ijk_, b_cols_of_row_, ij_keymap_, ijk_keymap_); + // add constraint with external mapper: key[1] represents `k` + bcast_a_->add_constraint(constraint, [](const Key<2>& key){ return key[1]; }); local_bcast_a_ = std::make_unique(local_a_ijk_, a_ijk_, b_cols_of_row_, ijk_keymap_); - bcast_b_ = std::make_unique(b, b_ctl, b_colctl, local_b_ijk_, a_rows_of_col_, b_cols_of_row_, b_rows_of_col_, - ij_keymap_, ijk_keymap_, parallel_bcasts_); + bcast_b_ = std::make_unique(b, local_b_ijk_, a_rows_of_col_, ij_keymap_, ijk_keymap_); + // add constraint with external mapper: key[0] represents `k` + bcast_b_->add_constraint(constraint, [](const Key<2>& key){ return key[0]; }); local_bcast_b_ = std::make_unique(local_b_ijk_, b_ijk_, a_rows_of_col_, ijk_keymap_); - multiplyadd_ = std::make_unique(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_, - b_rows_of_col_, mTiles, nTiles, ijk_keymap_); + multiplyadd_ = std::make_unique>(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_cols_of_row_, + b_rows_of_col_, mTiles, nTiles, ijk_keymap_, constraint, + k_cnt_, P, Q, parallel_bcasts_, enable_device_map_); + reduce_c_ = std::make_unique(c_ij_p_, c, ij_keymap_); reduce_c_->template set_input_reducer<0>( [&](Blk &c_ij, const Blk &c_ij_p) { @@ -314,6 +353,11 @@ class SpMM25D { auto world = ttg::default_execution_context(); const auto my_rank = world.rank(); std::vector c_ij_procmask(world.size(), false); + std::vector first_k_map(world.size(), std::numeric_limits::max()); + std::size_t max_k = a_rows_of_col_.size(); + std::vector k_cnt; + k_cnt.resize(a_rows_of_col_.size(), 0); + int release_k = 0; for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) { if (a_cols_of_row_[i].empty()) continue; for (auto j = 0ul; j != b_rows_of_col_.size(); ++j) { @@ -326,7 +370,10 @@ class SpMM25D { while (have_k) { const auto pR = ijk_keymap_(Key<3>{i, j, k}); assert(pR < c_ij_procmask.size()); + k_cnt[k]++; c_ij_procmask[pR] = true; + // find the first k that is needed from us by this rank + first_k_map[pR] = std::min(first_k_map[pR], k); /* get next k */ std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k); } @@ -338,61 +385,21 @@ class SpMM25D { } } - /* kick off the first broadcast in each row of A - * this is used to enforce strict ordering within a row of A */ - for (int i = 0; i < a_cols_of_row_.size(); ++i) { - for (int k : a_cols_of_row_[i]) { - auto key = Key<2>(i, k); - if (world.rank() == ij_keymap_(key)) { - bcast_a_->template in<1>()->send(key, 0); - break; - } - } - } - - /* initial ctl input for a number of bcasts for A - * this is used to limit the number of concurrent bcasts */ - int to_start = parallel_bcasts; - for (int k = 0; - 0 < to_start && k < a_rows_of_col_.size(); - ++k) { - for (auto i : a_rows_of_col_[k]) { - auto key = Key<2>(i, k); - if (world.rank() == ij_keymap_(key)) { - //std::cout << "SPMM kick off BcastA " << key << std::endl; - bcast_a_->template in<2>()->sendk(key); - if (0 == --to_start) break; - } - } - } - - /* kick off the first broadcast in each column of B - * this is used to enforce strict ordering within a column of B */ - for (int j = 0; j < b_rows_of_col_.size(); ++j) { - for (int k : b_rows_of_col_[j]) { - auto key = Key<2>(k, j); - if (world.rank() == ij_keymap_(key)) { - //std::cout << "BcastB kick off " << key << std::endl; - bcast_b_->template in<1>()->send(key, 0); - break; - } - } + k_cnt.push_back(1); // we always want to release the last k + assert(k_cnt.size() == k_cnt_.size()); + // copy into atomic counters + std::size_t i = 0; + for (auto c : k_cnt) { + assert(i < k_cnt_.size()); + k_cnt_[i++].store(c, std::memory_order_relaxed); } - /* initial ctl input for bcasts for B */ - to_start = parallel_bcasts; - for (int k = 0; - 0 < to_start && k < b_cols_of_row_.size(); - ++k) { - for (auto j : b_cols_of_row_[k]) { - auto key = Key<2>(k, j); - if (world.rank() == ij_keymap_(key)) { - //std::cout << "SPMM kick off BcastB " << key << std::endl; - bcast_b_->template in<2>()->sendk(key); - if (0 == --to_start) break; - } - } - } + /* release the first parallel_bcasts_ k that are non-zero */ + auto k_cnt_iter = k_cnt.begin(); + do { + k_cnt_iter = std::find_if(k_cnt_iter, k_cnt.end(), [](auto c){ return c > 0; }); + } while (++k_cnt_iter != k_cnt.end() && std::distance(k_cnt_iter, k_cnt.end()) < parallel_bcasts_); + constraint->release(std::distance(k_cnt.begin(), k_cnt_iter)); TTGUNUSED(bcast_a_); TTGUNUSED(bcast_b_); @@ -402,7 +409,7 @@ class SpMM25D { /// Locally broadcast `A[i][k]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `j` such that /// `B[k][j]` exists AND `k` contribution to `C[i][j]` is assigned to this processor - class LocalBcastA : public TT, std::tuple, Blk>>, LocalBcastA, ttg::typelist> { + class LocalBcastA : public TT, std::tuple, Blk>>, LocalBcastA, ttg::typelist> { public: using baseT = typename LocalBcastA::ttT; @@ -439,32 +446,24 @@ class SpMM25D { }; // class LocalBcastA /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists - class BcastA : public TT, std::tuple, Blk>, Out, int>, Out, void>>, BcastA, ttg::typelist> { + class BcastA : public TT, std::tuple, Blk>>, BcastA, ttg::typelist> { public: using baseT = typename BcastA::ttT; - BcastA(Edge, Blk> &a_ik, Edge, void> &ctl, - Edge, int> &rowctl, Edge, Blk> &a_ikp, - const std::vector> &a_rows_of_col, - const std::vector> &a_cols_of_row, + BcastA(Edge, Blk> &a_ik, Edge, Blk> &a_ikp, const std::vector> &b_cols_of_row, - const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap, - const int parallel_bcasts) - : baseT(edges(a_ik, rowctl, ctl), edges(a_ikp, rowctl, ctl), "SpMM25D::bcast_a", {"a_ik", "rowctl", "ctl"}, {"a_ikp", "rowctl", "ctl"}, ij_keymap) - , a_rows_of_col_(a_rows_of_col) - , a_cols_of_row_(a_cols_of_row) + const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap) + : baseT(edges(a_ik), edges(a_ikp), "SpMM25D::bcast_a", {"a_ik"}, {"a_ikp"}, ij_keymap) , b_cols_of_row_(b_cols_of_row) - , ijk_keymap_(ijk_keymap) - , ij_keymap_(ij_keymap) - , parallel_bcasts_(parallel_bcasts) { + , ijk_keymap_(ijk_keymap) { this->set_priomap([](const Key<2>& key){ return std::numeric_limits::max() - key[0]; }); } - void op(const Key<2> &ik, typename baseT::input_values_tuple_type &&a_ik, - std::tuple, Blk>, Out, int>, Out, void>> &outs) { + void op(const Key<2> &ik, typename baseT::input_refs_tuple_type &&a_ik, + std::tuple, Blk>> &outs) { const auto i = ik[0]; // row const auto k = ik[1]; // col ttg::trace("BcastA(", i, ", ", k, ")"); @@ -482,86 +481,18 @@ class SpMM25D { ikp_keys.emplace_back(Key<3>({i, k, p})); procmap[p] = true; } - // TODO: debug - //if (p != world.rank() && ij_keymap_(Key<2>{k, j}) != p) { - // std::cout << "[" << world.rank() << "] BCAST A " << ik << " for C update " << Key<3>({i, k, p}) << " on " << p << " has B from " << ij_keymap_(Key<2>{k, j}) << std::endl; - //} } ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), outs); - - /* enable the next broadcast on this row */ - int row = i; - int col = k; - auto rowit = std::find(a_cols_of_row_[row].begin(), a_cols_of_row_[row].end(), col); - for (++rowit; rowit != a_cols_of_row_[row].end(); ++rowit) { - Key<2> key = {row, *rowit}; - if (world.rank() == this->get_keymap()(key)) { - ::send<1>(key, std::move(baseT::template get<1>(a_ik)), outs); - break; - } - } - - - /* enable next broadcast through a control message - * we don't check whether this tile is in B here, this is - * done inside the next task (see above) - * we walk the matrix A column-major in an attempt to send from top to bottom, left to right */ - long to_skip = parallel_bcasts_; - - auto colit = std::find(a_rows_of_col_[col].begin(), a_rows_of_col_[col].end(), row); - ++colit; // skip to next row - do { - for (; colit != a_rows_of_col_[col].end(); ++colit) { - Key<2> key = {*colit, col}; - if (world.rank() == this->get_keymap()(key)) { - if (0 == --to_skip) { - //std::cout << "BcastA sending to " << key << " from " << ik << std::endl; - ::sendk<2>(key, outs); - return; - } - } - } - /* nothing for us in this column, move on to the next column */ - if (++col < a_rows_of_col_.size()) { - colit = a_rows_of_col_[col].begin(); - } else { - break; - } - } while (1); - -#if 0 - do { - for (; it != a_cols_of_row_[i].end(); ++it) { - Key<2> key = {i, *it}; - if (world.rank() == this->get_keymap()(key)) { - if (0 == --to_skip) { - ::sendk<1>(key, outs); - return; - } - } - } - if ((i+1) < num_rows) { - it = a_cols_of_row_[++i].begin(); - } else { - break; - } - } while (1); -#endif // 0 } private: - //const std::vector> &a_cols_of_row_; - const std::vector> &a_rows_of_col_; - const std::vector> &a_cols_of_row_; const std::vector> &b_cols_of_row_; const Keymap3 &ijk_keymap_; - const Keymap2 &ij_keymap_; - const int parallel_bcasts_; }; // class BcastA /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that /// `A[i][k]` exists AND `k` contribution to `C[i][j]` is assigned to this processor - class LocalBcastB : public TT, std::tuple, Blk>>, LocalBcastB, ttg::typelist> { + class LocalBcastB : public TT, std::tuple, Blk>>, LocalBcastB, ttg::typelist> { public: using baseT = typename LocalBcastB::ttT; @@ -597,30 +528,24 @@ class SpMM25D { }; // class LocalBcastB /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists - class BcastB : public TT, std::tuple, Blk>, Out, int>, Out, void>>, BcastB, ttg::typelist> { + class BcastB : public TT, std::tuple, Blk>>, BcastB, ttg::typelist> { public: using baseT = typename BcastB::ttT; - BcastB(Edge, Blk> &b_kj, Edge, void> ctl, Edge, int> colctl, Edge, Blk> &b_kjp, + BcastB(Edge, Blk> &b_kj, Edge, Blk> &b_kjp, const std::vector> &a_rows_of_col, - const std::vector> &b_cols_of_row, - const std::vector> &b_rows_of_col, - const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap, - const int parallel_bcasts) - : baseT(edges(b_kj, colctl, ctl), edges(b_kjp, colctl, ctl), "SpMM25D::bcast_b", {"b_kj", "colctl", "ctl"}, {"b_kjp", "colctl", "ctl"}, ij_keymap) + const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap) + : baseT(edges(b_kj), edges(b_kjp), "SpMM25D::bcast_b", {"b_kj"}, {"b_kjp"}, ij_keymap) , a_rows_of_col_(a_rows_of_col) - , b_cols_of_row_(b_cols_of_row) - , b_rows_of_col_(b_rows_of_col) , ijk_keymap_(ijk_keymap) - , parallel_bcasts_(parallel_bcasts) { this->set_priomap([](const Key<2>& key){ return std::numeric_limits::max() - key[1]; }); } - void op(const Key<2> &kj, typename baseT::input_values_tuple_type &&b_kj, - std::tuple, Blk>, Out, int>, Out, void>> &outs) { + void op(const Key<2> &kj, typename baseT::input_refs_tuple_type &&b_kj, + std::tuple, Blk>> &outs) { const auto k = kj[0]; // row const auto j = kj[1]; // col // broadcast b_kj to all processors which will contain at least one c_ij such that a_ik exists @@ -639,100 +564,75 @@ class SpMM25D { } } ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), outs); - - /* enable the next broadcast on this row */ - int row = k; - int col = j; - auto colit = std::find(b_rows_of_col_[col].begin(), b_rows_of_col_[col].end(), row); - for (++colit; colit != b_rows_of_col_[col].end(); ++colit) { - Key<2> key = {*colit, col}; - if (world.rank() == this->get_keymap()(key)) { - //std::cout << "BcastB kick off " << key << std::endl; - ::send<1>(key, std::move(baseT::template get<1>(b_kj)), outs); - break; - } - } - - /* enable next broadcast through a control message - * we don't check whether this tile is in A here, this is - * done inside the next task (see above) - * we run across a row to enable broadcasts */ - long to_skip = parallel_bcasts_; - - // iterator over the current row - auto rowit = std::find(b_cols_of_row_[row].begin(), b_cols_of_row_[row].end(), col); - ++rowit; // skip to next col - do { - for (; rowit != b_cols_of_row_[row].end(); ++rowit) { - Key<2> key = {row, *rowit}; - if (world.rank() == this->get_keymap()(key)) { - if (0 == --to_skip) { - //std::cout << "BcastB sending to " << key << " from " << kj << " pb " << parallel_bcasts_ << std::endl; - ::sendk<2>(key, outs); - return; - } else { - //std::cout << "BcastB skipping " << key << " from " << kj << " pb " << parallel_bcasts_ << std::endl; - } - } - } - /* nothing for us in this row, move on to the next row */ - if (++row != b_cols_of_row_.size()) { - rowit = b_cols_of_row_[row].begin(); - } else { - break; - } - } while (1); - - -#if 0 - std::size_t num_rows = b_cols_of_row_.size(); - auto it = std::find(b_cols_of_row_[k].begin(), b_cols_of_row_[k].end(), j); - ++it; // skip the current tile - long to_skip = parallel_bcasts_; - do { - for (; it != b_cols_of_row_[k].end(); ++it) { - Key<2> key = {k, *it}; - if (world.rank() == this->get_keymap()(key)) { - if (0 == --to_skip) { - ::sendk<1>(key, outs); - return; - } - } - } - if ((k+1) < num_rows) { - it = b_cols_of_row_[++k].begin(); - } else { - break; - } - } while (1); -#endif // 0 } private: const std::vector> &a_rows_of_col_; - const std::vector> &b_cols_of_row_; - const std::vector> &b_rows_of_col_; const Keymap3 &ijk_keymap_; - const int parallel_bcasts_; }; // class BcastB /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this layer of the /// 3-D process grid only - class MultiplyAdd : public TT, std::tuple, Blk>, Out, Blk>>, MultiplyAdd, + template + class MultiplyAdd : public TT, std::tuple, Blk>, Out, Blk>>, MultiplyAdd, ttg::typelist> { + static constexpr const bool is_device_space = (Space_ != ttg::ExecutionSpace::Host); + using task_return_type = std::conditional_t; + + void release_next_k(long k) { + assert(k_cnt_.size() > k); + long cnt = k_cnt_[k].fetch_sub(1, std::memory_order_relaxed)-1; + assert(cnt >= 0); + if (0 == cnt) { + auto release_k = k; + auto bcasts_ahead = parallel_bcasts_; + // this was the last gemm in this k, find the next one to release + while (++release_k < k_cnt_.size() && + (0 == k_cnt_[release_k].load(std::memory_order_relaxed) + || --bcasts_ahead > 0)) + { } + constraint->release(release_k); + } + } + + public: using baseT = typename MultiplyAdd::ttT; + /* communicate to the runtime which device we support (if any) */ + static constexpr bool have_cuda_op = (Space_ == ttg::ExecutionSpace::CUDA); + static constexpr bool have_hip_op = (Space_ == ttg::ExecutionSpace::HIP); + static constexpr bool have_level_zero_op = (Space_ == ttg::ExecutionSpace::L0); + MultiplyAdd(Edge, Blk> &a_ijk, Edge, Blk> &b_ijk, Edge, Blk> &c_ijk, Edge, Blk> &c, const std::vector> &a_cols_of_row, const std::vector> &b_rows_of_col, const std::vector &mTiles, - const std::vector &nTiles, const Keymap3 &ijk_keymap) + const std::vector &nTiles, const Keymap3 &ijk_keymap, + std::shared_ptr>> constraint, + std::vector>& k_cnt, + long P, long Q, + std::size_t parallel_bcasts, bool enable_device_map) : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"}, {"c_ij", "c_ijk"}, ijk_keymap) , a_cols_of_row_(a_cols_of_row) - , b_rows_of_col_(b_rows_of_col) { + , b_rows_of_col_(b_rows_of_col) + , k_cnt_(k_cnt) + , constraint(std::move(constraint)) + , parallel_bcasts_(parallel_bcasts) { this->set_priomap([=,this](const Key<3> &ijk) { return this->prio(ijk); }); // map a key to an integral priority value - + if constexpr (is_device_space) { + if (enable_device_map) { + int num_devices = ttg::device::num_devices(); + int gp = std::sqrt(num_devices); + int gq = num_devices / gp; + this->set_devicemap( + [P,Q,gp,gq,num_devices](const Key<3> &ijk){ + // TODO: include the number of rows/columns in this formula + auto device = (((ijk[0]/P)%gp)*gq) + (ijk[1]/Q)%gq; + return device; + }); + } + } // for each {i,j} determine first k that contributes AND belongs to this node, // initialize input {i,j,first_k} flow to 0 for (auto i = 0ul; i != a_cols_of_row_.size(); ++i) { @@ -760,8 +660,8 @@ class SpMM25D { } } - void op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk, - std::tuple, Blk>, Out, Blk>> &result) { + task_return_type op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk, + std::tuple, Blk>, Out, Blk>> &result) { const auto i = ijk[0]; const auto j = ijk[1]; const auto k = ijk[2]; // k==l same because 000 will always be on layer 0, 001 will be accessed on layer 1 @@ -774,25 +674,64 @@ class SpMM25D { " C[", i, "][", j, "] += A[", i, "][", k, "] by B[", k, "][", j, "], next_k? ", (have_next_k ? std::to_string(next_k) : "does not exist")); + // release the constraint on the next round of broadcasts + release_next_k(k); + const blk_t& A = baseT::template get<0>(_ijk); + const blk_t& B = baseT::template get<1>(_ijk); + blk_t& C = baseT::template get<2>(_ijk); + +#if defined(BLOCK_SPARSE_GEMM) + if (C.empty()) { + C = blk_t(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0); + } +#endif // BLOCK_SPARSE_GEMM + +#ifdef HAVE_SPMM_DEVICE + /* pull all buffers onto the device */ + co_await ttg::device::select(A.b, B.b, C.b); + + /* everything is on the device, call the gemm */ + device_gemm(C, A, B); + + // pass the running total to the next flow, if needed + // otherwise write to the result flow + if (have_next_k) { + co_await ttg::device::forward(ttg::device::send<1>( + Key<3>({i, j, next_k}), + std::move(C), + result)); + } else { // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j] + // belongs + co_await ttg::device::forward(ttg::device::send<0>( + Key<2>({i, j}), + std::move(C), + result)); + } +#else // HAVE_SPMM_DEVICE + gemm(C, A, B); // compute the contrib, pass the running total to the next flow, if needed // otherwise write to the result flow if (have_next_k) { ::send<1>( Key<3>({i, j, next_k}), - gemm(std::move(baseT::template get<2>(_ijk)), baseT::template get<0>(_ijk), baseT::template get<1>(_ijk)), + std::move(C), result); } else { // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j] // belongs ::send<0>( Key<2>({i, j}), - gemm(std::move(baseT::template get<2>(_ijk)), baseT::template get<0>(_ijk), baseT::template get<1>(_ijk)), + std::move(C), result); } +#endif // HAVE_SPMM_DEVICE } private: const std::vector> &a_cols_of_row_; const std::vector> &b_rows_of_col_; + std::vector>& k_cnt_; + std::shared_ptr>> constraint; + std::size_t parallel_bcasts_; /* Compute the length of the remaining sequence on that tile */ int32_t prio(const Key<3> &key) const { @@ -936,11 +875,13 @@ class SpMM25D { std::unique_ptr local_bcast_a_; std::unique_ptr bcast_b_; std::unique_ptr local_bcast_b_; - std::unique_ptr multiplyadd_; + std::unique_ptr> multiplyadd_; std::unique_ptr reduce_c_; + std::vector> k_cnt_; Keymap2 ij_keymap_; Keymap3 ijk_keymap_; long parallel_bcasts_; + bool enable_device_map_; }; class Control : public TT>>, Control> { @@ -1139,7 +1080,7 @@ static void initSpRmat(const std::function &)> &keymap, const c boost::minstd_rand gen(seed); boost::rmat_iterator> rmat_it(gen, N, E, a, b, c, d); - using triplet_t = Eigen::Triplet; + using triplet_t = ttg::matrix::Triplet; std::vector A_elements; for (int i = 0; i < N; i++) { nnz++; @@ -1175,7 +1116,7 @@ static void initSpHardCoded(const std::function &)> &keymap, Sp C.resize(m, n); // We initialize the same matrices on all the ranks, but we will use only the local part // following the keymap - using triplet_t = Eigen::Triplet; + using triplet_t = ttg::matrix::Triplet; std::vector A_elements; A_elements.emplace_back(0, 1, 12.3); A_elements.emplace_back(0, 2, 10.7); @@ -1222,7 +1163,7 @@ static void initBlSpHardCoded(const std::function &)> &keymap, int rank = ttg::default_execution_context().rank(); - using triplet_t = Eigen::Triplet; + using triplet_t = ttg::matrix::Triplet; std::vector A_elements; std::vector Aref_elements; #if defined(BTAS_IS_USABLE) @@ -1388,7 +1329,7 @@ static void initBlSpRandom(const std::function &)> &keymap, siz std::mt19937 genv(seed + 1); std::uniform_int_distribution<> dist(minTs, maxTs); // randomly pick any value in the range minTs, maxTs - using triplet_t = Eigen::Triplet; + using triplet_t = ttg::matrix::Triplet; std::vector A_elements; std::vector B_elements; std::vector Aref_elements; @@ -1512,7 +1453,7 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function< const std::vector> &b_cols_of_row, const std::vector> &b_rows_of_col, std::vector &mTiles, std::vector &nTiles, std::vector &kTiles, int M, int N, int K, int minTs, - int maxTs, int P, int Q, int R, int parallel_bcasts) { + int maxTs, int P, int Q, int R, int parallel_bcasts, bool enable_device_map) { int MT = (int)A.rows(); int NT = (int)B.cols(); int KT = (int)A.cols(); @@ -1539,7 +1480,7 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function< assert(!has_value(c_status)); // SpMM25D a_times_b(world, eA, eB, eC, A, B); SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col, - mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R, parallel_bcasts); + mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, P, Q, R, parallel_bcasts, enable_device_map); TTGUNUSED(a); TTGUNUSED(b); TTGUNUSED(a_times_b); @@ -1566,12 +1507,12 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function< std::string rt("Unkown???"); #endif if (ttg::default_execution_context().rank() == 0) { - std::cout << "TTG-" << rt << " PxQxR= " << P << " " << Q << " " << R << " 1 average_NB= " << avg_nb << " M= " << M + std::cout << "TTG-" << rt << " PxQxR= " << P << " " << Q << " " << R << " " << ttg::device::num_devices() + << " average_NB= " << avg_nb << " M= " << M << " N= " << N << " K= " << K << " t= " << minTs << " T=" << maxTs << " Tiling= " << tiling_type << " A_density= " << Adensity << " B_density= " << Bdensity << " gflops= " << gflops << " seconds= " << tc << " gflops/s= " << gflops / tc << std::endl; } - //std::cout << "num reductions " << reduce_count.load() << " tiles " << MT*KT << std::endl; } #if !defined(BLOCK_SPARSE_GEMM) @@ -1635,20 +1576,6 @@ int main(int argc, char **argv) { bool timing; double gflops; - // warm up silicon by calling gemm a few times -#ifdef BTAS_IS_USABLE - for (int i = 0; i < 20; i++) { - using baseT = typename btas::Tensor; - btas::Tensor> At(30, 30); - btas::Tensor> Bt(30, 30); - btas::Tensor> Ct(30, 30); - At.fill(1.0); - Bt.fill(2.0); - Ct.fill(3.0); - btas::gemm(std::move(Ct), Bt, At); - } -#endif // BTAS_IS_USABLE - int cores = -1; std::string nbCoreStr(getCmdOption(argv, argv + argc, "-c")); cores = parseOption(nbCoreStr, cores); @@ -1659,6 +1586,12 @@ int main(int argc, char **argv) { initialize(1, argv, cores); } +#if defined(BTAS_IS_USABLE) && defined(TTG_PARSEC_IMPORTED) + // initialize MADNESS so that TA allocators can be created + madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context()); + madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true); +#endif // BTAS_IS_USABLE + std::string debugStr(getCmdOption(argv, argv + argc, "-d")); auto debug = (unsigned int)parseOption(debugStr, 0); @@ -1710,6 +1643,9 @@ int main(int argc, char **argv) { parallel_bcasts = std::stol(pStr); } + /* whether we set a device mapping */ + bool enable_device_map = !cmdOptionExists(argv, argv+argc, "--default-device-map"); + std::string PStr(getCmdOption(argv, argv + argc, "-P")); P = parseOption(PStr, P); std::string QStr(getCmdOption(argv, argv + argc, "-Q")); @@ -1842,6 +1778,9 @@ int main(int argc, char **argv) { minTs = parseOption(minTsStr, 32); std::string maxTsStr(getCmdOption(argv, argv + argc, "-T")); maxTs = parseOption(maxTsStr, 256); + if (minTs >= maxTs) { + maxTs = minTs; + } std::string avgStr(getCmdOption(argv, argv + argc, "-a")); double avg = parseOption(avgStr, 0.3); timing = (check == 0); @@ -1867,9 +1806,17 @@ int main(int argc, char **argv) { // Start up engine execute(); for (int nrun = 0; nrun < nb_runs; nrun++) { +#if TTG_USE_PARSEC + /* flush all PaRSEC memory */ + parsec_devices_release_memory(); +#endif // TTG_USE_PARSEC timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity, a_cols_of_row, a_rows_of_col, b_cols_of_row, b_rows_of_col, mTiles, - nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R, parallel_bcasts); + nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R, parallel_bcasts, enable_device_map); +#if TTG_USE_PARSEC + /* reset PaRSEC's load tracking */ + parsec_devices_reset_load(default_execution_context().impl().context()); +#endif // TTG_USE_PARSEC } } else { // flow graph needs to exist on every node @@ -1890,7 +1837,7 @@ int main(int argc, char **argv) { assert(!has_value(c_status)); // SpMM25D a_times_b(world, eA, eB, eC, A, B); SpMM25D<> a_times_b(eA, eB, eC, A, B, a_cols_of_row, a_rows_of_col, b_cols_of_row, - b_rows_of_col, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R); + b_rows_of_col, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, P, Q, R); TTGUNUSED(a_times_b); // calling the Dot constructor with 'true' argument disables the type if (default_execution_context().rank() == 0) std::cout << Dot{/*disable_type=*/true}(&control) << std::endl; @@ -1940,6 +1887,10 @@ int main(int argc, char **argv) { } } +#if defined(BTAS_IS_USABLE) && defined(TTG_PARSEC_IMPORTED) + madness::finalize(); +#endif // BTAS_IS_USABLE + ttg_finalize(); return 0; diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc deleted file mode 100644 index 469aef8f7..000000000 --- a/examples/spmm/spmm_cuda.cc +++ /dev/null @@ -1,1969 +0,0 @@ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#if __has_include() -#pragma message("C Preprocessor got here!") -#include -#ifdef BTAS_IS_USABLE -#include -#include -#include -#include -#include "../devblas_helper.h" -#include // need to initialize MADNESS purely for the purposes of TA allocators -#else -#warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost" -#endif -#endif - -#include -#include -#if !defined(BLOCK_SPARSE_GEMM) -#include -#include -#include -#endif - -#include "ttg.h" - -#include "../devblas_helper.h" - -using namespace ttg; - -#include "ttg/util/future.h" - -#include "ttg/util/multiindex.h" - -#include "ttg/util/bug.h" - -#include "ttg/serialization/std/pair.h" - -#if defined(TTG_ENABLE_LEVEL_ZERO) -#include -#include -#endif - -#if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE) - -template -struct DeviceTensor : public ttg::TTValue> - , public btas::Tensor<_T, _Range, _Storage> { - using tensor_type = typename btas::Tensor<_T, _Range, _Storage>; - using ttvalue_type = typename ttg::TTValue>; - ttg::Buffer<_T> b; // does not own the host buffer - - using value_type = typename tensor_type::value_type; - using size_type = typename tensor_type::size_type; - using storage_type = typename tensor_type::storage_type; - using range_type = typename tensor_type::range_type; - - - public: - DeviceTensor() = default; - ~DeviceTensor() = default; - - /// constructor with index extent - template - explicit DeviceTensor(const size_type& first, const _args&... rest) - : ttvalue_type() - , tensor_type(first, rest...) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// construct from \c range, allocate data, but not initialized - template - explicit DeviceTensor(const Range& range, typename std::enable_if::value>::type* = 0) - : ttvalue_type() - , tensor_type(range) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// construct from \c range object, set all elements to \c v - template - DeviceTensor(const Range& range, value_type v, typename std::enable_if::value>::type* = 0) - : ttvalue_type() - , tensor_type(range) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// construct from \c range object, copy elements from \c vec - template - DeviceTensor(const Range& range, U* vec, typename std::enable_if::value>::type* = 0) - : ttvalue_type() - , tensor_type(range, vec) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// construct from \c range and \c storage - template - DeviceTensor(const Range& range, const Storage& storage, - typename std::enable_if::value & not std::is_same::value & - not std::is_same::value>::type* = 0) - : ttvalue_type() - , tensor_type(range, storage) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// copy-copy-construct from \c range and \c storage - DeviceTensor(const range_type& range, const storage_type& storage) - : ttvalue_type() - , tensor_type(range, storage) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// copy-move-construct from \c range and \c storage - DeviceTensor(const range_type& range, storage_type&& storage) - : ttvalue_type() - , tensor_type(range, std::forward(storage)) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// move-construct from \c range and \c storage - DeviceTensor(range_type&& range, storage_type&& storage) - : ttvalue_type() - , tensor_type(std::forward(range), std::forward(storage)) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// Construct an evaluated tensor - - /// This constructor will allocate memory for \c range.area() elements. Each element - /// will be initialized as: - /// \code - /// for(auto&& idx: range) - /// (*this)[idx] = op(*(it++)); - /// \endcode - /// \tparam Range An input Range type. - /// \tparam InIter An input iterator type. - /// \tparam Op A unary operation type - /// \param range the input range type - /// \param first An input iterator for the argument - /// \param op The unary operation to be applied to the argument data - template - DeviceTensor(const Range& range, InIter it, const Op& op, - typename std::enable_if::value>::type* = 0) - : ttvalue_type() - , tensor_type(range, it, op) - , b(this->size() ? this->data() : nullptr, this->size()) - { } - - /// copy constructor - /// It will accept Tensors and TensorViews - template ::value>::type> - DeviceTensor(const _Tensor& x) noexcept - : ttvalue_type() - , tensor_type(x.clone()) - , b(this->size() ? this->data() : nullptr, this->size()) - { - //std::cout << "DeviceTensor tensor_type copy ctor" << std::endl; - } - - /// copy constructor: devicebuf cannot be copied, so deleted - DeviceTensor(const DeviceTensor& x) noexcept - : ttvalue_type(x) - , tensor_type(x.clone()) - , b(this->size() ? this->data() : nullptr, this->size()) - { - //std::cout << "DeviceTensor copy ctor" << std::endl; - } - - /// move constructor - DeviceTensor(tensor_type&& x) noexcept - : ttvalue_type() - , tensor_type(std::move(x)) - , b(this->size() ? this->data() : nullptr, this->size()) - { - //std::cout << "DeviceTensor tensor_type move ctor" << std::endl; - } - - DeviceTensor(DeviceTensor&& x) noexcept - : ttvalue_type(std::move(x)) - , tensor_type(std::move(x)) - /* Grrrr, moving a Tensor does not guarantee to move the pointer */ - , b((this->size() == 0 || - this->data() == x.b.host_ptr()) ? std::move(x.b) - : ttg::Buffer<_T>(this->size() ? this->data() - : nullptr, - this->size())) - { - assert(this->data() == b.host_ptr()); - //std::cout << "DeviceTensor move ctor" << std::endl; - } - - /// copy assignment operator - template ::value && - not std::is_same::value>::type> - DeviceTensor& operator=(const _Tensor& x) noexcept { - tensor_type::operator=(x.clone()); - b.reset(this->size() ? this->data() : nullptr, this->size()); - //std::cout << "DeviceTensor tensor_type copy operator" << std::endl; - return *this; - } - - /// copy assignment operator - template ::value>::type, - class = typename std::enable_if< - std::is_same::value>::type> - DeviceTensor& operator=(const _Tensor& x) noexcept { - tensor_type::operator=(x.clone()); - b.reset(this->size() ? this->data() : nullptr, this->size()); - //std::cout << "DeviceTensor tensor_type copy operator" << std::endl; - return *this; - } - - /// copy assignment: devicebuf cannot be copied, deleted - DeviceTensor& operator=(const DeviceTensor& x) noexcept { - ttvalue_type::operator=(x); - tensor_type::operator=(x.clone()); - b.reset(this->size() ? this->data() : nullptr, this->size()); - //std::cout << "DeviceTensor copy operator" << std::endl; - return *this; - } - - /// move assignment operator - DeviceTensor& operator=(DeviceTensor&& x) noexcept { - ttvalue_type::operator=(std::move(x)); - tensor_type::operator=(std::move(x)); - if (this->size() == 0 || this->data() == x.b.host_ptr()){ - b = std::move(x.b); - } else { - b = ttg::Buffer<_T>(this->size() ? this->data() : nullptr, this->size()); - } - //std::swap(x.b, b); - //std::cout << "DeviceTensor move ctor" << std::endl; - return *this; - } - - using tensor_type::begin; - using tensor_type::cbegin; - using tensor_type::end; - using tensor_type::cend; - -}; - -using scalar_t = double; -#if defined(TTG_ENABLE_CUDA) || defined(TTG_ENABLE_HIP) -using blk_t = DeviceTensor>, - btas::Handle::shared_ptr>>; -#else -using blk_t = DeviceTensor, btas::Handle::shared_ptr>>; -#endif - - -//inline blk_t operator*(const blk_t &A, const blk_t &B) { -// blk_t::tensor_type c; -// btas::contract(1.0, A, {1, 2}, B, {2, 3}, 0.0, c, {1, 3}); -// return blk_t(std::move(c)); -//} - -/* TODO: call CUDA gemm here */ -template -static void device_gemm(Blk &C, const Blk &A, const Blk &B) { - using blk_t = Blk; - using T = typename blk_t::value_type; - static_assert(std::is_same_v || std::is_same_v); - static const T alpha = 1.0; - static const T beta = 1.0; - // make sure all memory is on the device - // TODO: A and B are read-only so the owner device will be 0. How to fix? - //assert(A.b.get_current_device() != 0); - //assert(B.b.get_current_device() != 0); - auto device = ttg::device::current_device(); - assert(device.is_device()); -#if defined(TTG_ENABLE_CUDA) - if constexpr (std::is_same_v) { - cublasDgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1), - &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta, - C.b.current_device_ptr(), C.extent(0)); - } - else if constexpr (std::is_same_v) { - cublasSgemm(cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1), - &alpha, A.b.current_device_ptr(), A.extent(0), B.b.current_device_ptr(), B.extent(0), &beta, - C.b.current_device_ptr(), C.extent(0)); - } -#elif defined(TTG_ENABLE_HIP) - if constexpr (std::is_same_v) { - hipblasDgemm(hipblas_handle(), - HIPBLAS_OP_N, HIPBLAS_OP_N, - C.extent(0), C.extent(1), A.extent(1), &alpha, - A.b.current_device_ptr(), A.extent(0), - B.b.current_device_ptr(), B.extent(0), &beta, - C.b.current_device_ptr(), C.extent(0)); - } else if constexpr (std::is_same_v) { - hipblasSgemm(hipblas_handle(), - HIPBLAS_OP_N, HIPBLAS_OP_N, - C.extent(0), C.extent(1), A.extent(1), &alpha, - A.b.current_device_ptr(), A.extent(0), - B.b.current_device_ptr(), B.extent(0), &beta, - C.b.current_device_ptr(), C.extent(0)); - } -#elif defined(TTG_ENABLE_LEVEL_ZERO) - -#if defined(DEBUG_SYNCHRONOUS) - try { -#endif /* DEBUG_SYNCHRONOUS */ - cl::sycl::event gemm_event; - gemm_event = oneapi::mkl::blas::gemm(ttg::device::current_stream(), - oneapi::mkl::transpose::N, oneapi::mkl::transpose::N, - C.extent(0), C.extent(1), A.extent(1), - alpha, A.b.current_device_ptr(), A.extent(0), - B.b.current_device_ptr(), B.extent(0), - beta, C.b.current_device_ptr(), C.extent(0)); -#if defined(DEBUG_SYNCHRONOUS) - gemm_event.wait(); - } catch (const oneapi::mkl::invalid_argument &e) { - std::cerr << "OneAPI MKL BLAS GEMM throws invalid argument exception" << std::endl; - } catch (const oneapi::mkl::unsupported_device &e) { - std::cerr << "OneAPI MKL BLAS GEMM throws unsuported device exception" << std::endl; - } catch (const oneapi::mkl::host_bad_alloc &e) { - std::cerr << "OneAPI MKL BLAS GEMM throws host bad allocation exception" << std::endl; - } catch (const oneapi::mkl::device_bad_alloc &e) { - std::cerr << "OneAPI MKL BLAS GEMM throws device bad allocation exception" << std::endl; - } catch (const oneapi::mkl::unimplemented &e) { - std::cerr << "OneAPI MKL BLAS GEMM throws unimplemented exception" << std::endl; - } catch (const std::exception& e) { - std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception" << std::endl; - } catch (...) { - std::cerr << "OneAPI MKL BLAS GEMM throws unexpected exception that is also badly formatted..." << std::endl; - } -#endif /* DEBUG_SYNCHRONOUS */ -#endif -} - -#if defined(TTG_USE_PARSEC) -namespace ttg { - template <> - struct SplitMetadataDescriptor { - // TODO: this is a quick and dirty approach. - // - blk_t could have any number of dimensions, this code only works for 2 dim blocks - // - we use Blk{} to send a control flow in some tasks below, these blocks have only - // 1 dimension (of size 0), to code this, we set the second dimension to 0 in our - // quick and dirty linearization, then have a case when we create the object - // - when we create the object with the metadata, we use a constructor that initializes - // the data to 0, which is useless: the data could be left uninitialized - static auto get_metadata(const blk_t &b) { - std::pair dim{0, 0}; - if (!b.empty()) { - assert(b.range().extent().size() == 2); - std::get<0>(dim) = (int)b.range().extent(0); - std::get<1>(dim) = (int)b.range().extent(1); - } - return dim; - } - static auto get_data(blk_t &b) { - using T = typename blk_t::value_type; - if (!b.empty()) - return boost::container::small_vector(1, iovec{b.size() * sizeof(T), b.data()}); - else - return boost::container::small_vector{}; - } - static auto create_from_metadata(const std::pair &meta) { - if (meta != std::pair{0, 0}) // N.B. allocate only, do not fill with zeroes - return blk_t(btas::Range(std::get<0>(meta), std::get<1>(meta))); - else - return blk_t{}; - } - }; -} // namespace ttg -#endif /* TTG_USE_PARSEC */ - -// declare btas::Tensor serializable by Boost -#include "ttg/serialization/backends/boost.h" -namespace ttg::detail { - // BTAS defines all of its Boost serializers in boost::serialization namespace ... as explained in - // ttg/serialization/boost.h such functions are not detectable via SFINAE, so must explicitly define serialization - // traits here - template - inline static constexpr bool is_boost_serializable_v = is_boost_archive_v; - template - inline static constexpr bool is_boost_serializable_v = is_boost_archive_v; -} // namespace ttg::detail - -#else -using blk_t = double; -#endif -template -using SpMatrix = Eigen::SparseMatrix; -template -using SpMatrixTriplet = Eigen::Triplet; // {row,col,value} - -#if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE) - -#if __has_include() - -#include - -#endif // __has_include() - -namespace btas { - template - inline btas::Tensor operator*(const btas::Tensor &A, - const btas::Tensor &B) { - btas::Tensor C; - btas::contract(1.0, A, {1, 2}, B, {2, 3}, 0.0, C, {1, 3}); - return C; - } - - template - btas::Tensor gemm(btas::Tensor &&C, const btas::Tensor &A, - const btas::Tensor &B) { - using array = btas::DEFAULT::index; - if (C.empty()) { // first contribution to C = allocate it and gemm with beta=0 - C = btas::Tensor(btas::Range(A.range().extent(0), B.range().extent(1))); - btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 0.0, C, array{1, 3}, false, false); - } - else { // subsequent contributions to C = gemm with beta=1 - btas::contract_222(1.0, A, array{1, 2}, B, array{2, 3}, 1.0, C, array{1, 3}, false, false); - } - return std::move(C); - } -} // namespace btas -#endif // BTAS_IS_USABLE -double gemm(double C, double A, double B) { return C + A * B; } - -// template -// struct colmajor_layout; -// template -// struct colmajor_layout<_Scalar, Eigen::ColMajor, _StorageIndex> : public std::true_type {}; -// template -// struct colmajor_layout<_Scalar, Eigen::RowMajor, _StorageIndex> : public std::false_type {}; - -template -using Key = MultiIndex; - -/// maps {i,j} to rank within first (R=0) layer of the 3-d process grid -inline int ij2rank(int i, int j, int P, int Q) { - std::vector vec; - int p = (i % P); - int q = (j % Q); - int rank = (q * P) + p; - return rank; -} - -/// maps {i,j,k} to rank within a 3-d process grid -inline int ijk2rank(int i, int j, int k, int P, int Q, int R) { - std::vector vec; - int p = (i % P); - int q = (j % Q); - int l = (k % R); - int rank = (l * P * Q) + (q * P) + p; - return rank; -} - -// flow data from an existing SpMatrix on rank 0 -template &)>> -class Read_SpMatrix : public TT, std::tuple, Blk>>, Read_SpMatrix, ttg::typelist> { - public: - using baseT = typename Read_SpMatrix::ttT; - Read_SpMatrix(const char *label, const SpMatrix &matrix, Edge> &ctl, Edge, Blk> &out, - Keymap &ij_keymap) - : baseT(edges(ctl), edges(out), std::string("read_spmatrix(") + label + ")", {"ctl"}, {std::string(label) + "ij"}, - ij_keymap) - , matrix_(matrix) {} - - void op(const Key<2> &, std::tuple, Blk>> &out) { - auto rank = ttg::default_execution_context().rank(); - for (int k = 0; k < matrix_.outerSize(); ++k) { - for (typename SpMatrix::InnerIterator it(matrix_, k); it; ++it) { - if (rank == this->get_keymap()(Key<2>(std::initializer_list({it.row(), it.col()})))) - ::send<0>(Key<2>(std::initializer_list({it.row(), it.col()})), ttg::persistent(it.value()), out); - } - } - } - - private: - const SpMatrix &matrix_; -}; - -// flow (move?) data into an existing SpMatrix on rank 0 -template -class Write_SpMatrix : public TT, std::tuple<>, Write_SpMatrix, ttg::typelist> { - public: - using baseT = typename Write_SpMatrix::ttT; - - template - Write_SpMatrix(SpMatrix &matrix, Edge, Blk> &in, Keymap2 &&ij_keymap, bool write_back = false) - : baseT(edges(in), edges(), "write_spmatrix", {"Cij"}, {}, ij_keymap) - , matrix_(matrix) - , write_back(write_back) - { } - - void op(const Key<2> &key, typename baseT::input_refs_tuple_type &&elem, std::tuple<> &) { - - if (write_back) { - std::lock_guard lock(mtx_); - ttg::trace("rank =", default_execution_context().rank(), - "/ thread_id =", reinterpret_cast(pthread_self()), "spmm.cc Write_SpMatrix wrote {", - key[0], ",", key[1], "} = ", baseT::template get<0>(elem), " in ", static_cast(&matrix_), - " with mutex @", static_cast(&mtx_), " for object @", static_cast(this)); - values_.emplace_back(key[0], key[1], std::move(baseT::template get<0>(elem))); - } - } - - /// grab completion status as a future - /// \note cannot be called once this is executable - const std::shared_future &status() const { - assert(!this->is_executable()); - if (!completion_status_) { // if not done yet, register completion work with the world - auto promise = std::make_shared>(); - completion_status_ = std::make_shared>(promise->get_future()); - ttg_register_status(this->get_world(), std::move(promise)); - ttg_register_callback(this->get_world(), - [this]() { this->matrix_.setFromTriplets(this->values_.begin(), this->values_.end()); }); - } else { // if done already, commit the result - this->matrix_.setFromTriplets(this->values_.begin(), this->values_.end()); - } - return *completion_status_; - } - - private: - std::mutex mtx_; - SpMatrix &matrix_; - std::vector> values_; - mutable std::shared_ptr> completion_status_; - bool write_back = false; -}; - -/// sparse mm via 2.5D SUMMA - -/// @tparam KeyMap2 maps {i,j} to processor -/// @tparam KeyMap3 maps {i,j,k} to processor -template &)>, typename Keymap3 = std::function &)>, - typename Blk = blk_t> -class SpMM25D { - public: - /// @param ij_keymap maps {i,j} to process, specifies distribution of tiles of A, B, and C - /// @param ijk_keymap maps {i,j,k} to process, controls distribution of tasks performing C[i][j] += A[i][k]*B[k][j] - /// @param R the number of "layers" in the 3-D process grid - SpMM25D(Edge, Blk> &a, Edge, Blk> &b, Edge, Blk> &c, const SpMatrix &a_mat, - const SpMatrix &b_mat, const std::vector> &a_rowidx_to_colidx, - const std::vector> &a_colidx_to_rowidx, - const std::vector> &b_rowidx_to_colidx, - const std::vector> &b_colidx_to_rowidx, const std::vector &mTiles, - const std::vector &nTiles, const std::vector &kTiles, Keymap2 ij_keymap, Keymap3 ijk_keymap, long R) - : a_rowidx_to_colidx_(a_rowidx_to_colidx) - , b_colidx_to_rowidx_(b_colidx_to_rowidx) - , a_colidx_to_rowidx_(a_colidx_to_rowidx) - , b_rowidx_to_colidx_(b_rowidx_to_colidx) - , ij_keymap_(std::move(ij_keymap)) - , ijk_keymap_(std::move(ijk_keymap)) { - bcast_a_ = std::make_unique(a, local_a_ijk_, b_rowidx_to_colidx_, ij_keymap_, ijk_keymap_); - local_bcast_a_ = std::make_unique(local_a_ijk_, a_ijk_, b_rowidx_to_colidx_, ijk_keymap_); - bcast_b_ = std::make_unique(b, local_b_ijk_, a_colidx_to_rowidx_, ij_keymap_, ijk_keymap_); - local_bcast_b_ = std::make_unique(local_b_ijk_, b_ijk_, a_colidx_to_rowidx_, ijk_keymap_); - multiplyadd_ = std::make_unique(a_ijk_, b_ijk_, c_ijk_, c_ij_p_, a_rowidx_to_colidx_, - b_colidx_to_rowidx_, mTiles, nTiles, ijk_keymap_); - reduce_c_ = std::make_unique(c_ij_p_, c, ij_keymap_); - reduce_c_->template set_input_reducer<0>([](Blk &c_ij, const Blk &c_ij_p) { c_ij = c_ij + c_ij_p; }); - // compute how many contributions each C[i][j] should expect ... MultiplyAdd already does this, but need a way to - // send message from each process p to the process owning C[i][j] to expect a contribution from it for now replicate - // this logic ... - // TODO: do this in MultiplyAdd (need to allreduce this info so that everyone has it) - // N.B. only need to set stream size on the rank that will accumulate the C[i][j] contribution - const auto my_rank = ttg::default_execution_context().rank(); - for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) { - if (a_rowidx_to_colidx_[i].empty()) continue; - for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) { - if (b_colidx_to_rowidx_[j].empty()) continue; - - if (ij_keymap_(Key<2>{i, j}) == my_rank) { - decltype(i) k; - bool have_k; - std::tie(k, have_k) = multiplyadd_->compute_first_k(i, j); - std::vector c_ij_procmask(R, false); - if (have_k) { - const auto pR = k % R; // k values are distributed round-robin among the layers of the 3-D grid - assert(pR < c_ij_procmask.size()); - c_ij_procmask[pR] = true; - while (have_k) { - std::tie(k, have_k) = multiplyadd_->compute_next_k(i, j, k); - if (have_k) { - const auto pR = k % R; - assert(pR < c_ij_procmask.size()); - c_ij_procmask[pR] = true; - } - } - } - const auto c_ij_nprocs = std::count_if(c_ij_procmask.begin(), c_ij_procmask.end(), [](bool b) { return b; }); - if (c_ij_nprocs > 0) reduce_c_->template set_argstream_size<0>(Key<2>{i, j}, c_ij_nprocs); - } - } - } - - TTGUNUSED(bcast_a_); - TTGUNUSED(bcast_b_); - TTGUNUSED(multiplyadd_); - TTGUNUSED(reduce_c_); - } - - /// Locally broadcast `A[i][k]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `j` such that - /// `B[k][j]` exists AND `k` contribution to `C[i][j]` is assigned to this processor - class LocalBcastA : public TT, std::tuple, Blk>>, LocalBcastA, ttg::typelist> { - public: - using baseT = typename LocalBcastA::ttT; - - LocalBcastA(Edge, Blk> &a, Edge, Blk> &a_ijk, - const std::vector> &b_rowidx_to_colidx, const Keymap3 &ijk_keymap) - : baseT(edges(a), edges(a_ijk), "SpMM25D::local_bcast_a", {"a_ikp"}, {"a_ijk"}, - [](const Key<3> &ikp) { return ikp[2]; }) - , b_rowidx_to_colidx_(b_rowidx_to_colidx) - , ijk_keymap_(ijk_keymap) {} - - void op(const Key<3> &ikp, typename baseT::input_refs_tuple_type &&a_ik, std::tuple, Blk>> &a_ijk) { - const auto i = ikp[0]; - const auto k = ikp[1]; - const auto p = ikp[2]; - - auto world = default_execution_context(); - assert(p == world.rank()); - ttg::trace("LocalBcastA(", i, ", ", k, ", ", p, ")"); - if (k >= b_rowidx_to_colidx_.size()) return; - // local broadcast a_ik to all {i,j,k} such that b_kj exists - std::vector> ijk_keys; - for (auto &j : b_rowidx_to_colidx_[k]) { - if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) { - ttg::trace("Broadcasting A[", i, "][", k, "] on proc ", p, " to j=", j); - ijk_keys.emplace_back(Key<3>({i, j, k})); - } - } - ::broadcast<0>(ijk_keys, std::move(baseT::template get<0>(a_ik)), a_ijk); - } - - private: - const std::vector> &b_rowidx_to_colidx_; - const Keymap3 &ijk_keymap_; - }; // class LocalBcastA - - /// broadcast `A[i][k]` to all processors which will contain at least one `C[i][j]` such that `B[k][j]` exists - class BcastA : public TT, std::tuple, Blk>>, BcastA, ttg::typelist> { - public: - using baseT = typename BcastA::ttT; - - BcastA(Edge, Blk> &a_ik, Edge, Blk> &a_ikp, const std::vector> &b_rowidx_to_colidx, - const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap) - : baseT(edges(a_ik), edges(a_ikp), "SpMM25D::bcast_a", {"a_ik"}, {"a_ikp"}, ij_keymap) - , b_rowidx_to_colidx_(b_rowidx_to_colidx) - , ijk_keymap_(ijk_keymap) {} - - void op(const Key<2> &ik, typename baseT::input_refs_tuple_type &&a_ik, std::tuple, Blk>> &a_ikp) { - const auto i = ik[0]; - const auto k = ik[1]; - ttg::trace("BcastA(", i, ", ", k, ")"); - std::vector> ikp_keys; - - if (k >= b_rowidx_to_colidx_.size()) return; - auto world = default_execution_context(); - std::vector procmap(world.size()); - for (auto &j : b_rowidx_to_colidx_[k]) { - const long p = ijk_keymap_(Key<3>( - {i, j, k})); // N.B. in 2.5D SUMMA different k contributions to C[i][j] are computed on different nodes - if (!procmap[p]) { - ttg::trace("Broadcasting A[", i, "][", k, "] to proc ", p); - ikp_keys.emplace_back(Key<3>({i, k, p})); - procmap[p] = true; - } - } - ::broadcast<0>(ikp_keys, std::move(baseT::template get<0>(a_ik)), a_ikp); - } - - private: - const std::vector> &b_rowidx_to_colidx_; - const Keymap3 &ijk_keymap_; - }; // class BcastA - - /// Locally broadcast `B[k][j]` assigned to this processor `p` to matmul tasks `{i,j,k}` for all `k` such that - /// `A[i][k]` exists AND `k` contribution to `C[i][j]` is assigned to this processor - class LocalBcastB : public TT, std::tuple, Blk>>, LocalBcastB, ttg::typelist> { - public: - using baseT = typename LocalBcastB::ttT; - - LocalBcastB(Edge, Blk> &b_kjp, Edge, Blk> &b_ijk, - const std::vector> &a_colidx_to_rowidx, const Keymap3 &ijk_keymap) - : baseT(edges(b_kjp), edges(b_ijk), "SpMM25D::local_bcast_b", {"b_kjp"}, {"b_ijk"}, - [](const Key<3> &kjp) { return kjp[2]; }) - , a_colidx_to_rowidx_(a_colidx_to_rowidx) - , ijk_keymap_(ijk_keymap) {} - - void op(const Key<3> &kjp, typename baseT::input_refs_tuple_type &&b_kj, std::tuple, Blk>> &b_ijk) { - const auto k = kjp[0]; - const auto j = kjp[1]; - const auto p = kjp[2]; - auto world = default_execution_context(); - assert(p == world.rank()); - ttg::trace("BcastB(", k, ", ", j, ", ", p, ")"); - if (k >= a_colidx_to_rowidx_.size()) return; - // broadcast b_kj to all ijk for which c_ij is on this processor and a_ik exists - std::vector> ijk_keys; - for (auto &i : a_colidx_to_rowidx_[k]) { - if (ijk_keymap_(Key<3>({i, j, k})) == world.rank()) { - ttg::trace("Broadcasting B[", k, "][", j, "] on proc ", p, " to i=", i); - ijk_keys.emplace_back(Key<3>({i, j, k})); - } - } - ::broadcast<0>(ijk_keys, std::move(baseT::template get<0>(b_kj)), b_ijk); - } - - private: - const std::vector> &a_colidx_to_rowidx_; - const Keymap3 &ijk_keymap_; - }; // class LocalBcastB - - /// broadcast `B[k][j]` to all processors which will contain at least one `C[i][j]` such that `A[i][k]` exists - class BcastB : public TT, std::tuple, Blk>>, BcastB, ttg::typelist> { - public: - using baseT = typename BcastB::ttT; - - BcastB(Edge, Blk> &b_kj, Edge, Blk> &b_kjp, const std::vector> &a_colidx_to_rowidx, - const Keymap2 &ij_keymap, const Keymap3 &ijk_keymap) - : baseT(edges(b_kj), edges(b_kjp), "SpMM25D::bcast_b", {"b_kj"}, {"b_kjp"}, ij_keymap) - , a_colidx_to_rowidx_(a_colidx_to_rowidx) - , ijk_keymap_(ijk_keymap) {} - - void op(const Key<2> &kj, typename baseT::input_refs_tuple_type &&b_kj, std::tuple, Blk>> &b_kjp) { - const auto k = kj[0]; - const auto j = kj[1]; - // broadcast b_kj to all processors which will contain at least one c_ij such that a_ik exists - std::vector> kjp_keys; - ttg::trace("BcastB(", k, ", ", j, ")"); - if (k >= a_colidx_to_rowidx_.size()) return; - auto world = default_execution_context(); - std::vector procmap(world.size()); - for (auto &i : a_colidx_to_rowidx_[k]) { - long p = ijk_keymap_(Key<3>({i, j, k})); - if (!procmap[p]) { - ttg::trace("Broadcasting B[", k, "][", j, "] to proc ", p); - kjp_keys.emplace_back(Key<3>({k, j, p})); - procmap[p] = true; - } - } - ::broadcast<0>(kjp_keys, std::move(baseT::template get<0>(b_kj)), b_kjp); - } - - private: - const std::vector> &a_colidx_to_rowidx_; - const Keymap3 &ijk_keymap_; - }; // class BcastB - - /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this kayer of the - /// 3-D process grid only - class MultiplyAdd : public TT, std::tuple, Blk>, Out, Blk>>, MultiplyAdd, - ttg::typelist> { - public: - using baseT = typename MultiplyAdd::ttT; - -#if defined(TTG_ENABLE_CUDA) - static constexpr bool have_cuda_op = true; -#warning SPMM using CUDA implementation -#elif defined(TTG_ENABLE_HIP) - static constexpr bool have_hip_op = true; -#warning SPMM using HIP implementation -#elif defined(TTG_ENABLE_LEVEL_ZERO) - static constexpr bool have_level_zero_op = true; -#warning SPMM using LEVEL_ZERO implementation -#else -#error No valid device implementation found! -#endif - - MultiplyAdd(Edge, Blk> &a_ijk, Edge, Blk> &b_ijk, Edge, Blk> &c_ijk, Edge, Blk> &c, - const std::vector> &a_rowidx_to_colidx, - const std::vector> &b_colidx_to_rowidx, const std::vector &mTiles, - const std::vector &nTiles, const Keymap3 &ijk_keymap) - : baseT(edges(a_ijk, b_ijk, c_ijk), edges(c, c_ijk), "SpMM25D::MultiplyAdd", {"a_ijk", "b_ijk", "c_ijk"}, - {"c_ij", "c_ijk"}, ijk_keymap) - , a_rowidx_to_colidx_(a_rowidx_to_colidx) - , b_colidx_to_rowidx_(b_colidx_to_rowidx) { - this->set_priomap([this](const Key<3> &ijk) { return this->prio(ijk); }); // map a key to an integral priority value - auto num_devices = ttg::device::num_devices(); - this->set_devicemap( - [num_devices](const Key<3> &ijk){ - return ((((uint64_t)ijk[0]) << 32) + ijk[1]) % num_devices; - }); - // for each {i,j} determine first k that contributes AND belongs to this node, - // initialize input {i,j,first_k} flow to 0 - for (auto i = 0ul; i != a_rowidx_to_colidx_.size(); ++i) { - if (a_rowidx_to_colidx_[i].empty()) continue; - for (auto j = 0ul; j != b_colidx_to_rowidx_.size(); ++j) { - if (b_colidx_to_rowidx_[j].empty()) continue; - - const auto p = ttg::default_execution_context().rank(); - decltype(i) k; - bool have_k; - std::tie(k, have_k) = compute_first_k(i, j, p); - if (have_k) { - ttg::trace("Initializing C[", i, "][", j, "] on process ", p, " to zero"); -#if BLOCK_SPARSE_GEMM - Blk zero(btas::Range(mTiles[i], nTiles[j]), 0.0); -#else - Blk zero{0.0}; -#endif - this->template in<2>()->send(Key<3>({i, j, k}), zero); - } else { - if (tracing() && a_rowidx_to_colidx_.size() * b_colidx_to_rowidx_.size() < 400) - ttg::print("C[", i, "][", j, "] is empty"); - } - } - } - } - - ttg::device::Task op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk, - std::tuple, Blk>, Out, Blk>> &result) { - const auto i = ijk[0]; - const auto j = ijk[1]; - const auto k = ijk[2]; // k==l same because 000 will always be on layer 0, 001 will be accessed on layer 1 - const auto p = ttg::default_execution_context().rank(); - long next_k; - bool have_next_k; - - const blk_t& A = baseT::template get<0>(_ijk); - const blk_t& B = baseT::template get<1>(_ijk); - blk_t& C = baseT::template get<2>(_ijk); - - if (C.empty()) { - C = blk_t(btas::Range(A.range().extent(0), B.range().extent(1)), 0.0); - } - - /* pull all buffers onto the device */ - co_await ttg::device::select(A.b, B.b, C.b); - - /* everything is on the device, call the gemm */ - device_gemm(C, A, B); - - /* compute next k while the kernel is running */ - std::tie(next_k, have_next_k) = compute_next_k(i, j, k, p); - ttg::trace("Rank ", ttg::default_execution_context().rank(), - " :" - " C[", - i, "][", j, "] += A[", i, "][", k, "] by B[", k, "][", j, "], next_k? ", - (have_next_k ? std::to_string(next_k) : "does not exist")); - - /* wait for the kernel to complete */ - co_await ttg::device::wait(); - - - // compute the contrib, pass the running total to the next flow, if needed - // otherwise write to the result flow - if (have_next_k) { - co_await ttg::device::forward(ttg::device::send<1>( - Key<3>({i, j, next_k}), - std::move(C), - result)); - } else { // done with all local contributions to C[i][j], reduce with others on the process to which C[i][j] - // belongs - co_await ttg::device::forward(ttg::device::send<0>( - Key<2>({i, j}), - std::move(C), - result)); - } - } - - private: - const std::vector> &a_rowidx_to_colidx_; - const std::vector> &b_colidx_to_rowidx_; - - /* Compute the length of the remaining sequence on that tile */ - int32_t prio(const Key<3> &key) const { - const auto i = key[0]; - const auto j = key[1]; - const auto k = key[2]; - int32_t len = -1; // will be incremented at least once - long next_k = k; - bool have_next_k; - do { - std::tie(next_k, have_next_k) = compute_next_k(i, j, next_k); // here I know how many 'k' I have with same ij - ++len; - } while (have_next_k); - return len; - } - - public: // to be able to reuse this logic in SpMM25D - // given {i,j} return first k such that A[i][k] and B[k][j] exist - std::tuple compute_first_k(long i, long j) const { - const auto &a_k_range = a_rowidx_to_colidx_.at(i); - auto a_iter = a_k_range.begin(); - auto a_iter_fence = a_k_range.end(); - if (a_iter == a_iter_fence) return std::make_tuple(-1, false); - const auto &b_k_range = b_colidx_to_rowidx_.at(j); - auto b_iter = b_k_range.begin(); - auto b_iter_fence = b_k_range.end(); - if (b_iter == b_iter_fence) return std::make_tuple(-1, false); - - { - auto a_colidx = *a_iter; // pointing to next kth element - auto b_rowidx = *b_iter; - while (a_colidx != b_rowidx) { - if (a_colidx < b_rowidx) { - ++a_iter; - if (a_iter == a_iter_fence) return std::make_tuple(-1, false); - a_colidx = *a_iter; - } else { - ++b_iter; - if (b_iter == b_iter_fence) return std::make_tuple(-1, false); - b_rowidx = *b_iter; - } - } - return std::make_tuple(a_colidx, true); // returned true for kth element exist and also returns next k since - // a_colidx points to ++a_iter, if not reaches to fence - } - assert(false); - } - - // given {i,j,k} such that A[i][k] and B[k][j] exist - // return next k such that this condition holds - std::tuple compute_next_k(long i, long j, long k) const { - const auto &a_k_range = a_rowidx_to_colidx_.at(i); - auto a_iter_fence = a_k_range.end(); - auto a_iter = std::find(a_k_range.begin(), a_iter_fence, k); - assert(a_iter != a_iter_fence); - const auto &b_k_range = b_colidx_to_rowidx_.at(j); - auto b_iter_fence = b_k_range.end(); - auto b_iter = std::find(b_k_range.begin(), b_iter_fence, k); - assert(b_iter != b_iter_fence); - while (a_iter != a_iter_fence && b_iter != b_iter_fence) { - ++a_iter; - ++b_iter; - if (a_iter == a_iter_fence || b_iter == b_iter_fence) return std::make_tuple(-1, false); - auto a_colidx = *a_iter; - auto b_rowidx = *b_iter; - while (a_colidx != b_rowidx) { - if (a_colidx < b_rowidx) { - ++a_iter; - if (a_iter == a_iter_fence) return std::make_tuple(-1, false); - a_colidx = *a_iter; - } else { - ++b_iter; - if (b_iter == b_iter_fence) return std::make_tuple(-1, false); - b_rowidx = *b_iter; - } - } - return std::make_tuple(a_colidx, true); - } - ttg::abort(); // unreachable - return std::make_tuple(0, false); - } - - // given {i,j} return first k such that A[i][k] and B[k][j] exist AND ijk_keymap_(i,j,k) == p - std::tuple compute_first_k(long i, long j, long p) const { - long first_k = 0; - bool have_k = false; - std::tie(first_k, have_k) = compute_first_k(i, j); - while (have_k) { - if (this->get_keymap()(Key<3>{i, j, first_k}) == p) - return {first_k, true}; - else - std::tie(first_k, have_k) = compute_next_k(i, j, first_k); - } - return {0, false}; - } - - // given {i,j,k} such that A[i][k] and B[k][j] exist - // return next k such that this condition holds AND ijk_keymap_(i,j,k) == p - std::tuple compute_next_k(long i, long j, long k, long p) const { - long next_k = 0; - bool have_k = false; - std::tie(next_k, have_k) = compute_next_k(i, j, k); - while (have_k) { - if (this->get_keymap()(Key<3>{i, j, next_k}) == p) - return {next_k, true}; - else - std::tie(next_k, have_k) = compute_next_k(i, j, next_k); - } - return {0, false}; - } - - }; // MultiplyAdd - - /// reduces contributions to `C[i][j]` produced on different layers of the 3-d process grid - class ReduceC : public TT, std::tuple, Blk>>, ReduceC, ttg::typelist> { - public: - using baseT = typename ReduceC::ttT; - - ReduceC(Edge, Blk> &c_ij_p, Edge, Blk> &c_ij, const Keymap2 &ij_keymap) - : baseT(edges(c_ij_p), edges(c_ij), "SpMM25D::reduce_c", {"c_ij(p)"}, {"c_ij"}, ij_keymap) {} - - void op(const Key<2> &ij, typename baseT::input_refs_tuple_type &&c_ij_p, std::tuple, Blk>> &c_ij) { - ttg::trace("ReduceC(", ij[0], ", ", ij[1], ")"); - ::send<0>(ij, std::move(baseT::template get<0>(c_ij_p)), c_ij); - } - }; // class ReduceC - - private: - Edge, Blk> a_ijk_; - Edge, Blk> local_a_ijk_; - Edge, Blk> b_ijk_; - Edge, Blk> local_b_ijk_; - Edge, Blk> c_ijk_; - Edge, Blk> c_ij_p_; - const std::vector> &a_rowidx_to_colidx_; - const std::vector> &b_colidx_to_rowidx_; - const std::vector> &a_colidx_to_rowidx_; - const std::vector> &b_rowidx_to_colidx_; - std::unique_ptr bcast_a_; - std::unique_ptr local_bcast_a_; - std::unique_ptr bcast_b_; - std::unique_ptr local_bcast_b_; - std::unique_ptr multiplyadd_; - std::unique_ptr reduce_c_; - Keymap2 ij_keymap_; - Keymap3 ijk_keymap_; -}; - -class Control : public TT>>, Control> { - using baseT = typename Control::ttT; - int P; - int Q; - - public: - explicit Control(Edge> &ctl) : baseT(edges(), edges(ctl), "Control", {}, {"ctl"}), P(0), Q(0) {} - - void op(std::tuple>> &out) const { - for (int p = 0; p < P; p++) { - for (int q = 0; q < Q; q++) { - ttg::trace("Control: start computing on process {", p, ", ", q, "}"); - ::sendk<0>(Key<2>{p, q}, out); - } - } - } - - void start(const int _p, const int _q) { - P = _p; - Q = _q; - invoke(); - } -}; - -#ifdef BTAS_IS_USABLE -template -std::tuple norms(const btas::Tensor &t) { - T_ norm_2_square = 0.0; - T_ norm_inf = 0.0; - for (auto k : t) { - norm_2_square += k * k; - norm_inf = std::max(norm_inf, std::abs(k)); - } - return std::make_tuple(norm_2_square, norm_inf); -} -#endif - -std::tuple norms(double t) { return std::make_tuple(t * t, std::abs(t)); } - -template -std::tuple norms(const SpMatrix &A) { - double norm_2_square = 0.0; - double norm_inf = 0.0; - for (int i = 0; i < A.outerSize(); ++i) { - for (typename SpMatrix::InnerIterator it(A, i); it; ++it) { - // cout << 1+it.row() << "\t"; // row index - // cout << 1+it.col() << "\t"; // col index (here it is equal to k) - // cout << it.value() << endl; - auto& elem = it.value(); - double elem_norm_2_square, elem_norm_inf; - std::tie(elem_norm_2_square, elem_norm_inf) = norms(elem); - norm_2_square += elem_norm_2_square; - norm_inf = std::max(norm_inf, elem_norm_inf); - } - } - return std::make_tuple(norm_2_square, norm_inf); -} - -char *getCmdOption(char **begin, char **end, const std::string &option) { - static char *empty = (char *)""; - char **itr = std::find(begin, end, option); - if (itr != end && ++itr != end) return *itr; - return empty; -} - -bool cmdOptionExists(char **begin, char **end, const std::string &option) { - return std::find(begin, end, option) != end; -} - -int cmdOptionIndex(char **begin, char **end, const std::string &option) { - char **itr = std::find(begin, end, option); - if (itr != end) return (int)(itr - begin); - return -1; -} - -static int parseOption(std::string &option, int default_value) { - size_t pos; - std::string token; - int N = default_value; - if (option.length() == 0) return N; - pos = option.find(':'); - if (pos == std::string::npos) { - pos = option.length(); - } - token = option.substr(0, pos); - N = std::stoi(token); - option.erase(0, pos + 1); - return N; -} - -static long parseOption(std::string &option, long default_value) { - size_t pos; - std::string token; - long N = default_value; - if (option.length() == 0) return N; - pos = option.find(':'); - if (pos == std::string::npos) { - pos = option.length(); - } - token = option.substr(0, pos); - N = std::stol(token); - option.erase(0, pos + 1); - return N; -} - -static double parseOption(std::string &option, double default_value = 0.25) { - size_t pos; - std::string token; - double N = default_value; - if (option.length() == 0) return N; - pos = option.find(':'); - if (pos == std::string::npos) { - pos = option.length(); - } - token = option.substr(0, pos); - N = std::stod(token); - option.erase(0, pos + 1); - return N; -} - -#if !defined(BLOCK_SPARSE_GEMM) -static void initSpMatrixMarket(const std::function &)> &keymap, const char *filename, SpMatrix<> &A, - SpMatrix<> &B, SpMatrix<> &C, int &M, int &N, int &K) { - std::vector sizes; - // We load the entire matrix on each rank, but we only use the local part for the GEMM - // loadMarket() is the eigan fuction to load matrix from a file - if (!loadMarket(A, filename)) { - std::cerr << "Failed to load " << filename << ", bailing out..." << std::endl; - ttg::ttg_abort(); - } - if (0 == ttg::default_execution_context().rank()) { - std::cout << "##MatrixMarket file " << filename << " -- " << A.rows() << " x " << A.cols() << " -- " << A.nonZeros() - << " nnz (density: " << (float)A.nonZeros() / (float)A.rows() / (float)A.cols() << ")" << std::endl; - } - if (A.rows() != A.cols()) { - B = A.transpose(); - } else { - B = A; - } - - C.resize(A.rows(), B.cols()); - M = (int)A.rows(); - N = (int)C.cols(); - K = (int)A.cols(); -} - -static void initSpRmat(const std::function &)> &keymap, const char *opt, SpMatrix<> &A, SpMatrix<> &B, - SpMatrix<> &C, int &M, int &N, int &K, unsigned long seed) { - int E; - double a = 0.25, b = 0.25, c = 0.25, d = 0.25; - size_t nnz = 0; - - if (nullptr == opt) { - std::cerr << "Usage: -rmat <#nodes>[:<#edges>[:[::[[:]]]]]" << std::endl; - exit(1); - } - std::string token; - std::string option = std::string(opt); - N = parseOption(option, -1); - K = N; - M = N; - - // We build the entire sparse matrix on each rank, but use only the local part - // on a given rank, according to keymap - A.resize(N, N); - - E = parseOption(option, (int)(0.01 * N * N)); - a = parseOption(option, a); - b = parseOption(option, b); - c = parseOption(option, c); - d = parseOption(option, d); - - if (ttg::default_execution_context().rank() == 0) { - std::cout << "#R-MAT: " << N << " nodes, " << E << " edges, a/b/c/d = " << a << "/" << b << "/" << c << "/" << d - << std::endl; - } - - boost::minstd_rand gen(seed); - boost::rmat_iterator> rmat_it(gen, N, E, a, b, c, d); - - using triplet_t = Eigen::Triplet; - std::vector A_elements; - for (int i = 0; i < N; i++) { - nnz++; - A_elements.emplace_back(i, i, 1.0); - } - for (int i = 0; i < E; i++) { - auto x = *rmat_it++; - if (x.first != x.second) { - A_elements.emplace_back(x.first, x.second, 1.0); - nnz++; - } - } - A.setFromTriplets(A_elements.begin(), A_elements.end()); - - B = A; - C.resize(N, N); - - if (ttg::default_execution_context().rank() == 0) { - std::cout << "#R-MAT: " << E << " nonzero elements, density: " << (double)nnz / (double)N / (double)N << std::endl; - } -} - -static void initSpHardCoded(const std::function &)> &keymap, SpMatrix<> &A, SpMatrix<> &B, - SpMatrix<> &C, int &m, int &n, int &k) { - m = 2; - n = 3; - k = 4; - - std::cout << "#HardCoded A, B, C" << std::endl; - A.resize(m, k); - B.resize(k, n); - C.resize(m, n); - // We initialize the same matrices on all the ranks, but we will use only the local part - // following the keymap - using triplet_t = Eigen::Triplet; - std::vector A_elements; - A_elements.emplace_back(0, 1, 12.3); - A_elements.emplace_back(0, 2, 10.7); - A_elements.emplace_back(0, 3, -2.3); - A_elements.emplace_back(1, 0, -0.3); - A_elements.emplace_back(1, 2, 1.2); - A.setFromTriplets(A_elements.begin(), A_elements.end()); - - std::vector B_elements; - B_elements.emplace_back(0, 0, 12.3); - B_elements.emplace_back(1, 0, 10.7); - B_elements.emplace_back(3, 0, -2.3); - B_elements.emplace_back(1, 1, -0.3); - B_elements.emplace_back(1, 2, 1.2); - B_elements.emplace_back(2, 2, 7.2); - B_elements.emplace_back(3, 2, 0.2); - B.setFromTriplets(B_elements.begin(), B_elements.end()); -} - -#else -static void initBlSpHardCoded(const std::function &)> &keymap, SpMatrix<> &A, SpMatrix<> &B, - SpMatrix<> &C, SpMatrix<> &Aref, SpMatrix<> &Bref, bool buildRefs, - std::vector &mTiles, std::vector &nTiles, std::vector &kTiles, - std::vector> &a_rowidx_to_colidx, - std::vector> &a_colidx_to_rowidx, - std::vector> &b_rowidx_to_colidx, - std::vector> &b_colidx_to_rowidx, int &m, int &n, int &k) { - m = 2; - n = 3; - k = 4; - - std::cout << "#HardCoded A, B, C" << std::endl; - A.resize(m, k); - B.resize(k, n); - C.resize(m, n); - if (buildRefs) { - Aref.resize(m, k); - Bref.resize(k, n); - } - - for (int mt = 0; mt < m; mt++) mTiles.push_back(128); - for (int nt = 0; nt < n; nt++) nTiles.push_back(196); - for (int kt = 0; kt < k; kt++) kTiles.push_back(256); - - int rank = ttg::default_execution_context().rank(); - - using triplet_t = Eigen::Triplet; - std::vector A_elements; - std::vector Aref_elements; -#if defined(BTAS_IS_USABLE) - if (keymap({0, 1}) == rank) { - A_elements.emplace_back(0, 1, blk_t(btas::Range(128, 256), 12.3)); - } - if (keymap({0, 2}) == rank) { - A_elements.emplace_back(0, 2, blk_t(btas::Range(128, 256), 10.7)); - } - if (keymap({0, 3}) == rank) { - A_elements.emplace_back(0, 3, blk_t(btas::Range(128, 256), -2.3)); - } - if (keymap({1, 0}) == rank) { - A_elements.emplace_back(1, 0, blk_t(btas::Range(128, 256), -0.3)); - } - if (keymap({1, 2}) == rank) { - A_elements.emplace_back(1, 2, blk_t(btas::Range(128, 256), 1.2)); - } - if (buildRefs && rank == 0) { - Aref_elements.emplace_back(0, 1, blk_t(btas::Range(128, 256), 12.3)); - Aref_elements.emplace_back(0, 2, blk_t(btas::Range(128, 256), 10.7)); - Aref_elements.emplace_back(0, 3, blk_t(btas::Range(128, 256), -2.3)); - Aref_elements.emplace_back(1, 0, blk_t(btas::Range(128, 256), -0.3)); - Aref_elements.emplace_back(1, 2, blk_t(btas::Range(128, 256), 1.2)); - } -#else - if ((buildRefs && rank == 0) || keymap({0, 1}) == rank) { - A_elements.emplace_back(0, 1, 12.3); - } - if ((buildRefs && rank == 0) || keymap({0, 2}) == rank) { - A_elements.emplace_back(0, 2, 10.7); - } - if ((buildRefs && rank == 0) || keymap({0, 3}) == rank) { - A_elements.emplace_back(0, 3, -2.3); - } - if ((buildRefs && rank == 0) || keymap({1, 0}) == rank) { - A_elements.emplace_back(1, 0, -0.3); - } - if ((buildRefs && rank == 0) || keymap({1, 2}) == rank) { - A_elements.emplace_back(1, 2, .2); - } - if (buildRefs && rank == 0) { - Aref_elements.emplace_back(0, 1, 12.3); - Aref_elements.emplace_back(0, 2, 10.7); - Aref_elements.emplace_back(0, 3, -2.3); - Aref_elements.emplace_back(1, 0, -0.3); - Aref_elements.emplace_back(1, 2, .2); - } -#endif - a_rowidx_to_colidx.resize(2); - a_rowidx_to_colidx[0].emplace_back(1); // A[0][1] - a_rowidx_to_colidx[0].emplace_back(2); // A[0][2] - a_rowidx_to_colidx[0].emplace_back(3); // A[0][3] - a_rowidx_to_colidx[1].emplace_back(0); // A[1][0] - a_rowidx_to_colidx[1].emplace_back(2); // A[1][2] - - a_colidx_to_rowidx.resize(4); - a_colidx_to_rowidx[0].emplace_back(1); // A[1][0] - a_colidx_to_rowidx[1].emplace_back(0); // A[0][1] - a_colidx_to_rowidx[2].emplace_back(0); // A[0][2] - a_colidx_to_rowidx[2].emplace_back(1); // A[1][2] - a_colidx_to_rowidx[3].emplace_back(0); // A[0][3] - - A.setFromTriplets(A_elements.begin(), A_elements.end()); - - if (buildRefs && 0 == rank) { - Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end()); - } - - std::vector B_elements; - std::vector Bref_elements; -#if defined(BTAS_IS_USABLE) - if (keymap({0, 0}) == rank) { - B_elements.emplace_back(0, 0, blk_t(btas::Range(256, 196), 12.3)); - } - if (keymap({1, 0}) == rank) { - B_elements.emplace_back(1, 0, blk_t(btas::Range(256, 196), 10.7)); - } - if (keymap({3, 0}) == rank) { - B_elements.emplace_back(3, 0, blk_t(btas::Range(256, 196), -2.3)); - } - if (keymap({1, 1}) == rank) { - B_elements.emplace_back(1, 1, blk_t(btas::Range(256, 196), -0.3)); - } - if (keymap({1, 2}) == rank) { - B_elements.emplace_back(1, 2, blk_t(btas::Range(256, 196), 1.2)); - } - if (keymap({2, 2}) == rank) { - B_elements.emplace_back(2, 2, blk_t(btas::Range(256, 196), 7.2)); - } - if (keymap({3, 2}) == rank) { - B_elements.emplace_back(3, 2, blk_t(btas::Range(256, 196), 0.2)); - } - if (buildRefs && rank == 0) { - Bref_elements.emplace_back(0, 0, blk_t(btas::Range(256, 196), 12.3)); - Bref_elements.emplace_back(1, 0, blk_t(btas::Range(256, 196), 10.7)); - Bref_elements.emplace_back(3, 0, blk_t(btas::Range(256, 196), -2.3)); - Bref_elements.emplace_back(1, 1, blk_t(btas::Range(256, 196), -0.3)); - Bref_elements.emplace_back(1, 2, blk_t(btas::Range(256, 196), 1.2)); - Bref_elements.emplace_back(2, 2, blk_t(btas::Range(256, 196), 7.2)); - Bref_elements.emplace_back(3, 2, blk_t(btas::Range(256, 196), 0.2)); - } -#else - if (keymap({0, 0}) == rank) { - B_elements.emplace_back(0, 0, 12.3); - } - if (keymap({1, 0}) == rank) { - B_elements.emplace_back(1, 0, 10.7); - } - if (keymap({3, 0}) == rank) { - B_elements.emplace_back(3, 0, -2.3); - } - if (keymap({1, 1}) == rank) { - B_elements.emplace_back(1, 1, -0.3); - } - if (keymap({1, 2}) == rank) { - B_elements.emplace_back(1, 2, 1.2); - } - if (keymap({2, 2}) == rank) { - B_elements.emplace_back(2, 2, 7.2); - } - if (keymap({3, 2}) == rank) { - B_elements.emplace_back(3, 2, 0.2); - } -#endif - b_rowidx_to_colidx.resize(4); - b_rowidx_to_colidx[0].emplace_back(0); // B[0][0] - b_rowidx_to_colidx[1].emplace_back(0); // B[1][0] - b_rowidx_to_colidx[1].emplace_back(1); // B[1][1] - b_rowidx_to_colidx[1].emplace_back(2); // B[1][2] - b_rowidx_to_colidx[2].emplace_back(2); // B[2][2] - b_rowidx_to_colidx[3].emplace_back(0); // B[3][0] - b_rowidx_to_colidx[3].emplace_back(2); // B[3][2] - - b_colidx_to_rowidx.resize(3); - b_colidx_to_rowidx[0].emplace_back(0); // B[0][0] - b_colidx_to_rowidx[0].emplace_back(1); // B[1][0] - b_colidx_to_rowidx[0].emplace_back(3); // B[3][0] - b_colidx_to_rowidx[1].emplace_back(1); // B[1][1] - b_colidx_to_rowidx[2].emplace_back(1); // B[1][2] - b_colidx_to_rowidx[2].emplace_back(2); // B[2][2] - b_colidx_to_rowidx[2].emplace_back(3); // A[3][2] - - B.setFromTriplets(B_elements.begin(), B_elements.end()); - if (buildRefs && 0 == rank) { - Bref.setFromTriplets(Bref_elements.begin(), Bref_elements.end()); - } -} - -#if defined(BTAS_IS_USABLE) -static void initBlSpRandom(const std::function &)> &keymap, size_t M, size_t N, size_t K, int minTs, - int maxTs, double avgDensity, SpMatrix<> &A, SpMatrix<> &B, SpMatrix<> &Aref, - SpMatrix<> &Bref, bool buildRefs, std::vector &mTiles, std::vector &nTiles, - std::vector &kTiles, std::vector> &a_rowidx_to_colidx, - std::vector> &a_colidx_to_rowidx, - std::vector> &b_rowidx_to_colidx, - std::vector> &b_colidx_to_rowidx, double &average_tile_size, - double &Adensity, double &Bdensity, unsigned int seed) { - int rank = ttg::default_execution_context().rank(); - - int ts; - std::mt19937 gen(seed); - std::mt19937 genv(seed + 1); - - std::uniform_int_distribution<> dist(minTs, maxTs); // randomly pick any value in the range minTs, maxTs - using triplet_t = Eigen::Triplet; - std::vector A_elements; - std::vector B_elements; - std::vector Aref_elements; - std::vector Bref_elements; - - for (int m = 0; m < M; m += ts) { - ts = dist(gen); - if (ts > M - m) ts = M - m; - mTiles.push_back(ts); - } - for (int n = 0; n < N; n += ts) { - ts = dist(gen); - if (ts > N - n) ts = N - n; - nTiles.push_back(ts); - } - for (int k = 0; k < K; k += ts) { - ts = dist(gen); - if (ts > K - k) ts = K - k; - kTiles.push_back(ts); - } - - A.resize(mTiles.size(), kTiles.size()); - B.resize(kTiles.size(), nTiles.size()); - if (buildRefs) { - Aref.resize(mTiles.size(), kTiles.size()); - Bref.resize(kTiles.size(), nTiles.size()); - } - - std::uniform_int_distribution<> mDist(0, mTiles.size() - 1); - std::uniform_int_distribution<> nDist(0, nTiles.size() - 1); - std::uniform_int_distribution<> kDist(0, kTiles.size() - 1); - std::uniform_real_distribution<> vDist(-1.0, 1.0); - - size_t filling = 0; - size_t avg_nb = 0; - int avg_nb_nb = 0; - - struct tuple_hash : public std::unary_function, std::size_t> { - std::size_t operator()(const std::tuple &k) const { - return static_cast(std::get<0>(k)) | (static_cast(std::get<1>(k)) << 32); - } - }; - - std::unordered_set, tuple_hash> fills; - - fills.clear(); - while ((double)filling / (double)(M * K) < avgDensity) { - int mt = mDist(gen); - int kt = kDist(gen); - - if (fills.find({mt, kt}) != fills.end()) continue; - fills.insert({mt, kt}); - - if (mt >= a_rowidx_to_colidx.size()) a_rowidx_to_colidx.resize(mt + 1); - a_rowidx_to_colidx[mt].emplace_back(kt); - if (kt >= a_colidx_to_rowidx.size()) a_colidx_to_rowidx.resize(kt + 1); - a_colidx_to_rowidx[kt].emplace_back(mt); - - filling += mTiles[mt] * kTiles[kt]; - avg_nb += mTiles[mt] * kTiles[kt]; - avg_nb_nb++; - double value = vDist(genv); - if (0 == rank && buildRefs) Aref_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value)); - if (rank != keymap({mt, kt})) continue; - A_elements.emplace_back(mt, kt, blk_t(btas::Range(mTiles[mt], kTiles[kt]), value)); - } - for (auto &row : a_rowidx_to_colidx) { - std::sort(row.begin(), row.end()); - } - for (auto &col : a_colidx_to_rowidx) { - std::sort(col.begin(), col.end()); - } - A.setFromTriplets(A_elements.begin(), A_elements.end()); - Adensity = (double)filling / (double)(M * K); - if (0 == rank && buildRefs) Aref.setFromTriplets(Aref_elements.begin(), Aref_elements.end()); - - filling = 0; - fills.clear(); - while ((double)filling / (double)(K * N) < avgDensity) { - int nt = nDist(gen); - int kt = kDist(gen); - - if (fills.find({kt, nt}) != fills.end()) continue; - fills.insert({kt, nt}); - - if (kt >= b_rowidx_to_colidx.size()) b_rowidx_to_colidx.resize(kt + 1); - b_rowidx_to_colidx[kt].emplace_back(nt); - if (nt >= b_colidx_to_rowidx.size()) b_colidx_to_rowidx.resize(nt + 1); - b_colidx_to_rowidx[nt].emplace_back(kt); - - filling += kTiles[kt] * nTiles[nt]; - avg_nb += kTiles[kt] * nTiles[nt]; - avg_nb_nb++; - double value = vDist(genv); - if (0 == rank && buildRefs) Bref_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value)); - if (rank != keymap({kt, nt})) continue; - B_elements.emplace_back(kt, nt, blk_t(btas::Range(kTiles[kt], nTiles[nt]), value)); - } - for (auto &row : b_rowidx_to_colidx) { - std::sort(row.begin(), row.end()); - } - for (auto &col : b_colidx_to_rowidx) { - std::sort(col.begin(), col.end()); - } - B.setFromTriplets(B_elements.begin(), B_elements.end()); - Bdensity = (double)filling / (double)(K * N); - if (0 == rank && buildRefs) Bref.setFromTriplets(Bref_elements.begin(), Bref_elements.end()); - fills.clear(); - - average_tile_size = (double)avg_nb / avg_nb_nb; -} -#endif - -#endif - -static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function &)> &ij_keymap, - const std::function &)> &ijk_keymap, const std::string &tiling_type, - double gflops, double avg_nb, double Adensity, double Bdensity, - const std::vector> &a_rowidx_to_colidx, - const std::vector> &a_colidx_to_rowidx, - const std::vector> &b_rowidx_to_colidx, - const std::vector> &b_colidx_to_rowidx, std::vector &mTiles, - std::vector &nTiles, std::vector &kTiles, int M, int N, int K, int minTs, - int maxTs, int P, int Q, int R) { - int MT = (int)A.rows(); - int NT = (int)B.cols(); - int KT = (int)A.cols(); - assert(KT == B.rows()); - - SpMatrix<> C; - C.resize(MT, NT); - - // flow graph needs to exist on every node - Edge> ctl("control"); - Control control(ctl); - Edge, blk_t> eA, eB; - Edge, blk_t> eC; - - Read_SpMatrix a("A", A, ctl, eA, ij_keymap); - Read_SpMatrix b("B", B, ctl, eB, ij_keymap); - Write_SpMatrix<> c(C, eC, ij_keymap); - auto &c_status = c.status(); - assert(!has_value(c_status)); - // SpMM25D a_times_b(world, eA, eB, eC, A, B); - SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, - mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R); - TTGUNUSED(a); - TTGUNUSED(b); - TTGUNUSED(a_times_b); - - auto connected = make_graph_executable(&control); - assert(connected); - TTGUNUSED(connected); - - struct timeval start { - 0 - }, end{0}, diff{0}; - gettimeofday(&start, nullptr); - // ready, go! need only 1 kick, so must be done by 1 thread only - if (ttg::default_execution_context().rank() == 0) control.start(P, Q); - fence(); - gettimeofday(&end, nullptr); - timersub(&end, &start, &diff); - double tc = (double)diff.tv_sec + (double)diff.tv_usec / 1e6; -#if defined(TTG_USE_MADNESS) - std::string rt("MAD"); -#elif defined(TTG_USE_PARSEC) - std::string rt("PARSEC"); -#else - std::string rt("Unkown???"); -#endif - if (ttg::default_execution_context().rank() == 0) { - std::cout << "TTG-" << rt << " PxQxR= " << P << " " << Q << " " << R << " 1 average_NB= " << avg_nb << " M= " << M - << " N= " << N << " K= " << K << " t= " << minTs << " T=" << maxTs << " Tiling= " << tiling_type - << " A_density= " << Adensity << " B_density= " << Bdensity << " gflops= " << gflops << " seconds= " << tc - << " gflops/s= " << gflops / tc << std::endl; - } -} - -#if !defined(BLOCK_SPARSE_GEMM) -static void make_rowidx_to_colidx_from_eigen(const SpMatrix<> &mat, std::vector> &r2c) { - for (int k = 0; k < mat.outerSize(); ++k) { // cols, if col-major, rows otherwise - for (typename SpMatrix::InnerIterator it(mat, k); it; ++it) { - const long row = it.row(); - const long col = it.col(); - if (row >= r2c.size()) r2c.resize(row + 1); - r2c[row].push_back(col); - } - } - // Sort each vector of column indices, as we pushed them in an arbitrary order - for (auto &row : r2c) { - std::sort(row.begin(), row.end()); - } -} - -static void make_colidx_to_rowidx_from_eigen(const SpMatrix<> &mat, std::vector> &c2r) { - for (int k = 0; k < mat.outerSize(); ++k) { // cols, if col-major, rows otherwise - for (typename SpMatrix::InnerIterator it(mat, k); it; ++it) { - const long row = it.row(); - const long col = it.col(); - - if (col >= c2r.size()) c2r.resize(col + 1); - c2r[col].push_back(row); - } - // Sort each vector of row indices, as we pushed them in an arbitrary order - for (auto &col : c2r) { - std::sort(col.begin(), col.end()); - } - } -} -#endif - -static double compute_gflops(const std::vector> &a_r2c, const std::vector> &b_r2c, - const std::vector &mTiles, const std::vector &nTiles, - const std::vector &kTiles) { - unsigned long flops = 0; - for (auto i = 0; i < a_r2c.size(); i++) { - for (auto kk = 0; kk < a_r2c[i].size(); kk++) { - auto k = a_r2c[i][kk]; - if (k > b_r2c.size()) continue; - for (auto jj = 0; jj < b_r2c[k].size(); jj++) { - auto j = b_r2c[k][jj]; - flops += static_cast(mTiles[i]) * nTiles[j] * kTiles[k]; - } - } - } - return 2.0 * (double)flops / 1e9; -} - -int main(int argc, char **argv) { - bool timing; - double gflops; - - // warm up silicon by calling gemm a few times -#ifdef BTAS_IS_USABLE - for (int i = 0; i < 20; i++) { - using baseT = typename btas::Tensor; - btas::Tensor> At(30, 30); - btas::Tensor> Bt(30, 30); - btas::Tensor> Ct(30, 30); - At.fill(1.0); - Bt.fill(2.0); - Ct.fill(3.0); - btas::gemm(std::move(Ct), Bt, At); - } -#endif // BTAS_IS_USABLE - -// static volatile int debug_signal = 0; -// std::cout << "Waiting on debug signal (int*)" << &debug_signal << std::endl; -// while (!debug_signal) {} - - - int cores = -1; - std::string nbCoreStr(getCmdOption(argv, argv + argc, "-c")); - cores = parseOption(nbCoreStr, cores); - - if (int dashdash = cmdOptionIndex(argv, argv + argc, "--") > -1) { - initialize(argc - dashdash, argv + dashdash, cores); - } else { - initialize(1, argv, cores); - } - -#ifdef BTAS_IS_USABLE - // initialize MADNESS so that TA allocators can be created - madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context()); - madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true); -#endif // BTAS_IS_USABLE - - std::string debugStr(getCmdOption(argv, argv + argc, "-d")); - auto debug = (unsigned int)parseOption(debugStr, 0); - - if (debug & (1 << 1)) { - using ttg::Debugger; - auto debugger = std::make_shared(); - Debugger::set_default_debugger(debugger); - debugger->set_exec(argv[0]); - debugger->set_prefix(ttg::default_execution_context().rank()); - // debugger->set_cmd("lldb_xterm"); - debugger->set_cmd("gdb_xterm"); - } - - int mpi_size = ttg::default_execution_context().size(); - int mpi_rank = ttg::default_execution_context().rank(); - int best_pqc = mpi_size; - int P, Q, R; - for (int c = 1; c <= (int)cbrt(mpi_size); c++) { - for (int p = 1; p <= (int)sqrt(mpi_size / c); p++) { - if ((mpi_size % (p * c)) == 0) { - int q = mpi_size / (p * c); - if (abs(c - p - q) <= best_pqc) { - best_pqc = abs(c - p - q); - P = p; - Q = q; - R = c; - } - } - } - // ttg::launch_lldb(ttg::default_execution_context().rank(), argv[0]); - - { - if (debug & (1 << 0)) { - ttg::trace_on(); - TTBase::set_trace_all(true); - } - - SpMatrix<> A, B, C, Aref, Bref; - std::string tiling_type; - int M = 0, N = 0, K = 0; - int minTs = 0, maxTs = 0; - - double avg_nb = nan("undefined"); - double Adensity = nan("undefined"); - double Bdensity = nan("undefined"); - - std::string PStr(getCmdOption(argv, argv + argc, "-P")); - P = parseOption(PStr, P); - std::string QStr(getCmdOption(argv, argv + argc, "-Q")); - Q = parseOption(QStr, Q); - // to make code behave like 2D summa if R not given - std::string RStr(getCmdOption(argv, argv + argc, "-R")); - R = parseOption(RStr, 1); - - if (P * Q * R != mpi_size) { - if (!cmdOptionExists(argv, argv + argc, "-Q") && (mpi_size % (P * R) == 0)) - Q = mpi_size / (P * R); - else if (!cmdOptionExists(argv, argv + argc, "-P") && (mpi_size % (Q * R)) == 0) - P = mpi_size / (Q * R); - else if (!cmdOptionExists(argv, argv + argc, "-R") && (mpi_size % (Q * P)) == 0) - R = mpi_size / (Q * P); - else { - if (0 == mpi_rank) { - std::cerr << P << "x" << Q << "x" << R << " is not a valid process grid -- bailing out" << std::endl; - MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); - } - } - } - - auto ij_keymap = [P, Q](const Key<2> &ij) { - int i = (int)ij[0]; - int j = (int)ij[1]; - int r = ij2rank(i, j, P, Q); - return r; - }; - - auto ijk_keymap = [P, Q, R](const Key<3> &ijk) { - int i = (int)ijk[0]; - int j = (int)ijk[1]; - int k = (int)ijk[2]; - int r = ijk2rank(i, j, k, P, Q, R); - return r; - }; - - std::string seedStr(getCmdOption(argv, argv + argc, "-s")); - unsigned int seed = parseOption(seedStr, 0); - if (seed == 0) { - std::random_device rd; - seed = rd(); - if (0 == ttg::default_execution_context().rank()) std::cerr << "#Random seeded with " << seed << std::endl; - } - ttg_broadcast(ttg::default_execution_context(), seed, 0); - - std::vector mTiles; - std::vector nTiles; - std::vector kTiles; - std::vector> a_rowidx_to_colidx; - std::vector> a_colidx_to_rowidx; - std::vector> b_rowidx_to_colidx; - std::vector> b_colidx_to_rowidx; - - std::string checkStr(getCmdOption(argv, argv + argc, "-x")); - int check = parseOption(checkStr, !(argc >= 2)); - timing = (check == 0); - -#if !defined(BLOCK_SPARSE_GEMM) - if (cmdOptionExists(argv, argv + argc, "-mm")) { - char *filename = getCmdOption(argv, argv + argc, "-mm"); - tiling_type = filename; - initSpMatrixMarket(ij_keymap, filename, A, B, C, M, N, K); - } else if (cmdOptionExists(argv, argv + argc, "-rmat")) { - char *opt = getCmdOption(argv, argv + argc, "-rmat"); - tiling_type = "RandomSparseMatrix"; - initSpRmat(ij_keymap, opt, A, B, C, M, N, K, seed); - } else { - tiling_type = "HardCodedSparseMatrix"; - initSpHardCoded(ij_keymap, A, B, C, M, N, K); - } - - if (check) { - // We don't generate the sparse matrices in distributed, so Aref and Bref can - // just point to the same matrix, or be a local copy. - Aref = A; - Bref = B; - } - - // We still need to build the metadata from the matrices. - make_rowidx_to_colidx_from_eigen(A, a_rowidx_to_colidx); - make_colidx_to_rowidx_from_eigen(A, a_colidx_to_rowidx); - make_rowidx_to_colidx_from_eigen(B, b_rowidx_to_colidx); - make_colidx_to_rowidx_from_eigen(B, b_colidx_to_rowidx); - // This is only needed to compute the flops - for (int mt = 0; mt < M; mt++) mTiles.emplace_back(1); - for (int nt = 0; nt < N; nt++) nTiles.emplace_back(1); - for (int kt = 0; kt < K; kt++) kTiles.emplace_back(1); -#else - if (argc >= 2) { - std::string Mstr(getCmdOption(argv, argv + argc, "-M")); - M = parseOption(Mstr, 1200); - std::string Nstr(getCmdOption(argv, argv + argc, "-N")); - N = parseOption(Nstr, M); - std::string Kstr(getCmdOption(argv, argv + argc, "-K")); - K = parseOption(Kstr, N); - std::string minTsStr(getCmdOption(argv, argv + argc, "-t")); - minTs = parseOption(minTsStr, 64); - std::string maxTsStr(getCmdOption(argv, argv + argc, "-T")); - maxTs = parseOption(maxTsStr, minTs); - std::string avgStr(getCmdOption(argv, argv + argc, "-a")); - double avg = parseOption(avgStr, 0.3); - timing = (check == 0); - tiling_type = "RandomIrregularTiling"; - initBlSpRandom(ij_keymap, M, N, K, minTs, maxTs, avg, A, B, Aref, Bref, check, mTiles, nTiles, kTiles, - a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, avg_nb, Adensity, - Bdensity, seed); - - C.resize(mTiles.size(), nTiles.size()); - } else { - tiling_type = "HardCodedBlockSparseMatrix"; - initBlSpHardCoded(ij_keymap, A, B, C, Aref, Bref, true, mTiles, nTiles, kTiles, a_rowidx_to_colidx, - a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, M, N, K); - } -#endif // !defined(BLOCK_SPARSE_GEMM) - - gflops = compute_gflops(a_rowidx_to_colidx, b_rowidx_to_colidx, mTiles, nTiles, kTiles); - - std::string nbrunStr(getCmdOption(argv, argv + argc, "-n")); - int nb_runs = parseOption(nbrunStr, 1); - - if (timing) { - // Start up engine - execute(); - for (int nrun = 0; nrun < nb_runs; nrun++) { - parsec_devices_release_memory(); - timed_measurement(A, B, ij_keymap, ijk_keymap, tiling_type, gflops, avg_nb, Adensity, Bdensity, - a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, b_colidx_to_rowidx, mTiles, - nTiles, kTiles, M, N, K, minTs, maxTs, P, Q, R); - parsec_devices_reset_load(default_execution_context().impl().context()); - } - } else { - // flow graph needs to exist on every node - // N.B. to validate C we need it on node 0! - auto keymap_write = [](const Key<2> &key) { return 0; }; - Edge> ctl("control"); - Control control(ctl); - Edge, blk_t> eA, eB, eC; - Read_SpMatrix a("A", A, ctl, eA, ij_keymap); - Read_SpMatrix b("B", B, ctl, eB, ij_keymap); - Write_SpMatrix<> c(C, eC, keymap_write, true); - auto &c_status = c.status(); - assert(!has_value(c_status)); - // SpMM25D a_times_b(world, eA, eB, eC, A, B); - SpMM25D<> a_times_b(eA, eB, eC, A, B, a_rowidx_to_colidx, a_colidx_to_rowidx, b_rowidx_to_colidx, - b_colidx_to_rowidx, mTiles, nTiles, kTiles, ij_keymap, ijk_keymap, R); - TTGUNUSED(a_times_b); - // calling the Dot constructor with 'true' argument disables the type - if (default_execution_context().rank() == 0) std::cout << Dot{/*disable_type=*/true}(&control) << std::endl; - - // ready to run! - auto connected = make_graph_executable(&control); - assert(connected); - TTGUNUSED(connected); - - // ready, go! need only 1 kick, so must be done by 1 thread only - if (ttg::default_execution_context().rank() == 0) control.start(P, Q); - - execute(); - fence(); - - // validate C=A*B against the reference output - assert(has_value(c_status)); - if (ttg::default_execution_context().rank() == 0) { - SpMatrix<> Cref = Aref * Bref; - - double norm_2_square, norm_inf; - std::tie(norm_2_square, norm_inf) = norms(Cref - C); - std::cout << "||Cref - C||_2 = " << std::sqrt(norm_2_square) << std::endl; - std::cout << "||Cref - C||_\\infty = " << norm_inf << std::endl; - if (norm_inf > 1e-9) { - std::cout << "Cref:\n" << Cref << std::endl; - std::cout << "C:\n" << C << std::endl; - ttg_abort(); - } - } - - // validate Acopy=A against the reference output - // assert(has_value(copy_status)); - // if (ttg::default_execution_context().rank() == 0) { - // double norm_2_square, norm_inf; - // std::tie(norm_2_square, norm_inf) = norms(Acopy - A); - // std::cout << "||Acopy - A||_2 = " << std::sqrt(norm_2_square) << std::endl; - // std::cout << "||Acopy - A||_\\infty = " << norm_inf << std::endl; - // if (::ttg::tracing()) { - // std::cout << "Acopy (" << static_cast(&Acopy) << "):\n" << Acopy << std::endl; - // std::cout << "A (" << static_cast(&A) << "):\n" << A << std::endl; - // } - // if (norm_inf != 0) { - // ttg_abort(); - // } - // } - } - } - -#ifdef BTAS_IS_USABLE - madness::finalize(); -#endif // BTAS_IS_USABLE - ttg_finalize(); - return 0; - } -} diff --git a/examples/ttg_matrix.h b/examples/ttg_matrix.h index 3478e2aa4..e0d6cb3f6 100644 --- a/examples/ttg_matrix.h +++ b/examples/ttg_matrix.h @@ -8,11 +8,53 @@ #include #include "ttg/serialization/std/vector.h" +#include "ttg/util/multiindex.h" + +#include namespace ttg { namespace matrix { + enum class ReadOnceTriplet { yes, no }; + + /// element of a sparse matrix = {row index, col index, value} + + /// move-capable replacement for Eigen::Triplet + template::StorageIndex > + class Triplet { + public: + Triplet() = default; + Triplet(const Triplet&) = default; + Triplet(Triplet&&) = default; + Triplet& operator=(const Triplet&) = default; + Triplet& operator=(Triplet&&) = default; + + Triplet(StorageIndex r, const StorageIndex c, const Value& v) + : m_row(r), m_col(c), m_value(v) + {} + Triplet(StorageIndex r, const StorageIndex c, Value&& v = Value{}) + : m_row(r), m_col(c), m_value(std::move(v)) + {} + + /** \returns the row index of the element */ + const StorageIndex& row() const { return m_row; } + + /** \returns the column index of the element */ + const StorageIndex& col() const { return m_col; } + + /** \returns the value of the element */ + std::conditional_t value() const { + if constexpr (ReadOnce == ReadOnceTriplet::yes) + return std::move(m_value); + else + return m_value; + } + protected: + StorageIndex m_row = -1, m_col = -1; + mutable Value m_value; + }; + // matrix shape = maps {column,row} index to {row,column} indices class Shape : public std::vector> { using base_t = std::vector>; @@ -192,22 +234,22 @@ namespace ttg { } // compute shape of an existing SpMatrix on rank 0 - template + template class ReadShape : public TT>, ReadShape, ttg::typelist> { public: using baseT = typename ReadShape::ttT; static constexpr const int owner = 0; // where data resides - ReadShape(const char *label, const SpMatrix &matrix, Edge &in, Edge &out) + ReadShape(const char *label, const Eigen::SparseMatrix &matrix, Edge &in, Edge &out) : baseT(edges(in), edges(out), std::string("read_spmatrix_shape(") + label + ")", {"ctl"}, {std::string("shape[") + label + "]"}, /* keymap */ []() { return owner; }) , matrix_(matrix) {} - void op(std::tuple> &out) { ::sendv<0>(Shape(matrix_), out); } + void op(std::tuple> &out) { ttg::sendv<0>(Shape(matrix_), out); } private: - const SpMatrix &matrix_; + const Eigen::SparseMatrix &matrix_; }; // flow data from an existing SpMatrix on rank 0 @@ -217,38 +259,38 @@ namespace ttg { // but will only be efficient if can do random access (slow with CSC format used by Eigen matrices) // - this could be generalized to read efficiently from a distributed data structure // Use Read_SpMatrix if need to read all data from a data structure localized on 1 process - template - class Read : public TT, std::tuple, Blk>>, Read, ttg::typelist> { + template + class Read : public TT, std::tuple, Blk>>, Read, ttg::typelist> { public: - using baseT = TT, std::tuple, Blk>>, Read, void>; + using baseT = TT, std::tuple, Blk>>, Read, void>; static constexpr const int owner = 0; // where data resides - Read(const char *label, const SpMatrix &matrix, Edge, void> &in, Edge, Blk> &out) + Read(const char *label, const Eigen::SparseMatrix &matrix, Edge, void> &in, Edge, Blk> &out) : baseT(edges(in), edges(out), std::string("read_spmatrix(") + label + ")", {"ctl[ij]"}, {std::string(label) + "[ij]"}, /* keymap */ [](auto key) { return owner; }) , matrix_(matrix) {} - void op(const Key<2> &key, std::tuple, Blk>> &out) { + void op(const MultiIndex<2> &key, std::tuple, Blk>> &out) { // random access in CSC format is inefficient, this is only to demonstrate the way to go for hash-based storage // for whatever reason coeffRef does not work on a const SpMatrix& - ::send<0>(key, static_cast(const_cast &>(matrix_).coeffRef(key[0], key[1])), out); + ttg::send<0>(key, static_cast(const_cast &>(matrix_).coeffRef(key[0], key[1])), out); } private: - const SpMatrix &matrix_; + const Eigen::SparseMatrix &matrix_; }; // WriteShape commits shape to an existing SpMatrix on rank 0 and sends it on // since SpMatrix supports random inserts there is no need to commit the shape into the matrix, other than get the // dimensions - template + template class WriteShape : public TT>, WriteShape, ttg::typelist> { public: using baseT = typename WriteShape::ttT; static constexpr const int owner = 0; // where data resides - WriteShape(const char *label, SpMatrix &matrix, Edge &in, Edge &out) + WriteShape(const char *label, Eigen::SparseMatrix &matrix, Edge &in, Edge &out) : baseT(edges(in), edges(out), std::string("write_spmatrix_shape(") + label + ")", {std::string("shape_in[") + label + "]"}, {std::string("shape_out[") + label + "]"}, /* keymap */ []() { return owner; }) @@ -256,28 +298,28 @@ namespace ttg { void op(typename baseT::input_values_tuple_type &&ins, std::tuple> &out) { const auto &shape = baseT::template get<0>(ins); - ::ttg::trace("Resizing ", static_cast(&matrix_)); + ttg::trace("Resizing ", static_cast(&matrix_)); matrix_.resize(shape.nrows(), shape.ncols()); - ::sendv<0>(shape, out); + ttg::sendv<0>(shape, out); } private: - SpMatrix &matrix_; + Eigen::SparseMatrix &matrix_; }; // flow (move?) data into an existing SpMatrix on rank 0 - template - class Write : public TT, std::tuple<>, Write, Blk, ttg::typelist> { + template + class Write : public TT, std::tuple<>, Write, ttg::typelist> { public: using baseT = typename Write::ttT; - Write(const char *label, SpMatrix &matrix, Edge, Blk> &data_in, Edge, void> &ctl_in) + Write(const char *label, Eigen::SparseMatrix &matrix, Edge, Blk> &data_in, Edge, void> &ctl_in) : baseT(edges(data_in, ctl_in), edges(), std::string("write_spmatrix(") + label + ")", {std::string(label) + "[ij]", std::string("ctl[ij]")}, {}, /* keymap */ [](auto key) { return 0; }) , matrix_(matrix) {} - void op(const Key<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) { + void op(const MultiIndex<2> &key, typename baseT::input_values_tuple_type &&elem, std::tuple<> &) { std::lock_guard lock(mtx_); ttg::trace("rank =", default_execution_context().rank(), "/ thread_id =", reinterpret_cast(pthread_self()), @@ -309,8 +351,8 @@ namespace ttg { private: std::mutex mtx_; - SpMatrix &matrix_; - std::vector> values_; + Eigen::SparseMatrix &matrix_; + std::vector> values_; mutable std::shared_ptr> completion_status_; }; @@ -325,28 +367,28 @@ namespace ttg { /* keymap */ []() { return owner; }) {} void op(typename baseT::input_values_tuple_type &&ins, std::tuple> &out) { - ::sendv<0>(Shape::add(baseT::template get<0>(ins), baseT::template get<1>(ins)), out); + ttg::sendv<0>(Shape::add(baseT::template get<0>(ins), baseT::template get<1>(ins)), out); } }; // pushes all blocks given by the shape - class Push : public TT, void>>, Push, ttg::typelist> { + class Push : public TT, void>>, Push, ttg::typelist> { public: using baseT = typename Push::ttT; static constexpr const int owner = 0; // where data resides - Push(const char *label, Edge &in, Edge, void> &out) + Push(const char *label, Edge &in, Edge, void> &out) : baseT(edges(in), edges(out), std::string("push_spmatrix(") + label + ")", {std::string("shape[") + label + "]"}, {"ctl[ij]"}, /* keymap */ []() { return owner; }) {} - void op(typename baseT::input_values_tuple_type &&ins, std::tuple, void>> &out) { + void op(typename baseT::input_values_tuple_type &&ins, std::tuple, void>> &out) { const auto &shape = baseT::get<0>(ins); if (shape.type() == Shape::Type::col2row) { long colidx = 0; for (const auto &col : shape) { for (const auto rowidx : col) { - ::sendk<0>(Key<2>({rowidx, colidx}), out); + ttg::sendk<0>(MultiIndex<2>({rowidx, colidx}), out); } ++colidx; } @@ -354,7 +396,7 @@ namespace ttg { long rowidx = 0; for (const auto &row : shape) { for (const auto colidx : row) { - ::sendk<0>(Key<2>({rowidx, colidx}), out); + ttg::sendk<0>(MultiIndex<2>({rowidx, colidx}), out); } ++rowidx; } @@ -372,9 +414,9 @@ namespace ttg { class Matrix { public: using shape_t = matrix::Shape; - using data_edge_t = Edge, T>; + using data_edge_t = Edge, T>; using shape_edge_t = Edge; - using ctl_edge_t = Edge, void>; + using ctl_edge_t = Edge, void>; Matrix() = default; @@ -403,7 +445,7 @@ namespace ttg { /// @return an std::future object indicating the status; @c destination_matrix is ready if calling has_value() /// on the return value of this function is true. /// @note up to the user to ensure completion before reading destination_matrix - auto operator>>(SpMatrix &destination_matrix) { + auto operator>>(Eigen::SparseMatrix &destination_matrix) { // shape writer writes shape to destination_matrix ttg_register_ptr(world_, std::make_shared>("Matrix.WriteShape", destination_matrix, shape_edge_, shape_writer_push_edge_)); diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index e1fb7d685..a33bc3c94 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -21,6 +21,8 @@ if (CXXStdCoroutine_FOUND) list(APPEND ut_libs std::coroutine) endif(CXXStdCoroutine_FOUND) +list(APPEND ut_src constraints.cc) + add_ttg_executable(core-unittests-ttg "${ut_src}" LINK_LIBRARIES "${ut_libs}" COMPILE_DEFINITIONS "CATCH_CONFIG_NO_POSIX_SIGNALS=1" ) # serialization test: probes serialization via all supported serialization methods (MADNESS, Boost::serialization) that are available diff --git a/tests/unit/constraints.cc b/tests/unit/constraints.cc new file mode 100644 index 000000000..70003b945 --- /dev/null +++ b/tests/unit/constraints.cc @@ -0,0 +1,113 @@ +#include +#include + +#include "ttg.h" + +#include "ttg/serialization/std/pair.h" +#include "ttg/util/hash/std/pair.h" +#include "ttg/util/multiindex.h" + +using Key = ttg::MultiIndex<2>; + +TEST_CASE("constraints", "") { + + SECTION("manual") { + ttg::Edge e; + auto world = ttg::default_execution_context(); + std::atomic check_ord = 1; + std::atomic cnt = 10; + auto constraint = ttg::make_shared_constraint([](const Key& k){ return k[1]; }); + auto tt = ttg::make_tt([&](const Key& key, const int& value){ + std::cout << "key " << key[0] << ", " << key[1] << " check_ord " << check_ord << " cnt " << cnt << std::endl; + CHECK((key[1] == check_ord)); + if (--cnt == 0) { + cnt = 10; + check_ord++; + std::cout << "key " << key[0] << " releasing next ord " << check_ord << std::endl; + constraint->release(check_ord); + } + }, ttg::edges(e), ttg::edges()); + // every process executes all tasks + tt->set_keymap([&](const Key&){ return world.rank(); }); + tt->add_constraint(constraint); + constraint->stop(); + constraint->release(1); + + auto bcast = ttg::make_tt([&](){ + std::vector keys; + // loop iteration order intentionally reversed + for (int i = 10; i > 0; --i) { + for (int j = 10; j > 0; --j) { + keys.push_back(Key{i, j}); + } + } + ttg::broadcast<0>(std::move(keys), 0); + + // explicit start here to ensure absolute order + constraint->start(); + }, ttg::edges(), ttg::edges(e)); + bcast->set_keymap([&](){ return world.rank(); }); + + /** + * Constraints are currently only implemented in the PaRSEC backend. + * Codes using constraints will still compile but they will not + * affect the execution order in other backends. + */ +#ifdef TTG_USE_PARSEC + make_graph_executable(bcast); + ttg::execute(ttg::default_execution_context()); + bcast->invoke(); + + ttg::ttg_fence(ttg::default_execution_context()); +#endif // TTG_USE_PARSEC + + } + + + SECTION("automatic") { + ttg::Edge e; + auto world = ttg::default_execution_context(); + std::atomic last_ord = 0; + auto constraint = ttg::make_shared_constraint( + [](const Key& k){ return k[1]; }, true); + auto tt = ttg::make_tt([&](const Key& key, const int& value){ + int check_ord = last_ord; + std::cout << "key " << key[0] << ", " << key[1] << " check_ord " << check_ord << std::endl; + CHECK(((key[1] == check_ord) || (key[1] == check_ord+1))); + last_ord = key[1]; + }, ttg::edges(e), ttg::edges()); + // every process executes all tasks + tt->set_keymap([&](const Key&){ return world.rank(); }); + tt->add_constraint(constraint); + constraint->stop(); + + auto bcast = ttg::make_tt([&](){ + std::vector keys; + // loop iteration order intentionally reversed + for (int i = 10; i > 0; --i) { + for (int j = 10; j > 0; --j) { + keys.push_back(Key{i, j}); + } + } + ttg::broadcast<0>(std::move(keys), 0); + + // explicit start here to ensure absolute order + constraint->start(); + }, ttg::edges(), ttg::edges(e)); + bcast->set_keymap([&](){ return world.rank(); }); + + /** + * Constraints are currently only implemented in the PaRSEC backend. + * Codes using constraints will still compile but they will not + * affect the execution order in other backends. + */ +#ifdef TTG_USE_PARSEC + make_graph_executable(bcast); + ttg::execute(ttg::default_execution_context()); + bcast->invoke(); + + ttg::ttg_fence(ttg::default_execution_context()); +#endif // TTG_USE_PARSEC + + } +} // TEST_CASE("streams") diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 272f005ab..329697ed2 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -46,6 +46,7 @@ configure_file( set(ttg-impl-headers ${CMAKE_CURRENT_SOURCE_DIR}/ttg/broadcast.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/buffer.h + ${CMAKE_CURRENT_SOURCE_DIR}/ttg/constraint.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescope.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/devicescratch.h ${CMAKE_CURRENT_SOURCE_DIR}/ttg/edge.h diff --git a/ttg/ttg.h b/ttg/ttg.h index 50a891c1d..8dd86a636 100644 --- a/ttg/ttg.h +++ b/ttg/ttg.h @@ -26,6 +26,7 @@ #include "ttg/util/print.h" #include "ttg/world.h" +#include "ttg/constraint.h" #include "ttg/edge.h" #include "ttg/ptr.h" diff --git a/ttg/ttg/constraint.h b/ttg/ttg/constraint.h new file mode 100644 index 000000000..ab3671601 --- /dev/null +++ b/ttg/ttg/constraint.h @@ -0,0 +1,415 @@ +#ifndef TTG_CONSTRAINT_H +#define TTG_CONSTRAINT_H + +#include +#include +#include +#include +#include + +#include "ttg/util/span.h" + +#ifdef TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS +#include +#else +#include +#endif + +namespace ttg { + + template + struct ConstraintBase { + using key_type = Key; + using listener_t = std::function&)>; + + ConstraintBase() + { } + + ConstraintBase(ConstraintBase&& cb) + : m_listeners(std::move(cb.m_listeners)) + { } + + ConstraintBase(const ConstraintBase& cb) + : m_listeners(cb.m_listeners) + {} + + ConstraintBase& operator=(ConstraintBase&& cb) { + m_listeners = std::move(cb.m_listeners); + } + ConstraintBase& operator=(const ConstraintBase& cb) { + m_listeners = cb.m_listeners; + } + + virtual ~ConstraintBase() = default; + + void add_listener(listener_t l, ttg::TTBase *tt) { + auto g = this->lock_guard(); + m_listeners.insert_or_assign(tt, std::move(l)); + } + + void notify_listener(const ttg::span& keys, ttg::TTBase* tt) { + auto& release = m_listeners[tt]; + release(keys); + } + + protected: + + auto lock_guard() { + return std::lock_guard{m_mtx}; + } + + private: + std::map m_listeners; + std::mutex m_mtx; + }; + + template, + typename Mapper = ttg::Void> + struct SequencedKeysConstraint : public ConstraintBase { + + using key_type = std::conditional_t, ttg::Void, Key>; + using ordinal_type = Ordinal; + using keymap_t = std::function; + using compare_t = Compare; + using base_t = ConstraintBase; + + static_assert((!Compare{}(Ordinal{}, Ordinal{})), "Comparator must provide strict ordering."); + + protected: + struct sequence_elem_t { + std::map> m_keys; + + sequence_elem_t() = default; + sequence_elem_t(sequence_elem_t&&) = default; + sequence_elem_t(const sequence_elem_t&) = default; + sequence_elem_t& operator=(sequence_elem_t&&) = default; + sequence_elem_t& operator=(const sequence_elem_t&) = default; + + void add_key(const key_type& key, ttg::TTBase* tt) { + auto it = m_keys.find(tt); + if (it == m_keys.end()) { + m_keys.insert(std::make_pair(tt, std::vector{key})); + } else { + it->second.push_back(key); + } + } + }; + + bool comp_equal(const Ordinal& a, const Ordinal& b) const { + return (!m_order(a, b) && !m_order(b, a)); + } + + bool eligible(const Ordinal& ord) const { + return m_order(ord, m_current) || comp_equal(ord, m_current); + } + + bool check_key_impl(const key_type& key, Ordinal ord, ttg::TTBase *tt) { + if (!m_stopped) { + if (eligible(ord)) { + // key should be executed + if (m_auto_release) { // only needed for auto-release + m_active.fetch_add(1, std::memory_order_relaxed); + // revert the current ordinal to the lower ordinal + m_current = ord; + } + return true; + } else if (m_sequence.empty() && m_auto_release && 0 == m_active.load(std::memory_order_relaxed)) { + // there are no keys (active or blocked) so we execute to avoid a deadlock + // we don't change the current ordinal because there may be lower ordinals coming in later + // NOTE: there is a race condition between the check here and the increment above. + // This is mostly benign as it can lead to out-of-sequence released tasks. + // Avoiding this race would incur significant overheads. + m_active.fetch_add(1, std::memory_order_relaxed); + return true; + } + } + // key should be deferred + auto g = this->lock_guard(); + if (!m_stopped && eligible(ord)) { + // someone released this ordinal while we took the lock + return true; + } + auto it = m_sequence.find(ord); + if (it == m_sequence.end()) { + auto [iter, success] = m_sequence.insert(std::make_pair(ord, sequence_elem_t{})); + assert(success); + it = iter; + } + it->second.add_key(key, tt); + return false; + } + + + void complete_key_impl() { + if (m_auto_release) { + auto active = m_active.fetch_sub(1, std::memory_order_relaxed) - 1; + if (0 == active) { + release_next(); + } + } + } + + // used in the auto case + void release_next() { + if (this->m_stopped) { + // don't release tasks if we're stopped + return; + } + // trigger the next sequence + sequence_elem_t elem; + { + // extract the next sequence + auto g = this->lock_guard(); + auto it = this->m_sequence.begin(); // sequence is ordered by ordinal + if (it == this->m_sequence.end()) { + return; // nothing to be done + } + this->m_current = it->first; + elem = std::move(it->second); + this->m_sequence.erase(it); + } + + for (auto& seq : elem.m_keys) { + // account for the newly active keys + this->m_active.fetch_add(seq.second.size(), std::memory_order_relaxed); + this->notify_listener(ttg::span(seq.second.data(), seq.second.size()), seq.first); + } + } + + + // used in the non-auto case + void release_next(ordinal_type ord, bool force_check = false) { + if (this->m_stopped) { + // don't release tasks if we're stopped but remember that this was released + this->m_current = ord; + return; + } + if (!force_check && eligible(ord)) { + return; // already at the provided ordinal, nothing to be done + } + // trigger the next sequence(s) (m_sequence is ordered by ordinal) + std::vector seqs; + { + auto g = this->lock_guard(); + // set current ordinal + this->m_current = ord; + for (auto it = this->m_sequence.begin(); it != this->m_sequence.end();) { + if (!eligible(it->first)) break; + // extract the next sequence + this->m_current = it->first; + seqs.push_back(std::move(it->second)); + it = this->m_sequence.erase(it); + } + } + for (auto& elem : seqs) { + for (auto& e : elem.m_keys) { + // account for the newly active keys + this->notify_listener(ttg::span(e.second.data(), e.second.size()), e.first); + } + } + } + + public: + + /** + * Used for external key mapper. + */ + SequencedKeysConstraint(bool auto_release = false) + : base_t() + , m_auto_release(auto_release) + { } + + template + requires(std::is_invocable_v) + SequencedKeysConstraint(Mapper_&& map, bool auto_release = false) + : base_t() + , m_map(std::forward(map)) + , m_auto_release(auto_release) + { } + + SequencedKeysConstraint(SequencedKeysConstraint&& skc) = default; + + SequencedKeysConstraint(const SequencedKeysConstraint& skc) = default; + + SequencedKeysConstraint& operator=(SequencedKeysConstraint&& skc) = default; + + SequencedKeysConstraint& operator=(const SequencedKeysConstraint& skc) = default; + + virtual ~SequencedKeysConstraint() = default; + + /* Check whether the key may be executed. + * Returns true if the key may be executed. + * Otherwise, returns false and */ + template + std::enable_if_t && !ttg::meta::is_void_v, bool> + check(const key_type& key, ttg::TTBase *tt) { + ordinal_type ord = m_map(key); + return this->check_key_impl(key, ord, tt); + } + + template + std::enable_if_t && ttg::meta::is_void_v, bool> + check(const key_type& key, Ordinal ord, ttg::TTBase *tt) { + return this->check_key_impl(key, ord, tt); + } + + template + std::enable_if_t && !ttg::meta::is_void_v, bool> + check(ttg::TTBase *tt) { + return this->check_key_impl(ttg::Void{}, m_map(), tt); + } + + template + std::enable_if_t && ttg::meta::is_void_v, bool> + check(ordinal_type ord, ttg::TTBase *tt) { + return this->check_key_impl(ttg::Void{}, ord, tt); + } + + template + std::enable_if_t && !ttg::meta::is_void_v> + complete(const key_type& key, ttg::TTBase *tt) { + this->complete_key_impl(); + } + + template + std::enable_if_t && ttg::meta::is_void_v> + complete(const key_type& key, Ordinal ord, ttg::TTBase *tt) { + this->complete_key_impl(); + } + + template + std::enable_if_t && ttg::meta::is_void_v> + complete(Ordinal ord, ttg::TTBase *tt) { + this->complete_key_impl(); + } + + template + std::enable_if_t && !ttg::meta::is_void_v> + complete(ttg::TTBase *tt) { + this->complete_key_impl(); + } + + /** + * Stop all execution. Call \c start to resume. + * This constraint is not stopped by default so calls to \c start + * are only necessary if explictily stopped. + */ + void stop() { + m_stopped = true; + } + + /** + * Start execution. + * This constraint is not stopped by default so calls to \c start + * are only necessary if explictily stopped. + */ + void start() { + if (m_stopped) { + m_stopped = false; + if (m_auto_release) { + release_next(); + } else { + auto ord = m_current; + // release the first set of available keys if none were set explicitly + if (ord == std::numeric_limits::min() && + this->m_sequence.begin() != this->m_sequence.end()) { + ord = this->m_sequence.begin()->first; + } + release_next(ord, true); // force the check for a next release even if the current ordinal hasn't changed + } + } + } + + /** + * Release tasks up to the ordinal. The provided ordinal is ignored if `auto_release` is enabled. + */ + void release(ordinal_type ord = 0) { + if (m_auto_release) { + // last key for this ordinal, release the next + // the provided ordinal is ignored + release_next(); + } else { + release_next(ord); + } + } + + bool is_auto() const { + return m_auto_release; + } + + + protected: + std::map m_sequence; + ordinal_type m_current = std::numeric_limits::min(); + [[no_unique_address]] + Mapper m_map; + [[no_unique_address]] + compare_t m_order; + std::atomic m_active; + bool m_stopped = false; + bool m_auto_release = false; + }; + + // deduction guides: take type of first argument to Mapper as the key type + // TODO: can we use the TTG callable_args instead? + template>>>>> + SequencedKeysConstraint(Mapper&&) + -> SequencedKeysConstraint< + std::decay_t>>, + std::decay_t>, + std::less>>, + std::enable_if_t>>>, Mapper> + >; + + template>>>>> + SequencedKeysConstraint(Mapper&&, bool) + -> SequencedKeysConstraint< + std::decay_t>>, + std::decay_t>, + std::less>>, + std::enable_if_t>>>, Mapper> + >; + + template + SequencedKeysConstraint(SequencedKeysConstraint&&) + -> SequencedKeysConstraint; + + template + SequencedKeysConstraint(const SequencedKeysConstraint&) + -> SequencedKeysConstraint; + + /** + * Make a constraint that can be shared between multiple TT instances. + * Overload for incomplete templated constraint types. + * + * Example: + * // SequencedKeysConstraint is incomplete + * auto c = ttg::make_shared_constraint([](Key& k){ return k[0]; }); + * auto tt_a = ttg::make_tt(...); + * tt_a->add_constraint(c); + * auto tt_b = ttg::make_tt(...); + * tt_b->add_constraint(c); + * + * -> the constraint will handle keys from both tt_a and tt_b. Both TTs must have the same key type. + */ + template typename Constraint, typename... Args> + auto make_shared_constraint(Args&&... args) { + return std::make_shared(args)...))>(std::forward(args)...); + } + + /** + * Make a constraint that can be shared between multiple TT instances. + * Overload for complete constraint types. + */ + template + auto make_shared_constraint(Args&&... args) { + return std::make_shared(std::forward(args)...); + } + + + +} // namespace ttg + +#endif // TTG_CONSTRAINT_H \ No newline at end of file diff --git a/ttg/ttg/madness/ttg.h b/ttg/ttg/madness/ttg.h index 85dee437f..d84f29dcc 100644 --- a/ttg/ttg/madness/ttg.h +++ b/ttg/ttg/madness/ttg.h @@ -1206,6 +1206,23 @@ namespace ttg_madness { priomap = std::forward(pm); } + /// add a constraint + /// the constraint must provide a valid override of `check_key(key)` + template + void add_constraint(Constraint&& c) { + /* currently a noop */ + } + + template + void add_constraint(std::shared_ptr c, Mapper&& map) { + /* currently a noop */ + } + + template + void add_constraint(Constraint c, Mapper&& map) { + /* currently a noop */ + } + /// implementation of TTBase::make_executable() void make_executable() override { TTBase::make_executable(); diff --git a/ttg/ttg/madness/ttvalue.h b/ttg/ttg/madness/ttvalue.h index ad53ee5f8..a171c3c71 100644 --- a/ttg/ttg/madness/ttvalue.h +++ b/ttg/ttg/madness/ttvalue.h @@ -9,6 +9,11 @@ namespace ttg_madness { /* empty */ }; + template + inline auto persistent(ValueT&& value) { + return std::forward(value); + } + } // namespace ttg_madness #endif // TTG_MADNESS_TTVALUE_H diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 58ee528ad..f29ca8ecb 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -202,7 +202,7 @@ namespace ttg_parsec { struct parsec_ttg_task_t : public parsec_ttg_task_base_t { using key_type = typename TT::key_type; static constexpr size_t num_streams = TT::numins; - /* device tasks may have to store more copies than it's inputs as their sends are aggregated */ + /* device tasks may have to store more copies than # of its inputs as their sends are aggregated */ static constexpr size_t num_copies = TT::derived_has_device_op() ? static_cast(MAX_PARAM_COUNT) : (num_streams+1); TT* tt = nullptr; @@ -215,8 +215,9 @@ namespace ttg_parsec { device_state_t dev_state; ttg_data_copy_t *copies[num_copies] = { nullptr }; // the data copies tracked by this task - parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class) - : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) { + parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, TT *tt_ptr) + : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) + , tt(tt_ptr) { tt_ht_item.key = pkey(); this->dev_ptr = this->dev_state.dev_ptr(); // We store the hash of the key and the address where it can be found in locals considered as a scratchpad @@ -282,8 +283,9 @@ namespace ttg_parsec { ttg_data_copy_t *copies[num_streams+1] = { nullptr }; // the data copies tracked by this task // +1 for the copy needed during send/bcast - parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class) - : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) { + parsec_ttg_task_t(parsec_thread_mempool_t *mempool, parsec_task_class_t *task_class, TT *tt_ptr) + : parsec_ttg_task_base_t(mempool, task_class, num_streams, copies) + , tt(tt_ptr) { tt_ht_item.key = pkey(); this->dev_ptr = this->dev_state.dev_ptr(); } diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 3009891e1..47ab79068 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -22,6 +22,7 @@ #include "ttg/base/keymap.h" #include "ttg/base/tt.h" #include "ttg/base/world.h" +#include "ttg/constraint.h" #include "ttg/edge.h" #include "ttg/execution.h" #include "ttg/func.h" @@ -1173,6 +1174,7 @@ namespace ttg_parsec { protected: // static std::map function_id_to_instance; parsec_hash_table_t tasks_table; + parsec_hash_table_t task_constraint_table; parsec_task_class_t self; }; @@ -1342,6 +1344,9 @@ namespace ttg_parsec { bool m_defer_writer = TTG_PARSEC_DEFER_WRITER; + std::vector> constraints_check; + std::vector> constraints_complete; + public: ttg::World get_world() const override final { return world; } @@ -1992,7 +1997,7 @@ namespace ttg_parsec { task_t *dummy; parsec_execution_stream_s *es = world.impl().execution_stream(); parsec_thread_mempool_t *mempool = get_task_mempool(); - dummy = new (parsec_thread_mempool_allocate(mempool)) task_t(mempool, &this->self); + dummy = new (parsec_thread_mempool_allocate(mempool)) task_t(mempool, &this->self, this); dummy->set_dummy(true); // TODO: do we need to copy static_stream_goal in dummy? @@ -2568,6 +2573,83 @@ namespace ttg_parsec { } } + bool check_constraints(task_t *task) { + bool constrained = false; + if (constraints_check.size() > 0) { + if constexpr (ttg::meta::is_void_v) { + constrained = !constraints_check[0](); + } else { + constrained = !constraints_check[0](task->key); + } + } + if (constrained) { + // store the task so we can later access it once it is released + parsec_hash_table_insert(&task_constraint_table, &task->tt_ht_item); + } + return !constrained; + } + + template + std::enable_if_t, void> release_constraint(std::size_t cid) { + // check the next constraint, if any + assert(cid < constraints_check.size()); + bool release = true; + for (std::size_t i = cid+1; i < constraints_check.size(); i++) { + if (!constraints_check[i]()) { + release = false; + break; + } + } + if (release) { + // no constraint blocked us + task_t *task; + parsec_key_t hk = 0; + task = (task_t*)parsec_hash_table_remove(&task_constraint_table, hk); + assert(task != nullptr); + auto &world_impl = world.impl(); + parsec_execution_stream_t *es = world_impl.execution_stream(); + parsec_task_t *vp_task_rings[1] = { &task->parsec_task }; + __parsec_schedule_vp(es, vp_task_rings, 0); + } + } + + template + std::enable_if_t, void> release_constraint(std::size_t cid, const std::span& keys) { + assert(cid < constraints_check.size()); + parsec_task_t *task_ring = nullptr; + for (auto& key : keys) { + task_t *task; + bool release = true; + for (std::size_t i = cid+1; i < constraints_check.size(); i++) { + if (!constraints_check[i](key)) { + release = false; + break; + } + } + + if (release) { + // no constraint blocked this task, so go ahead and release + auto hk = reinterpret_cast(&key); + task = (task_t*)parsec_hash_table_remove(&task_constraint_table, hk); + assert(task != nullptr); + if (task_ring == nullptr) { + /* the first task is set directly */ + task_ring = &task->parsec_task; + } else { + /* push into the ring */ + parsec_list_item_ring_push_sorted(&task_ring->super, &task->parsec_task.super, + offsetof(parsec_task_t, priority)); + } + } + } + if (nullptr != task_ring) { + auto &world_impl = world.impl(); + parsec_execution_stream_t *es = world_impl.execution_stream(); + parsec_task_t *vp_task_rings[1] = { task_ring }; + __parsec_schedule_vp(es, vp_task_rings, 0); + } + } + void release_task(task_t *task, parsec_task_t **task_ring = nullptr) { constexpr const bool keyT_is_Void = ttg::meta::is_void_v; @@ -2597,16 +2679,19 @@ namespace ttg_parsec { } } if (task->remove_from_hash) parsec_hash_table_remove(&tasks_table, hk); - if (nullptr == task_ring) { - parsec_task_t *vp_task_rings[1] = { &task->parsec_task }; - __parsec_schedule_vp(es, vp_task_rings, 0); - } else if (*task_ring == nullptr) { - /* the first task is set directly */ - *task_ring = &task->parsec_task; - } else { - /* push into the ring */ - parsec_list_item_ring_push_sorted(&(*task_ring)->super, &task->parsec_task.super, - offsetof(parsec_task_t, priority)); + + if (check_constraints(task)) { + if (nullptr == task_ring) { + parsec_task_t *vp_task_rings[1] = { &task->parsec_task }; + __parsec_schedule_vp(es, vp_task_rings, 0); + } else if (*task_ring == nullptr) { + /* the first task is set directly */ + *task_ring = &task->parsec_task; + } else { + /* push into the ring */ + parsec_list_item_ring_push_sorted(&(*task_ring)->super, &task->parsec_task.super, + offsetof(parsec_task_t, priority)); + } } } else if constexpr (!ttg::meta::is_void_v) { if ((baseobj->num_pullins + count == numins) && baseobj->is_lazy_pull()) { @@ -3766,6 +3851,14 @@ namespace ttg_parsec { detail::release_data_copy(copy); task->copies[i] = nullptr; } + + for (auto& c : task->tt->constraints_complete) { + if constexpr(std::is_void_v) { + c(); + } else { + c(task->key); + } + } return PARSEC_HOOK_RETURN_DONE; } @@ -3920,6 +4013,9 @@ namespace ttg_parsec { parsec_hash_table_init(&tasks_table, offsetof(detail::parsec_ttg_task_base_t, tt_ht_item), 8, tasks_hash_fcts, NULL); + + parsec_hash_table_init(&task_constraint_table, offsetof(detail::parsec_ttg_task_base_t, tt_ht_item), 8, tasks_hash_fcts, + NULL); } template , @@ -4289,6 +4385,57 @@ namespace ttg_parsec { /// @return the device map auto get_devicemap() { return devicemap; } + /// add a shared constraint + /// the constraint must provide a valid override of `check_key(key)` + template + void add_constraint(std::shared_ptr c) { + std::size_t cid = constraints_check.size(); + if constexpr(ttg::meta::is_void_v) { + c->add_listener([this, cid](){ this->release_constraint(cid); }, this); + constraints_check.push_back([c, this](){ return c->check(this); }); + constraints_complete.push_back([c, this](const keyT& key){ c->complete(this); return true; }); + } else { + c->add_listener([this, cid](const std::span& keys){ this->release_constraint(cid, keys); }, this); + constraints_check.push_back([c, this](const keyT& key){ return c->check(key, this); }); + constraints_complete.push_back([c, this](const keyT& key){ c->complete(key, this); return true; }); + } + } + + /// add a constraint + /// the constraint must provide a valid override of `check_key(key)` + template + void add_constraint(Constraint&& c) { + // need to make this a shared_ptr since it's shared between different callbacks + this->add_constraint(std::make_shared(std::forward(c))); + } + + /// add a shared constraint + /// the constraint must provide a valid override of `check_key(key, map(key))` + /// ths overload can be used to provide different key mapping functions for each TT + template + void add_constraint(std::shared_ptr c, Mapper&& map) { + static_assert(std::is_same_v); + std::size_t cid = constraints_check.size(); + if constexpr(ttg::meta::is_void_v) { + c->add_listener([this, cid](){ this->release_constraint(cid); }, this); + constraints_check.push_back([map, c, this](){ return c->check(map(), this); }); + constraints_complete.push_back([map, c, this](){ c->complete(map(), this); return true; }); + } else { + c->add_listener([this, cid](const std::span& keys){ this->release_constraint(cid, keys); }, this); + constraints_check.push_back([map, c, this](const keyT& key){ return c->check(key, map(key), this); }); + constraints_complete.push_back([map, c, this](const keyT& key){ c->complete(key, map(key), this); return true; }); + } + } + + /// add a shared constraint + /// the constraint must provide a valid override of `check_key(key, map(key))` + /// ths overload can be used to provide different key mapping functions for each TT + template + void add_constraint(Constraint c, Mapper&& map) { + // need to make this a shared_ptr since it's shared between different callbacks + this->add_constraint(std::make_shared(std::forward(c)), std::forward(map)); + } + // Register the static_op function to associate it to instance_id void register_static_op_function(void) { int rank; @@ -4425,8 +4572,13 @@ struct ttg::detail::value_copy_handler { bool inserted = ttg_parsec::detail::add_copy_to_task(copy, caller); assert(inserted); copy_to_remove = copy; // we want to remove the copy from the task once done sending - do_release = false; // we don't release the copy since we didn't allocate it + do_release = true; // we don't release the copy since we didn't allocate it copy->add_ref(); // add a reference so that TTG does not attempt to delete this object + copy->add_ref(); // add another reference so that TTG never attempts to free this copy + if (copy->num_readers() == 0) { + /* add at least one reader (the current task) */ + copy->increment_readers(); + } } return vref.value_ref; } diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h index 351bc7a2c..fbd58c64e 100644 --- a/ttg/ttg/parsec/ttg_data_copy.h +++ b/ttg/ttg/parsec/ttg_data_copy.h @@ -572,15 +572,12 @@ namespace ttg_parsec { inline ttg_parsec_data_wrapper_t& ttg_parsec_data_wrapper_t::operator=(ttg_parsec_data_wrapper_t&& other) { m_data = std::move(other.m_data); - /* check whether the owning ttg_data_copy has already moved us */ - if (other.m_ttg_copy != m_ttg_copy) { - /* remove from old ttg copy */ - other.remove_from_owner(); - - if (nullptr != m_ttg_copy) { - /* register with the new ttg_copy */ - m_ttg_copy->add_device_data(this); - } + /* remove from old ttg copy */ + other.remove_from_owner(); + + if (nullptr != m_ttg_copy) { + /* register with the new ttg_copy */ + m_ttg_copy->add_device_data(this); } return *this; } diff --git a/ttg/ttg/parsec/ttvalue.h b/ttg/ttg/parsec/ttvalue.h index b93f1687f..b5b6aa982 100644 --- a/ttg/ttg/parsec/ttvalue.h +++ b/ttg/ttg/parsec/ttvalue.h @@ -91,9 +91,11 @@ namespace ttg_parsec { template inline auto persistent(ValueT&& value) { - static_assert(std::is_base_of_v>, std::decay_t>, - "ttg::persistent can only be used on types derived from ttg::TTValue"); - return detail::persistent_value_ref{value}; + if constexpr (std::is_base_of_v>, std::decay_t>) { + return detail::persistent_value_ref{value}; + } else { + return std::forward(value); + } } } // namespace ttg_parsec diff --git a/ttg/ttg/util/meta.h b/ttg/ttg/util/meta.h index f3af03152..c595369b0 100644 --- a/ttg/ttg/util/meta.h +++ b/ttg/ttg/util/meta.h @@ -912,6 +912,21 @@ namespace ttg { template using prepare_send_callback_t = typename prepare_send_callback::type; + template + struct constraint_callback; + + template + struct constraint_callback>> { + using type = std::function; + }; + + template + struct constraint_callback>> { + using type = std::function; + }; + + template + using constraint_callback_t = typename constraint_callback::type; } // namespace detail