Skip to content

Commit

Permalink
Merge pull request #273 from devreal/spmm_constraints
Browse files Browse the repository at this point in the history
Implement simple sequenced keys constraint
  • Loading branch information
evaleev authored Nov 15, 2024
2 parents 982975a + 7048ab5 commit 6f7c82d
Show file tree
Hide file tree
Showing 25 changed files with 1,412 additions and 2,346 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
# and build directories, but this is only available with CMake 3.13 and higher.
# The CMake binaries on the Github Actions machines are (as of this writing) 3.12
run: |
cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log)
cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG || (cat CMakeFiles/CMakeConfigureLog.yaml)
- name: Build
working-directory: ${{github.workspace}}/build
Expand All @@ -113,11 +113,11 @@ jobs:
working-directory: ${{github.workspace}}/build
shell: bash
run: |
cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_helloworld/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_helloworld/CMakeFiles/CMakeError.log)
cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/helloworld -B test_install_devsamp_helloworld -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && test_install_devsamp_helloworld/CMakeFiles/CMakeConfigureLog.yaml)
cmake --build test_install_devsamp_helloworld
$MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-parsec
$MPIEXEC -n 2 test_install_devsamp_helloworld/helloworld-mad
cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_devsamp_fibonacci/CMakeFiles/CMakeOutput.log && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeError.log)
cmake -S $GITHUB_WORKSPACE/doc/dox/dev/devsamp/fibonacci -B test_install_devsamp_fibonacci -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeConfigureLog.yaml)
cmake --build test_install_devsamp_fibonacci
$MPIEXEC -n 2 test_install_devsamp_fibonacci/fibonacci-parsec
cmake -E make_directory test_install_userexamples
Expand All @@ -130,7 +130,7 @@ jobs:
add_ttg_executable(iterative $GITHUB_WORKSPACE/doc/dox/user/examples/iterative.cc NOT_EXCLUDE_FROM_ALL)
add_ttg_executable(distributed $GITHUB_WORKSPACE/doc/dox/user/examples/distributed.cc NOT_EXCLUDE_FROM_ALL)
EOF
cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat test_install_userexamples/CMakeFiles/CMakeOutput.log && cat test_install_userexamples/CMakeFiles/CMakeError.log)
cmake -S test_install_userexamples -B test_install_userexamples/build -DCMAKE_PREFIX_PATH=${{github.workspace}}/install || (cat /home/runner/work/ttg/ttg/install/lib/cmake/ttg/ttg-config.cmake && cat test_install_devsamp_fibonacci/CMakeFiles/CMakeConfigureLog.yaml)
cmake --build test_install_userexamples/build
- name: Build+Deploy Dox
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ option(TTG_EXAMPLES "Whether to build examples" OFF)
option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)

option(TTG_ENABLE_COROUTINES "Whether to enable C++ coroutines, needed for accelerator device support" ON)
option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" ON)
option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF)
option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF)
# See https://medium.com/@alasher/colored-c-compiler-output-with-ninja-clang-gcc-10bfe7f2b949
Expand Down
22 changes: 11 additions & 11 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ TTG includes several examples that may require additional prerequisites. These a

## useful cmake cache variables:

| Variable |Default | Description |
|--------------------------------------|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `TTG_ENABLE_CUDA` | `OFF` | whether to enable CUDA device support |
| `TTG_ENABLE_HIP` | `OFF` | whether to enable HIP/ROCm device support |
| `TTG_ENABLE_LEVEL_ZERO` | `OFF` | whether to enable Intel oneAPI Level Zero device support |
| `BUILD_TESTING` | `ON` | whether target `check-ttg` and its relatives will actually build and run unit tests |
| `TTG_EXAMPLES` | `OFF` | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source |
| `TTG_ENABLE_TRACE` | `OFF` | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op |
| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF` | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. |
| `TTG_FETCH_BOOST` | `OFF` | whether to download and build Boost automatically, if missing |
| `TTG_IGNORE_BUNDLED_EXTERNALS` | `OFF` | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits) |
| Variable | Default | Description |
|--------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `TTG_ENABLE_CUDA` | `OFF` | whether to enable CUDA device support |
| `TTG_ENABLE_HIP` | `OFF` | whether to enable HIP/ROCm device support |
| `TTG_ENABLE_LEVEL_ZERO` | `OFF` | whether to enable Intel oneAPI Level Zero device support |
| `BUILD_TESTING` | `ON` | whether target `check-ttg` and its relatives will actually build and run unit tests |
| `TTG_EXAMPLES` | `OFF` | whether target `check-ttg` and its relatives will actually build and run examples; setting this to `ON` will cause detection of several optional prerequisites, and (if missing) building from source |
| `TTG_ENABLE_TRACE` | `OFF` | setting this to `ON` will enable the ability to instrument TTG code for tracing (see `ttg::trace()`, etc.); if this is set to `OFF`, `ttg::trace()` is a no-op |
| `TTG_PARSEC_USE_BOOST_SERIALIZATION` | `OFF` | whether to use Boost.Serialization for serialization for the PaRSEC backend; if this is set to `OFF`, PaRSEC backend will only be able to use trivially-copyable data types or, if MADNESS backend is available, MADNESS-serializable types. |
| `TTG_FETCH_BOOST` | `ON` | whether to download and build Boost automatically, if missing |
| `TTG_IGNORE_BUNDLED_EXTERNALS` | `OFF` | whether to install and use bundled external dependencies (currently, only Boost.CallableTraits) |
15 changes: 10 additions & 5 deletions cmake/modules/ExternalDependenciesVersions.cmake
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
# for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS)
# to be able to auto-update them

set(TTG_TRACKED_VG_CMAKE_KIT_TAG 092efee765e039b02e0a9aaf013c12fc3c4e89cf) # used to provide "real" FindOrFetchBoost
set(TTG_TRACKED_VG_CMAKE_KIT_TAG d1b34157c349cf0a7c2f149b7704a682d53f6486) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3)
set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
set(TTG_TRACKED_PARSEC_TAG 58f8f3089ecad2e8ee50e80a9586e05ce8873b1c)
set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2)
set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73)
set(TTG_TRACKED_BTAS_TAG c25b0a11d2a76190bfb13fa72f9e9dc3e57c3c2f)
set(TTG_TRACKED_TILEDARRAY_TAG 5944bdba3266a3fa19f1809c8e2accf3dad4d815)

# need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work
set(TTG_OLDEST_BOOST_VERSION 1.66)
# BUT if will be building examples, inherit the oldest version from the pickiest Boost consumer (TA and/or BSPMM)
if (TTG_EXAMPLES)
set(TTG_OLDEST_BOOST_VERSION 1.81)
else()
set(TTG_OLDEST_BOOST_VERSION 1.66)
endif()
6 changes: 3 additions & 3 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ if (TARGET tiledarray)
add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES tiledarray lapackpp)

if (TARGET CUDA::cublas)
add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc
add_ttg_executable(bspmm-cuda spmm/spmm.cc
LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS CUDA::cublas
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1
RUNTIMES "parsec")
Expand All @@ -32,7 +32,7 @@ if (TARGET tiledarray)
RUNTIMES "parsec")
endif(TARGET CUDA::cusolver)
elseif (TARGET roc::hipblas)
add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
add_ttg_executable(bspmm-hip spmm/spmm.cc
LINK_LIBRARIES tiledarray TiledArray_Eigen roc::hipblas
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_HIP=1
RUNTIMES "parsec")
Expand All @@ -43,7 +43,7 @@ if (TARGET tiledarray)
RUNTIMES "parsec")
endif(TARGET roc::hipsolver)
elseif (TARGET MKL::MKL_DPCPP)
add_ttg_executable(bspmm-lz spmm/spmm_cuda.cc
add_ttg_executable(bspmm-lz spmm/spmm.cc
LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS MKL::MKL_DPCPP level_zero::ze_loader m
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_LEVEL_ZERO=1
RUNTIMES "parsec")
Expand Down
30 changes: 17 additions & 13 deletions examples/matrixtile.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,41 @@
#include <ttg/serialization/splitmd_data_descriptor.h>


#include <TiledArray/device/allocators.h>
#include <TiledArray/external/device.h>
#if defined(TILEDARRAY_HAS_DEVICE)
#define ALLOCATOR TiledArray::device_pinned_allocator<T>
template<typename T>
using Allocator = TiledArray::device_pinned_allocator<T>;

inline void allocator_init(int argc, char **argv) {
// initialize MADNESS so that TA allocators can be created
#if defined(TTG_PARSEC_IMPORTED)
madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
#endif // TTG_PARSEC_IMPORTED
madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
#endif // TTG_PARSEC_IMPORTED
}

inline void allocator_fini() {
#if defined(TTG_PARSEC_IMPORTED)
madness::finalize();
#endif // TTG_PARSEC_IMPORTED
}
#else // TILEDARRAY_HAS_DEVICE
#define ALLOCATOR std::allocator<T>
template<typename T>
using Allocator = std::allocator<T>;

inline void allocator_init(int argc, char **argv) { }

inline void allocator_fini() { }

#endif // TILEDARRAY_HAS_DEVICE

template <typename T, class Allocator = ALLOCATOR>
class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
template <typename T, class AllocatorT = Allocator<T>>
class MatrixTile : public ttg::TTValue<MatrixTile<T, AllocatorT>> {
public:
using metadata_t = typename std::tuple<std::size_t, std::size_t, std::size_t>;

using buffer_t = typename ttg::Buffer<T, Allocator>;
using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;
using buffer_t = typename ttg::Buffer<T, AllocatorT>;
using ttvalue_type = ttg::TTValue<MatrixTile<T, AllocatorT>>;

private:
buffer_t _buffer;
Expand Down Expand Up @@ -87,15 +91,15 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
, _lda(lda)
{ }

MatrixTile(MatrixTile<T, Allocator>&& other) = default;
MatrixTile(MatrixTile<T, AllocatorT>&& other) = default;

MatrixTile& operator=(MatrixTile<T, Allocator>&& other) = default;
MatrixTile& operator=(MatrixTile<T, AllocatorT>&& other) = default;

/* Deep copy ctor und op are not needed for PO since tiles will never be read
* and written concurrently. Hence shallow copies are enough, will all
* receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
* can handle data sharing without excessive copying */
MatrixTile(const MatrixTile<T, Allocator>& other)
MatrixTile(const MatrixTile<T, AllocatorT>& other)
: ttvalue_type()
, _buffer(other._lda*other._cols)
, _rows(other._rows)
Expand All @@ -108,7 +112,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
std::copy_n(other.data(), _lda * _cols, this->data());
}

MatrixTile& operator=(const MatrixTile<T, Allocator>& other) {
MatrixTile& operator=(const MatrixTile<T, AllocatorT>& other) {
this->_rows = other._rows;
this->_cols = other._cols;
this->_lda = other._lda;
Expand Down Expand Up @@ -166,7 +170,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
}
#endif // DEBUG_TILES_VALUES

friend std::ostream& operator<<(std::ostream& o, MatrixTile<T> const& tt) {
friend std::ostream& operator<<(std::ostream& o, MatrixTile<T, AllocatorT> const& tt) {
auto ptr = tt.data();
o << std::endl << " ";
o << "MatrixTile<" << typeid(T).name() << ">{ rows=" << tt.rows() << " cols=" << tt.cols() << " ld=" << tt.lda();
Expand Down
21 changes: 15 additions & 6 deletions examples/potrf/potrf.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ namespace potrf {
ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
using T = typename MatrixT::element_type;
#if defined(ENABLE_DEVICE_KERNEL)
auto iallocator = std::make_shared<TiledArray::device_pinned_allocator<int>>();
auto iallocator = std::make_shared<Allocator<int>>();
//std::cout << "Creating CUDA POTRF task " << std::endl;
auto f_dev = [=, iallocator = std::move(iallocator)]
(const Key1& key, MatrixTile<T>&& tile_kk,
Expand Down Expand Up @@ -669,7 +669,8 @@ namespace potrf {

template <typename MatrixT>
auto make_potrf_ttg(MatrixT& A, ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& input,
ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output, bool defer_write) {
ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output, bool defer_write,
bool enable_device_map = true) {
using T = typename MatrixT::element_type;
auto keymap1 = [&](const Key1& key) { return A.rank_of(key[0], key[0]); };

Expand Down Expand Up @@ -705,28 +706,36 @@ namespace potrf {
tt_potrf->set_keymap(keymap1);
tt_potrf->set_defer_writer(defer_write);
#ifdef ENABLE_DEVICE_KERNEL
tt_potrf->set_devicemap(devmap1);
if (enable_device_map) {
tt_potrf->set_devicemap(devmap1);
}
#endif // 0

auto tt_trsm = make_trsm(A, disp_trsm, potrf_trsm, gemm_trsm, trsm_syrk, trsm_gemm_row, trsm_gemm_col, output);
tt_trsm->set_keymap(keymap2a);
tt_trsm->set_defer_writer(defer_write);
#ifdef ENABLE_DEVICE_KERNEL
tt_trsm->set_devicemap(devmap2a);
if (enable_device_map) {
tt_trsm->set_devicemap(devmap2a);
}
#endif // 0

auto tt_syrk = make_syrk(A, disp_syrk, trsm_syrk, syrk_syrk, syrk_potrf, syrk_syrk);
tt_syrk->set_keymap(keymap2b);
tt_syrk->set_defer_writer(defer_write);
#ifdef ENABLE_DEVICE_KERNEL
tt_syrk->set_devicemap(devmap2b);
if (enable_device_map) {
tt_syrk->set_devicemap(devmap2b);
}
#endif // 0

auto tt_gemm = make_gemm(A, disp_gemm, trsm_gemm_row, trsm_gemm_col, gemm_gemm, gemm_trsm, gemm_gemm);
tt_gemm->set_keymap(keymap3);
tt_gemm->set_defer_writer(defer_write);
#ifdef ENABLE_DEVICE_KERNEL
tt_gemm->set_devicemap(devmap3);
if (enable_device_map) {
tt_gemm->set_devicemap(devmap3);
}
#endif // 0

/* Priorities taken from DPLASMA */
Expand Down
5 changes: 4 additions & 1 deletion examples/potrf/testing_dpotrf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ int main(int argc, char **argv)
bool check = !cmdOptionExists(argv+1, argv+argc, "-x");
bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w");

/* whether we set a device mapping */
bool enable_device_map = !cmdOptionExists(argv, argv+argc, "--default-device-map");

// TODO: need to filter out our arguments to make parsec happy
ttg::initialize(1, argv, nthreads);

Expand Down Expand Up @@ -130,7 +133,7 @@ int main(int argc, char **argv)
init_tt->set_keymap([&]() {return world.rank();});

auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint);
auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint);
auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint, enable_device_map);
auto result_ttg = make_result_ttg(A, result, cow_hint);

auto connected = make_graph_executable(init_tt.get());
Expand Down
Loading

0 comments on commit 6f7c82d

Please sign in to comment.