From e07140e648f8f7becee6cc8b077ed5bdbdba0a83 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 14 Jul 2023 00:01:25 -0400 Subject: [PATCH] removed cublas v1 + SPMM tiles use pinned allocator ... does not compile yet --- examples/CMakeLists.txt | 2 +- examples/spmm/cuda_gemm.cc | 29 ---------------------------- examples/spmm/cuda_gemm.h | 24 ----------------------- examples/spmm/spmm.cc | 5 +++-- ttg/ttg/device/cublas_helper.cpp | 33 ++++++++++++++++++++------------ ttg/ttg/device/cublas_helper.h | 3 ++- ttg/ttg/parsec/ttg.h | 4 ++-- 7 files changed, 29 insertions(+), 71 deletions(-) delete mode 100644 examples/spmm/cuda_gemm.cc delete mode 100644 examples/spmm/cuda_gemm.h diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d72712b81..8750204a9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -16,7 +16,7 @@ if (TARGET eigen3) if (NOT TARGET tiledarray) message(FATAL_ERROR "TiledArray is not found; it is required for CUDA-based block-sparse SPMM") endif() - add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc spmm/cuda_gemm.cc + add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc LINK_LIBRARIES tiledarray eigen3 BTAS Boost::boost CUDA::cublas COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") diff --git a/examples/spmm/cuda_gemm.cc b/examples/spmm/cuda_gemm.cc deleted file mode 100644 index 68987f64f..000000000 --- a/examples/spmm/cuda_gemm.cc +++ /dev/null @@ -1,29 +0,0 @@ - -#include - - -void my_cublas_init_because_cublas_is_stupid() { - cublasInit(); -} - -void my_cublas_shutdown_because_cublas_is_stupid() { - cublasShutdown(); -} - -void my_cublas_dgemm_because_cublas_is_stupid( - char transa, - char transb, - int m, - int n, - int k, - double alpha, - const double* A, - int lda, - const double* B, - int ldb, - double beta, - double* C, - int ldc) -{ - cublasDgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); -} diff --git a/examples/spmm/cuda_gemm.h b/examples/spmm/cuda_gemm.h deleted file mode 100644 index cc6aba6c8..000000000 --- a/examples/spmm/cuda_gemm.h +++ /dev/null @@ -1,24 +0,0 @@ - - -/* We need wrap cublas v1 functions because some header (blaspp) - * includes cublas_v2 and cublas is braindead and overwrites the v1 - * signatures. What a clusterfuck. */ - -void my_cublas_init_because_cublas_is_stupid(); - -void my_cublas_shutdown_because_cublas_is_stupid(); - -void my_cublas_dgemm_because_cublas_is_stupid( - char transa, - char transb, - int m, - int n, - int k, - double alpha, - const double* A, - int lda, - const double* B, - int ldb, - double beta, - double* C, - int ldc); \ No newline at end of file diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index 4403c1012..1610477b7 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -14,8 +14,9 @@ #include #ifdef BTAS_IS_USABLE #include -#include #include +#include +#include #else #warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost" #endif @@ -40,7 +41,7 @@ using namespace ttg; #include "ttg/util/bug.h" #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE) -using blk_t = btas::Tensor, btas::Handle::shared_ptr>>; +using blk_t = btas::Tensor>, btas::Handle::shared_ptr>>; #if defined(TTG_USE_PARSEC) namespace ttg { diff --git a/ttg/ttg/device/cublas_helper.cpp b/ttg/ttg/device/cublas_helper.cpp index 0f77c2a5f..f86eda05d 100644 --- a/ttg/ttg/device/cublas_helper.cpp +++ b/ttg/ttg/device/cublas_helper.cpp @@ -1,23 +1,32 @@ -#ifdef TTG_HAVE_CUDART -#include -#endif // TTG_HAVE_CUDART - -#include -#include - #include "ttg/config.h" #include "ttg/device/cublas_helper.h" +#include +#include +#include + namespace ttg::detail { -/* shim wrapper to work around the fact that cublas - * deliberately breaks its API depending on the order - * in which header are included */ +#ifdef TTG_HAVE_CUDART +/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread +/// \return the cuBLAS handle for the current thread +inline const cublasHandle_t& cublas_get_handle() { + static thread_local std::optional handle; + if (!handle.has_value()) { + auto status = cublasCreate_v2(&handle.emplace()); + if (CUBLAS_STATUS_SUCCESS != status) { + throw std::runtime_error("cublasCreate_v2 failed"); + } + } + return *handle; +} +#endif // TTG_HAVE_CUDART + void cublas_set_kernel_stream(cudaStream_t stream) { #ifdef TTG_HAVE_CUDART - cublasStatus_t status = cublasSetKernelStream(stream); + cublasStatus_t status = cublasSetStream_v2(cublas_get_handle(), stream); if (CUBLAS_STATUS_SUCCESS != status) { - throw std::runtime_error("cublasSetKernelStream failed"); + throw std::runtime_error("cublasSetStream_v2 failed"); } #else throw std::runtime_error("Support for cublas missing during installation!"); diff --git a/ttg/ttg/device/cublas_helper.h b/ttg/ttg/device/cublas_helper.h index 51c13601c..5aae3df9b 100644 --- a/ttg/ttg/device/cublas_helper.h +++ b/ttg/ttg/device/cublas_helper.h @@ -4,10 +4,11 @@ #include "ttg/config.h" #ifdef TTG_HAVE_CUDART -#include +#include namespace ttg::detail { +/// \brief Returns the current CUDA stream used by cuBLAS void cublas_set_kernel_stream(cudaStream_t stream); } // namespace ttg::detail diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 2d40b7064..59575f13b 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1362,8 +1362,8 @@ namespace ttg_parsec { /* TODO: is this the right place to set the mask? */ task->parsec_task.chore_mask = PARSEC_DEV_ALL; /* get a device and come back if we need another one */ - int64_t task_load = 1; - dev_index = parsec_get_best_device(parsec_task, &task_load); + double task_load = 1.; + dev_index = parsec_get_best_device(parsec_task, task_load); assert(dev_index >= 0); if (dev_index < 2) { return PARSEC_HOOK_RETURN_NEXT; /* Fall back */