From e07140e648f8f7becee6cc8b077ed5bdbdba0a83 Mon Sep 17 00:00:00 2001
From: Eduard Valeyev <eduard@valeyev.net>
Date: Fri, 14 Jul 2023 00:01:25 -0400
Subject: [PATCH] removed cublas v1 + SPMM tiles use pinned allocator ... does
 not compile yet

---
 examples/CMakeLists.txt          |  2 +-
 examples/spmm/cuda_gemm.cc       | 29 ----------------------------
 examples/spmm/cuda_gemm.h        | 24 -----------------------
 examples/spmm/spmm.cc            |  5 +++--
 ttg/ttg/device/cublas_helper.cpp | 33 ++++++++++++++++++++------------
 ttg/ttg/device/cublas_helper.h   |  3 ++-
 ttg/ttg/parsec/ttg.h             |  4 ++--
 7 files changed, 29 insertions(+), 71 deletions(-)
 delete mode 100644 examples/spmm/cuda_gemm.cc
 delete mode 100644 examples/spmm/cuda_gemm.h
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d72712b81..8750204a9 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -16,7 +16,7 @@ if (TARGET eigen3)
           if (NOT TARGET tiledarray)
               message(FATAL_ERROR "TiledArray is not found; it is required for CUDA-based block-sparse SPMM")
           endif()
-          add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc spmm/cuda_gemm.cc
+          add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc
                              LINK_LIBRARIES tiledarray eigen3 BTAS Boost::boost CUDA::cublas
                              COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
                              RUNTIMES "parsec")
diff --git a/examples/spmm/cuda_gemm.cc b/examples/spmm/cuda_gemm.cc
deleted file mode 100644
index 68987f64f..000000000
--- a/examples/spmm/cuda_gemm.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-
-#include <cublas.h>
-
-
-void my_cublas_init_because_cublas_is_stupid() {
-    cublasInit();
-}
-
-void my_cublas_shutdown_because_cublas_is_stupid() {
-    cublasShutdown();
-}
-
-void my_cublas_dgemm_because_cublas_is_stupid(
-    char transa,
-    char transb,
-    int m,
-    int n,
-    int k,
-    double alpha,
-    const double* A,
-    int lda,
-    const double* B,
-    int ldb,
-    double beta,
-    double* C,
-    int ldc)
-{
-  cublasDgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
diff --git a/examples/spmm/cuda_gemm.h b/examples/spmm/cuda_gemm.h
deleted file mode 100644
index cc6aba6c8..000000000
--- a/examples/spmm/cuda_gemm.h
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-/* We need wrap cublas v1 functions because some header (blaspp)
- * includes cublas_v2 and cublas is braindead and overwrites the v1
- * signatures. What a clusterfuck. */
-
-void my_cublas_init_because_cublas_is_stupid();
-
-void my_cublas_shutdown_because_cublas_is_stupid();
-
-void my_cublas_dgemm_because_cublas_is_stupid(
-    char transa,
-    char transb,
-    int m,
-    int n,
-    int k,
-    double alpha,
-    const double* A,
-    int lda,
-    const double* B,
-    int ldb,
-    double beta,
-    double* C,
-    int ldc);
\ No newline at end of file
diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc
index 4403c1012..1610477b7 100644
--- a/examples/spmm/spmm.cc
+++ b/examples/spmm/spmm.cc
@@ -14,8 +14,9 @@
 #include <btas/features.h>
 #ifdef BTAS_IS_USABLE
 #include <btas/btas.h>
-#include <btas/optimize/contract.h>
 #include <btas/util/mohndle.h>
+#include <btas/optimize/contract.h>
+#include <TiledArray/cuda/allocators.h>
 #else
 #warning "found btas/features.h but Boost.Iterators is missing, hence BTAS is unusable ... add -I/path/to/boost"
 #endif
@@ -40,7 +41,7 @@ using namespace ttg;
 #include "ttg/util/bug.h"
 
 #if defined(BLOCK_SPARSE_GEMM) && defined(BTAS_IS_USABLE)
-using blk_t = btas::Tensor<double, btas::DEFAULT::range, btas::mohndle<btas::varray<double>, btas::Handle::shared_ptr>>;
+using blk_t = btas::Tensor<double, btas::DEFAULT::range, btas::mohndle<btas::varray<double, TiledArray::cuda_pinned_allocator<double>>, btas::Handle::shared_ptr>>;
 
 #if defined(TTG_USE_PARSEC)
 namespace ttg {
diff --git a/ttg/ttg/device/cublas_helper.cpp b/ttg/ttg/device/cublas_helper.cpp
index 0f77c2a5f..f86eda05d 100644
--- a/ttg/ttg/device/cublas_helper.cpp
+++ b/ttg/ttg/device/cublas_helper.cpp
@@ -1,23 +1,32 @@
-#ifdef TTG_HAVE_CUDART
-#include <cublas.h>
-#endif // TTG_HAVE_CUDART
-
-#include <exception>
-#include <stdexcept>
-
 #include "ttg/config.h"
 #include "ttg/device/cublas_helper.h"
 
+#include <memory>
+#include <stdexcept>
+#include <optional>
+
 namespace ttg::detail {
 
-/* shim wrapper to work around the fact that cublas
- * deliberately breaks its API depending on the order
- * in which header are included */
+#ifdef TTG_HAVE_CUDART
+/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread
+/// \return the cuBLAS handle for the current thread
+inline const cublasHandle_t& cublas_get_handle() {
+  static thread_local std::optional<cublasHandle_t> handle;
+  if (!handle.has_value()) {
+    auto status = cublasCreate_v2(&handle.emplace());
+    if (CUBLAS_STATUS_SUCCESS != status) {
+      throw std::runtime_error("cublasCreate_v2 failed");
+    }
+  }
+  return *handle;
+}
+#endif // TTG_HAVE_CUDART
+
 void cublas_set_kernel_stream(cudaStream_t stream) {
 #ifdef TTG_HAVE_CUDART
-    cublasStatus_t status = cublasSetKernelStream(stream);
+    cublasStatus_t status = cublasSetStream_v2(cublas_get_handle(), stream);
     if (CUBLAS_STATUS_SUCCESS != status) {
-        throw std::runtime_error("cublasSetKernelStream failed");
+        throw std::runtime_error("cublasSetStream_v2 failed");
     }
 #else
     throw std::runtime_error("Support for cublas missing during installation!");
diff --git a/ttg/ttg/device/cublas_helper.h b/ttg/ttg/device/cublas_helper.h
index 51c13601c..5aae3df9b 100644
--- a/ttg/ttg/device/cublas_helper.h
+++ b/ttg/ttg/device/cublas_helper.h
@@ -4,10 +4,11 @@
 #include "ttg/config.h"
 
 #ifdef TTG_HAVE_CUDART
-#include <cublas.h>
+#include <cublas_v2.h>
 
 namespace ttg::detail {
 
+/// \brief Returns the current CUDA stream used by cuBLAS
 void cublas_set_kernel_stream(cudaStream_t stream);
 
 } // namespace ttg::detail
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 2d40b7064..59575f13b 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -1362,8 +1362,8 @@ namespace ttg_parsec {
       /* TODO: is this the right place to set the mask? */
       task->parsec_task.chore_mask = PARSEC_DEV_ALL;
       /* get a device and come back if we need another one */
-      int64_t task_load = 1;
-      dev_index = parsec_get_best_device(parsec_task, &task_load);
+      double task_load = 1.;
+      dev_index = parsec_get_best_device(parsec_task, task_load);
       assert(dev_index >= 0);
       if (dev_index < 2) {
           return PARSEC_HOOK_RETURN_NEXT; /* Fall back */