Add USM support

codeplaysoftware · Aug 11, 2023 · 37be9e4 · 37be9e4
1 parent 48a5361
commit 37be9e4
Show file tree

Hide file tree

Showing 165 changed files with 6,697 additions and 3,738 deletions.
diff --git a/README.md b/README.md
@@ -175,14 +175,23 @@ of multiple BLAS operations.
 
 ## API description
 
-This section references all the supported operations and their interface.
+This section references all the supported operations and their interface. The 
+library follows the [oneAPI MKL BLAS specification](https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/blas/blas.html) 
+as reference for the api. We have support for both USM and Buffer api, however 
+the group apis for USM are not supported. We don't support mixing USM and Buffer 
+arguments together to compile the library, and instead stick to the aformentioned 
+reference specification.
 
 All operations take as their first argument a reference to the SB_Handle, a
-`blas::SB_Handle` created with a `sycl::queue`. The return value is usually an
-array of SYCL events (except for some operations that can return a scalar or
+`blas::SB_Handle` created with a `sycl::queue`. The last argument for all operators
+is a vector of dependencies of type `cl::sycl::event` (empty by default). The return value 
+is usually an array of SYCL events (except for some operations that can return a scalar or
 a tuple). The containers for the vectors and matrices (and scalars written by
-the BLAS operations) are iterator buffers that can be created with
-`make_sycl_iterator_buffer`.
+the BLAS operations) can either be `raw usm pointers` or `iterator buffers` that can be 
+created with a call to `cl::sycl::malloc_device` or `make_sycl_iterator_buffer` respectively.
+
+The USM support in SYCL-BLAS is limited to `device allocated` memory only and we don't support
+`shared` or `host` allocations with USM. 
 
 We recommend checking the [samples](samples) to get started with SYCL-BLAS. It
 is better to be familiar with BLAS:

diff --git a/benchmark/syclblas/blas1/asum.cpp b/benchmark/syclblas/blas1/asum.cpp
@@ -28,7 +28,7 @@
 constexpr blas_benchmark::utils::Level1Op benchmark_op =
     blas_benchmark::utils::Level1Op::asum;
 
-template <typename scalar_t>
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
 void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
          bool* success) {
   // initialize the state label
@@ -40,6 +40,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
                                                                        size);
 
   blas::SB_Handle& sb_handle = *sb_handle_ptr;
+  auto q = sb_handle.get_queue();
 
   // Create data
   std::vector<scalar_t> v1 = blas_benchmark::utils::random_data<scalar_t>(size);
@@ -49,19 +50,24 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
   std::transform(std::begin(v1), std::end(v1), std::begin(v1),
                  [=](scalar_t x) { return x / v1.size(); });
 
-  scalar_t vr;
+  auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
+  auto inr = blas::helper::allocate<mem_alloc, scalar_t>(1, q);
+
+  auto copy_x = blas::helper::copy_to_device<scalar_t>(q, v1.data(), inx, size);
 
-  auto inx = blas::make_sycl_iterator_buffer<scalar_t>(v1, size);
-  auto inr = blas::make_sycl_iterator_buffer<scalar_t>(&vr, 1);
+  sb_handle.wait({copy_x});
 
 #ifdef BLAS_VERIFY_BENCHMARK
   // Run a first time with a verification of the results
   scalar_t vr_ref = reference_blas::asum(size, v1.data(), 1);
   scalar_t vr_temp = 0;
   {
-    auto vr_temp_gpu = blas::make_sycl_iterator_buffer<scalar_t>(&vr_temp, 1);
-    auto event = _asum(sb_handle, size, inx, 1, vr_temp_gpu);
-    sb_handle.wait(event);
+    auto vr_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(1, q);
+    auto asum_event = _asum(sb_handle, size, inx, 1, vr_temp_gpu);
+    sb_handle.wait(asum_event);
+    auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1);
+    sb_handle.wait(copy_output);
+    blas::helper::deallocate<mem_alloc>(vr_temp_gpu, q);
   }
 
   if (!utils::almost_equal<scalar_t>(vr_temp, vr_ref)) {
@@ -100,30 +106,45 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
                           state.counters["bytes_processed"]);
 
   blas_benchmark::utils::calc_avg_counters(state);
-}
 
-template <typename scalar_t>
-void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
-                        bool* success) {
-  auto asum_params = blas_benchmark::utils::get_blas1_params(args);
+  blas::helper::deallocate<mem_alloc>(inx, q);
+  blas::helper::deallocate<mem_alloc>(inr, q);
+}
 
-  for (auto size : asum_params) {
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
+                        std::string mem_type,
+                        std::vector<blas1_param_t> params) {
+  for (auto size : params) {
     auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
                          index_t size, bool* success) {
-      run<scalar_t>(st, sb_handle_ptr, size, success);
+      run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, success);
     };
+
     benchmark::RegisterBenchmark(
         blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
-            size, blas_benchmark::utils::MEM_TYPE_BUFFER)
-            .c_str(),
+            size, mem_type).c_str(),
         BM_lambda, sb_handle_ptr, size, success)
         ->UseRealTime();
   }
 }
 
+template <typename scalar_t>
+void register_benchmark(blas_benchmark::Args& args,
+                        blas::SB_Handle* sb_handle_ptr, bool* success) {
+  auto asum_params = blas_benchmark::utils::get_blas1_params(args);
+
+  register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, asum_params);
+#ifdef SB_ENABLE_USM
+  register_benchmark<scalar_t, blas::helper::AllocType::usm>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, asum_params);
+#endif
+}
+
 namespace blas_benchmark {
-void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
-                      bool* success) {
+void create_benchmark(blas_benchmark::Args& args,
+                      blas::SB_Handle* sb_handle_ptr, bool* success) {
   BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success);
 }
 }  // namespace blas_benchmark
diff --git a/benchmark/syclblas/blas1/axpy.cpp b/benchmark/syclblas/blas1/axpy.cpp
@@ -28,7 +28,7 @@
 constexpr blas_benchmark::utils::Level1Op benchmark_op =
     blas_benchmark::utils::Level1Op::axpy;
 
-template <typename scalar_t>
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
 void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
          bool* success) {
   // initialize the state label
@@ -40,14 +40,20 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
                                                                        size);
 
   blas::SB_Handle& sb_handle = *sb_handle_ptr;
+  auto q = sb_handle.get_queue();
 
   // Create data
   std::vector<scalar_t> v1 = blas_benchmark::utils::random_data<scalar_t>(size);
   std::vector<scalar_t> v2 = blas_benchmark::utils::random_data<scalar_t>(size);
   auto alpha = blas_benchmark::utils::random_scalar<scalar_t>();
 
-  auto inx = blas::make_sycl_iterator_buffer<scalar_t>(v1, size);
-  auto iny = blas::make_sycl_iterator_buffer<scalar_t>(v2, size);
+  auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
+  auto iny = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
+
+  auto copy_x = blas::helper::copy_to_device<scalar_t>(q, v1.data(), inx, size);
+  auto copy_y = blas::helper::copy_to_device<scalar_t>(q, v2.data(), iny, size);
+
+  sb_handle.wait({copy_x, copy_y});
 
 #ifdef BLAS_VERIFY_BENCHMARK
   // Run a first time with a verification of the results
@@ -56,9 +62,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
                        y_ref.data(), 1);
   std::vector<scalar_t> y_temp = v2;
   {
-    auto y_temp_gpu = blas::make_sycl_iterator_buffer<scalar_t>(y_temp, size);
-    auto event = _axpy(sb_handle, size, alpha, inx, 1, y_temp_gpu, 1);
-    sb_handle.wait(event);
+    auto y_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
+    auto copy_temp = blas::helper::copy_to_device<scalar_t>(q, y_temp.data(),
+                                                            y_temp_gpu, size);
+    sb_handle.wait(copy_temp);
+    auto axpy_event = _axpy(sb_handle, size, alpha, inx, 1, y_temp_gpu, 1);
+    sb_handle.wait(axpy_event);
+    auto copy_output =
+        blas::helper::copy_to_host(q, y_temp_gpu, y_temp.data(), size);
+    sb_handle.wait(copy_output);
+
+    blas::helper::deallocate<mem_alloc>(y_temp_gpu, q);
   }
 
   std::ostringstream err_stream;
@@ -97,30 +111,44 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
                           state.counters["bytes_processed"]);
 
   blas_benchmark::utils::calc_avg_counters(state);
-}
 
-template <typename scalar_t>
-void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
-                        bool* success) {
-  auto axpy_params = blas_benchmark::utils::get_blas1_params(args);
+  blas::helper::deallocate<mem_alloc>(inx, q);
+  blas::helper::deallocate<mem_alloc>(iny, q);
+}
 
-  for (auto size : axpy_params) {
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
+                        std::string mem_type,
+                        std::vector<blas1_param_t> params) {
+  for (auto size : params) {
     auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
                          index_t size, bool* success) {
-      run<scalar_t>(st, sb_handle_ptr, size, success);
+      run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, success);
     };
     benchmark::RegisterBenchmark(
         blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
-            size, blas_benchmark::utils::MEM_TYPE_BUFFER)
-            .c_str(),
+            size, mem_type).c_str(),
         BM_lambda, sb_handle_ptr, size, success)
         ->UseRealTime();
   }
 }
 
+template <typename scalar_t>
+void register_benchmark(blas_benchmark::Args& args,
+                        blas::SB_Handle* sb_handle_ptr, bool* success) {
+  auto axpy_params = blas_benchmark::utils::get_blas1_params(args);
+
+  register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, axpy_params);
+#ifdef SB_ENABLE_USM
+  register_benchmark<scalar_t, blas::helper::AllocType::usm>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, axpy_params);
+#endif
+}
+
 namespace blas_benchmark {
-void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
-                      bool* success) {
+void create_benchmark(blas_benchmark::Args& args,
+                      blas::SB_Handle* sb_handle_ptr, bool* success) {
   BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success);
 }
 }  // namespace blas_benchmark
diff --git a/benchmark/syclblas/blas1/copy.cpp b/benchmark/syclblas/blas1/copy.cpp
@@ -28,7 +28,7 @@
 constexpr blas_benchmark::utils::Level1Op benchmark_op =
     blas_benchmark::utils::Level1Op::copy;
 
-template <typename scalar_t>
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
 void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
          index_t incx, index_t incy, bool* success) {
   // initialize the state label
@@ -39,6 +39,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
       blas_benchmark::utils::Level1Op::copy, scalar_t>(state, size);
 
   blas::SB_Handle& sb_handle = *sb_handle_ptr;
+  auto q = sb_handle.get_queue();
 
   auto size_x = size * incx;
   auto size_y = size * incy;
@@ -47,18 +48,36 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
   std::vector<scalar_t> y =
       blas_benchmark::utils::random_data<scalar_t>(size_y);
 
-  auto x_gpu = blas::make_sycl_iterator_buffer<scalar_t>(x, size_x);
-  auto y_gpu = blas::make_sycl_iterator_buffer<scalar_t>(y, size_y);
+  auto x_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_x, q);
+  auto y_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_y, q);
+
+  auto copy_x =
+      blas::helper::copy_to_device<scalar_t>(q, x.data(), x_gpu, size_x);
+  auto copy_y =
+      blas::helper::copy_to_device<scalar_t>(q, y.data(), y_gpu, size_y);
+
+  sb_handle.wait({copy_x, copy_y});
 
 #ifdef BLAS_VERIFY_BENCHMARK
   // Run a first time with a verification of the results
   std::vector<scalar_t> y_ref = y;
   reference_blas::copy(size, x.data(), incx, y_ref.data(), incy);
   std::vector<scalar_t> y_temp = y;
   {
-    auto y_temp_gpu = blas::make_sycl_iterator_buffer<scalar_t>(y_temp, size_y);
-    auto event = _copy(sb_handle, size, x_gpu, incx, y_temp_gpu, incy);
-    sb_handle.wait(event);
+    auto y_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_y, q);
+    auto copy_temp = blas::helper::copy_to_device<scalar_t>(q, y_temp.data(),
+                                                            y_temp_gpu, size_y);
+    sb_handle.wait(copy_temp);
+    auto copy_event = blas::_copy<blas::SB_Handle, index_t, decltype(x_gpu),
+                                  decltype(y_temp_gpu), index_t>(
+        sb_handle, size, x_gpu, incx, y_temp_gpu, incy);
+    sb_handle.wait(copy_event);
+
+    auto copy_out = blas::helper::copy_to_host<scalar_t>(q, y_temp_gpu,
+                                                         y_temp.data(), size_y);
+    sb_handle.wait(copy_out);
+
+    blas::helper::deallocate<mem_alloc>(y_temp_gpu, q);
   }
 
   std::ostringstream err_stream;
@@ -70,7 +89,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
 #endif
 
   auto blas_method_def = [&]() -> std::vector<cl::sycl::event> {
-    auto event = _copy(sb_handle, size, x_gpu, incx, y_gpu, incy);
+    auto event =
+        blas::_copy<blas::SB_Handle, index_t, decltype(x_gpu), decltype(y_gpu),
+                    index_t>(sb_handle, size, x_gpu, incx, y_gpu, incy);
     sb_handle.wait(event);
     return event;
   };
@@ -96,14 +117,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
                           state.counters["bytes_processed"]);
 
   blas_benchmark::utils::calc_avg_counters(state);
-}
 
-template <typename scalar_t>
-void register_benchmark(blas_benchmark::Args& args,
-                        blas::SB_Handle* sb_handle_ptr, bool* success) {
-  auto copy_params = blas_benchmark::utils::get_copy_params<scalar_t>(args);
+  blas::helper::deallocate<mem_alloc>(x_gpu, q);
+  blas::helper::deallocate<mem_alloc>(y_gpu, q);
+}
 
-  for (auto p : copy_params) {
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
+                        std::string mem_type,
+                        std::vector<copy_param_t<scalar_t>> params) {
+  for (auto p : params) {
     index_t size, incx, incy;
     scalar_t unused;  // Work around a dpcpp compiler bug
                       // (https://github.com/intel/llvm/issues/7075)
@@ -112,17 +135,29 @@ void register_benchmark(blas_benchmark::Args& args,
     auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
                          index_t size, index_t incx, index_t incy,
                          bool* success) {
-      run<scalar_t>(st, sb_handle_ptr, size, incx, incy, success);
+      run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, incx, incy, success);
     };
     benchmark::RegisterBenchmark(
         blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
-            size, incx, incy, blas_benchmark::utils::MEM_TYPE_BUFFER)
-            .c_str(),
+            size, incx, incy, mem_type).c_str(),
         BM_lambda, sb_handle_ptr, size, incx, incy, success)
         ->UseRealTime();
   }
 }
 
+template <typename scalar_t>
+void register_benchmark(blas_benchmark::Args& args,
+                        blas::SB_Handle* sb_handle_ptr, bool* success) {
+  auto copy_params = blas_benchmark::utils::get_copy_params<scalar_t>(args);
+
+  register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, copy_params);
+#ifdef SB_ENABLE_USM
+  register_benchmark<scalar_t, blas::helper::AllocType::usm>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, copy_params);
+#endif
+}
+
 namespace blas_benchmark {
 void create_benchmark(blas_benchmark::Args& args,
                       blas::SB_Handle* sb_handle_ptr, bool* success) {