From fac276dc123ac6303b117bfecb232f892b469c2e Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Fri, 6 Sep 2024 17:47:30 +0200
Subject: [PATCH] [SPARSE] Add support for cuSPARSE backend

---
 CMakeLists.txt                                |   8 +-
 cmake/FindCompiler.cmake                      |   4 +-
 docs/building_the_project_with_dpcpp.rst      |   8 +-
 docs/domains/sparse_linear_algebra.rst        | 136 +++++
 .../compile_time_dispatching/CMakeLists.txt   |  21 +-
 .../sparse_blas_spmv_usm_mklcpu_cusparse.cpp  | 291 ++++++++++
 .../run_time_dispatching/CMakeLists.txt       |   3 +
 include/oneapi/mkl/detail/backends.hpp        |  29 +-
 include/oneapi/mkl/detail/backends_table.hpp  |   6 +
 include/oneapi/mkl/sparse_blas.hpp            |   3 +
 .../cusparse/onemkl_sparse_blas_cusparse.hpp  |  35 ++
 .../detail/cusparse/sparse_blas_ct.hpp        |  40 ++
 src/config.hpp.in                             |   1 +
 src/sparse_blas/backends/CMakeLists.txt       |   4 +
 .../backends/cusparse/CMakeLists.txt          |  85 +++
 .../backends/cusparse/cusparse_error.hpp      | 100 ++++
 .../cusparse/cusparse_global_handle.hpp       |  63 +++
 .../backends/cusparse/cusparse_handles.cpp    | 520 ++++++++++++++++++
 .../backends/cusparse/cusparse_handles.hpp    |  78 +++
 .../backends/cusparse/cusparse_helper.hpp     | 165 ++++++
 .../cusparse/cusparse_scope_handle.cpp        | 147 +++++
 .../cusparse/cusparse_scope_handle.hpp        |  93 ++++
 .../backends/cusparse/cusparse_task.hpp       | 382 +++++++++++++
 .../backends/cusparse/cusparse_wrappers.cpp   |  32 ++
 .../cusparse/operations/cusparse_spmm.cpp     | 296 ++++++++++
 .../cusparse/operations/cusparse_spmv.cpp     | 323 +++++++++++
 .../cusparse/operations/cusparse_spsv.cpp     | 263 +++++++++
 .../backends/mkl_common/mkl_dispatch.hpp      |  37 ++
 .../backends/mkl_common/mkl_handles.cxx       | 161 ++----
 .../backends/mkl_common/mkl_handles.hpp       |   2 +
 .../backends/mkl_common/mkl_spmm.cxx          |  30 +-
 .../backends/mkl_common/mkl_spmv.cxx          |  27 +-
 .../backends/mkl_common/mkl_spsv.cxx          |  17 +-
 .../backends/mklcpu/mklcpu_handles.cpp        |   2 +-
 .../backends/mklcpu/mklcpu_operations.cpp     |   4 +-
 .../backends/mklgpu/mklgpu_handles.cpp        |   2 +-
 .../backends/mklgpu/mklgpu_operations.cpp     |   4 +-
 src/sparse_blas/common_op_verification.hpp    | 142 +++++
 src/sparse_blas/generic_container.hpp         |  67 ++-
 src/sparse_blas/macros.hpp                    |  81 +++
 src/sparse_blas/sycl_helper.hpp               |  80 +++
 tests/unit_tests/CMakeLists.txt               |   5 +
 tests/unit_tests/include/test_helper.hpp      |  10 +
 tests/unit_tests/main_test.cpp                |   3 +-
 .../sparse_blas/include/test_common.hpp       |  42 +-
 .../sparse_blas/source/sparse_spmm_buffer.cpp |   6 +-
 .../sparse_blas/source/sparse_spmm_usm.cpp    |   6 +-
 .../sparse_blas/source/sparse_spmv_buffer.cpp |   6 +-
 .../sparse_blas/source/sparse_spmv_usm.cpp    |   6 +-
 .../sparse_blas/source/sparse_spsv_buffer.cpp |  11 +-
 .../sparse_blas/source/sparse_spsv_usm.cpp    |  11 +-
 51 files changed, 3635 insertions(+), 263 deletions(-)
 create mode 100644 examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/CMakeLists.txt
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_error.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_handles.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_handles.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_helper.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_task.hpp
 create mode 100644 src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp
 create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp
 create mode 100644 src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp
 create mode 100644 src/sparse_blas/common_op_verification.hpp
 create mode 100644 src/sparse_blas/sycl_helper.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1bd39f188..39ec0f053 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,9 @@ option(ENABLE_CUFFT_BACKEND "Enable the cuFFT backend for the DFT interface" OFF
 option(ENABLE_ROCFFT_BACKEND "Enable the rocFFT backend for the DFT interface" OFF)
 option(ENABLE_PORTFFT_BACKEND "Enable the portFFT DFT backend for the DFT interface. Cannot be used with other DFT backends." OFF)
 
+# sparse
+option(ENABLE_CUSPARSE_BACKEND "Enable the cuSPARSE backend for the SPARSE_BLAS interface" OFF)
+
 set(ONEMKL_SYCL_IMPLEMENTATION "dpc++" CACHE STRING "Name of the SYCL compiler")
 set(HIP_TARGETS "" CACHE STRING "Target HIP architectures")
 
@@ -102,7 +105,8 @@ if(ENABLE_MKLGPU_BACKEND
   list(APPEND DOMAINS_LIST "dft")
 endif()
 if(ENABLE_MKLCPU_BACKEND
-        OR ENABLE_MKLGPU_BACKEND)
+        OR ENABLE_MKLGPU_BACKEND
+        OR ENABLE_CUSPARSE_BACKEND)
   list(APPEND DOMAINS_LIST "sparse_blas")
 endif()
 
@@ -129,7 +133,7 @@ if(CMAKE_CXX_COMPILER OR NOT ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++")
     string(REPLACE "\\" "/" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER})
   endif()
 else()
-  if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND
+  if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_CUSPARSE_BACKEND
     OR ENABLE_ROCBLAS_BACKEND OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND OR ENABLE_ROCFFT_BACKEND)
     set(CMAKE_CXX_COMPILER "clang++")
   elseif(ENABLE_MKLGPU_BACKEND)
diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake
index 556211999..8aefc2623 100644
--- a/cmake/FindCompiler.cmake
+++ b/cmake/FindCompiler.cmake
@@ -37,7 +37,7 @@ if(is_dpcpp)
     # Check if the Nvidia target is supported. PortFFT uses this for choosing default configuration.
     check_cxx_compiler_flag("-fsycl -fsycl-targets=nvptx64-nvidia-cuda" dpcpp_supports_nvptx64)
 
-    if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND)
+    if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUSPARSE_BACKEND)
       list(APPEND UNIX_INTERFACE_COMPILE_OPTIONS
         -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda)
       list(APPEND UNIX_INTERFACE_LINK_OPTIONS
@@ -51,7 +51,7 @@ if(is_dpcpp)
         -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend 
 	--offload-arch=${HIP_TARGETS})
     endif()
-    if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_ROCBLAS_BACKEND
+    if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUSPARSE_BACKEND OR ENABLE_ROCBLAS_BACKEND
 	    OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND)
       set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
         INTERFACE_COMPILE_OPTIONS "${UNIX_INTERFACE_COMPILE_OPTIONS}"
diff --git a/docs/building_the_project_with_dpcpp.rst b/docs/building_the_project_with_dpcpp.rst
index e33a78046..808644792 100644
--- a/docs/building_the_project_with_dpcpp.rst
+++ b/docs/building_the_project_with_dpcpp.rst
@@ -104,6 +104,9 @@ The most important supported build options are:
    * - ENABLE_CURAND_BACKEND
      - True, False
      - False     
+   * - ENABLE_CUSPARSE_BACKEND
+     - True, False
+     - False     
    * - ENABLE_NETLIB_BACKEND
      - True, False
      - False     
@@ -183,8 +186,8 @@ Building for CUDA
 ^^^^^^^^^^^^^^^^^
 
 The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``,
-``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``, and
-``ENABLE_CUSOLVER_BACKEND``.
+``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``,
+``ENABLE_CUSOLVER_BACKEND``, and ``ENABLE_CUSPARSE_BACKEND``.
 
 No additional parameters are required for using CUDA libraries. In most cases,
 the CUDA libraries should be found automatically by CMake.
@@ -356,6 +359,7 @@ disabled using the Ninja build system:
       -DENABLE_CUBLAS_BACKEND=True \
       -DENABLE_CUSOLVER_BACKEND=True \
       -DENABLE_CURAND_BACKEND=True \
+      -DENABLE_CUSPARSE_BACKEND=True \
       -DBUILD_FUNCTIONAL_TESTS=False
 
 ``$ONEMKL_DIR`` points at the oneMKL source directly. The x86 CPU (``MKLCPU``)
diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst
index eab5afd56..acff0380f 100644
--- a/docs/domains/sparse_linear_algebra.rst
+++ b/docs/domains/sparse_linear_algebra.rst
@@ -38,3 +38,139 @@ Currently known limitations:
   ``oneapi::mkl::unimplemented`` exception.
 - Scalar parameters ``alpha`` and ``beta`` should be host pointers to prevent
   synchronizations and copies to the host.
+
+
+cuSPARSE backend
+----------------
+
+Currently known limitations:
+
+- Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will
+  throw an ``oneapi::mkl::unimplemented`` exception.
+- The COO format requires the indices to be sorted by row. See the `cuSPARSE
+  documentation
+  <https://docs.nvidia.com/cuda/cusparse/index.html#coordinate-coo>`_.
+
+
+Operation algorithms mapping
+----------------------------
+
+The following tables describe how a oneMKL SYCL Interface algorithm maps to the
+backend's algorithms. Refer to the backend's documentation for a more detailed
+explanation of the algorithms.
+
+Backends with no equivalent algorithms will fallback to the backend's default
+behavior.
+
+
+spmm
+^^^^
+
+.. list-table::
+   :header-rows: 1
+   :widths: 10 30 45
+
+   * - Value
+     - Description
+     - Backend equivalent
+   * - ``default_optimize_alg``
+     - Default algorithm.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT``
+   * - ``no_optimize_alg``
+     - Default algorithm but may skip some optimizations. Useful only if an
+       operation with the same configuration is run once.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT``
+   * - ``coo_alg1``
+     - Should provide best performance for COO format, small ``nnz`` and
+       column-major layout.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG1``
+   * - ``coo_alg2``
+     - Should provide best performance for COO format and column-major layout.
+       Produces deterministic results.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG2``
+   * - ``coo_alg3``
+     - Should provide best performance for COO format and large ``nnz``.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG3``
+   * - ``coo_alg4``
+     - Should provide best performance for COO format and row-major layout.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG4``
+   * - ``csr_alg1``
+     - Should provide best performance for CSR format and column-major layout.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG1``
+   * - ``csr_alg2``
+     - Should provide best performance for CSR format and row-major layout.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG2``
+   * - ``csr_alg3``
+     - Deterministic algorithm for CSR format.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG3``
+
+
+spmv
+^^^^
+
+.. list-table::
+   :header-rows: 1
+   :widths: 10 30 45
+
+   * - Value
+     - Description
+     - Backend equivalent
+   * - ``default_alg``
+     - Default algorithm.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMV_ALG_DEFAULT``
+   * - ``no_optimize_alg``
+     - Default algorithm but may skip some optimizations. Useful only if an
+       operation with the same configuration is run once.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT``
+   * - ``coo_alg1``
+     - Default algorithm for COO format.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMV_COO_ALG1``
+   * - ``coo_alg2``
+     - Deterministic algorithm for COO format.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMV_COO_ALG2``
+   * - ``csr_alg1``
+     - Default algorithm for CSR format.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMV_CSR_ALG1``
+   * - ``csr_alg2``
+     - Deterministic algorithm for CSR format.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMV_CSR_ALG2``
+   * - ``csr_alg3``
+     - LRB variant of the algorithm for CSR format.
+     - | MKL: none
+       | cuSPARSE: none
+
+
+spsv
+^^^^
+
+.. list-table::
+   :header-rows: 1
+   :widths: 10 30 45
+
+   * - Value
+     - Description
+     - Backend equivalent
+   * - ``default_optimize_alg``
+     - Default algorithm.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT``
+   * - ``no_optimize_alg``
+     - Default algorithm but may skip some optimizations. Useful only if an
+       operation with the same configuration is run once.
+     - | MKL: none
+       | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT``
diff --git a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
index 5dbbba8a4..a38f4ebd4 100644
--- a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
+++ b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
@@ -18,27 +18,24 @@
 #===============================================================================
 
 #Build object from all sources
-set(SPARSE_BLAS_BACKENDS "")
-
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND SPARSE_BLAS_BACKENDS "mklcpu")
+set(SPARSE_CT_SOURCES "")
+if(ENABLE_MKLCPU_BACKEND AND ENABLE_CUSPARSE_BACKEND)
+  list(APPEND SPARSE_CT_SOURCES "sparse_blas_spmv_usm_mklcpu_cusparse")
 endif()
 
 include(WarningsUtils)
 
-foreach(backend ${SPARSE_BLAS_BACKENDS})
-  set(EXAMPLE_NAME example_sparse_blas_spmv_usm_${backend})
-  add_executable(${EXAMPLE_NAME} sparse_blas_spmv_usm_${backend}.cpp)
-  target_include_directories(${EXAMPLE_NAME}
+foreach(sparse_ct_source ${SPARSE_CT_SOURCES})
+  add_executable(${sparse_ct_source} ${sparse_ct_source}.cpp)
+  target_include_directories(${sparse_ct_source}
       PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
       PUBLIC ${PROJECT_SOURCE_DIR}/include
       PUBLIC ${CMAKE_BINARY_DIR}/bin
   )
 
-  add_dependencies(${EXAMPLE_NAME} onemkl_sparse_blas_${backend})
-  target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_${backend})
+  target_link_libraries(${sparse_ct_source} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_mklcpu onemkl_sparse_blas_cusparse)
 
   # Register example as ctest
-  add_test(NAME sparse_blas/EXAMPLE/CT/sparse_blas_spmv_usm_${backend} COMMAND ${EXAMPLE_NAME})
-endforeach(backend)
+  add_test(NAME sparse_blas/EXAMPLE/CT/${sparse_ct_source} COMMAND ${sparse_ct_source})
+endforeach(sparse_ct_source)
 
diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
new file mode 100644
index 000000000..d025539f8
--- /dev/null
+++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp
@@ -0,0 +1,291 @@
+/*******************************************************************************
+* Copyright 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+/*
+*
+*  Content:
+*       This example demonstrates use of DPCPP API oneapi::mkl::sparse::spmv
+*       using unified shared memory to perform general sparse matrix-vector
+*       multiplication on a INTEL CPU SYCL device.
+*
+*       y = alpha * op(A) * x + beta * y
+*
+*       where op() is defined by one of
+*
+*           oneapi::mkl::transpose::{nontrans,trans,conjtrans}
+*
+*
+*       This example demonstrates only single precision (float) data type for
+*       spmv matrix data
+*
+*
+*******************************************************************************/
+
+// stl includes
+#include <iostream>
+#include <vector>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+#include "oneapi/mkl.hpp"
+
+#include "example_helper.hpp"
+
+//
+// Main example for Sparse Matrix-Vector Multiply consisting of
+// initialization of A matrix, x and y vectors as well as
+// scalars alpha and beta.  Then the product
+//
+// y = alpha * op(A) * x + beta * y
+//
+// is performed and finally the results are post processed.
+//
+template <typename fpType, typename intType, typename selectorType>
+int run_sparse_matrix_vector_multiply_example(const selectorType &selector) {
+    auto queue = selector.get_queue();
+
+    // Matrix data size
+    intType size = 4;
+    intType nrows = size * size * size;
+
+    // Set scalar fpType values
+    fpType alpha = set_fp_value(fpType(1.0));
+    fpType beta = set_fp_value(fpType(0.0));
+
+    intType *ia, *ja;
+    fpType *a, *x, *y, *z;
+    std::size_t sizea = static_cast<std::size_t>(27 * nrows);
+    std::size_t sizeja = static_cast<std::size_t>(27 * nrows);
+    std::size_t sizeia = static_cast<std::size_t>(nrows + 1);
+    std::size_t sizevec = static_cast<std::size_t>(nrows);
+
+    ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), queue);
+    ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), queue);
+    a = (fpType *)sycl::malloc_shared(sizea * sizeof(fpType), queue);
+    x = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue);
+    y = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue);
+    z = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue);
+
+    if (!ia || !ja || !a || !x || !y || !z) {
+        throw std::runtime_error("Failed to allocate USM memory");
+    }
+
+    intType nnz = generate_sparse_matrix<fpType, intType>(size, ia, ja, a);
+
+    // Init vectors x and y
+    for (int i = 0; i < nrows; i++) {
+        x[i] = set_fp_value(fpType(1.0));
+        y[i] = set_fp_value(fpType(0.0));
+        z[i] = set_fp_value(fpType(0.0));
+    }
+
+    std::vector<intType *> int_ptr_vec;
+    int_ptr_vec.push_back(ia);
+    int_ptr_vec.push_back(ja);
+    std::vector<fpType *> fp_ptr_vec;
+    fp_ptr_vec.push_back(a);
+    fp_ptr_vec.push_back(x);
+    fp_ptr_vec.push_back(y);
+    fp_ptr_vec.push_back(z);
+
+    //
+    // Execute Matrix Multiply
+    //
+
+    oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
+    oneapi::mkl::sparse::spmv_alg alg = oneapi::mkl::sparse::spmv_alg::default_alg;
+    oneapi::mkl::sparse::matrix_view A_view;
+
+    std::cout << "\n\t\tsparse::spmv parameters:\n";
+    std::cout << "\t\t\ttransA = "
+              << (transA == oneapi::mkl::transpose::nontrans
+                      ? "nontrans"
+                      : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
+              << std::endl;
+    std::cout << "\t\t\tnrows = " << nrows << std::endl;
+    std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
+
+    // Create and initialize handle for a Sparse Matrix in CSR format
+    oneapi::mkl::sparse::matrix_handle_t A_handle = nullptr;
+    oneapi::mkl::sparse::init_csr_matrix(selector, &A_handle, nrows, nrows, nnz,
+                                         oneapi::mkl::index_base::zero, ia, ja, a);
+
+    // Create and initialize dense vector handles
+    oneapi::mkl::sparse::dense_vector_handle_t x_handle = nullptr;
+    oneapi::mkl::sparse::dense_vector_handle_t y_handle = nullptr;
+    oneapi::mkl::sparse::init_dense_vector(selector, &x_handle, sizevec, x);
+    oneapi::mkl::sparse::init_dense_vector(selector, &y_handle, sizevec, y);
+
+    // Create operation descriptor
+    oneapi::mkl::sparse::spmv_descr_t descr = nullptr;
+    oneapi::mkl::sparse::init_spmv_descr(selector, &descr);
+
+    // Allocate external workspace
+    std::size_t workspace_size = 0;
+    oneapi::mkl::sparse::spmv_buffer_size(selector, transA, &alpha, A_view, A_handle, x_handle,
+                                          &beta, y_handle, alg, descr, workspace_size);
+    void *workspace = sycl::malloc_device(workspace_size, queue);
+
+    // Optimize spmv
+    auto ev_opt =
+        oneapi::mkl::sparse::spmv_optimize(selector, transA, &alpha, A_view, A_handle, x_handle,
+                                           &beta, y_handle, alg, descr, workspace);
+
+    // Run spmv
+    auto ev_spmv = oneapi::mkl::sparse::spmv(selector, transA, &alpha, A_view, A_handle, x_handle,
+                                             &beta, y_handle, alg, descr, { ev_opt });
+
+    // Release handles and descriptor
+    std::vector<sycl::event> release_events;
+    release_events.push_back(
+        oneapi::mkl::sparse::release_dense_vector(selector, x_handle, { ev_spmv }));
+    release_events.push_back(
+        oneapi::mkl::sparse::release_dense_vector(selector, y_handle, { ev_spmv }));
+    release_events.push_back(
+        oneapi::mkl::sparse::release_sparse_matrix(selector, A_handle, { ev_spmv }));
+    release_events.push_back(oneapi::mkl::sparse::release_spmv_descr(selector, descr, { ev_spmv }));
+    for (auto event : release_events) {
+        event.wait_and_throw();
+    }
+
+    //
+    // Post Processing
+    //
+
+    fpType *res = y;
+    const bool isConj = (transA == oneapi::mkl::transpose::conjtrans);
+    for (intType row = 0; row < nrows; row++) {
+        z[row] *= beta;
+    }
+    for (intType row = 0; row < nrows; row++) {
+        fpType tmp = alpha * x[row];
+        for (intType i = ia[row]; i < ia[row + 1]; i++) {
+            if constexpr (is_complex<fpType>()) {
+                z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]);
+            }
+            else {
+                z[ja[i]] += tmp * a[i];
+            }
+        }
+    }
+
+    bool good = true;
+    for (intType row = 0; row < nrows; row++) {
+        good &= check_result(res[row], z[row], nrows, row);
+    }
+
+    std::cout << "\n\t\t sparse::spmv example " << (good ? "passed" : "failed") << "\n\tFinished"
+              << std::endl;
+
+    free_vec(fp_ptr_vec, queue);
+    free_vec(int_ptr_vec, queue);
+
+    if (!good)
+        return 1;
+
+    return 0;
+}
+
+//
+// Description of example setup, apis used and supported floating point type
+// precisions
+//
+void print_example_banner() {
+    std::cout << "" << std::endl;
+    std::cout << "########################################################################"
+              << std::endl;
+    std::cout << "# Sparse Matrix-Vector Multiply Example: " << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# where A is a sparse matrix in CSR format, x and y are "
+                 "dense vectors"
+              << std::endl;
+    std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Using apis:" << std::endl;
+    std::cout << "#   sparse::spmv" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Using single precision (float) data type" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Running on both Intel CPU and Nvidia GPU devices" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "########################################################################"
+              << std::endl;
+    std::cout << std::endl;
+}
+
+//
+// Main entry point for example
+//
+int main(int /*argc*/, char ** /*argv*/) {
+    print_example_banner();
+
+    auto exception_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e) {
+                std::cout << "Caught asynchronous SYCL "
+                             "exception during sparse::spmv:\n"
+                          << e.what() << std::endl;
+            }
+        }
+    };
+
+    try {
+        sycl::queue cpu_queue(sycl::cpu_selector_v, exception_handler);
+        sycl::queue gpu_queue(sycl::gpu_selector_v, exception_handler);
+        unsigned int vendor_id = gpu_queue.get_info<sycl::info::device::vendor_id>();
+        if (vendor_id != NVIDIA_ID) {
+            std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl;
+            return 1;
+        }
+        oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector{ cpu_queue };
+        oneapi::mkl::backend_selector<oneapi::mkl::backend::cusparse> gpu_selector{ gpu_queue };
+
+        std::cout << "Running Sparse BLAS SPMV USM example on:" << std::endl;
+        std::cout << "\tCPU device: " << cpu_queue.get_info<sycl::info::device::name>()
+                  << std::endl;
+        std::cout << "\tGPU device: " << gpu_queue.get_info<sycl::info::device::name>()
+                  << std::endl;
+        std::cout << "Running with single precision real data type:" << std::endl;
+
+        run_sparse_matrix_vector_multiply_example<float, std::int32_t>(cpu_selector);
+        run_sparse_matrix_vector_multiply_example<float, std::int32_t>(gpu_selector);
+        std::cout << "Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE." << std::endl;
+    }
+    catch (sycl::exception const &e) {
+        std::cerr << "Caught synchronous SYCL exception during Sparse SPMV:" << std::endl;
+        std::cerr << "\t" << e.what() << std::endl;
+        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
+        return 1;
+    }
+    catch (std::exception const &e) {
+        std::cerr << "Caught std::exception during Sparse SPMV:" << std::endl;
+        std::cerr << "\t" << e.what() << std::endl;
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
index 398f3e0f2..f09daf819 100644
--- a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
+++ b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
@@ -33,6 +33,9 @@ endif()
 if(ENABLE_MKLGPU_BACKEND)
   list(APPEND DEVICE_FILTERS "level_zero:gpu")
 endif()
+if(ENABLE_CUSPARSE_BACKEND)
+  list(APPEND DEVICE_FILTERS "cuda:gpu")
+endif()
 
 message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
 
diff --git a/include/oneapi/mkl/detail/backends.hpp b/include/oneapi/mkl/detail/backends.hpp
index 32b7c2614..216a6feba 100644
--- a/include/oneapi/mkl/detail/backends.hpp
+++ b/include/oneapi/mkl/detail/backends.hpp
@@ -40,20 +40,31 @@ enum class backend {
     cufft,
     rocfft,
     portfft,
+    cusparse,
     unsupported
 };
 
 typedef std::map<backend, std::string> backendmap;
 
-static backendmap backend_map = {
-    { backend::mklcpu, "mklcpu" },       { backend::mklgpu, "mklgpu" },
-    { backend::cublas, "cublas" },       { backend::cusolver, "cusolver" },
-    { backend::curand, "curand" },       { backend::netlib, "netlib" },
-    { backend::rocblas, "rocblas" },     { backend::rocrand, "rocrand" },
-    { backend::rocsolver, "rocsolver" }, { backend::portblas, "portblas" },
-    { backend::cufft, "cufft" },         { backend::rocfft, "rocfft" },
-    { backend::portfft, "portfft" },     { backend::unsupported, "unsupported" }
-};
+// clang-format alternate the formatting depending on the parity of the number of backends
+// It is disabled to reduce noise
+// clang-format off
+static backendmap backend_map = { { backend::mklcpu, "mklcpu" },
+                                  { backend::mklgpu, "mklgpu" },
+                                  { backend::cublas, "cublas" },
+                                  { backend::cusolver, "cusolver" },
+                                  { backend::curand, "curand" },
+                                  { backend::netlib, "netlib" },
+                                  { backend::rocblas, "rocblas" },
+                                  { backend::rocrand, "rocrand" },
+                                  { backend::rocsolver, "rocsolver" },
+                                  { backend::portblas, "portblas" },
+                                  { backend::cufft, "cufft" },
+                                  { backend::rocfft, "rocfft" },
+                                  { backend::portfft, "portfft" },
+                                  { backend::cusparse, "cusparse" },
+                                  { backend::unsupported, "unsupported" } };
+// clang-format on
 
 } //namespace mkl
 } //namespace oneapi
diff --git a/include/oneapi/mkl/detail/backends_table.hpp b/include/oneapi/mkl/detail/backends_table.hpp
index 8e68674cc..8a79c5c06 100644
--- a/include/oneapi/mkl/detail/backends_table.hpp
+++ b/include/oneapi/mkl/detail/backends_table.hpp
@@ -186,6 +186,12 @@ static std::map<domain, std::map<device, std::vector<const char*>>> libraries =
           {
 #ifdef ENABLE_MKLGPU_BACKEND
               LIB_NAME("sparse_blas_mklgpu")
+#endif
+          } },
+        { device::nvidiagpu,
+          {
+#ifdef ENABLE_CUSPARSE_BACKEND
+              LIB_NAME("sparse_blas_cusparse")
 #endif
           } } } },
 };
diff --git a/include/oneapi/mkl/sparse_blas.hpp b/include/oneapi/mkl/sparse_blas.hpp
index 912a20eb8..73e6753c7 100644
--- a/include/oneapi/mkl/sparse_blas.hpp
+++ b/include/oneapi/mkl/sparse_blas.hpp
@@ -34,6 +34,9 @@
 #ifdef ENABLE_MKLGPU_BACKEND
 #include "sparse_blas/detail/mklgpu/sparse_blas_ct.hpp"
 #endif
+#ifdef ENABLE_CUSPARSE_BACKEND
+#include "sparse_blas/detail/cusparse/sparse_blas_ct.hpp"
+#endif
 
 #include "sparse_blas/detail/sparse_blas_rt.hpp"
 
diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp
new file mode 100644
index 000000000..6de2802f1
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp
@@ -0,0 +1,35 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_
+#define _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_
+
+#include "oneapi/mkl/detail/export.hpp"
+#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
+#include "oneapi/mkl/sparse_blas/types.hpp"
+
+namespace oneapi::mkl::sparse::cusparse {
+
+namespace detail = oneapi::mkl::sparse::detail;
+
+#include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx"
+
+} // namespace oneapi::mkl::sparse::cusparse
+
+#endif // _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp
new file mode 100644
index 000000000..11abb9a6f
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_
+#define _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_
+
+#include "oneapi/mkl/detail/backends.hpp"
+#include "oneapi/mkl/detail/backend_selector.hpp"
+
+#include "onemkl_sparse_blas_cusparse.hpp"
+
+namespace oneapi {
+namespace mkl {
+namespace sparse {
+
+#define BACKEND cusparse
+#include "oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx"
+#undef BACKEND
+
+} //namespace sparse
+} //namespace mkl
+} //namespace oneapi
+
+#endif // _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_
diff --git a/src/config.hpp.in b/src/config.hpp.in
index 5698abf9b..fd55006a6 100644
--- a/src/config.hpp.in
+++ b/src/config.hpp.in
@@ -24,6 +24,7 @@
 #cmakedefine ENABLE_CUFFT_BACKEND
 #cmakedefine ENABLE_CURAND_BACKEND
 #cmakedefine ENABLE_CUSOLVER_BACKEND
+#cmakedefine ENABLE_CUSPARSE_BACKEND
 #cmakedefine ENABLE_MKLCPU_BACKEND
 #cmakedefine ENABLE_MKLGPU_BACKEND
 #cmakedefine ENABLE_NETLIB_BACKEND
diff --git a/src/sparse_blas/backends/CMakeLists.txt b/src/sparse_blas/backends/CMakeLists.txt
index 294040808..baae9445d 100644
--- a/src/sparse_blas/backends/CMakeLists.txt
+++ b/src/sparse_blas/backends/CMakeLists.txt
@@ -27,3 +27,7 @@ endif()
 if(ENABLE_MKLGPU_BACKEND)
   add_subdirectory(mklgpu)
 endif()
+
+if(ENABLE_CUSPARSE_BACKEND)
+  add_subdirectory(cusparse)
+endif()
diff --git a/src/sparse_blas/backends/cusparse/CMakeLists.txt b/src/sparse_blas/backends/cusparse/CMakeLists.txt
new file mode 100644
index 000000000..60bbaf35f
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/CMakeLists.txt
@@ -0,0 +1,85 @@
+#===============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+set(LIB_NAME onemkl_sparse_blas_cusparse)
+set(LIB_OBJ ${LIB_NAME}_obj)
+
+include(WarningsUtils)
+
+add_library(${LIB_NAME})
+add_library(${LIB_OBJ} OBJECT
+  cusparse_handles.cpp
+  cusparse_scope_handle.cpp
+  operations/cusparse_spmm.cpp
+  operations/cusparse_spmv.cpp
+  operations/cusparse_spsv.cpp
+  $<$<BOOL:${BUILD_SHARED_LIBS}>: cusparse_wrappers.cpp>
+)
+add_dependencies(onemkl_backend_libs_sparse_blas ${LIB_NAME})
+
+target_include_directories(${LIB_OBJ}
+  PRIVATE ${PROJECT_SOURCE_DIR}/include
+          ${PROJECT_SOURCE_DIR}/src
+          ${CMAKE_BINARY_DIR}/bin
+          ${ONEMKL_GENERATED_INCLUDE_PATH}
+)
+
+target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
+
+if (${CMAKE_VERSION} VERSION_LESS "3.17.0")
+  find_package(CUDA 12.2 REQUIRED)
+  target_include_directories(${LIB_OBJ} PRIVATE ${CUDA_INCLUDE_DIRS})
+  target_link_libraries(${LIB_OBJ} PUBLIC cuda rt ${CUDA_cusparse_LIBRARY})
+else()
+  find_package(CUDAToolkit 12.2 REQUIRED)
+  target_link_libraries(${LIB_OBJ} PRIVATE CUDA::cusparse CUDA::cudart CUDA::cuda_driver)
+endif()
+
+target_link_libraries(${LIB_OBJ}
+  PUBLIC ONEMKL::SYCL::SYCL
+  PRIVATE onemkl_warnings
+)
+
+set_target_properties(${LIB_OBJ} PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+)
+target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
+
+#Set oneMKL libraries as not transitive for dynamic
+if(BUILD_SHARED_LIBS)
+  set_target_properties(${LIB_NAME} PROPERTIES
+    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
+  )
+endif()
+
+# Add major version to the library
+set_target_properties(${LIB_NAME} PROPERTIES
+  SOVERSION ${PROJECT_VERSION_MAJOR}
+)
+
+# Add dependencies rpath to the library
+list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
+
+# Add the library to install package
+install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
+install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
+  RUNTIME DESTINATION bin
+  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION lib
+)
diff --git a/src/sparse_blas/backends/cusparse/cusparse_error.hpp b/src/sparse_blas/backends/cusparse/cusparse_error.hpp
new file mode 100644
index 000000000..7d6bf45d7
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_error.hpp
@@ -0,0 +1,100 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_
+
+#include <string>
+
+#include <cuda.h>
+#include <cusparse.h>
+
+#include "oneapi/mkl/exceptions.hpp"
+
+namespace oneapi::mkl::sparse::cusparse {
+
+inline std::string cuda_result_to_str(CUresult result) {
+    switch (result) {
+#define ONEMKL_CUSPARSE_CASE(STATUS) \
+    case STATUS: return #STATUS
+        ONEMKL_CUSPARSE_CASE(CUDA_SUCCESS);
+        ONEMKL_CUSPARSE_CASE(CUDA_ERROR_NOT_PERMITTED);
+        ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_CONTEXT);
+        ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_DEVICE);
+        ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_VALUE);
+        ONEMKL_CUSPARSE_CASE(CUDA_ERROR_OUT_OF_MEMORY);
+        ONEMKL_CUSPARSE_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES);
+        default: return "<unknown>";
+    }
+}
+
+#define CUDA_ERROR_FUNC(func, ...)                                                  \
+    do {                                                                            \
+        auto res = func(__VA_ARGS__);                                               \
+        if (res != CUDA_SUCCESS) {                                                  \
+            throw oneapi::mkl::exception("sparse_blas", #func,                      \
+                                         "cuda error: " + cuda_result_to_str(res)); \
+        }                                                                           \
+    } while (0)
+
+inline std::string cusparse_status_to_str(cusparseStatus_t status) {
+    switch (status) {
+#define ONEMKL_CUSPARSE_CASE(STATUS) \
+    case STATUS: return #STATUS
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_SUCCESS);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_NOT_INITIALIZED);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_ALLOC_FAILED);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INVALID_VALUE);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_ARCH_MISMATCH);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_EXECUTION_FAILED);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INTERNAL_ERROR);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_NOT_SUPPORTED);
+        ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INSUFFICIENT_RESOURCES);
+#undef ONEMKL_CUSPARSE_CASE
+        default: return "<unknown>";
+    }
+}
+
+inline void check_status(cusparseStatus_t status, const std::string& function,
+                         std::string error_str = "") {
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+        if (!error_str.empty()) {
+            error_str += "; ";
+        }
+        error_str += "cuSPARSE status: " + cusparse_status_to_str(status);
+        switch (status) {
+            case CUSPARSE_STATUS_NOT_SUPPORTED:
+                throw oneapi::mkl::unimplemented("sparse_blas", function, error_str);
+            case CUSPARSE_STATUS_INVALID_VALUE:
+                throw oneapi::mkl::invalid_argument("sparse_blas", function, error_str);
+            default: throw oneapi::mkl::exception("sparse_blas", function, error_str);
+        }
+    }
+}
+
+#define CUSPARSE_ERR_FUNC(func, ...)     \
+    do {                                 \
+        auto status = func(__VA_ARGS__); \
+        check_status(status, #func);     \
+    } while (0)
+
+} // namespace oneapi::mkl::sparse::cusparse
+
+#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp
new file mode 100644
index 000000000..59e582a65
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp
@@ -0,0 +1,63 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_
+
+/**
+ * @file Similar to blas_handle.hpp
+ * Provides a map from a ur_context_handle_t (or equivalent) to a cusparseHandle_t.
+ * @see cusparse_scope_handle.hpp
+*/
+
+#include <atomic>
+#include <unordered_map>
+
+namespace oneapi::mkl::sparse::cusparse {
+
+template <typename T>
+struct cusparse_global_handle {
+    using handle_container_t = std::unordered_map<T, std::atomic<cusparseHandle_t> *>;
+    handle_container_t cusparse_global_handle_mapper_{};
+
+    ~cusparse_global_handle() noexcept(false) {
+        for (auto &handle_pair : cusparse_global_handle_mapper_) {
+            if (handle_pair.second != nullptr) {
+                auto handle = handle_pair.second->exchange(nullptr);
+                if (handle != nullptr) {
+                    CUSPARSE_ERR_FUNC(cusparseDestroy, handle);
+                    handle = nullptr;
+                }
+                else {
+                    // if the handle is nullptr it means the handle was already
+                    // destroyed by the ContextCallback and we're free to delete the
+                    // atomic object.
+                    delete handle_pair.second;
+                }
+
+                handle_pair.second = nullptr;
+            }
+        }
+        cusparse_global_handle_mapper_.clear();
+    }
+};
+
+} // namespace oneapi::mkl::sparse::cusparse
+
+#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp
new file mode 100644
index 000000000..de7236110
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp
@@ -0,0 +1,520 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "cusparse_error.hpp"
+#include "cusparse_helper.hpp"
+#include "cusparse_handles.hpp"
+#include "cusparse_task.hpp"
+#include "sparse_blas/macros.hpp"
+
+namespace oneapi::mkl::sparse::cusparse {
+
+/**
+ * In this file CusparseScopedContextHandler are used to ensure that a cusparseHandle_t is created before any other cuSPARSE call, as required by the specification.
+*/
+
+// Dense vector
+template <typename fpType>
+void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size,
+                       sycl::buffer<fpType, 1> val) {
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        auto acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            auto cuda_value_type = CudaEnumType<fpType>::value;
+            cusparseDnVecDescr_t cu_dvhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, sc.get_mem(acc),
+                              cuda_value_type);
+            *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType>
+void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size,
+                       fpType *val) {
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            auto cuda_value_type = CudaEnumType<fpType>::value;
+            cusparseDnVecDescr_t cu_dvhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, sc.get_mem(val),
+                              cuda_value_type);
+            *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType>
+void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size,
+                           sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, true);
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        auto acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            if (dvhandle->size != size) {
+                CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+                auto cuda_value_type = CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size,
+                                  sc.get_mem(acc), cuda_value_type);
+                dvhandle->size = size;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle,
+                                  sc.get_mem(acc));
+            }
+            dvhandle->set_buffer(val);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType>
+void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size,
+                           fpType *val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, false);
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            if (dvhandle->size != size) {
+                CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+                auto cuda_value_type = CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size,
+                                  sc.get_mem(val), cuda_value_type);
+                dvhandle->size = size;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle,
+                                  sc.get_mem(val));
+            }
+            dvhandle->set_usm_ptr(val);
+        });
+    });
+    event.wait_and_throw();
+}
+
+FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS);
+
+sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle,
+                                 const std::vector<sycl::event> &dependencies) {
+    // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used
+    auto functor = [=](CusparseScopedContextHandler &) {
+        CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle);
+        delete dvhandle;
+    };
+    return dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dvhandle);
+}
+
+// Dense matrix
+template <typename fpType>
+void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows,
+                       std::int64_t num_cols, std::int64_t ld, layout dense_layout,
+                       sycl::buffer<fpType, 1> val) {
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        auto acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            auto cuda_value_type = CudaEnumType<fpType>::value;
+            auto cuda_order = get_cuda_order(dense_layout);
+            cusparseDnMatDescr_t cu_dmhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld,
+                              sc.get_mem(acc), cuda_value_type, cuda_order);
+            *p_dmhandle =
+                new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType>
+void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows,
+                       std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType *val) {
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            auto cuda_value_type = CudaEnumType<fpType>::value;
+            auto cuda_order = get_cuda_order(dense_layout);
+            cusparseDnMatDescr_t cu_dmhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld,
+                              sc.get_mem(val), cuda_value_type, cuda_order);
+            *p_dmhandle =
+                new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType>
+void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle,
+                           std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
+                           oneapi::mkl::layout dense_layout, sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, true);
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        auto acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols ||
+                dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) {
+                CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+                auto cuda_value_type = CudaEnumType<fpType>::value;
+                auto cuda_order = get_cuda_order(dense_layout);
+                CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows,
+                                  num_cols, ld, sc.get_mem(acc), cuda_value_type, cuda_order);
+                dmhandle->num_rows = num_rows;
+                dmhandle->num_cols = num_cols;
+                dmhandle->ld = ld;
+                dmhandle->dense_layout = dense_layout;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle,
+                                  sc.get_mem(acc));
+            }
+            dmhandle->set_buffer(val);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType>
+void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle,
+                           std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
+                           oneapi::mkl::layout dense_layout, fpType *val) {
+    detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, false);
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols ||
+                dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) {
+                CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+                auto cuda_value_type = CudaEnumType<fpType>::value;
+                auto cuda_order = get_cuda_order(dense_layout);
+                CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows,
+                                  num_cols, ld, sc.get_mem(val), cuda_value_type, cuda_order);
+                dmhandle->num_rows = num_rows;
+                dmhandle->num_cols = num_cols;
+                dmhandle->ld = ld;
+                dmhandle->dense_layout = dense_layout;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle,
+                                  sc.get_mem(val));
+            }
+            dmhandle->set_usm_ptr(val);
+        });
+    });
+    event.wait_and_throw();
+}
+
+FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS);
+
+sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle,
+                                 const std::vector<sycl::event> &dependencies) {
+    // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used
+    auto functor = [=](CusparseScopedContextHandler &) {
+        CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle);
+        delete dmhandle;
+    };
+    return dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dmhandle);
+}
+
+// COO matrix
+template <typename fpType, typename intType>
+void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows,
+                     std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                     sycl::buffer<intType, 1> row_ind, sycl::buffer<intType, 1> col_ind,
+                     sycl::buffer<fpType, 1> val) {
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        auto row_acc = row_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            auto cuda_index_type = CudaIndexEnumType<intType>::value;
+            auto cuda_index_base = get_cuda_index_base(index);
+            auto cuda_value_type = CudaEnumType<fpType>::value;
+            cusparseSpMatDescr_t cu_smhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz,
+                              sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc),
+                              cuda_index_type, cuda_index_base, cuda_value_type);
+            *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, num_rows, num_cols,
+                                            nnz, index);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType, typename intType>
+void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows,
+                     std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                     intType *row_ind, intType *col_ind, fpType *val) {
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            auto cuda_index_type = CudaIndexEnumType<intType>::value;
+            auto cuda_index_base = get_cuda_index_base(index);
+            auto cuda_value_type = CudaEnumType<fpType>::value;
+            cusparseSpMatDescr_t cu_smhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz,
+                              sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val),
+                              cuda_index_type, cuda_index_base, cuda_value_type);
+            *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, num_rows, num_cols,
+                                            nnz, index);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType, typename intType>
+void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows,
+                         std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                         sycl::buffer<intType, 1> row_ind, sycl::buffer<intType, 1> col_ind,
+                         sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, true);
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        auto row_acc = row_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols ||
+                smhandle->nnz != nnz || smhandle->index != index) {
+                CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+                auto cuda_index_type = CudaIndexEnumType<intType>::value;
+                auto cuda_index_base = get_cuda_index_base(index);
+                auto cuda_value_type = CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols,
+                                  nnz, sc.get_mem(row_acc), sc.get_mem(col_acc),
+                                  sc.get_mem(val_acc), cuda_index_type, cuda_index_base,
+                                  cuda_value_type);
+                smhandle->num_rows = num_rows;
+                smhandle->num_cols = num_cols;
+                smhandle->nnz = nnz;
+                smhandle->index = index;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle,
+                                  sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc));
+            }
+            smhandle->row_container.set_buffer(row_ind);
+            smhandle->col_container.set_buffer(col_ind);
+            smhandle->value_container.set_buffer(val);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType, typename intType>
+void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows,
+                         std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                         intType *row_ind, intType *col_ind, fpType *val) {
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, false);
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols ||
+                smhandle->nnz != nnz || smhandle->index != index) {
+                CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+                auto cuda_index_type = CudaIndexEnumType<intType>::value;
+                auto cuda_index_base = get_cuda_index_base(index);
+                auto cuda_value_type = CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols,
+                                  nnz, sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val),
+                                  cuda_index_type, cuda_index_base, cuda_value_type);
+                smhandle->num_rows = num_rows;
+                smhandle->num_cols = num_cols;
+                smhandle->nnz = nnz;
+                smhandle->index = index;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle,
+                                  sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val));
+            }
+            smhandle->row_container.set_usm_ptr(row_ind);
+            smhandle->col_container.set_usm_ptr(col_ind);
+            smhandle->value_container.set_usm_ptr(val);
+        });
+    });
+    event.wait_and_throw();
+}
+
+FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS);
+
+// CSR matrix
+template <typename fpType, typename intType>
+void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows,
+                     std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                     sycl::buffer<intType, 1> row_ptr, sycl::buffer<intType, 1> col_ind,
+                     sycl::buffer<fpType, 1> val) {
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        auto row_acc = row_ptr.template get_access<sycl::access::mode::read_write>(cgh);
+        auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            auto cuda_index_type = CudaIndexEnumType<intType>::value;
+            auto cuda_index_base = get_cuda_index_base(index);
+            auto cuda_value_type = CudaEnumType<fpType>::value;
+            cusparseSpMatDescr_t cu_smhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz,
+                              sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc),
+                              cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type);
+            *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, num_rows, num_cols,
+                                            nnz, index);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType, typename intType>
+void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows,
+                     std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                     intType *row_ptr, intType *col_ind, fpType *val) {
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            auto cuda_index_type = CudaIndexEnumType<intType>::value;
+            auto cuda_index_base = get_cuda_index_base(index);
+            auto cuda_value_type = CudaEnumType<fpType>::value;
+            cusparseSpMatDescr_t cu_smhandle;
+            CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz,
+                              sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val),
+                              cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type);
+            *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, num_rows, num_cols,
+                                            nnz, index);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType, typename intType>
+void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows,
+                         std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                         sycl::buffer<intType, 1> row_ptr, sycl::buffer<intType, 1> col_ind,
+                         sycl::buffer<fpType, 1> val) {
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, true);
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        auto row_acc = row_ptr.template get_access<sycl::access::mode::read_write>(cgh);
+        auto col_acc = col_ind.template get_access<sycl::access::mode::read_write>(cgh);
+        auto val_acc = val.template get_access<sycl::access::mode::read_write>(cgh);
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols ||
+                smhandle->nnz != nnz || smhandle->index != index) {
+                CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+                auto cuda_index_type = CudaIndexEnumType<intType>::value;
+                auto cuda_index_base = get_cuda_index_base(index);
+                auto cuda_value_type = CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols,
+                                  nnz, sc.get_mem(row_acc), sc.get_mem(col_acc),
+                                  sc.get_mem(val_acc), cuda_index_type, cuda_index_type,
+                                  cuda_index_base, cuda_value_type);
+                smhandle->num_rows = num_rows;
+                smhandle->num_cols = num_cols;
+                smhandle->nnz = nnz;
+                smhandle->index = index;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle,
+                                  sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc));
+            }
+            smhandle->row_container.set_buffer(row_ptr);
+            smhandle->col_container.set_buffer(col_ind);
+            smhandle->value_container.set_buffer(val);
+        });
+    });
+    event.wait_and_throw();
+}
+
+template <typename fpType, typename intType>
+void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows,
+                         std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,
+                         intType *row_ptr, intType *col_ind, fpType *val) {
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, smhandle, false);
+    auto event = queue.submit([&](sycl::handler &cgh) {
+        submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) {
+            // Ensure that a cusparse handle is created before any other cuSPARSE function is called.
+            sc.get_handle(queue);
+            if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols ||
+                smhandle->nnz != nnz || smhandle->index != index) {
+                CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+                auto cuda_index_type = CudaIndexEnumType<intType>::value;
+                auto cuda_index_base = get_cuda_index_base(index);
+                auto cuda_value_type = CudaEnumType<fpType>::value;
+                CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols,
+                                  nnz, sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val),
+                                  cuda_index_type, cuda_index_type, cuda_index_base,
+                                  cuda_value_type);
+                smhandle->num_rows = num_rows;
+                smhandle->num_cols = num_cols;
+                smhandle->nnz = nnz;
+                smhandle->index = index;
+            }
+            else {
+                CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle,
+                                  sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val));
+            }
+            smhandle->row_container.set_usm_ptr(row_ptr);
+            smhandle->col_container.set_usm_ptr(col_ind);
+            smhandle->value_container.set_usm_ptr(val);
+        });
+    });
+    event.wait_and_throw();
+}
+
+FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS);
+
+sycl::event release_sparse_matrix(sycl::queue &queue, matrix_handle_t smhandle,
+                                  const std::vector<sycl::event> &dependencies) {
+    // Use dispatch_submit to ensure the backend's handle is kept alive as long as the buffers are used
+    auto functor = [=](CusparseScopedContextHandler &) {
+        CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle);
+        delete smhandle;
+    };
+    return dispatch_submit(__func__, queue, dependencies, functor, smhandle);
+}
+
+// Matrix property
+bool set_matrix_property(sycl::queue &, matrix_handle_t smhandle, matrix_property property) {
+    // No equivalent in cuSPARSE
+    // Store the matrix property internally for future usages
+    smhandle->set_matrix_property(property);
+    return false;
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp
new file mode 100644
index 000000000..ac22d33ae
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp
@@ -0,0 +1,78 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_
+#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_
+
+#include <cusparse.h>
+
+#include "sparse_blas/generic_container.hpp"
+
+namespace oneapi::mkl::sparse {
+
+// Complete the definition of incomplete types dense_vector_handle, dense_matrix_handle and matrix_handle.
+
+struct dense_vector_handle : public detail::generic_dense_vector_handle<cusparseDnVecDescr_t> {
+    template <typename T>
+    dense_vector_handle(cusparseDnVecDescr_t cu_descr, T* value_ptr, std::int64_t size)
+            : detail::generic_dense_vector_handle<cusparseDnVecDescr_t>(cu_descr, value_ptr, size) {
+    }
+
+    template <typename T>
+    dense_vector_handle(cusparseDnVecDescr_t cu_descr, const sycl::buffer<T, 1> value_buffer,
+                        std::int64_t size)
+            : detail::generic_dense_vector_handle<cusparseDnVecDescr_t>(cu_descr, value_buffer,
+                                                                        size) {}
+};
+
+struct dense_matrix_handle : public detail::generic_dense_matrix_handle<cusparseDnMatDescr_t> {
+    template <typename T>
+    dense_matrix_handle(cusparseDnMatDescr_t cu_descr, T* value_ptr, std::int64_t num_rows,
+                        std::int64_t num_cols, std::int64_t ld, layout dense_layout)
+            : detail::generic_dense_matrix_handle<cusparseDnMatDescr_t>(
+                  cu_descr, value_ptr, num_rows, num_cols, ld, dense_layout) {}
+
+    template <typename T>
+    dense_matrix_handle(cusparseDnMatDescr_t cu_descr, const sycl::buffer<T, 1> value_buffer,
+                        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
+                        layout dense_layout)
+            : detail::generic_dense_matrix_handle<cusparseDnMatDescr_t>(
+                  cu_descr, value_buffer, num_rows, num_cols, ld, dense_layout) {}
+};
+
+struct matrix_handle : public detail::generic_sparse_handle<cusparseSpMatDescr_t> {
+    template <typename fpType, typename intType>
+    matrix_handle(cusparseSpMatDescr_t cu_descr, intType* row_ptr, intType* col_ptr,
+                  fpType* value_ptr, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
+                  oneapi::mkl::index_base index)
+            : detail::generic_sparse_handle<cusparseSpMatDescr_t>(
+                  cu_descr, row_ptr, col_ptr, value_ptr, num_rows, num_cols, nnz, index) {}
+
+    template <typename fpType, typename intType>
+    matrix_handle(cusparseSpMatDescr_t cu_descr, const sycl::buffer<intType, 1> row_buffer,
+                  const sycl::buffer<intType, 1> col_buffer,
+                  const sycl::buffer<fpType, 1> value_buffer, std::int64_t num_rows,
+                  std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index)
+            : detail::generic_sparse_handle<cusparseSpMatDescr_t>(
+                  cu_descr, row_buffer, col_buffer, value_buffer, num_rows, num_cols, nnz, index) {}
+};
+
+} // namespace oneapi::mkl::sparse
+
+#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_helper.hpp b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp
new file mode 100644
index 000000000..b392071f5
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp
@@ -0,0 +1,165 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_
+
+#include <complex>
+#include <cstdint>
+#include <limits>
+#include <string>
+
+#include <cusparse.h>
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+#include "sparse_blas/enum_data_types.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+#include "cusparse_error.hpp"
+
+namespace oneapi::mkl::sparse::cusparse {
+
+template <typename T>
+struct CudaEnumType;
+template <>
+struct CudaEnumType<float> {
+    static constexpr cudaDataType_t value = CUDA_R_32F;
+};
+template <>
+struct CudaEnumType<double> {
+    static constexpr cudaDataType_t value = CUDA_R_64F;
+};
+template <>
+struct CudaEnumType<std::complex<float>> {
+    static constexpr cudaDataType_t value = CUDA_C_32F;
+};
+template <>
+struct CudaEnumType<std::complex<double>> {
+    static constexpr cudaDataType_t value = CUDA_C_64F;
+};
+
+template <typename T>
+struct CudaIndexEnumType;
+template <>
+struct CudaIndexEnumType<std::int32_t> {
+    static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+};
+template <>
+struct CudaIndexEnumType<std::int64_t> {
+    static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+};
+
+template <typename E>
+inline std::string cast_enum_to_str(E e) {
+    return std::to_string(static_cast<char>(e));
+}
+
+inline cudaDataType_t get_cuda_value_type(detail::data_type onemkl_data_type) {
+    switch (onemkl_data_type) {
+        case detail::data_type::real_fp32: return CUDA_R_32F;
+        case detail::data_type::real_fp64: return CUDA_R_64F;
+        case detail::data_type::complex_fp32: return CUDA_C_32F;
+        case detail::data_type::complex_fp64: return CUDA_C_64F;
+        default:
+            throw oneapi::mkl::invalid_argument(
+                "sparse_blas", "get_cuda_value_type",
+                "Invalid data type: " + cast_enum_to_str(onemkl_data_type));
+    }
+}
+
+inline cusparseOrder_t get_cuda_order(layout l) {
+    switch (l) {
+        case layout::row_major: return CUSPARSE_ORDER_ROW;
+        case layout::col_major: return CUSPARSE_ORDER_COL;
+        default:
+            throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_order",
+                                                "Unknown layout: " + cast_enum_to_str(l));
+    }
+}
+
+inline cusparseIndexBase_t get_cuda_index_base(index_base index) {
+    switch (index) {
+        case index_base::zero: return CUSPARSE_INDEX_BASE_ZERO;
+        case index_base::one: return CUSPARSE_INDEX_BASE_ONE;
+        default:
+            throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_index_base",
+                                                "Unknown index_base: " + cast_enum_to_str(index));
+    }
+}
+
+/// Return the CUDA transpose operation from a oneMKL type.
+/// Do not conjugate for real types to avoid an invalid argument.
+inline cusparseOperation_t get_cuda_operation(detail::data_type type, transpose op) {
+    switch (op) {
+        case transpose::nontrans: return CUSPARSE_OPERATION_NON_TRANSPOSE;
+        case transpose::trans: return CUSPARSE_OPERATION_TRANSPOSE;
+        case transpose::conjtrans:
+            return (type == detail::data_type::complex_fp32 ||
+                    type == detail::data_type::complex_fp64)
+                       ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
+                       : CUSPARSE_OPERATION_TRANSPOSE;
+        default:
+            throw oneapi::mkl::invalid_argument(
+                "sparse_blas", "get_cuda_operation",
+                "Unknown transpose operation: " + cast_enum_to_str(op));
+    }
+}
+
+inline auto get_cuda_uplo(uplo uplo_val) {
+    switch (uplo_val) {
+        case uplo::upper: return CUSPARSE_FILL_MODE_UPPER;
+        case uplo::lower: return CUSPARSE_FILL_MODE_LOWER;
+        default:
+            throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_uplo",
+                                                "Unknown uplo: " + cast_enum_to_str(uplo_val));
+    }
+}
+
+inline auto get_cuda_diag(diag diag_val) {
+    switch (diag_val) {
+        case diag::nonunit: return CUSPARSE_DIAG_TYPE_NON_UNIT;
+        case diag::unit: return CUSPARSE_DIAG_TYPE_UNIT;
+        default:
+            throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_diag",
+                                                "Unknown diag: " + cast_enum_to_str(diag_val));
+    }
+}
+
+inline void set_matrix_attributes(const std::string& func_name, cusparseSpMatDescr_t cu_a,
+                                  oneapi::mkl::sparse::matrix_view A_view) {
+    auto cu_fill_mode = get_cuda_uplo(A_view.uplo_view);
+    auto status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_FILL_MODE, &cu_fill_mode,
+                                            sizeof(cu_fill_mode));
+    check_status(status, func_name + "/set_uplo");
+
+    auto cu_diag_type = get_cuda_diag(A_view.diag_view);
+    status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_DIAG_TYPE, &cu_diag_type,
+                                       sizeof(cu_diag_type));
+    check_status(status, func_name + "/set_diag");
+}
+
+/**
+ * cuSPARSE requires to set the pointer mode for scalars parameters (typically alpha and beta).
+ */
+inline void set_pointer_mode(cusparseHandle_t cu_handle, bool is_ptr_host_accessible) {
+    cusparseSetPointerMode(cu_handle, is_ptr_host_accessible ? CUSPARSE_POINTER_MODE_HOST
+                                                             : CUSPARSE_POINTER_MODE_DEVICE);
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
+
+#endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp
new file mode 100644
index 000000000..c25c7c92f
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp
@@ -0,0 +1,147 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+/**
+ * @file Similar to cublas_scope_handle.cpp
+*/
+
+#include "cusparse_scope_handle.hpp"
+
+namespace oneapi::mkl::sparse::cusparse {
+
+/**
+ * Inserts a new element in the map if its key is unique. This new element
+ * is constructed in place using args as the arguments for the construction
+ * of a value_type (which is an object of a pair type). The insertion only
+ * takes place if no other element in the container has a key equivalent to
+ * the one being emplaced (keys in a map container are unique).
+ */
+#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED
+thread_local cusparse_global_handle<ur_context_handle_t>
+    CusparseScopedContextHandler::handle_helper = cusparse_global_handle<ur_context_handle_t>{};
+#else
+thread_local cusparse_global_handle<pi_context> CusparseScopedContextHandler::handle_helper =
+    cusparse_global_handle<pi_context>{};
+#endif
+
+CusparseScopedContextHandler::CusparseScopedContextHandler(sycl::queue queue,
+                                                           sycl::interop_handle &ih)
+        : ih(ih),
+          needToRecover_(false) {
+    placedContext_ = new sycl::context(queue.get_context());
+    auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
+    CUcontext desired;
+    CUDA_ERROR_FUNC(cuCtxGetCurrent, &original_);
+    CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice);
+    if (original_ != desired) {
+        // Sets the desired context as the active one for the thread
+        CUDA_ERROR_FUNC(cuCtxSetCurrent, desired);
+        // No context is installed and the suggested context is primary
+        // This is the most common case. We can activate the context in the
+        // thread and leave it there until all the PI context referring to the
+        // same underlying CUDA primary context are destroyed. This emulates
+        // the behaviour of the CUDA runtime api, and avoids costly context
+        // switches. No action is required on this side of the if.
+        needToRecover_ = !(original_ == nullptr);
+    }
+}
+
+CusparseScopedContextHandler::~CusparseScopedContextHandler() noexcept(false) {
+    if (needToRecover_) {
+        CUDA_ERROR_FUNC(cuCtxSetCurrent, original_);
+    }
+    delete placedContext_;
+}
+
+void ContextCallback(void *userData) {
+    auto *ptr = static_cast<std::atomic<cusparseHandle_t> *>(userData);
+    if (!ptr) {
+        return;
+    }
+    auto handle = ptr->exchange(nullptr);
+    if (handle != nullptr) {
+        CUSPARSE_ERR_FUNC(cusparseDestroy, handle);
+        handle = nullptr;
+    }
+    else {
+        // if the handle is nullptr it means the handle was already destroyed by
+        // the cusparse_global_handle destructor and we're free to delete the atomic
+        // object.
+        delete ptr;
+    }
+}
+
+std::pair<cusparseHandle_t, CUstream> CusparseScopedContextHandler::get_handle_and_stream(
+    const sycl::queue &queue) {
+    auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
+    CUcontext desired;
+    CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice);
+#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED
+    auto piPlacedContext_ = reinterpret_cast<ur_context_handle_t>(desired);
+#else
+    auto piPlacedContext_ = reinterpret_cast<pi_context>(desired);
+#endif
+    CUstream streamId = get_stream(queue);
+    auto it = handle_helper.cusparse_global_handle_mapper_.find(piPlacedContext_);
+    if (it != handle_helper.cusparse_global_handle_mapper_.end()) {
+        if (it->second == nullptr) {
+            handle_helper.cusparse_global_handle_mapper_.erase(it);
+        }
+        else {
+            auto handle = it->second->load();
+            if (handle != nullptr) {
+                cudaStream_t currentStreamId;
+                CUSPARSE_ERR_FUNC(cusparseGetStream, handle, &currentStreamId);
+                if (currentStreamId != streamId) {
+                    CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId);
+                }
+                return { handle, streamId };
+            }
+            else {
+                handle_helper.cusparse_global_handle_mapper_.erase(it);
+            }
+        }
+    }
+
+    cusparseHandle_t handle;
+    CUSPARSE_ERR_FUNC(cusparseCreate, &handle);
+    CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId);
+
+    auto insert_iter = handle_helper.cusparse_global_handle_mapper_.insert(
+        std::make_pair(piPlacedContext_, new std::atomic<cusparseHandle_t>(handle)));
+
+    sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback,
+                                                insert_iter.first->second);
+
+    return { handle, streamId };
+}
+
+cusparseHandle_t CusparseScopedContextHandler::get_handle(const sycl::queue &queue) {
+    return get_handle_and_stream(queue).first;
+}
+
+CUstream CusparseScopedContextHandler::get_stream(const sycl::queue &queue) {
+    return sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
+}
+
+sycl::context CusparseScopedContextHandler::get_context(const sycl::queue &queue) {
+    return queue.get_context();
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp
new file mode 100644
index 000000000..b56bb07cf
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp
@@ -0,0 +1,93 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_
+
+/**
+ * @file Similar to cublas_scope_handle.hpp
+*/
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+// After Plugin Interface removal in DPC++ ur.hpp is the new include
+#if __has_include(<sycl/detail/ur.hpp>) && !defined(ONEAPI_ONEMKL_PI_INTERFACE_REMOVED)
+#define ONEAPI_ONEMKL_PI_INTERFACE_REMOVED
+#endif
+
+#include <thread>
+
+#include "cusparse_error.hpp"
+#include "cusparse_global_handle.hpp"
+#include "cusparse_helper.hpp"
+
+namespace oneapi::mkl::sparse::cusparse {
+
+class CusparseScopedContextHandler {
+    CUcontext original_;
+    sycl::context *placedContext_;
+    sycl::interop_handle &ih;
+    bool needToRecover_;
+
+#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED
+    static thread_local cusparse_global_handle<ur_context_handle_t> handle_helper;
+#else
+    static thread_local cusparse_global_handle<pi_context> handle_helper;
+#endif
+
+    CUstream get_stream(const sycl::queue &queue);
+    sycl::context get_context(const sycl::queue &queue);
+
+public:
+    CusparseScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih);
+
+    ~CusparseScopedContextHandler() noexcept(false);
+
+    /**
+     * @brief get_handle: creates the handle by implicitly impose the advice
+     * given by nvidia for creating a cusparse_global_handle. (e.g. one cuStream per device
+     * per thread).
+     * @param queue sycl queue.
+     * @return a pair of: cusparseHandle_t a handle to construct cusparse routines; and a CUDA stream
+     */
+    std::pair<cusparseHandle_t, CUstream> get_handle_and_stream(const sycl::queue &queue);
+
+    /// See get_handle_and_stream
+    cusparseHandle_t get_handle(const sycl::queue &queue);
+
+    // This is a work-around function for reinterpret_casting the memory. This
+    // will be fixed when SYCL-2020 has been implemented for Pi backend.
+    template <typename AccT>
+    inline void *get_mem(AccT acc) {
+        auto cudaPtr = ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc);
+        return reinterpret_cast<void *>(cudaPtr);
+    }
+
+    template <typename T>
+    inline void *get_mem(T *ptr) {
+        return reinterpret_cast<void *>(ptr);
+    }
+};
+
+} // namespace oneapi::mkl::sparse::cusparse
+
+#endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp
new file mode 100644
index 000000000..e839c5100
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp
@@ -0,0 +1,382 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_
+#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_
+
+#include "cusparse_handles.hpp"
+#include "cusparse_scope_handle.hpp"
+
+/// This file provide a helper function to submit host_task using buffers or USM seamlessly
+
+namespace oneapi::mkl::sparse::cusparse {
+
+template <typename T, typename Container>
+auto get_value_accessor(sycl::handler &cgh, Container container) {
+    auto buffer_ptr =
+        reinterpret_cast<sycl::buffer<T, 1> *>(container->value_container.buffer_ptr.get());
+    return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh);
+}
+
+template <typename T, typename... Ts>
+auto get_fp_accessors(sycl::handler &cgh, Ts... containers) {
+    return std::array<sycl::accessor<T, 1>, sizeof...(containers)>{ get_value_accessor<T>(
+        cgh, containers)... };
+}
+
+template <typename T>
+auto get_row_accessor(sycl::handler &cgh, matrix_handle_t smhandle) {
+    auto buffer_ptr =
+        reinterpret_cast<sycl::buffer<T, 1> *>(smhandle->row_container.buffer_ptr.get());
+    return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh);
+}
+
+template <typename T>
+auto get_col_accessor(sycl::handler &cgh, matrix_handle_t smhandle) {
+    auto buffer_ptr =
+        reinterpret_cast<sycl::buffer<T, 1> *>(smhandle->col_container.buffer_ptr.get());
+    return buffer_ptr->template get_access<sycl::access::mode::read_write>(cgh);
+}
+
+template <typename T>
+auto get_int_accessors(sycl::handler &cgh, matrix_handle_t smhandle) {
+    return std::array<sycl::accessor<T, 1>, 2>{ get_row_accessor<T>(cgh, smhandle),
+                                                get_col_accessor<T>(cgh, smhandle) };
+}
+
+template <typename Functor, typename... CaptureOnlyAcc>
+void submit_host_task(sycl::handler &cgh, sycl::queue &queue, Functor functor,
+                      CaptureOnlyAcc... capture_only_accessors) {
+    // Only capture the accessors to ensure the dependencies are properly handled
+    // The accessors's pointer have already been set to the native container types in previous functions
+    cgh.host_task([functor, queue, capture_only_accessors...](sycl::interop_handle ih) {
+        auto unused = std::make_tuple(capture_only_accessors...);
+        (void)unused;
+        auto sc = CusparseScopedContextHandler(queue, ih);
+        functor(sc);
+    });
+}
+
+template <typename Functor, typename... CaptureOnlyAcc>
+void submit_host_task_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor functor,
+                               sycl::accessor<std::uint8_t> workspace_placeholder_acc,
+                               CaptureOnlyAcc... capture_only_accessors) {
+    // Only capture the accessors to ensure the dependencies are properly handled
+    // The accessors's pointer have already been set to the native container types in previous functions
+    cgh.require(workspace_placeholder_acc);
+    cgh.host_task([functor, queue, workspace_placeholder_acc,
+                   capture_only_accessors...](sycl::interop_handle ih) {
+        auto unused = std::make_tuple(capture_only_accessors...);
+        (void)unused;
+        auto sc = CusparseScopedContextHandler(queue, ih);
+        functor(sc, workspace_placeholder_acc);
+    });
+}
+
+template <typename Functor, typename... CaptureOnlyAcc>
+void submit_native_command_ext(sycl::handler &cgh, sycl::queue &queue, Functor functor,
+                               const std::vector<sycl::event> &dependencies,
+                               CaptureOnlyAcc... capture_only_accessors) {
+    // Only capture the accessors to ensure the dependencies are properly handled
+    // The accessors's pointer have already been set to the native container types in previous functions
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+    cgh.ext_codeplay_enqueue_native_command(
+        [functor, queue, dependencies, capture_only_accessors...](sycl::interop_handle ih) {
+            auto unused = std::make_tuple(capture_only_accessors...);
+            (void)unused;
+            auto sc = CusparseScopedContextHandler(queue, ih);
+            // The functor using ext_codeplay_enqueue_native_command need to
+            // explicitly wait on the events for the SPARSE domain. The
+            // extension ext_codeplay_enqueue_native_command is used to launch
+            // the compute operation which depends on the previous optimize
+            // step. In cuSPARSE the optimize step is synchronous but it is
+            // asynchronous in oneMKL Interface. The optimize step may not use
+            // the CUDA stream which would make it impossible for
+            // ext_codeplay_enqueue_native_command to automatically ensure it
+            // has completed before the compute function starts. These waits are
+            // used to ensure the optimize step has completed before starting
+            // the computation.
+            for (auto event : dependencies) {
+                event.wait();
+            }
+            functor(sc);
+        });
+#else
+    (void)dependencies;
+    submit_host_task(cgh, queue, functor, capture_only_accessors...);
+#endif
+}
+
+template <typename Functor, typename... CaptureOnlyAcc>
+void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor functor,
+                                        const std::vector<sycl::event> &dependencies,
+                                        sycl::accessor<std::uint8_t> workspace_placeholder_acc,
+                                        CaptureOnlyAcc... capture_only_accessors) {
+    // Only capture the accessors to ensure the dependencies are properly handled
+    // The accessors's pointer have already been set to the native container types in previous functions
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+    cgh.require(workspace_placeholder_acc);
+    cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies,
+                                             workspace_placeholder_acc,
+                                             capture_only_accessors...](sycl::interop_handle ih) {
+        auto unused = std::make_tuple(capture_only_accessors...);
+        (void)unused;
+        auto sc = CusparseScopedContextHandler(queue, ih);
+        // The functor using ext_codeplay_enqueue_native_command need to
+        // explicitly wait on the events for the SPARSE domain. The
+        // extension ext_codeplay_enqueue_native_command is used to launch
+        // the compute operation which depends on the previous optimize
+        // step. In cuSPARSE the optimize step is synchronous but it is
+        // asynchronous in oneMKL Interface. The optimize step may not use
+        // the CUDA stream which would make it impossible for
+        // ext_codeplay_enqueue_native_command to automatically ensure it
+        // has completed before the compute function starts. These waits are
+        // used to ensure the optimize step has completed before starting
+        // the computation.
+        for (auto event : dependencies) {
+            event.wait();
+        }
+        functor(sc, workspace_placeholder_acc);
+    });
+#else
+    (void)dependencies;
+    submit_host_task_with_acc(cgh, queue, functor, workspace_placeholder_acc,
+                              capture_only_accessors...);
+#endif
+}
+
+/// Helper submit functions to capture all accessors from the generic containers
+/// \p other_containers and ensure the dependencies of buffers are respected.
+/// The accessors are not directly used as the underlying data pointer has
+/// already been captured in previous functions.
+/// \p workspace_placeholder_acc is a placeholder accessor that will be bound to
+/// the cgh if not empty and given to the functor as a last argument.
+/// \p UseWorkspace must be true to use the placeholder accessor.
+/// \p UseEnqueueNativeCommandExt controls whether host_task are used or the
+/// extension ext_codeplay_enqueue_native_command is used to launch tasks. The
+/// extension should only be used for asynchronous functions using native
+/// backend's functions.
+template <bool UseWorkspace, bool UseEnqueueNativeCommandExt, typename Functor, typename... Ts>
+sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl::queue queue,
+                                        const std::vector<sycl::event> &dependencies,
+                                        Functor functor, matrix_handle_t sm_handle,
+                                        sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc,
+                                        Ts... other_containers) {
+    if (sm_handle->all_use_buffer()) {
+        detail::data_type value_type = sm_handle->get_value_type();
+        detail::data_type int_type = sm_handle->get_int_type();
+
+#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE)                                                  \
+    return queue.submit([&](sycl::handler &cgh) {                                                  \
+        cgh.depends_on(dependencies);                                                              \
+        auto fp_accs = get_fp_accessors<FP_TYPE>(cgh, sm_handle, other_containers...);             \
+        auto int_accs = get_int_accessors<INT_TYPE>(cgh, sm_handle);                               \
+        if constexpr (UseWorkspace) {                                                              \
+            if constexpr (UseEnqueueNativeCommandExt) {                                            \
+                submit_native_command_ext_with_acc(cgh, queue, functor, dependencies,              \
+                                                   workspace_placeholder_acc, fp_accs, int_accs);  \
+            }                                                                                      \
+            else {                                                                                 \
+                submit_host_task_with_acc(cgh, queue, functor, workspace_placeholder_acc, fp_accs, \
+                                          int_accs);                                               \
+            }                                                                                      \
+        }                                                                                          \
+        else {                                                                                     \
+            (void)workspace_placeholder_acc;                                                       \
+            if constexpr (UseEnqueueNativeCommandExt) {                                            \
+                submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs, int_accs);   \
+            }                                                                                      \
+            else {                                                                                 \
+                submit_host_task(cgh, queue, functor, fp_accs, int_accs);                          \
+            }                                                                                      \
+        }                                                                                          \
+    })
+#define ONEMKL_CUSPARSE_SUBMIT_INT(FP_TYPE)            \
+    if (int_type == detail::data_type::int32) {        \
+        ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int32_t); \
+    }                                                  \
+    else if (int_type == detail::data_type::int64) {   \
+        ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int64_t); \
+    }
+
+        if (value_type == detail::data_type::real_fp32) {
+            ONEMKL_CUSPARSE_SUBMIT_INT(float)
+        }
+        else if (value_type == detail::data_type::real_fp64) {
+            ONEMKL_CUSPARSE_SUBMIT_INT(double)
+        }
+        else if (value_type == detail::data_type::complex_fp32) {
+            ONEMKL_CUSPARSE_SUBMIT_INT(std::complex<float>)
+        }
+        else if (value_type == detail::data_type::complex_fp64) {
+            ONEMKL_CUSPARSE_SUBMIT_INT(std::complex<double>)
+        }
+
+#undef ONEMKL_CUSPARSE_SUBMIT_INT
+#undef ONEMKL_CUSPARSE_SUBMIT
+
+        throw oneapi::mkl::exception("sparse_blas", function_name,
+                                     "Could not dispatch buffer kernel to a supported type");
+    }
+    else {
+        // USM submit does not need to capture accessors
+        if constexpr (!UseWorkspace) {
+            return queue.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(dependencies);
+                if constexpr (UseEnqueueNativeCommandExt) {
+                    submit_native_command_ext(cgh, queue, functor, dependencies);
+                }
+                else {
+                    submit_host_task(cgh, queue, functor);
+                }
+            });
+        }
+        else {
+            throw oneapi::mkl::exception("sparse_blas", function_name,
+                                         "Internal error: Cannot use accessor workspace with USM");
+        }
+    }
+}
+
+/// Similar to dispatch_submit_impl_fp_int but only dispatches the host_task based on the floating point value type.
+template <typename Functor, typename ContainerT>
+sycl::event dispatch_submit_impl_fp(const std::string &function_name, sycl::queue queue,
+                                    const std::vector<sycl::event> &dependencies, Functor functor,
+                                    ContainerT container_handle) {
+    if (container_handle->all_use_buffer()) {
+        detail::data_type value_type = container_handle->get_value_type();
+
+#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE)                                  \
+    return queue.submit([&](sycl::handler &cgh) {                        \
+        cgh.depends_on(dependencies);                                    \
+        auto fp_accs = get_fp_accessors<FP_TYPE>(cgh, container_handle); \
+        submit_host_task(cgh, queue, functor, fp_accs);                  \
+    })
+
+        if (value_type == detail::data_type::real_fp32) {
+            ONEMKL_CUSPARSE_SUBMIT(float);
+        }
+        else if (value_type == detail::data_type::real_fp64) {
+            ONEMKL_CUSPARSE_SUBMIT(double);
+        }
+        else if (value_type == detail::data_type::complex_fp32) {
+            ONEMKL_CUSPARSE_SUBMIT(std::complex<float>);
+        }
+        else if (value_type == detail::data_type::complex_fp64) {
+            ONEMKL_CUSPARSE_SUBMIT(std::complex<double>);
+        }
+
+#undef ONEMKL_CUSPARSE_SUBMIT
+
+        throw oneapi::mkl::exception("sparse_blas", function_name,
+                                     "Could not dispatch buffer kernel to a supported type");
+    }
+    else {
+        return queue.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependencies);
+            submit_host_task(cgh, queue, functor);
+        });
+    }
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, Functor functor,
+                            matrix_handle_t sm_handle,
+                            sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc,
+                            Ts... other_containers) {
+    constexpr bool UseWorkspace = true;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, {}, functor, sm_handle, workspace_placeholder_acc,
+        other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue,
+                            const std::vector<sycl::event> &dependencies, Functor functor,
+                            matrix_handle_t sm_handle, Ts... other_containers) {
+    constexpr bool UseWorkspace = false;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, dependencies, functor, sm_handle, {}, other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, Functor functor,
+                            matrix_handle_t sm_handle, Ts... other_containers) {
+    constexpr bool UseWorkspace = false;
+    constexpr bool UseEnqueueNativeCommandExt = false;
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, {}, functor, sm_handle, {}, other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue,
+                                       Functor functor, matrix_handle_t sm_handle,
+                                       sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc,
+                                       Ts... other_containers) {
+    constexpr bool UseWorkspace = true;
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+    constexpr bool UseEnqueueNativeCommandExt = true;
+#else
+    constexpr bool UseEnqueueNativeCommandExt = false;
+#endif
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, {}, functor, sm_handle, workspace_placeholder_acc,
+        other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue,
+                                       const std::vector<sycl::event> &dependencies,
+                                       Functor functor, matrix_handle_t sm_handle,
+                                       Ts... other_containers) {
+    constexpr bool UseWorkspace = false;
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+    constexpr bool UseEnqueueNativeCommandExt = true;
+#else
+    constexpr bool UseEnqueueNativeCommandExt = false;
+#endif
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, dependencies, functor, sm_handle, {}, other_containers...);
+}
+
+/// Helper function for dispatch_submit_impl_fp_int
+template <typename Functor, typename... Ts>
+sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue,
+                                       Functor functor, matrix_handle_t sm_handle,
+                                       Ts... other_containers) {
+    constexpr bool UseWorkspace = false;
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+    constexpr bool UseEnqueueNativeCommandExt = true;
+#else
+    constexpr bool UseEnqueueNativeCommandExt = false;
+#endif
+    return dispatch_submit_impl_fp_int<UseWorkspace, UseEnqueueNativeCommandExt>(
+        function_name, queue, {}, functor, sm_handle, {}, other_containers...);
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
+
+#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_
diff --git a/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp
new file mode 100644
index 000000000..278aec296
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp
@@ -0,0 +1,32 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "sparse_blas/function_table.hpp"
+
+#define WRAPPER_VERSION 1
+#define BACKEND         cusparse
+
+extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = {
+    WRAPPER_VERSION,
+#include "sparse_blas/backends/backend_wrappers.cxx"
+};
diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
new file mode 100644
index 000000000..09fe0515e
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
@@ -0,0 +1,296 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "sparse_blas/backends/cusparse/cusparse_error.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_helper.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_task.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_handles.hpp"
+#include "sparse_blas/common_op_verification.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+
+namespace oneapi::mkl::sparse {
+
+// Complete the definition of the incomplete type
+struct spmm_descr {
+    detail::generic_container workspace;
+    std::size_t temp_buffer_size = 0;
+    bool buffer_size_called = false;
+    bool optimized_called = false;
+    oneapi::mkl::transpose last_optimized_opA;
+    oneapi::mkl::transpose last_optimized_opB;
+    oneapi::mkl::sparse::matrix_view last_optimized_A_view;
+    oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle;
+    oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_B_handle;
+    oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_C_handle;
+    oneapi::mkl::sparse::spmm_alg last_optimized_alg;
+};
+
+} // namespace oneapi::mkl::sparse
+
+namespace oneapi::mkl::sparse::cusparse {
+
+void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) {
+    *p_spmm_descr = new spmm_descr();
+}
+
+sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr,
+                               const std::vector<sycl::event>& dependencies) {
+    return detail::submit_release(queue, spmm_descr, dependencies);
+}
+
+inline auto get_cuda_spmm_alg(spmm_alg alg) {
+    switch (alg) {
+        case spmm_alg::coo_alg1: return CUSPARSE_SPMM_COO_ALG1;
+        case spmm_alg::coo_alg2: return CUSPARSE_SPMM_COO_ALG2;
+        case spmm_alg::coo_alg3: return CUSPARSE_SPMM_COO_ALG3;
+        case spmm_alg::coo_alg4: return CUSPARSE_SPMM_COO_ALG4;
+        case spmm_alg::csr_alg1: return CUSPARSE_SPMM_CSR_ALG1;
+        case spmm_alg::csr_alg2: return CUSPARSE_SPMM_CSR_ALG2;
+        case spmm_alg::csr_alg3: return CUSPARSE_SPMM_CSR_ALG3;
+        default: return CUSPARSE_SPMM_ALG_DEFAULT;
+    }
+}
+
+inline void fallback_alg_if_needed(oneapi::mkl::sparse::spmm_alg& alg, oneapi::mkl::transpose opA,
+                                   oneapi::mkl::transpose opB) {
+    if (alg == oneapi::mkl::sparse::spmm_alg::csr_alg3 &&
+        (opA != oneapi::mkl::transpose::nontrans || opB == oneapi::mkl::transpose::conjtrans)) {
+        // Avoid warnings printed on std::cerr
+        alg = oneapi::mkl::sparse::spmm_alg::default_alg;
+    }
+}
+
+void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+                      const void* alpha, oneapi::mkl::sparse::matrix_view A_view,
+                      oneapi::mkl::sparse::matrix_handle_t A_handle,
+                      oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
+                      oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
+                      oneapi::mkl::sparse::spmm_alg alg,
+                      oneapi::mkl::sparse::spmm_descr_t spmm_descr, std::size_t& temp_buffer_size) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
+    fallback_alg_if_needed(alg, opA, opB);
+    auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler& sc) {
+        auto cu_handle = sc.get_handle(queue);
+        auto cu_a = A_handle->backend_handle;
+        auto cu_b = B_handle->backend_handle;
+        auto cu_c = C_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        auto cu_op_a = get_cuda_operation(type, opA);
+        auto cu_op_b = get_cuda_operation(type, opB);
+        auto cu_type = get_cuda_value_type(type);
+        auto cu_alg = get_cuda_spmm_alg(alg);
+        set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpMM_bufferSize(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta,
+                                              cu_c, cu_type, cu_alg, &temp_buffer_size);
+        check_status(status, __func__);
+    };
+    auto event = dispatch_submit(__func__, queue, functor, A_handle, B_handle, C_handle);
+    event.wait_and_throw();
+    spmm_descr->temp_buffer_size = temp_buffer_size;
+    spmm_descr->buffer_size_called = true;
+}
+
+inline void common_spmm_optimize(
+    oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, bool is_alpha_host_accessible,
+    oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle,
+    oneapi::mkl::sparse::dense_matrix_handle_t B_handle, bool is_beta_host_accessible,
+    oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg,
+    oneapi::mkl::sparse::spmm_descr_t spmm_descr) {
+    detail::check_valid_spmm_common("spmm_optimize", A_view, A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
+    if (!spmm_descr->buffer_size_called) {
+        throw mkl::uninitialized("sparse_blas", "spmm_optimize",
+                                 "spmm_buffer_size must be called before spmm_optimize.");
+    }
+    spmm_descr->optimized_called = true;
+    spmm_descr->last_optimized_opA = opA;
+    spmm_descr->last_optimized_opB = opB;
+    spmm_descr->last_optimized_A_view = A_view;
+    spmm_descr->last_optimized_A_handle = A_handle;
+    spmm_descr->last_optimized_B_handle = B_handle;
+    spmm_descr->last_optimized_C_handle = C_handle;
+    spmm_descr->last_optimized_alg = alg;
+}
+
+void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA,
+                        oneapi::mkl::transpose opB, const void* alpha,
+                        oneapi::mkl::sparse::matrix_handle_t A_handle,
+                        oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
+                        oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
+                        oneapi::mkl::sparse::spmm_alg alg, void* workspace_ptr,
+                        bool is_alpha_host_accessible) {
+    auto cu_a = A_handle->backend_handle;
+    auto cu_b = B_handle->backend_handle;
+    auto cu_c = C_handle->backend_handle;
+    auto type = A_handle->value_container.data_type;
+    auto cu_op_a = get_cuda_operation(type, opA);
+    auto cu_op_b = get_cuda_operation(type, opB);
+    auto cu_type = get_cuda_value_type(type);
+    auto cu_alg = get_cuda_spmm_alg(alg);
+    set_pointer_mode(cu_handle, is_alpha_host_accessible);
+    auto status = cusparseSpMM_preprocess(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta,
+                                          cu_c, cu_type, cu_alg, workspace_ptr);
+    check_status(status, "optimize_spmm");
+}
+
+void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+                   const void* alpha, oneapi::mkl::sparse::matrix_view A_view,
+                   oneapi::mkl::sparse::matrix_handle_t A_handle,
+                   oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
+                   oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
+                   oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr,
+                   sycl::buffer<std::uint8_t, 1> workspace) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    if (!A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle,
+                         is_beta_host_accessible, C_handle, alg, spmm_descr);
+    // Copy the buffer to extend its lifetime until the descriptor is free'd.
+    spmm_descr->workspace.set_buffer_untyped(workspace);
+    if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg || workspace.size() == 0) {
+        // cusparseSpMM_preprocess cannot be called if the workspace is empty
+        return;
+    }
+    fallback_alg_if_needed(alg, opA, opB);
+    auto functor = [=](CusparseScopedContextHandler& sc,
+                       sycl::accessor<std::uint8_t> workspace_acc) {
+        auto cu_handle = sc.get_handle(queue);
+        auto workspace_ptr = sc.get_mem(workspace_acc);
+        spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg,
+                           workspace_ptr, is_alpha_host_accessible);
+    };
+
+    sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc(workspace);
+    dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, B_handle,
+                    C_handle);
+}
+
+sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
+                          oneapi::mkl::transpose opB, const void* alpha,
+                          oneapi::mkl::sparse::matrix_view A_view,
+                          oneapi::mkl::sparse::matrix_handle_t A_handle,
+                          oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
+                          oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
+                          oneapi::mkl::sparse::spmm_alg alg,
+                          oneapi::mkl::sparse::spmm_descr_t spmm_descr, void* workspace,
+                          const std::vector<sycl::event>& dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    if (A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle,
+                         is_beta_host_accessible, C_handle, alg, spmm_descr);
+    spmm_descr->workspace.usm_ptr = workspace;
+    if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg || workspace == nullptr) {
+        // cusparseSpMM_preprocess cannot be called if the workspace is empty
+        return detail::collapse_dependencies(queue, dependencies);
+    }
+    fallback_alg_if_needed(alg, opA, opB);
+    auto functor = [=](CusparseScopedContextHandler& sc) {
+        auto cu_handle = sc.get_handle(queue);
+        spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg,
+                           workspace, is_alpha_host_accessible);
+    };
+
+    return dispatch_submit(__func__, queue, dependencies, functor, A_handle, B_handle, C_handle);
+}
+
+sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
+                 const void* alpha, oneapi::mkl::sparse::matrix_view A_view,
+                 oneapi::mkl::sparse::matrix_handle_t A_handle,
+                 oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
+                 oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
+                 oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr,
+                 const std::vector<sycl::event>& dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
+    if (A_handle->all_use_buffer() != spmm_descr->workspace.use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+
+    if (!spmm_descr->optimized_called) {
+        throw mkl::uninitialized("sparse_blas", __func__,
+                                 "spmm_optimize must be called before spmm.");
+    }
+    CHECK_DESCR_MATCH(spmm_descr, opA, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, opB, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, A_view, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, A_handle, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, B_handle, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, C_handle, "spmm_optimize");
+    CHECK_DESCR_MATCH(spmm_descr, alg, "spmm_optimize");
+
+    fallback_alg_if_needed(alg, opA, opB);
+    auto compute_functor = [=](CusparseScopedContextHandler& sc, void* workspace_ptr) {
+        auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue);
+        auto cu_a = A_handle->backend_handle;
+        auto cu_b = B_handle->backend_handle;
+        auto cu_c = C_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        auto cu_op_a = get_cuda_operation(type, opA);
+        auto cu_op_b = get_cuda_operation(type, opB);
+        auto cu_type = get_cuda_value_type(type);
+        auto cu_alg = get_cuda_spmm_alg(alg);
+        set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpMM(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c,
+                                   cu_type, cu_alg, workspace_ptr);
+        check_status(status, __func__);
+#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+        CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream);
+#endif
+    };
+    if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) {
+        // The accessor can only be bound to the cgh if the buffer size is
+        // greater than 0
+        auto functor_buffer = [=](CusparseScopedContextHandler& sc,
+                                  sycl::accessor<std::uint8_t> workspace_acc) {
+            auto workspace_ptr = sc.get_mem(workspace_acc);
+            compute_functor(sc, workspace_ptr);
+        };
+        sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc(
+            spmm_descr->workspace.get_buffer<std::uint8_t>());
+        return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle,
+                                          workspace_placeholder_acc, B_handle, C_handle);
+    }
+    else {
+        // The same dispatch_submit can be used for USM or buffers if no
+        // workspace accessor is needed, workspace_ptr will be a nullptr in the
+        // latter case.
+        auto workspace_ptr = spmm_descr->workspace.usm_ptr;
+        auto functor_usm = [=](CusparseScopedContextHandler& sc) {
+            compute_functor(sc, workspace_ptr);
+        };
+        return dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, A_handle,
+                                          B_handle, C_handle);
+    }
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp
new file mode 100644
index 000000000..e06f84695
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp
@@ -0,0 +1,323 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "sparse_blas/backends/cusparse/cusparse_error.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_helper.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_task.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_handles.hpp"
+#include "sparse_blas/common_op_verification.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+
+namespace oneapi::mkl::sparse {
+
+// Complete the definition of the incomplete type
+struct spmv_descr {
+    detail::generic_container workspace;
+    std::size_t temp_buffer_size = 0;
+    bool buffer_size_called = false;
+    bool optimized_called = false;
+    oneapi::mkl::transpose last_optimized_opA;
+    oneapi::mkl::sparse::matrix_view last_optimized_A_view;
+    oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle;
+    oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle;
+    oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle;
+    oneapi::mkl::sparse::spmv_alg last_optimized_alg;
+};
+
+} // namespace oneapi::mkl::sparse
+
+namespace oneapi::mkl::sparse::cusparse {
+
+void init_spmv_descr(sycl::queue & /*queue*/, spmv_descr_t *p_spmv_descr) {
+    *p_spmv_descr = new spmv_descr();
+}
+
+sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr,
+                               const std::vector<sycl::event> &dependencies) {
+    return detail::submit_release(queue, spmv_descr, dependencies);
+}
+
+inline auto get_cuda_spmv_alg(spmv_alg alg) {
+    switch (alg) {
+        case spmv_alg::coo_alg1: return CUSPARSE_SPMV_COO_ALG1;
+        case spmv_alg::coo_alg2: return CUSPARSE_SPMV_COO_ALG2;
+        case spmv_alg::csr_alg1: return CUSPARSE_SPMV_CSR_ALG1;
+        case spmv_alg::csr_alg2: return CUSPARSE_SPMV_CSR_ALG2;
+        default: return CUSPARSE_SPMV_ALG_DEFAULT;
+    }
+}
+
+void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose opA,
+                      oneapi::mkl::sparse::matrix_view A_view,
+                      oneapi::mkl::sparse::matrix_handle_t A_handle,
+                      oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                      oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                      bool is_alpha_host_accessible, bool is_beta_host_accessible) {
+    detail::check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
+    if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) {
+        throw mkl::unimplemented(
+            "sparse_blas", function_name,
+            "The backend does not support spmv with a `type_view` other than `matrix_descr::general`.");
+    }
+}
+
+void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
+                      oneapi::mkl::sparse::matrix_view A_view,
+                      oneapi::mkl::sparse::matrix_handle_t A_handle,
+                      oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta,
+                      oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                      oneapi::mkl::sparse::spmv_alg alg,
+                      oneapi::mkl::sparse::spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible,
+                     is_beta_host_accessible);
+    auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) {
+        auto cu_handle = sc.get_handle(queue);
+        auto cu_a = A_handle->backend_handle;
+        auto cu_x = x_handle->backend_handle;
+        auto cu_y = y_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        auto cu_op = get_cuda_operation(type, opA);
+        auto cu_type = get_cuda_value_type(type);
+        auto cu_alg = get_cuda_spmv_alg(alg);
+        set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpMV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y,
+                                              cu_type, cu_alg, &temp_buffer_size);
+        check_status(status, __func__);
+    };
+    auto event = dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+    event.wait_and_throw();
+    spmv_descr->temp_buffer_size = temp_buffer_size;
+    spmv_descr->buffer_size_called = true;
+}
+
+inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible,
+                                 oneapi::mkl::sparse::matrix_view A_view,
+                                 oneapi::mkl::sparse::matrix_handle_t A_handle,
+                                 oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                                 bool is_beta_host_accessible,
+                                 oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                                 oneapi::mkl::sparse::spmv_alg alg,
+                                 oneapi::mkl::sparse::spmv_descr_t spmv_descr) {
+    check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle,
+                     is_alpha_host_accessible, is_beta_host_accessible);
+    if (!spmv_descr->buffer_size_called) {
+        throw mkl::uninitialized("sparse_blas", "spmv_optimize",
+                                 "spmv_buffer_size must be called before spmv_optimize.");
+    }
+    spmv_descr->optimized_called = true;
+    spmv_descr->last_optimized_opA = opA;
+    spmv_descr->last_optimized_A_view = A_view;
+    spmv_descr->last_optimized_A_handle = A_handle;
+    spmv_descr->last_optimized_x_handle = x_handle;
+    spmv_descr->last_optimized_y_handle = y_handle;
+    spmv_descr->last_optimized_alg = alg;
+}
+
+#if CUSPARSE_VERSION >= 12300
+// cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4)
+void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void *alpha,
+                        oneapi::mkl::sparse::matrix_handle_t A_handle,
+                        oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta,
+                        oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                        oneapi::mkl::sparse::spmv_alg alg, void *workspace_ptr,
+                        bool is_alpha_host_accessible) {
+    auto cu_a = A_handle->backend_handle;
+    auto cu_x = x_handle->backend_handle;
+    auto cu_y = y_handle->backend_handle;
+    auto type = A_handle->value_container.data_type;
+    auto cu_op = get_cuda_operation(type, opA);
+    auto cu_type = get_cuda_value_type(type);
+    auto cu_alg = get_cuda_spmv_alg(alg);
+    set_pointer_mode(cu_handle, is_alpha_host_accessible);
+    auto status = cusparseSpMV_preprocess(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type,
+                                          cu_alg, workspace_ptr);
+    check_status(status, "optimize_spmv");
+}
+#endif
+
+void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
+                   oneapi::mkl::sparse::matrix_view A_view,
+                   oneapi::mkl::sparse::matrix_handle_t A_handle,
+                   oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta,
+                   oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                   oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr,
+                   sycl::buffer<std::uint8_t, 1> workspace) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    if (!A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle,
+                         is_beta_host_accessible, y_handle, alg, spmv_descr);
+    // Copy the buffer to extend its lifetime until the descriptor is free'd.
+    spmv_descr->workspace.set_buffer_untyped(workspace);
+    if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) {
+        return;
+    }
+
+#if CUSPARSE_VERSION < 12300
+    // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4)
+    return;
+#else
+    if (spmv_descr->temp_buffer_size > 0) {
+        auto functor = [=](CusparseScopedContextHandler &sc,
+                           sycl::accessor<std::uint8_t> workspace_acc) {
+            auto cu_handle = sc.get_handle(queue);
+            auto workspace_ptr = sc.get_mem(workspace_acc);
+            spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg,
+                               workspace_ptr, is_alpha_host_accessible);
+        };
+
+        // The accessor can only be bound to the cgh if the buffer size is
+        // greater than 0
+        sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc(workspace);
+        dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, x_handle,
+                        y_handle);
+    }
+    else {
+        auto functor = [=](CusparseScopedContextHandler &sc) {
+            auto cu_handle = sc.get_handle(queue);
+            spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg,
+                               nullptr, is_alpha_host_accessible);
+        };
+        dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+    }
+#endif
+}
+
+sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
+                          oneapi::mkl::sparse::matrix_view A_view,
+                          oneapi::mkl::sparse::matrix_handle_t A_handle,
+                          oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta,
+                          oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                          oneapi::mkl::sparse::spmv_alg alg,
+                          oneapi::mkl::sparse::spmv_descr_t spmv_descr, void *workspace,
+                          const std::vector<sycl::event> &dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    if (A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle,
+                         is_beta_host_accessible, y_handle, alg, spmv_descr);
+    spmv_descr->workspace.usm_ptr = workspace;
+    if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) {
+        return detail::collapse_dependencies(queue, dependencies);
+    }
+
+#if CUSPARSE_VERSION < 12300
+    // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4)
+    return detail::collapse_dependencies(queue, dependencies);
+#else
+    auto functor = [=](CusparseScopedContextHandler &sc) {
+        auto cu_handle = sc.get_handle(queue);
+        spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg,
+                           workspace, is_alpha_host_accessible);
+    };
+    return dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, y_handle);
+#endif
+}
+
+sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
+                 oneapi::mkl::sparse::matrix_view A_view,
+                 oneapi::mkl::sparse::matrix_handle_t A_handle,
+                 oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta,
+                 oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                 oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr,
+                 const std::vector<sycl::event> &dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible,
+                     is_beta_host_accessible);
+    if (A_handle->all_use_buffer() != spmv_descr->workspace.use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+
+    if (!spmv_descr->optimized_called) {
+        throw mkl::uninitialized("sparse_blas", __func__,
+                                 "spmv_optimize must be called before spmv.");
+    }
+    CHECK_DESCR_MATCH(spmv_descr, opA, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, A_view, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, A_handle, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, x_handle, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, y_handle, "spmv_optimize");
+    CHECK_DESCR_MATCH(spmv_descr, alg, "spmv_optimize");
+
+    auto compute_functor = [=](CusparseScopedContextHandler &sc, void *workspace_ptr) {
+        auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue);
+        auto cu_a = A_handle->backend_handle;
+        auto cu_x = x_handle->backend_handle;
+        auto cu_y = y_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        auto cu_op = get_cuda_operation(type, opA);
+        auto cu_type = get_cuda_value_type(type);
+        auto cu_alg = get_cuda_spmv_alg(alg);
+        // Workaround issue with captured alpha and beta causing a segfault inside cuSPARSE
+        // Copy alpha and beta locally in the largest data value type and use the local pointer
+        cuDoubleComplex local_alpha, local_beta;
+        const void *alpha_ptr = alpha, *beta_ptr = beta;
+        if (is_alpha_host_accessible) {
+            local_alpha = *reinterpret_cast<const cuDoubleComplex *>(alpha_ptr);
+            local_beta = *reinterpret_cast<const cuDoubleComplex *>(beta_ptr);
+            alpha_ptr = &local_alpha;
+            beta_ptr = &local_beta;
+        }
+        set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpMV(cu_handle, cu_op, alpha_ptr, cu_a, cu_x, beta_ptr, cu_y, cu_type,
+                                   cu_alg, workspace_ptr);
+        check_status(status, __func__);
+#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+        CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream);
+#endif
+    };
+    if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) {
+        // The accessor can only be bound to the cgh if the buffer size is
+        // greater than 0
+        auto functor_buffer = [=](CusparseScopedContextHandler &sc,
+                                  sycl::accessor<std::uint8_t> workspace_acc) {
+            auto workspace_ptr = sc.get_mem(workspace_acc);
+            compute_functor(sc, workspace_ptr);
+        };
+        sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc(
+            spmv_descr->workspace.get_buffer<std::uint8_t>());
+        return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle,
+                                          workspace_placeholder_acc, x_handle, y_handle);
+    }
+    else {
+        // The same dispatch_submit can be used for USM or buffers if no
+        // workspace accessor is needed, workspace_ptr will be a nullptr in the
+        // latter case.
+        auto workspace_ptr = spmv_descr->workspace.usm_ptr;
+        auto functor_usm = [=](CusparseScopedContextHandler &sc) {
+            compute_functor(sc, workspace_ptr);
+        };
+        return dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, A_handle,
+                                          x_handle, y_handle);
+    }
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp
new file mode 100644
index 000000000..2f124caad
--- /dev/null
+++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp
@@ -0,0 +1,263 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp"
+
+#include "sparse_blas/backends/cusparse/cusparse_error.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_helper.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_task.hpp"
+#include "sparse_blas/backends/cusparse/cusparse_handles.hpp"
+#include "sparse_blas/common_op_verification.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
+
+namespace oneapi::mkl::sparse {
+
+// Complete the definition of the incomplete type
+struct spsv_descr {
+    cusparseSpSVDescr_t cu_descr;
+    detail::generic_container workspace;
+    bool buffer_size_called = false;
+    bool optimized_called = false;
+    oneapi::mkl::transpose last_optimized_opA;
+    oneapi::mkl::sparse::matrix_view last_optimized_A_view;
+    oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle;
+    oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle;
+    oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle;
+    oneapi::mkl::sparse::spsv_alg last_optimized_alg;
+};
+
+} // namespace oneapi::mkl::sparse
+
+namespace oneapi::mkl::sparse::cusparse {
+
+void init_spsv_descr(sycl::queue & /*queue*/, spsv_descr_t *p_spsv_descr) {
+    *p_spsv_descr = new spsv_descr();
+    CUSPARSE_ERR_FUNC(cusparseSpSV_createDescr, &(*p_spsv_descr)->cu_descr);
+}
+
+sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr,
+                               const std::vector<sycl::event> &dependencies) {
+    // Use dispatch_submit to ensure the backend's descriptor is kept alive as long as the buffers are used
+    auto functor = [=](CusparseScopedContextHandler &) {
+        CUSPARSE_ERR_FUNC(cusparseSpSV_destroyDescr, spsv_descr->cu_descr);
+        delete spsv_descr;
+    };
+    return dispatch_submit(__func__, queue, dependencies, functor,
+                           spsv_descr->last_optimized_A_handle, spsv_descr->last_optimized_x_handle,
+                           spsv_descr->last_optimized_y_handle);
+}
+
+inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) {
+    return CUSPARSE_SPSV_ALG_DEFAULT;
+}
+
+void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
+                      oneapi::mkl::sparse::matrix_view A_view,
+                      oneapi::mkl::sparse::matrix_handle_t A_handle,
+                      oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                      oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                      oneapi::mkl::sparse::spsv_alg alg,
+                      oneapi::mkl::sparse::spsv_descr_t spsv_descr, std::size_t &temp_buffer_size) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    detail::check_valid_spsv_common(__func__, A_view, A_handle, x_handle, y_handle,
+                                    is_alpha_host_accessible);
+    auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) {
+        auto cu_handle = sc.get_handle(queue);
+        auto cu_a = A_handle->backend_handle;
+        auto cu_x = x_handle->backend_handle;
+        auto cu_y = y_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        set_matrix_attributes(__func__, cu_a, A_view);
+        auto cu_op = get_cuda_operation(type, opA);
+        auto cu_type = get_cuda_value_type(type);
+        auto cu_alg = get_cuda_spsv_alg(alg);
+        auto cu_descr = spsv_descr->cu_descr;
+        set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpSV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type,
+                                              cu_alg, cu_descr, &temp_buffer_size);
+        check_status(status, __func__);
+    };
+    auto event = dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+    event.wait_and_throw();
+    spsv_descr->buffer_size_called = true;
+}
+
+inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible,
+                                 oneapi::mkl::sparse::matrix_view A_view,
+                                 oneapi::mkl::sparse::matrix_handle_t A_handle,
+                                 oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                                 oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                                 oneapi::mkl::sparse::spsv_alg alg,
+                                 oneapi::mkl::sparse::spsv_descr_t spsv_descr) {
+    detail::check_valid_spsv_common("spsv_optimize", A_view, A_handle, x_handle, y_handle,
+                                    is_alpha_host_accessible);
+    if (!spsv_descr->buffer_size_called) {
+        throw mkl::uninitialized("sparse_blas", "spsv_optimize",
+                                 "spsv_buffer_size must be called before spsv_optimize.");
+    }
+    spsv_descr->optimized_called = true;
+    spsv_descr->last_optimized_opA = opA;
+    spsv_descr->last_optimized_A_view = A_view;
+    spsv_descr->last_optimized_A_handle = A_handle;
+    spsv_descr->last_optimized_x_handle = x_handle;
+    spsv_descr->last_optimized_y_handle = y_handle;
+    spsv_descr->last_optimized_alg = alg;
+}
+
+void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void *alpha,
+                        oneapi::mkl::sparse::matrix_view A_view,
+                        oneapi::mkl::sparse::matrix_handle_t A_handle,
+                        oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                        oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                        oneapi::mkl::sparse::spsv_alg alg,
+                        oneapi::mkl::sparse::spsv_descr_t spsv_descr, void *workspace_ptr,
+                        bool is_alpha_host_accessible) {
+    auto cu_a = A_handle->backend_handle;
+    auto cu_x = x_handle->backend_handle;
+    auto cu_y = y_handle->backend_handle;
+    auto type = A_handle->value_container.data_type;
+    set_matrix_attributes("optimize_spsv", cu_a, A_view);
+    auto cu_op = get_cuda_operation(type, opA);
+    auto cu_type = get_cuda_value_type(type);
+    auto cu_alg = get_cuda_spsv_alg(alg);
+    auto cu_descr = spsv_descr->cu_descr;
+    set_pointer_mode(cu_handle, is_alpha_host_accessible);
+    auto status = cusparseSpSV_analysis(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg,
+                                        cu_descr, workspace_ptr);
+    check_status(status, "optimize_spsv");
+}
+
+void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
+                   oneapi::mkl::sparse::matrix_view A_view,
+                   oneapi::mkl::sparse::matrix_handle_t A_handle,
+                   oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                   oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                   oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr,
+                   sycl::buffer<std::uint8_t, 1> workspace) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    if (!A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg,
+                         spsv_descr);
+    // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE
+    // Copy the buffer to extend its lifetime until the descriptor is free'd.
+    spsv_descr->workspace.set_buffer_untyped(workspace);
+
+    if (workspace.size() > 0) {
+        auto functor = [=](CusparseScopedContextHandler &sc,
+                           sycl::accessor<std::uint8_t> workspace_acc) {
+            auto cu_handle = sc.get_handle(queue);
+            auto workspace_ptr = sc.get_mem(workspace_acc);
+            spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg,
+                               spsv_descr, workspace_ptr, is_alpha_host_accessible);
+        };
+
+        // The accessor can only be bound to the cgh if the buffer size is
+        // greater than 0
+        sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc(workspace);
+        dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, x_handle,
+                        y_handle);
+    }
+    else {
+        auto functor = [=](CusparseScopedContextHandler &sc) {
+            auto cu_handle = sc.get_handle(queue);
+            spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg,
+                               spsv_descr, nullptr, is_alpha_host_accessible);
+        };
+
+        dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle);
+    }
+}
+
+sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
+                          oneapi::mkl::sparse::matrix_view A_view,
+                          oneapi::mkl::sparse::matrix_handle_t A_handle,
+                          oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                          oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                          oneapi::mkl::sparse::spsv_alg alg,
+                          oneapi::mkl::sparse::spsv_descr_t spsv_descr, void *workspace,
+                          const std::vector<sycl::event> &dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    if (A_handle->all_use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+    common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg,
+                         spsv_descr);
+    // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE
+    auto functor = [=](CusparseScopedContextHandler &sc) {
+        auto cu_handle = sc.get_handle(queue);
+        spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg,
+                           spsv_descr, workspace, is_alpha_host_accessible);
+    };
+    // No need to store the workspace USM pointer as the backend stores it already
+    return dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, y_handle);
+}
+
+sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
+                 oneapi::mkl::sparse::matrix_view A_view,
+                 oneapi::mkl::sparse::matrix_handle_t A_handle,
+                 oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                 oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                 oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr,
+                 const std::vector<sycl::event> &dependencies) {
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    detail::check_valid_spsv_common(__func__, A_view, A_handle, x_handle, y_handle,
+                                    is_alpha_host_accessible);
+    if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) {
+        detail::throw_incompatible_container(__func__);
+    }
+
+    if (!spsv_descr->optimized_called) {
+        throw mkl::uninitialized("sparse_blas", __func__,
+                                 "spsv_optimize must be called before spsv.");
+    }
+    CHECK_DESCR_MATCH(spsv_descr, opA, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, A_view, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, A_handle, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, x_handle, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, y_handle, "spsv_optimize");
+    CHECK_DESCR_MATCH(spsv_descr, alg, "spsv_optimize");
+
+    auto functor = [=](CusparseScopedContextHandler &sc) {
+        auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue);
+        auto cu_a = A_handle->backend_handle;
+        auto cu_x = x_handle->backend_handle;
+        auto cu_y = y_handle->backend_handle;
+        auto type = A_handle->value_container.data_type;
+        set_matrix_attributes(__func__, cu_a, A_view);
+        auto cu_op = get_cuda_operation(type, opA);
+        auto cu_type = get_cuda_value_type(type);
+        auto cu_alg = get_cuda_spsv_alg(alg);
+        auto cu_descr = spsv_descr->cu_descr;
+        set_pointer_mode(cu_handle, is_alpha_host_accessible);
+        auto status = cusparseSpSV_solve(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg,
+                                         cu_descr);
+        check_status(status, __func__);
+#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+        CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream);
+#endif
+    };
+    return dispatch_submit_native_ext(__func__, queue, dependencies, functor, A_handle, x_handle,
+                                      y_handle);
+}
+
+} // namespace oneapi::mkl::sparse::cusparse
diff --git a/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp b/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp
new file mode 100644
index 000000000..28c628438
--- /dev/null
+++ b/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp
@@ -0,0 +1,37 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_
+#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_
+
+/// Convert \p value_type to template type argument and use it to call \p op_functor.
+#define DISPATCH_MKL_OPERATION(function_name, value_type, op_functor, ...)                         \
+    switch (value_type) {                                                                          \
+        case detail::data_type::real_fp32: return op_functor<float>(__VA_ARGS__);                  \
+        case detail::data_type::real_fp64: return op_functor<double>(__VA_ARGS__);                 \
+        case detail::data_type::complex_fp32: return op_functor<std::complex<float>>(__VA_ARGS__); \
+        case detail::data_type::complex_fp64:                                                      \
+            return op_functor<std::complex<double>>(__VA_ARGS__);                                  \
+        default:                                                                                   \
+            throw oneapi::mkl::exception(                                                          \
+                "sparse_blas", function_name,                                                      \
+                "Internal error: unsupported type " + data_type_to_str(value_type));               \
+    }
+
+#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_
diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx
index 3ae84ca64..7550625eb 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx
+++ b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx
@@ -32,27 +32,11 @@ void init_dense_vector(sycl::queue & /*queue*/,
     *p_dvhandle = new oneapi::mkl::sparse::dense_vector_handle(val, size);
 }
 
-template <typename fpType, typename InternalHandleT>
-void check_can_reset_value_handle(const std::string &function_name,
-                                  InternalHandleT *internal_handle, bool expect_buffer) {
-    if (internal_handle->get_value_type() != detail::get_data_type<fpType>()) {
-        throw oneapi::mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Incompatible data types expected " +
-                data_type_to_str(internal_handle->get_value_type()) + " but got " +
-                data_type_to_str(detail::get_data_type<fpType>()));
-    }
-    if (internal_handle->all_use_buffer() != expect_buffer) {
-        throw oneapi::mkl::invalid_argument(
-            "sparse_blas", function_name, "Cannot change the container type between buffer or USM");
-    }
-}
-
 template <typename fpType>
 void set_dense_vector_data(sycl::queue & /*queue*/,
                            oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size,
                            sycl::buffer<fpType, 1> val) {
-    check_can_reset_value_handle<fpType>(__func__, dvhandle, true);
+    detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, true);
     dvhandle->size = size;
     dvhandle->set_buffer(val);
 }
@@ -61,26 +45,12 @@ template <typename fpType>
 void set_dense_vector_data(sycl::queue & /*queue*/,
                            oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size,
                            fpType *val) {
-    check_can_reset_value_handle<fpType>(__func__, dvhandle, false);
+    detail::check_can_reset_value_handle<fpType>(__func__, dvhandle, false);
     dvhandle->size = size;
     dvhandle->set_usm_ptr(val);
 }
 
-#define INSTANTIATE_DENSE_VECTOR_FUNCS(FP_TYPE, FP_SUFFIX)                            \
-    template void init_dense_vector<FP_TYPE>(                                         \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \
-        std::int64_t size, sycl::buffer<FP_TYPE, 1> val);                             \
-    template void init_dense_vector<FP_TYPE>(                                         \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \
-        std::int64_t size, FP_TYPE * val);                                            \
-    template void set_dense_vector_data<FP_TYPE>(                                     \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle,     \
-        std::int64_t size, sycl::buffer<FP_TYPE, 1> val);                             \
-    template void set_dense_vector_data<FP_TYPE>(                                     \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle,     \
-        std::int64_t size, FP_TYPE * val)
 FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS);
-#undef INSTANTIATE_DENSE_VECTOR_FUNCS
 
 sycl::event release_dense_vector(sycl::queue &queue,
                                  oneapi::mkl::sparse::dense_vector_handle_t dvhandle,
@@ -112,7 +82,7 @@ void set_dense_matrix_data(sycl::queue & /*queue*/,
                            oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,
                            std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
                            oneapi::mkl::layout dense_layout, sycl::buffer<fpType, 1> val) {
-    check_can_reset_value_handle<fpType>(__func__, dmhandle, true);
+    detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, true);
     dmhandle->num_rows = num_rows;
     dmhandle->num_cols = num_cols;
     dmhandle->ld = ld;
@@ -125,7 +95,7 @@ void set_dense_matrix_data(sycl::queue & /*queue*/,
                            oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,
                            std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,
                            oneapi::mkl::layout dense_layout, fpType *val) {
-    check_can_reset_value_handle<fpType>(__func__, dmhandle, false);
+    detail::check_can_reset_value_handle<fpType>(__func__, dmhandle, false);
     dmhandle->num_rows = num_rows;
     dmhandle->num_cols = num_cols;
     dmhandle->ld = ld;
@@ -133,25 +103,7 @@ void set_dense_matrix_data(sycl::queue & /*queue*/,
     dmhandle->set_usm_ptr(val);
 }
 
-#define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX)                            \
-    template void init_dense_matrix<FP_TYPE>(                                         \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
-        oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val);              \
-    template void init_dense_matrix<FP_TYPE>(                                         \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
-        oneapi::mkl::layout dense_layout, FP_TYPE * val);                             \
-    template void set_dense_matrix_data<FP_TYPE>(                                     \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,     \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
-        oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val);              \
-    template void set_dense_matrix_data<FP_TYPE>(                                     \
-        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,     \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
-        oneapi::mkl::layout dense_layout, FP_TYPE * val)
 FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS);
-#undef INSTANTIATE_DENSE_MATRIX_FUNCS
 
 sycl::event release_dense_matrix(sycl::queue &queue,
                                  oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,
@@ -167,7 +119,8 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p
                      sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) {
     oneapi::mkl::sparse::matrix_handle_t mkl_handle;
     oneapi::mkl::sparse::init_matrix_handle(&mkl_handle);
-    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val);
+    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val,
+                                                              num_rows, num_cols, nnz, index);
     // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released.
     oneapi::mkl::sparse::set_coo_data(queue, mkl_handle, static_cast<intType>(num_rows),
                                       static_cast<intType>(num_cols), static_cast<intType>(nnz),
@@ -184,7 +137,8 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p
                      fpType *val) {
     oneapi::mkl::sparse::matrix_handle_t mkl_handle;
     oneapi::mkl::sparse::init_matrix_handle(&mkl_handle);
-    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val);
+    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val,
+                                                              num_rows, num_cols, nnz, index);
     auto event = oneapi::mkl::sparse::set_coo_data(
         queue, mkl_handle, static_cast<intType>(num_rows), static_cast<intType>(num_cols),
         static_cast<intType>(nnz), index, row_ind, col_ind, val);
@@ -192,32 +146,17 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p
     *p_smhandle = reinterpret_cast<oneapi::mkl::sparse::matrix_handle_t>(internal_smhandle);
 }
 
-template <typename fpType, typename intType>
-void check_can_reset_sparse_handle(const std::string &function_name,
-                                   detail::sparse_matrix_handle *internal_smhandle,
-                                   bool expect_buffer) {
-    check_can_reset_value_handle<fpType>(function_name, internal_smhandle, expect_buffer);
-    if (internal_smhandle->get_int_type() != detail::get_data_type<intType>()) {
-        throw oneapi::mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Incompatible data types expected " +
-                data_type_to_str(internal_smhandle->get_int_type()) + " but got " +
-                data_type_to_str(detail::get_data_type<intType>()));
-    }
-    if (!internal_smhandle->can_be_reset) {
-        throw mkl::unimplemented(
-            "sparse_blas/mkl", function_name,
-            "Reseting the matrix handle's data after it was used in a computation is not supported.");
-    }
-}
-
 template <typename fpType, typename intType>
 void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle,
                          std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                          oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ind,
                          sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
-    check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true);
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true);
+    internal_smhandle->num_rows = num_rows;
+    internal_smhandle->num_cols = num_cols;
+    internal_smhandle->nnz = nnz;
+    internal_smhandle->index = index;
     internal_smhandle->row_container.set_buffer(row_ind);
     internal_smhandle->col_container.set_buffer(col_ind);
     internal_smhandle->value_container.set_buffer(val);
@@ -236,7 +175,11 @@ void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_
                          oneapi::mkl::index_base index, intType *row_ind, intType *col_ind,
                          fpType *val) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
-    check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false);
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false);
+    internal_smhandle->num_rows = num_rows;
+    internal_smhandle->num_cols = num_cols;
+    internal_smhandle->nnz = nnz;
+    internal_smhandle->index = index;
     internal_smhandle->row_container.set_usm_ptr(row_ind);
     internal_smhandle->col_container.set_usm_ptr(col_ind);
     internal_smhandle->value_container.set_usm_ptr(val);
@@ -246,37 +189,18 @@ void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_
     event.wait_and_throw();
 }
 
-#define INSTANTIATE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX)                     \
-    template void init_coo_matrix<FP_TYPE, INT_TYPE>(                                              \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
-        oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ind,                          \
-        sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val);                          \
-    template void init_coo_matrix<FP_TYPE, INT_TYPE>(                                              \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
-        oneapi::mkl::index_base index, INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val);     \
-    template void set_coo_matrix_data<FP_TYPE, INT_TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
-        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
-        sycl::buffer<INT_TYPE, 1> row_ind, sycl::buffer<INT_TYPE, 1> col_ind,                      \
-        sycl::buffer<FP_TYPE, 1> val);                                                             \
-    template void set_coo_matrix_data<FP_TYPE, INT_TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
-        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
-        INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val)
 FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS);
-#undef INSTANTIATE_COO_MATRIX_FUNCS
 
 // CSR matrix
 template <typename fpType, typename intType>
 void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_smhandle,
-                     std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/,
+                     std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                      oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ptr,
                      sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) {
     oneapi::mkl::sparse::matrix_handle_t mkl_handle;
     oneapi::mkl::sparse::init_matrix_handle(&mkl_handle);
-    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val);
+    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val,
+                                                              num_rows, num_cols, nnz, index);
     // The backend deduces nnz from row_ptr.
     // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released.
     oneapi::mkl::sparse::set_csr_data(queue, mkl_handle, static_cast<intType>(num_rows),
@@ -289,12 +213,13 @@ void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p
 
 template <typename fpType, typename intType>
 void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_smhandle,
-                     std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/,
+                     std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                      oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind,
                      fpType *val) {
     oneapi::mkl::sparse::matrix_handle_t mkl_handle;
     oneapi::mkl::sparse::init_matrix_handle(&mkl_handle);
-    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val);
+    auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val,
+                                                              num_rows, num_cols, nnz, index);
     // The backend deduces nnz from row_ptr.
     auto event = oneapi::mkl::sparse::set_csr_data(
         queue, mkl_handle, static_cast<intType>(num_rows), static_cast<intType>(num_cols), index,
@@ -305,11 +230,15 @@ void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p
 
 template <typename fpType, typename intType>
 void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle,
-                         std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/,
+                         std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                          oneapi::mkl::index_base index, sycl::buffer<intType, 1> row_ptr,
                          sycl::buffer<intType, 1> col_ind, sycl::buffer<fpType, 1> val) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
-    check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true);
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, true);
+    internal_smhandle->num_rows = num_rows;
+    internal_smhandle->num_cols = num_cols;
+    internal_smhandle->nnz = nnz;
+    internal_smhandle->index = index;
     internal_smhandle->row_container.set_buffer(row_ptr);
     internal_smhandle->col_container.set_buffer(col_ind);
     internal_smhandle->value_container.set_buffer(val);
@@ -325,11 +254,15 @@ void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_
 
 template <typename fpType, typename intType>
 void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle,
-                         std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/,
+                         std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,
                          oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind,
                          fpType *val) {
     auto internal_smhandle = detail::get_internal_handle(smhandle);
-    check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false);
+    detail::check_can_reset_sparse_handle<fpType, intType>(__func__, internal_smhandle, false);
+    internal_smhandle->num_rows = num_rows;
+    internal_smhandle->num_cols = num_cols;
+    internal_smhandle->nnz = nnz;
+    internal_smhandle->index = index;
     internal_smhandle->row_container.set_usm_ptr(row_ptr);
     internal_smhandle->col_container.set_usm_ptr(col_ind);
     internal_smhandle->value_container.set_usm_ptr(val);
@@ -340,27 +273,7 @@ void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_
     event.wait_and_throw();
 }
 
-#define INSTANTIATE_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX)                     \
-    template void init_csr_matrix<FP_TYPE, INT_TYPE>(                                              \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
-        oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ptr,                          \
-        sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val);                          \
-    template void init_csr_matrix<FP_TYPE, INT_TYPE>(                                              \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
-        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
-        oneapi::mkl::index_base index, INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val);     \
-    template void set_csr_matrix_data<FP_TYPE, INT_TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
-        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
-        sycl::buffer<INT_TYPE, 1> row_ptr, sycl::buffer<INT_TYPE, 1> col_ind,                      \
-        sycl::buffer<FP_TYPE, 1> val);                                                             \
-    template void set_csr_matrix_data<FP_TYPE, INT_TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
-        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
-        INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val)
 FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS);
-#undef INSTANTIATE_CSR_MATRIX_FUNCS
 
 // Common sparse matrix functions
 sycl::event release_sparse_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle,
@@ -369,7 +282,7 @@ sycl::event release_sparse_matrix(sycl::queue &queue, oneapi::mkl::sparse::matri
     // Asynchronously release the backend's handle followed by the internal handle.
     auto event = oneapi::mkl::sparse::release_matrix_handle(
         queue, &internal_smhandle->backend_handle, dependencies);
-    return detail::submit_release(queue, internal_smhandle, event);
+    return detail::submit_release(queue, internal_smhandle, { event });
 }
 
 bool set_matrix_property(sycl::queue & /*queue*/, oneapi::mkl::sparse::matrix_handle_t smhandle,
diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.hpp b/src/sparse_blas/backends/mkl_common/mkl_handles.hpp
index efadd72e7..24a61ce5e 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_handles.hpp
+++ b/src/sparse_blas/backends/mkl_common/mkl_handles.hpp
@@ -26,6 +26,8 @@
 #include <oneapi/mkl/spblas.hpp>
 
 #include "sparse_blas/generic_container.hpp"
+#include "sparse_blas/macros.hpp"
+#include "sparse_blas/sycl_helper.hpp"
 
 namespace oneapi::mkl::sparse {
 
diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx
index dad611252..acde45cb4 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx
+++ b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx
@@ -50,35 +50,9 @@ void check_valid_spmm(const std::string &function_name, oneapi::mkl::transpose o
                       oneapi::mkl::sparse::dense_matrix_handle_t B_handle,
                       oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
                       bool is_alpha_host_accessible, bool is_beta_host_accessible) {
-    THROW_IF_NULLPTR(function_name, A_handle);
-    THROW_IF_NULLPTR(function_name, B_handle);
-    THROW_IF_NULLPTR(function_name, C_handle);
-
     auto internal_A_handle = detail::get_internal_handle(A_handle);
-    detail::check_all_containers_compatible(function_name, internal_A_handle, B_handle, C_handle);
-    if (internal_A_handle->all_use_buffer()) {
-        detail::check_ptr_is_host_accessible("spmm", "alpha", is_alpha_host_accessible);
-        detail::check_ptr_is_host_accessible("spmm", "beta", is_beta_host_accessible);
-    }
-    if (is_alpha_host_accessible != is_beta_host_accessible) {
-        throw mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Alpha and beta must both be placed on host memory or device memory.");
-    }
-    if (B_handle->dense_layout != C_handle->dense_layout) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "B and C matrices must used the same layout.");
-    }
-
-    if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "Matrix view's type must be `matrix_descr::general`.");
-    }
-
-    if (A_view.diag_view != oneapi::mkl::diag::nonunit) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "Matrix's diag_view must be `nonunit`.");
-    }
+    detail::check_valid_spmm_common(function_name, A_view, internal_A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
 
 #if BACKEND == gpu
     detail::data_type data_type = internal_A_handle->get_value_type();
diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx
index d2332286b..cba197848 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx
+++ b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx
@@ -49,32 +49,9 @@ void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose o
                       oneapi::mkl::sparse::dense_vector_handle_t x_handle,
                       oneapi::mkl::sparse::dense_vector_handle_t y_handle,
                       bool is_alpha_host_accessible, bool is_beta_host_accessible) {
-    THROW_IF_NULLPTR(function_name, A_handle);
-    THROW_IF_NULLPTR(function_name, x_handle);
-    THROW_IF_NULLPTR(function_name, y_handle);
-
     auto internal_A_handle = detail::get_internal_handle(A_handle);
-    detail::check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle);
-    if (internal_A_handle->all_use_buffer()) {
-        detail::check_ptr_is_host_accessible("spmv", "alpha", is_alpha_host_accessible);
-        detail::check_ptr_is_host_accessible("spmv", "beta", is_beta_host_accessible);
-    }
-    if (is_alpha_host_accessible != is_beta_host_accessible) {
-        throw mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "Alpha and beta must both be placed on host memory or device memory.");
-    }
-    if (A_view.type_view == oneapi::mkl::sparse::matrix_descr::diagonal) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "Matrix view's type cannot be diagonal.");
-    }
-
-    if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::triangular &&
-        A_view.diag_view == oneapi::mkl::diag::unit) {
-        throw mkl::invalid_argument(
-            "sparse_blas", function_name,
-            "`unit` diag_view can only be used with a triangular type_view.");
-    }
+    detail::check_valid_spmv_common(__func__, opA, A_view, internal_A_handle, x_handle, y_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
 
     if ((A_view.type_view == oneapi::mkl::sparse::matrix_descr::symmetric ||
          A_view.type_view == oneapi::mkl::sparse::matrix_descr::hermitian) &&
diff --git a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx
index 7ef5b3c39..01575ac36 100644
--- a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx
+++ b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx
@@ -49,11 +49,10 @@ void check_valid_spsv(const std::string &function_name, oneapi::mkl::transpose o
                       oneapi::mkl::sparse::dense_vector_handle_t x_handle,
                       oneapi::mkl::sparse::dense_vector_handle_t y_handle,
                       bool is_alpha_host_accessible, oneapi::mkl::sparse::spsv_alg alg) {
-    THROW_IF_NULLPTR(function_name, A_handle);
-    THROW_IF_NULLPTR(function_name, x_handle);
-    THROW_IF_NULLPTR(function_name, y_handle);
-
     auto internal_A_handle = detail::get_internal_handle(A_handle);
+    detail::check_valid_spsv_common(function_name, A_view, internal_A_handle, x_handle, y_handle,
+                                    is_alpha_host_accessible);
+
     if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg &&
         !internal_A_handle->has_matrix_property(oneapi::mkl::sparse::matrix_property::sorted)) {
         throw mkl::unimplemented(
@@ -72,16 +71,6 @@ void check_valid_spsv(const std::string &function_name, oneapi::mkl::transpose o
 #else
     (void)opA;
 #endif // BACKEND
-
-    detail::check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle);
-    if (A_view.type_view != matrix_descr::triangular) {
-        throw mkl::invalid_argument("sparse_blas", function_name,
-                                    "Matrix view's type must be `matrix_descr::triangular`.");
-    }
-
-    if (internal_A_handle->all_use_buffer()) {
-        detail::check_ptr_is_host_accessible("spsv", "alpha", is_alpha_host_accessible);
-    }
 }
 
 void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha,
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp
index a6ea51629..0aaf91b25 100644
--- a/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp
+++ b/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp
@@ -19,7 +19,7 @@
 
 #include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
 
-#include "sparse_blas/backends/mkl_common/mkl_helper.hpp"
+#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp"
 #include "sparse_blas/backends/mkl_common/mkl_handles.hpp"
 
 namespace oneapi::mkl::sparse::mklcpu {
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
index 0929a7ef4..ebc8ceecf 100644
--- a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
+++ b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
@@ -17,10 +17,12 @@
 *
 **************************************************************************/
 
+#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp"
 #include "sparse_blas/backends/mkl_common/mkl_handles.hpp"
-#include "sparse_blas/backends/mkl_common/mkl_helper.hpp"
+#include "sparse_blas/common_op_verification.hpp"
 #include "sparse_blas/macros.hpp"
 #include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
 
 #include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
 
diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp
index 7cb9853a7..648fed66e 100644
--- a/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp
+++ b/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp
@@ -19,8 +19,8 @@
 
 #include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp"
 
+#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp"
 #include "sparse_blas/backends/mkl_common/mkl_handles.hpp"
-#include "sparse_blas/backends/mkl_common/mkl_helper.hpp"
 
 namespace oneapi::mkl::sparse::mklgpu {
 
diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
index be5e0c0aa..1102306dc 100644
--- a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
+++ b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
@@ -17,10 +17,12 @@
 *
 **************************************************************************/
 
+#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp"
 #include "sparse_blas/backends/mkl_common/mkl_handles.hpp"
-#include "sparse_blas/backends/mkl_common/mkl_helper.hpp"
+#include "sparse_blas/common_op_verification.hpp"
 #include "sparse_blas/macros.hpp"
 #include "sparse_blas/matrix_view_comparison.hpp"
+#include "sparse_blas/sycl_helper.hpp"
 
 #include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp"
 
diff --git a/src/sparse_blas/common_op_verification.hpp b/src/sparse_blas/common_op_verification.hpp
new file mode 100644
index 000000000..e496c725e
--- /dev/null
+++ b/src/sparse_blas/common_op_verification.hpp
@@ -0,0 +1,142 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_
+#define _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_
+
+#include <string>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+#include "macros.hpp"
+
+namespace oneapi::mkl::sparse::detail {
+
+/// Throw an exception if the scalar is not accessible in the host
+inline void check_ptr_is_host_accessible(const std::string &function_name,
+                                         const std::string &scalar_name,
+                                         bool is_ptr_accessible_on_host) {
+    if (!is_ptr_accessible_on_host) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Scalar " + scalar_name + " must be accessible on the host for buffer functions.");
+    }
+}
+
+template <typename InternalSparseMatHandleT>
+void check_valid_spmm_common(const std::string &function_name,
+                             oneapi::mkl::sparse::matrix_view A_view,
+                             InternalSparseMatHandleT internal_A_handle,
+                             oneapi::mkl::sparse::dense_matrix_handle_t B_handle,
+                             oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
+                             bool is_alpha_host_accessible, bool is_beta_host_accessible) {
+    THROW_IF_NULLPTR(function_name, internal_A_handle);
+    THROW_IF_NULLPTR(function_name, B_handle);
+    THROW_IF_NULLPTR(function_name, C_handle);
+
+    check_all_containers_compatible(function_name, internal_A_handle, B_handle, C_handle);
+    if (internal_A_handle->all_use_buffer()) {
+        check_ptr_is_host_accessible("spmm", "alpha", is_alpha_host_accessible);
+        check_ptr_is_host_accessible("spmm", "beta", is_beta_host_accessible);
+    }
+    if (is_alpha_host_accessible != is_beta_host_accessible) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Alpha and beta must both be placed on host memory or device memory.");
+    }
+    if (B_handle->dense_layout != C_handle->dense_layout) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "B and C matrices must used the same layout.");
+    }
+
+    if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "Matrix view's type must be `matrix_descr::general`.");
+    }
+
+    if (A_view.diag_view != oneapi::mkl::diag::nonunit) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "Matrix's diag_view must be `nonunit`.");
+    }
+}
+
+template <typename InternalSparseMatHandleT>
+void check_valid_spmv_common(const std::string &function_name, oneapi::mkl::transpose /*opA*/,
+                             oneapi::mkl::sparse::matrix_view A_view,
+                             InternalSparseMatHandleT internal_A_handle,
+                             oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                             oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                             bool is_alpha_host_accessible, bool is_beta_host_accessible) {
+    THROW_IF_NULLPTR(function_name, internal_A_handle);
+    THROW_IF_NULLPTR(function_name, x_handle);
+    THROW_IF_NULLPTR(function_name, y_handle);
+
+    check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle);
+    if (internal_A_handle->all_use_buffer()) {
+        check_ptr_is_host_accessible("spmv", "alpha", is_alpha_host_accessible);
+        check_ptr_is_host_accessible("spmv", "beta", is_beta_host_accessible);
+    }
+    if (is_alpha_host_accessible != is_beta_host_accessible) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Alpha and beta must both be placed on host memory or device memory.");
+    }
+    if (A_view.type_view == oneapi::mkl::sparse::matrix_descr::diagonal) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "Matrix view's type cannot be diagonal.");
+    }
+
+    if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::triangular &&
+        A_view.diag_view == oneapi::mkl::diag::unit) {
+        throw mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "`unit` diag_view can only be used with a triangular type_view.");
+    }
+}
+
+template <typename InternalSparseMatHandleT>
+void check_valid_spsv_common(const std::string &function_name,
+                             oneapi::mkl::sparse::matrix_view A_view,
+                             InternalSparseMatHandleT internal_A_handle,
+                             oneapi::mkl::sparse::dense_vector_handle_t x_handle,
+                             oneapi::mkl::sparse::dense_vector_handle_t y_handle,
+                             bool is_alpha_host_accessible) {
+    THROW_IF_NULLPTR(function_name, internal_A_handle);
+    THROW_IF_NULLPTR(function_name, x_handle);
+    THROW_IF_NULLPTR(function_name, y_handle);
+
+    check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle);
+    if (A_view.type_view != matrix_descr::triangular) {
+        throw mkl::invalid_argument("sparse_blas", function_name,
+                                    "Matrix view's type must be `matrix_descr::triangular`.");
+    }
+
+    if (internal_A_handle->all_use_buffer()) {
+        check_ptr_is_host_accessible("spsv", "alpha", is_alpha_host_accessible);
+    }
+}
+
+} // namespace oneapi::mkl::sparse::detail
+
+#endif // _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_
\ No newline at end of file
diff --git a/src/sparse_blas/generic_container.hpp b/src/sparse_blas/generic_container.hpp
index 53bd50837..5fe2b1ab2 100644
--- a/src/sparse_blas/generic_container.hpp
+++ b/src/sparse_blas/generic_container.hpp
@@ -61,6 +61,10 @@ struct generic_container {
               buffer_ptr(std::make_shared<sycl::buffer<T, 1>>(buffer)),
               data_type(get_data_type<T>()) {}
 
+    bool use_buffer() const {
+        return static_cast<bool>(buffer_ptr);
+    }
+
     template <typename T>
     void set_usm_ptr(T* ptr) {
         usm_ptr = ptr;
@@ -108,7 +112,7 @@ struct generic_dense_handle {
               value_container(value_buffer) {}
 
     bool all_use_buffer() const {
-        return static_cast<bool>(value_container.buffer_ptr);
+        return value_container.use_buffer();
     }
 
     data_type get_value_type() const {
@@ -210,34 +214,47 @@ struct generic_sparse_handle {
     generic_container col_container;
     generic_container value_container;
 
+    std::int64_t num_rows;
+    std::int64_t num_cols;
+    std::int64_t nnz;
+    oneapi::mkl::index_base index;
     std::int32_t properties_mask;
     bool can_be_reset;
 
     template <typename fpType, typename intType>
     generic_sparse_handle(BackendHandleT backend_handle, intType* row_ptr, intType* col_ptr,
-                          fpType* value_ptr)
+                          fpType* value_ptr, std::int64_t num_rows, std::int64_t num_cols,
+                          std::int64_t nnz, oneapi::mkl::index_base index)
             : backend_handle(backend_handle),
               row_container(generic_container(row_ptr)),
               col_container(generic_container(col_ptr)),
               value_container(generic_container(value_ptr)),
+              num_rows(num_rows),
+              num_cols(num_cols),
+              nnz(nnz),
+              index(index),
               properties_mask(0),
               can_be_reset(true) {}
 
     template <typename fpType, typename intType>
     generic_sparse_handle(BackendHandleT backend_handle, const sycl::buffer<intType, 1> row_buffer,
                           const sycl::buffer<intType, 1> col_buffer,
-                          const sycl::buffer<fpType, 1> value_buffer)
+                          const sycl::buffer<fpType, 1> value_buffer, std::int64_t num_rows,
+                          std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index)
             : backend_handle(backend_handle),
               row_container(row_buffer),
               col_container(col_buffer),
               value_container(value_buffer),
+              num_rows(num_rows),
+              num_cols(num_cols),
+              nnz(nnz),
+              index(index),
               properties_mask(0),
               can_be_reset(true) {}
 
     bool all_use_buffer() const {
-        return static_cast<bool>(value_container.buffer_ptr) &&
-               static_cast<bool>(row_container.buffer_ptr) &&
-               static_cast<bool>(col_container.buffer_ptr);
+        return value_container.use_buffer() && row_container.use_buffer() &&
+               col_container.use_buffer();
     }
 
     data_type get_value_type() const {
@@ -321,12 +338,38 @@ void check_all_containers_compatible(const std::string& function_name,
     }
 }
 
-template <typename T, typename DependenciesT>
-sycl::event submit_release(sycl::queue& queue, T* ptr, const DependenciesT& dependencies) {
-    return queue.submit([&](sycl::handler& cgh) {
-        cgh.depends_on(dependencies);
-        cgh.host_task([=]() { delete ptr; });
-    });
+template <typename fpType, typename InternalHandleT>
+void check_can_reset_value_handle(const std::string& function_name,
+                                  InternalHandleT* internal_handle, bool expect_buffer) {
+    if (internal_handle->get_value_type() != detail::get_data_type<fpType>()) {
+        throw oneapi::mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Incompatible data types expected " +
+                data_type_to_str(internal_handle->get_value_type()) + " but got " +
+                data_type_to_str(detail::get_data_type<fpType>()));
+    }
+    if (internal_handle->all_use_buffer() != expect_buffer) {
+        throw oneapi::mkl::invalid_argument(
+            "sparse_blas", function_name, "Cannot change the container type between buffer or USM");
+    }
+}
+
+template <typename fpType, typename intType, typename InternalHandleT>
+void check_can_reset_sparse_handle(const std::string& function_name,
+                                   InternalHandleT* internal_smhandle, bool expect_buffer) {
+    check_can_reset_value_handle<fpType>(function_name, internal_smhandle, expect_buffer);
+    if (internal_smhandle->get_int_type() != detail::get_data_type<intType>()) {
+        throw oneapi::mkl::invalid_argument(
+            "sparse_blas", function_name,
+            "Incompatible data types expected " +
+                data_type_to_str(internal_smhandle->get_int_type()) + " but got " +
+                data_type_to_str(detail::get_data_type<intType>()));
+    }
+    if (!internal_smhandle->can_be_reset) {
+        throw mkl::unimplemented(
+            "sparse_blas", function_name,
+            "The backend does not support reseting the matrix handle's data after it was used in a computation.");
+    }
 }
 
 } // namespace oneapi::mkl::sparse::detail
diff --git a/src/sparse_blas/macros.hpp b/src/sparse_blas/macros.hpp
index 7eba01390..9eb769736 100644
--- a/src/sparse_blas/macros.hpp
+++ b/src/sparse_blas/macros.hpp
@@ -36,10 +36,91 @@
     FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int32_t, _i32); \
     FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int64_t, _i64)
 
+#define INSTANTIATE_DENSE_VECTOR_FUNCS(FP_TYPE, FP_SUFFIX)                            \
+    template void init_dense_vector<FP_TYPE>(                                         \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \
+        std::int64_t size, sycl::buffer<FP_TYPE, 1> val);                             \
+    template void init_dense_vector<FP_TYPE>(                                         \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \
+        std::int64_t size, FP_TYPE * val);                                            \
+    template void set_dense_vector_data<FP_TYPE>(                                     \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle,     \
+        std::int64_t size, sycl::buffer<FP_TYPE, 1> val);                             \
+    template void set_dense_vector_data<FP_TYPE>(                                     \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle,     \
+        std::int64_t size, FP_TYPE * val)
+
+#define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX)                            \
+    template void init_dense_matrix<FP_TYPE>(                                         \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
+        oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val);              \
+    template void init_dense_matrix<FP_TYPE>(                                         \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
+        oneapi::mkl::layout dense_layout, FP_TYPE * val);                             \
+    template void set_dense_matrix_data<FP_TYPE>(                                     \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,     \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
+        oneapi::mkl::layout dense_layout, sycl::buffer<FP_TYPE, 1> val);              \
+    template void set_dense_matrix_data<FP_TYPE>(                                     \
+        sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle,     \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld,                \
+        oneapi::mkl::layout dense_layout, FP_TYPE * val)
+
+#define INSTANTIATE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX)                     \
+    template void init_coo_matrix<FP_TYPE, INT_TYPE>(                                              \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
+        oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ind,                          \
+        sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val);                          \
+    template void init_coo_matrix<FP_TYPE, INT_TYPE>(                                              \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
+        oneapi::mkl::index_base index, INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val);     \
+    template void set_coo_matrix_data<FP_TYPE, INT_TYPE>(                                          \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
+        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
+        sycl::buffer<INT_TYPE, 1> row_ind, sycl::buffer<INT_TYPE, 1> col_ind,                      \
+        sycl::buffer<FP_TYPE, 1> val);                                                             \
+    template void set_coo_matrix_data<FP_TYPE, INT_TYPE>(                                          \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
+        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
+        INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val)
+
+#define INSTANTIATE_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX)                     \
+    template void init_csr_matrix<FP_TYPE, INT_TYPE>(                                              \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
+        oneapi::mkl::index_base index, sycl::buffer<INT_TYPE, 1> row_ptr,                          \
+        sycl::buffer<INT_TYPE, 1> col_ind, sycl::buffer<FP_TYPE, 1> val);                          \
+    template void init_csr_matrix<FP_TYPE, INT_TYPE>(                                              \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle,                    \
+        std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz,                            \
+        oneapi::mkl::index_base index, INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val);     \
+    template void set_csr_matrix_data<FP_TYPE, INT_TYPE>(                                          \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
+        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
+        sycl::buffer<INT_TYPE, 1> row_ptr, sycl::buffer<INT_TYPE, 1> col_ind,                      \
+        sycl::buffer<FP_TYPE, 1> val);                                                             \
+    template void set_csr_matrix_data<FP_TYPE, INT_TYPE>(                                          \
+        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \
+        std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index,                    \
+        INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val)
+
 #define THROW_IF_NULLPTR(FUNC_NAME, PTR)                                       \
     if (!(PTR)) {                                                              \
         throw mkl::uninitialized("sparse_blas", FUNC_NAME,                     \
                                  std::string(#PTR) + " must not be nullptr."); \
     }
 
+#define CHECK_DESCR_MATCH(descr, argument, optimize_func_name)                                    \
+    do {                                                                                          \
+        if (descr->last_optimized_##argument != argument) {                                       \
+            throw mkl::invalid_argument(                                                          \
+                "sparse_blas", __func__,                                                          \
+                #argument " argument must match with the previous call to " #optimize_func_name); \
+        }                                                                                         \
+    } while (0)
+
 #endif // _ONEMKL_SPARSE_BLAS_MACROS_HPP_
diff --git a/src/sparse_blas/sycl_helper.hpp b/src/sparse_blas/sycl_helper.hpp
new file mode 100644
index 000000000..67580159c
--- /dev/null
+++ b/src/sparse_blas/sycl_helper.hpp
@@ -0,0 +1,80 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_
+#define _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+namespace oneapi::mkl::sparse::detail {
+
+/// Return whether a pointer is accessible on the host
+template <typename T>
+inline bool is_ptr_accessible_on_host(sycl::queue queue, const T *host_or_device_ptr) {
+    auto alloc_type = sycl::get_pointer_type(host_or_device_ptr, queue.get_context());
+    return alloc_type == sycl::usm::alloc::host || alloc_type == sycl::usm::alloc::shared ||
+           alloc_type == sycl::usm::alloc::unknown;
+}
+
+/// Return a scalar on the host from a pointer to host or device memory
+template <typename T>
+inline T get_scalar_on_host(sycl::queue &queue, const T *host_or_device_ptr,
+                            bool is_ptr_accessible_on_host) {
+    if (is_ptr_accessible_on_host) {
+        return *host_or_device_ptr;
+    }
+    T scalar;
+    auto event = queue.copy(host_or_device_ptr, &scalar, 1);
+    event.wait_and_throw();
+    return scalar;
+}
+
+/// Submit the release of \p ptr in a host_task waiting on the dependencies
+template <typename T>
+sycl::event submit_release(sycl::queue &queue, T *ptr,
+                           const std::vector<sycl::event> &dependencies) {
+    return queue.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependencies);
+        cgh.host_task([=]() { delete ptr; });
+    });
+}
+
+/// Merge multiple event dependencies into one
+inline sycl::event collapse_dependencies(sycl::queue &queue,
+                                         const std::vector<sycl::event> &dependencies) {
+    if (dependencies.empty()) {
+        return {};
+    }
+    else if (dependencies.size() == 1) {
+        return dependencies[0];
+    }
+
+    return queue.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependencies);
+        cgh.host_task([=]() {});
+    });
+}
+
+} // namespace oneapi::mkl::sparse::detail
+
+#endif // _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_
diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt
index e7fe8e110..5fc56d04a 100644
--- a/tests/unit_tests/CMakeLists.txt
+++ b/tests/unit_tests/CMakeLists.txt
@@ -178,6 +178,11 @@ foreach(domain ${TARGET_DOMAINS})
     list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_dft_portfft)
   endif()
 
+  if(domain STREQUAL "sparse_blas" AND ENABLE_CUSPARSE_BACKEND)
+    add_dependencies(test_main_${domain}_ct onemkl_${domain}_cusparse)
+    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_cusparse)
+  endif()
+
   target_link_libraries(test_main_${domain}_ct PUBLIC
       gtest
       gtest_main
diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp
index ad215761f..5457079e0 100644
--- a/tests/unit_tests/include/test_helper.hpp
+++ b/tests/unit_tests/include/test_helper.hpp
@@ -176,6 +176,13 @@
 #define TEST_RUN_PORTFFT_SELECT(q, func, ...)
 #endif
 
+#ifdef ENABLE_CUSPARSE_BACKEND
+#define TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, ...) \
+    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::cusparse>{ q }, __VA_ARGS__)
+#else
+#define TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, ...)
+#endif
+
 #ifndef __HIPSYCL__
 #define CHECK_HOST_OR_CPU(q) q.get_device().is_cpu()
 #else
@@ -268,6 +275,9 @@
             if (vendor_id == INTEL_ID) {                                   \
                 TEST_RUN_INTELGPU_SELECT(q, func, __VA_ARGS__);            \
             }                                                              \
+            else if (vendor_id == NVIDIA_ID) {                             \
+                TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, __VA_ARGS__);  \
+            }                                                              \
         }                                                                  \
     } while (0);
 
diff --git a/tests/unit_tests/main_test.cpp b/tests/unit_tests/main_test.cpp
index bac3f8c83..fc208da09 100644
--- a/tests/unit_tests/main_test.cpp
+++ b/tests/unit_tests/main_test.cpp
@@ -122,7 +122,8 @@ int main(int argc, char** argv) {
 #endif
 #if !defined(ENABLE_CUBLAS_BACKEND) && !defined(ENABLE_CURAND_BACKEND) &&                \
     !defined(ENABLE_CUSOLVER_BACKEND) && !defined(ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU) && \
-    !defined(ENABLE_CUFFT_BACKEND) && !defined(ENABLE_PORTFFT_BACKEND)
+    !defined(ENABLE_CUFFT_BACKEND) && !defined(ENABLE_PORTFFT_BACKEND) &&                \
+    !defined(ENABLE_CUSPARSE_BACKEND)
                         if (dev.is_gpu() && vendor_id == NVIDIA_ID)
                             continue;
 #endif
diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp
index c11255a9a..a02f91789 100644
--- a/tests/unit_tests/sparse_blas/include/test_common.hpp
+++ b/tests/unit_tests/sparse_blas/include/test_common.hpp
@@ -332,13 +332,18 @@ intType generate_random_matrix(sparse_matrix_format_t format, const intType nrow
     throw std::runtime_error("Unsupported sparse format");
 }
 
+inline bool require_coo_sorted_by_row(sycl::queue queue) {
+    auto vendor_id = oneapi::mkl::get_device_id(queue);
+    return vendor_id == oneapi::mkl::device::nvidiagpu;
+}
+
 /// Shuffle the 3arrays CSR or COO representation (ia, ja, values)
 /// of any sparse matrix.
 /// In CSR format, the elements within a row are shuffled without changing ia.
 /// In COO format, all the elements are shuffled.
 template <typename fpType, typename intType>
-void shuffle_sparse_matrix(sparse_matrix_format_t format, intType indexing, intType *ia,
-                           intType *ja, fpType *a, intType nnz, std::size_t nrows) {
+void shuffle_sparse_matrix(sycl::queue queue, sparse_matrix_format_t format, intType indexing,
+                           intType *ia, intType *ja, fpType *a, intType nnz, std::size_t nrows) {
     if (format == sparse_matrix_format_t::CSR) {
         for (std::size_t i = 0; i < nrows; ++i) {
             intType nnz_row = ia[i + 1] - ia[i];
@@ -351,12 +356,33 @@ void shuffle_sparse_matrix(sparse_matrix_format_t format, intType indexing, intT
         }
     }
     else if (format == sparse_matrix_format_t::COO) {
-        for (std::size_t i = 0; i < static_cast<std::size_t>(nnz); ++i) {
-            intType q = std::rand() % nnz;
-            // Swap elements i and q
-            std::swap(ia[q], ia[i]);
-            std::swap(ja[q], ja[i]);
-            std::swap(a[q], a[i]);
+        if (require_coo_sorted_by_row(queue)) {
+            std::size_t linear_idx = 0;
+            for (std::size_t i = 0; i < nrows; ++i) {
+                // Count the number of non-zero elements for the given row
+                std::size_t nnz_row = 1;
+                while (linear_idx + nnz_row < static_cast<std::size_t>(nnz) &&
+                       ia[linear_idx] == ia[linear_idx + nnz_row]) {
+                    ++nnz_row;
+                }
+                for (std::size_t j = 0; j < nnz_row; ++j) {
+                    // Swap elements within the same row
+                    std::size_t q = linear_idx + (static_cast<std::size_t>(std::rand()) % nnz_row);
+                    // Swap elements j and q
+                    std::swap(ja[q], ja[linear_idx + j]);
+                    std::swap(a[q], a[linear_idx + j]);
+                }
+                linear_idx += nnz_row;
+            }
+        }
+        else {
+            for (std::size_t i = 0; i < static_cast<std::size_t>(nnz); ++i) {
+                intType q = std::rand() % nnz;
+                // Swap elements i and q
+                std::swap(ia[q], ia[i]);
+                std::swap(ja[q], ja[i]);
+                std::swap(a[q], a[i]);
+            }
         }
     }
     else {
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp
index b6f9e1185..df6fb850b 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp
@@ -74,8 +74,8 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A,
 
     // Shuffle ordering of column indices/values to test sortedness
     if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz,
-                              static_cast<std::size_t>(nrows_A));
+        shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
+                              a_host.data(), nnz, static_cast<std::size_t>(nrows_A));
     }
 
     auto ia_buf = make_buffer(ia_host);
@@ -120,7 +120,7 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A,
                 format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host,
                 is_symmetric);
             if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(),
+                shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
                                       a_host.data(), reset_nnz, static_cast<std::size_t>(nrows_A));
             }
             if (reset_nnz > nnz) {
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp
index 5778430a6..7d30426c4 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp
@@ -70,8 +70,8 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A,
 
     // Shuffle ordering of column indices/values to test sortedness
     if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz,
-                              static_cast<std::size_t>(nrows_A));
+        shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
+                              a_host.data(), nnz, static_cast<std::size_t>(nrows_A));
     }
 
     auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
@@ -153,7 +153,7 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A,
                 format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host,
                 is_symmetric);
             if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(),
+                shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
                                       a_host.data(), reset_nnz, static_cast<std::size_t>(nrows_A));
             }
             if (reset_nnz > nnz) {
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp
index 3d99f9e94..e03c09ebe 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp
@@ -67,8 +67,8 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A,
 
     // Shuffle ordering of column indices/values to test sortedness
     if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz,
-                              static_cast<std::size_t>(nrows_A));
+        shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
+                              a_host.data(), nnz, static_cast<std::size_t>(nrows_A));
     }
 
     auto ia_buf = make_buffer(ia_host);
@@ -110,7 +110,7 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A,
                 format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host,
                 is_symmetric);
             if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(),
+                shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
                                       a_host.data(), reset_nnz, static_cast<std::size_t>(nrows_A));
             }
             if (reset_nnz > nnz) {
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp
index ded92a770..eb54f6a5d 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp
@@ -63,8 +63,8 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A,
 
     // Shuffle ordering of column indices/values to test sortedness
     if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz,
-                              static_cast<std::size_t>(nrows_A));
+        shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
+                              a_host.data(), nnz, static_cast<std::size_t>(nrows_A));
     }
 
     auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
@@ -145,7 +145,7 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A,
                 format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host,
                 is_symmetric);
             if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(),
+                shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
                                       a_host.data(), reset_nnz, static_cast<std::size_t>(nrows_A));
             }
             if (reset_nnz > nnz) {
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp
index 6b276dff4..b64219b9a 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp
@@ -70,8 +70,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl
 
     // Shuffle ordering of column indices/values to test sortedness
     if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz,
-                              mu);
+        shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
+                              a_host.data(), nnz, mu);
     }
 
     auto ia_buf = make_buffer(ia_host);
@@ -110,7 +110,7 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl
                 format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric,
                 require_diagonal);
             if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(),
+                shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
                                       a_host.data(), reset_nnz, mu);
             }
             if (reset_nnz > nnz) {
@@ -170,8 +170,11 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl
                                 y_ref_host.data());
 
     // Compare the results of reference implementation and DPC++ implementation.
+    // Increase default relative error margin for tests that lead to large numeric values.
+    double abs_error_factor = 10;
+    double rel_error_factor = 1E5;
     auto y_acc = y_buf.get_host_access(sycl::read_only);
-    bool valid = check_equal_vector(y_acc, y_ref_host);
+    bool valid = check_equal_vector(y_acc, y_ref_host, abs_error_factor, rel_error_factor);
 
     return static_cast<int>(valid);
 }
diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp
index 3b58db914..be427d011 100644
--- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp
+++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp
@@ -66,8 +66,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl
 
     // Shuffle ordering of column indices/values to test sortedness
     if (!is_sorted) {
-        shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz,
-                              mu);
+        shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
+                              a_host.data(), nnz, mu);
     }
 
     auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
@@ -141,7 +141,7 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl
                 format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric,
                 require_diagonal);
             if (!is_sorted) {
-                shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(),
+                shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(),
                                       a_host.data(), reset_nnz, mu);
             }
             if (reset_nnz > nnz) {
@@ -218,8 +218,11 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl
                                 y_ref_host.data());
 
     // Compare the results of reference implementation and DPC++ implementation.
+    // Increase default relative error margin for tests that lead to large numeric values.
+    double abs_error_factor = 10;
+    double rel_error_factor = 1E5;
     ev_copy.wait_and_throw();
-    bool valid = check_equal_vector(y_host, y_ref_host);
+    bool valid = check_equal_vector(y_host, y_ref_host, abs_error_factor, rel_error_factor);
 
     return static_cast<int>(valid);
 }