From 47fd3518f30b9f41bb7f039f61d7d7ac4ccb8dad Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Mon, 9 Oct 2023 14:00:34 +0100 Subject: [PATCH 01/10] Initial formatting of dot operator & subsequent sdsdot update --- include/interface/blas1_interface.h | 49 ++++++++----- src/interface/blas1/backend/amd_gpu.hpp | 23 ++++++ src/interface/blas1/backend/default_cpu.hpp | 16 +++++ src/interface/blas1/backend/intel_gpu.hpp | 17 +++++ src/interface/blas1/backend/nvidia_gpu.hpp | 26 +++++++ src/interface/blas1_interface.hpp | 78 ++++++++++++++++----- test/unittest/CMakeLists.txt | 2 +- test/unittest/blas1/blas1_dot_test.cpp | 24 ++++--- test/unittest/blas1/blas1_sdsdot_test.cpp | 8 +-- 9 files changed, 194 insertions(+), 49 deletions(-) diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h index eef09e7b8..b632100f1 100644 --- a/include/interface/blas1_interface.h +++ b/include/interface/blas1_interface.h @@ -225,6 +225,19 @@ typename sb_handle_t::event_t _nrm2_impl( container_1_t _rs, const index_t number_WG, const typename sb_handle_t::event_t &_dependencies); +/*! + * \brief Prototype for the internal implementation of the Dot operator. See + * documentation in the blas1_interface.hpp file for details. + */ +template +typename sb_handle_t::event_t _dot_impl( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const index_t _number_wg, + const typename sb_handle_t::event_t &_dependencies); + /** * @brief _rot constructor given plane rotation * @param sb_handle SB_Handle @@ -306,12 +319,12 @@ typename sb_handle_t::event_t _rotm( * @tparam container_3_t Buffer Iterator or USM pointer * @tparam container_4_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle - * @param _d1[in,out] On entry, memory object holding the scaling factor for the - * x-coordinate. On exit, the re-scaled _d1. - * @param _d2[in,out] On entry, memory object holding the scaling factor for the - * y-coordinate. On exit, the re-scaled _d2. - * @param _x1[in,out] On entry, memory object holding the x-coordinate. On exit, - * the re-scaled _x1 + * @param _d1[in,out] On entry, memory object holding the scaling factor for + * the x-coordinate. On exit, the re-scaled _d1. + * @param _d2[in,out] On entry, memory object holding the scaling factor for + * the y-coordinate. On exit, the re-scaled _d2. + * @param _x1[in,out] On entry, memory object holding the x-coordinate. On + * exit, the re-scaled _x1 * @param _y1[in] Memory object holding the y-coordinate of the point. * @param _param[out] Buffer with the following layout: [flag, h11, h21, h12, * h22]. @@ -359,8 +372,10 @@ typename sb_handle_t::event_t _rotg( * @tparam sb_handle_t SB_Handle type * @tparam scalar_t Scalar type * @param sb_handle SB_Handle - * @param a[in, out] On entry, x-coordinate of the point. On exit, the scalar z. - * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar r. + * @param a[in, out] On entry, x-coordinate of the point. On exit, the scalar + * z. + * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar + * r. * @param c[out] scalar representing the output c. * @param s[out] scalar representing the output s. * @param _dependencies Vector of events @@ -754,12 +769,12 @@ typename sb_handle_t::event_t _rotm( * @tparam container_3_t Buffer Iterator or USM pointer * @tparam container_4_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle - * @param _d1[in,out] On entry, memory object holding the scaling factor for the - * x-coordinate. On exit, the re-scaled _d1. - * @param _d2[in,out] On entry, memory object holding the scaling factor for the - * y-coordinate. On exit, the re-scaled _d2. - * @param _x1[in,out] On entry, memory object holding the x-coordinate. On exit, - * the re-scaled _x1 + * @param _d1[in,out] On entry, memory object holding the scaling factor for + * the x-coordinate. On exit, the re-scaled _d1. + * @param _d2[in,out] On entry, memory object holding the scaling factor for + * the y-coordinate. On exit, the re-scaled _d2. + * @param _x1[in,out] On entry, memory object holding the x-coordinate. On + * exit, the re-scaled _x1 * @param _y1[in] Memory object holding the y-coordinate of the point. * @param _param[out] Buffer with the following layout: [flag, h11, h21, h12, * h22]. @@ -811,8 +826,10 @@ typename sb_handle_t::event_t _rotg( * @tparam sb_handle_t SB_Handle type * @tparam scalar_t Scalar type * @param sb_handle SB_Handle - * @param a[in, out] On entry, x-coordinate of the point. On exit, the scalar z. - * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar r. + * @param a[in, out] On entry, x-coordinate of the point. On exit, the scalar + * z. + * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar + * r. * @param c[out] scalar representing the output c. * @param s[out] scalar representing the output s. * @param _dependencies Vector of events diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 071436eea..a4d1df1ea 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -70,6 +70,29 @@ typename sb_handle_t::event_t _nrm2( } } // namespace backend } // namespace nrm2 + +namespace dot { +namespace backend { +template +typename sb_handle_t::event_t _dot( + sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t& _dependencies) { + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_dot_impl(localSize), 32>( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 512; + return blas::internal::_dot_impl( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } +} +} // namespace backend +} // namespace dot } // namespace blas #endif diff --git a/src/interface/blas1/backend/default_cpu.hpp b/src/interface/blas1/backend/default_cpu.hpp index 24b3a29a5..f64e9b5b1 100644 --- a/src/interface/blas1/backend/default_cpu.hpp +++ b/src/interface/blas1/backend/default_cpu.hpp @@ -56,6 +56,22 @@ typename sb_handle_t::event_t _nrm2( } } // namespace backend } // namespace nrm2 + +namespace dot { +namespace backend { +template +typename sb_handle_t::event_t _dot( + sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t& _dependencies) { + constexpr int localSize = 8; + constexpr index_t number_WG = 16; + return blas::internal::_dot_impl( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); +} +} // namespace backend +} // namespace dot } // namespace blas #endif diff --git a/src/interface/blas1/backend/intel_gpu.hpp b/src/interface/blas1/backend/intel_gpu.hpp index 1115f3e6e..5cb6c106c 100644 --- a/src/interface/blas1/backend/intel_gpu.hpp +++ b/src/interface/blas1/backend/intel_gpu.hpp @@ -59,6 +59,23 @@ typename sb_handle_t::event_t _nrm2( } // namespace backend } // namespace nrm2 +namespace dot { +namespace backend { +template +typename sb_handle_t::event_t _dot( + sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t& _dependencies) { + constexpr index_t localSize = 128; + const index_t number_WG = + std::min((_N + localSize - 1) / localSize, static_cast(512)); + return blas::internal::_dot_impl(localSize), 32>( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); +} +} // namespace backend +} // namespace dot + } // namespace blas #endif diff --git a/src/interface/blas1/backend/nvidia_gpu.hpp b/src/interface/blas1/backend/nvidia_gpu.hpp index 15ee336bb..5a7be241a 100644 --- a/src/interface/blas1/backend/nvidia_gpu.hpp +++ b/src/interface/blas1/backend/nvidia_gpu.hpp @@ -75,6 +75,32 @@ typename sb_handle_t::event_t _nrm2( } // namespace backend } // namespace nrm2 +namespace dot { +namespace backend { +template +typename sb_handle_t::event_t _dot( + sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t& _dependencies) { + if (_N < (1 << 23)) { + constexpr index_t localSize = 512; + const index_t number_WG = (_N < (1 << 18)) + ? (_N + localSize - 1) / localSize + : static_cast(256); + + return blas::internal::_dot_impl(localSize), 32>( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 1024; + return blas::internal::_dot_impl( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } +} +} // namespace backend +} // namespace dot + } // namespace blas #endif diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 899b97011..61e77a942 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -120,22 +120,8 @@ typename sb_handle_t::event_t _dot( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t &_dependencies) { - auto vx = make_vector_view(_vx, _incx, _N); - auto vy = make_vector_view(_vy, _incy, _N); - auto rs = make_vector_view(_rs, static_cast(1), - static_cast(1)); - // TODO: (Tanvir) avoid over-writing the input. - // Once this is fixed, we should be able to add - // const support for dot and sdsdot operators. - auto prdOp = make_op(vx, vy); - - auto localSize = sb_handle.get_work_group_size(); - auto nWG = 2 * localSize; - - auto assignOp = - make_assign_reduction(rs, prdOp, localSize, localSize * nWG); - auto ret = sb_handle.execute(assignOp, _dependencies); - return ret; + return blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs, + _dependencies); } /** @@ -169,8 +155,8 @@ typename sb_handle_t::event_t _sdsdot( auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); - dot_event = - internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs, _dependencies); + dot_event = blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, + _rs, _dependencies); auto addOp = make_op(sb, rs); auto assignOp2 = make_op(rs, addOp); auto ret2 = sb_handle.execute(assignOp2, dot_event); @@ -329,7 +315,7 @@ typename sb_handle_t::event_t _swap( } /** - * \brief SCALAR operation on a vector + * \brief SCALAR operation on a vector * @param sb_handle_t sb_handle * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis @@ -418,6 +404,60 @@ typename sb_handle_t::event_t _nrm2_impl( return blas::concatenate_vectors(ret0, ret1); } +/** + * @brief _dot_impl Internal implementation of the dot operator. + * + * This function contains the code that sets up and executes the kernels + * required to perform the dot operation (also used in sdsdot). + * + * This function is called by blas::dot::backend::_dot which, depending on + * the TUNING_TARGET and other RT parameters (size for instance), selects + * different template parameters / configuration to ensure the adequate kernel + * is called. + * + * @tparam localSize specifies the number of threads per work group used by + * the kernel + * @tparam localMemSize specifies the size of local shared memory to use, which + * is device and implementation dependent. If 0 the + * implementation use a kernel implementation which doesn't + * require local memory. + */ +template +typename sb_handle_t::event_t _dot_impl( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const index_t _number_wg, + const typename sb_handle_t::event_t &_dependencies) { + typename sb_handle_t::event_t ret_event; + // Skip if N==0, _rs is not overwritten + if (!_N) return ret_event; + + // TODO: (Tanvir) avoid over-writing the input. + // Once this is fixed, we should be able to add + // const support for dot and sdsdot operators. + auto vx = make_vector_view(_vx, _incx, _N); + auto vy = make_vector_view(_vy, _incy, _N); + auto rs = make_vector_view(_rs, static_cast(1), + static_cast(1)); + + auto prdOp = make_op(vx, vy); + auto assignOp = make_wg_atomic_reduction(rs, prdOp); + + if constexpr (localMemSize) { + ret_event = + sb_handle.execute(assignOp, static_cast(localSize), + static_cast(_number_wg * localSize), + static_cast(localMemSize), _dependencies); + } else { + ret_event = sb_handle.execute(assignOp, static_cast(localSize), + static_cast(_number_wg * localSize), + _dependencies); + } + return ret_event; +} + /** * . * @brief _rot constructor given plane rotation diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index b4d2b0a3b..54b386df8 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -40,6 +40,7 @@ set(SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/blas1/blas1_rotg_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp + ${PORTBLAS_UNITTEST}/blas1/blas1_dot_test.cpp # # Blas 2 tests ${PORTBLAS_UNITTEST}/blas2/blas2_gbmv_test.cpp ${PORTBLAS_UNITTEST}/blas2/blas2_gemv_test.cpp @@ -85,7 +86,6 @@ if(is_computecpp) set(SYCL_UNITTEST_SRCS ${SYCL_UNITTEST_SRCS} # Blas 1 tests ${PORTBLAS_UNITTEST}/blas1/blas1_swap_test.cpp - ${PORTBLAS_UNITTEST}/blas1/blas1_dot_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp # Blas 2 tests diff --git a/test/unittest/blas1/blas1_dot_test.cpp b/test/unittest/blas1/blas1_dot_test.cpp index c9401b68e..ba9406b56 100644 --- a/test/unittest/blas1/blas1_dot_test.cpp +++ b/test/unittest/blas1/blas1_dot_test.cpp @@ -26,7 +26,8 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; template void run_test(const combination_t combi) { @@ -35,7 +36,8 @@ void run_test(const combination_t combi) { index_t size; index_t incX; index_t incY; - std::tie(alloc, api, size, incX, incY) = combi; + scalar_t unused; /* Necessary to work around dpcpp compiler bug */ + std::tie(alloc, api, size, incX, incY, unused) = combi; // Input vectors std::vector x_v(size * incX); @@ -44,7 +46,7 @@ void run_test(const combination_t combi) { fill_random(y_v); // Output - scalar_t out_s = 0; + scalar_t out_s = 0.0; // Reference implementation auto out_cpu_s = @@ -91,7 +93,8 @@ void run_test(const combination_t combi) { index_t size; index_t incX; index_t incY; - std::tie(alloc, api, size, incX, incY) = combi; + scalar_t unused; + std::tie(alloc, api, size, incX, incY, unused) = combi; if (alloc == "usm") { // usm alloc #ifdef SB_ENABLE_USM @@ -111,9 +114,10 @@ const auto combi = ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 65, 1002, - 1002400), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY + 1002400), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3), // incY + ::testing::Values(0) // unused ); #else template @@ -123,7 +127,8 @@ const auto combi = api_type::sync), // Api ::testing::Values(11, 1002), // size ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY + ::testing::Values(1, 3), // incY + ::testing::Values(0) // unused ); #endif @@ -133,7 +138,8 @@ static std::string generate_name( std::string alloc; api_type api; int size, incX, incY; - BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, incY); + T unused; + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, incY, unused); } BLAS_REGISTER_TEST_ALL(Dot, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_sdsdot_test.cpp b/test/unittest/blas1/blas1_sdsdot_test.cpp index b4b4b6ac6..f88496d85 100644 --- a/test/unittest/blas1/blas1_sdsdot_test.cpp +++ b/test/unittest/blas1/blas1_sdsdot_test.cpp @@ -35,7 +35,7 @@ template void run_test(const combination_t combi) { std::string alloc; index_t N; - float sb; + scalar_t sb; index_t incX; index_t incY; api_type api; @@ -53,7 +53,7 @@ void run_test(const combination_t combi) { fill_random(y_v); // Output scalar - scalar_t out_s = 10.0; + scalar_t out_s = 0.0; // Reference implementation auto out_cpu_s = @@ -104,7 +104,7 @@ void run_test(const combination_t combi) { std::string alloc; index_t N; - float sb; + scalar_t sb; index_t incX; index_t incY; api_type api; @@ -149,7 +149,7 @@ static std::string generate_name( const ::testing::TestParamInfo>& info) { std::string alloc; int size, incX, incY; - float sb; + T sb; api_type api; BLAS_GENERATE_NAME(info.param, alloc, api, size, sb, incX, incY); } From 8b32b82f8564748db6b34420abe9f7879a5344d9 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Mon, 9 Oct 2023 18:06:00 +0100 Subject: [PATCH 02/10] Fixed a copy2device & sync bug within _dot --- include/interface/blas1_interface.h | 2 -- src/interface/blas1/dot.cpp.in | 1 - src/interface/blas1/dot_return.cpp.in | 1 - src/interface/blas1_interface.hpp | 22 +++++++++++++--------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h index b632100f1..a5fd15439 100644 --- a/include/interface/blas1_interface.h +++ b/include/interface/blas1_interface.h @@ -392,7 +392,6 @@ void _rotg(sb_handle_t &sb_handle, scalar_t &a, scalar_t &b, scalar_t &c, * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM pointer * @tparam container_1_t Buffer Iterator or USM pointer - * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle @@ -419,7 +418,6 @@ typename ValueType::type _dot( * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM pointer * @tparam container_1_t Buffer Iterator or USM pointer - * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle diff --git a/src/interface/blas1/dot.cpp.in b/src/interface/blas1/dot.cpp.in index fd91eba1c..ab298051f 100644 --- a/src/interface/blas1/dot.cpp.in +++ b/src/interface/blas1/dot.cpp.in @@ -39,7 +39,6 @@ namespace internal { * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM Pointer * @tparam container_1_t Buffer Iterator or USM Pointer - * @tparam container_2_t Buffer Iterator or USM Pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle diff --git a/src/interface/blas1/dot_return.cpp.in b/src/interface/blas1/dot_return.cpp.in index 320728f4b..09ed29a49 100644 --- a/src/interface/blas1/dot_return.cpp.in +++ b/src/interface/blas1/dot_return.cpp.in @@ -39,7 +39,6 @@ namespace internal { * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM Pointer * @tparam container_1_t Buffer Iterator or USM Pointer - * @tparam container_2_t Buffer Iterator or USM Pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 61e77a942..8fb63df4a 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -737,7 +737,6 @@ void _rotg(sb_handle_t &sb_handle, scalar_t &a, scalar_t &b, scalar_t &c, * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM pointer * @tparam container_1_t Buffer Iterator or USM pointer - * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle @@ -758,21 +757,27 @@ typename ValueType::type _dot( const typename sb_handle_t::event_t &_dependencies) { constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; - auto res = std::vector(1); + element_t res = element_t(0); auto gpu_res = helper::allocate < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, element_t > (static_cast(1), sb_handle.get_queue()); - auto dot_event = internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, - gpu_res, _dependencies); + auto copy_to_d = + blas::helper::copy_to_device(sb_handle.get_queue(), &res, gpu_res, 1); + typename sb_handle_t::event_t all_deps = concatenate_vectors( + _dependencies, typename sb_handle_t::event_t{copy_to_d}); + + auto dot_event = + internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, gpu_res, all_deps); + sb_handle.wait(dot_event); - auto event = - helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1); - sb_handle.wait(event); + auto copy_to_h = + helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1); + sb_handle.wait(copy_to_h); helper::deallocate(gpu_res, sb_handle.get_queue()); - return res[0]; + return res; } /** @@ -782,7 +787,6 @@ typename ValueType::type _dot( * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM pointer * @tparam container_1_t Buffer Iterator or USM pointer - * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle From a50f237aa26a552f445b3ea08775baabe68f1d89 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Tue, 10 Oct 2023 21:51:11 +0100 Subject: [PATCH 03/10] Added relevant copy to device & init of input/output result --- benchmark/portblas/blas1/dot.cpp | 11 +++--- benchmark/portblas/blas1/sdsdot.cpp | 14 +++++--- src/interface/blas1_interface.hpp | 52 +++++++++++++++++------------ 3 files changed, 47 insertions(+), 30 deletions(-) diff --git a/benchmark/portblas/blas1/dot.cpp b/benchmark/portblas/blas1/dot.cpp index 7fbbf190d..4d45dc577 100644 --- a/benchmark/portblas/blas1/dot.cpp +++ b/benchmark/portblas/blas1/dot.cpp @@ -65,8 +65,10 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, scalar_t vr_temp = 0; { auto vr_temp_gpu = blas::helper::allocate(1, q); + auto copyToD = + blas::helper::copy_to_device(q, &vr_temp, vr_temp_gpu, 1); auto dot_event = _dot(sb_handle, size, inx, static_cast(1), iny, - static_cast(1), vr_temp_gpu); + static_cast(1), vr_temp_gpu, {copyToD}); sb_handle.wait(dot_event); auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1); sb_handle.wait(copy_output); @@ -128,8 +130,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name( - size, mem_type).c_str(), + blas_benchmark::utils::get_name(size, mem_type) + .c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } @@ -141,7 +143,8 @@ void register_benchmark(blas_benchmark::Args& args, auto dot_params = blas_benchmark::utils::get_blas1_params(args); register_benchmark( - sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, dot_params); + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, + dot_params); #ifdef SB_ENABLE_USM register_benchmark( sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, dot_params); diff --git a/benchmark/portblas/blas1/sdsdot.cpp b/benchmark/portblas/blas1/sdsdot.cpp index d270b486d..36e963bf6 100644 --- a/benchmark/portblas/blas1/sdsdot.cpp +++ b/benchmark/portblas/blas1/sdsdot.cpp @@ -63,9 +63,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, scalar_t vr_temp = 0; { auto vr_temp_gpu = blas::helper::allocate(1, q); + auto copyToD = + blas::helper::copy_to_device(q, &vr_temp, vr_temp_gpu, 1); auto sdsdot_event = _sdsdot(sb_handle, size, sb, inx, static_cast(1), iny, - static_cast(1), vr_temp_gpu); + static_cast(1), vr_temp_gpu, {copyToD}); sb_handle.wait(sdsdot_event); auto event = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1); sb_handle.wait(event); @@ -126,8 +128,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, run(st, sb_handle_ptr, size, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name( - size, mem_type).c_str(), + blas_benchmark::utils::get_name(size, mem_type) + .c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } @@ -139,10 +141,12 @@ void register_benchmark(blas_benchmark::Args& args, auto sdsdot_params = blas_benchmark::utils::get_blas1_params(args); register_benchmark( - sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, sdsdot_params); + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, + sdsdot_params); #ifdef SB_ENABLE_USM register_benchmark( - sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, sdsdot_params); + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, + sdsdot_params); #endif } diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 8fb63df4a..9dac8fa4c 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -151,16 +151,22 @@ typename sb_handle_t::event_t _sdsdot( sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t &_dependencies) { - typename sb_handle_t::event_t dot_event{}; auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); - - dot_event = blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, - _rs, _dependencies); - auto addOp = make_op(sb, rs); - auto assignOp2 = make_op(rs, addOp); - auto ret2 = sb_handle.execute(assignOp2, dot_event); - return blas::concatenate_vectors(dot_event, ret2); + if (!_N) { + auto addOp = make_op(sb, rs); + auto assignOp = make_op(rs, addOp); + auto ret = sb_handle.execute(assignOp, _dependencies); + return ret; + } else { + typename sb_handle_t::event_t dotOp{}; + dotOp = blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs, + _dependencies); + auto addOp = make_op(sb, rs); + auto assignOp2 = make_op(rs, addOp); + auto ret = sb_handle.execute(assignOp2, dotOp); + return blas::concatenate_vectors(dotOp, ret); + } } /** @@ -761,18 +767,17 @@ typename ValueType::type _dot( auto gpu_res = helper::allocate < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, element_t > (static_cast(1), sb_handle.get_queue()); - auto copy_to_d = + auto copyTodD = blas::helper::copy_to_device(sb_handle.get_queue(), &res, gpu_res, 1); typename sb_handle_t::event_t all_deps = concatenate_vectors( - _dependencies, typename sb_handle_t::event_t{copy_to_d}); + _dependencies, typename sb_handle_t::event_t{copyTodD}); - auto dot_event = + auto dotOp = internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, gpu_res, all_deps); - sb_handle.wait(dot_event); - auto copy_to_h = - helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1); - sb_handle.wait(copy_to_h); + sb_handle.wait(dotOp); + auto copyToH = helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1); + sb_handle.wait(copyToH); helper::deallocate(gpu_res, @@ -808,16 +813,21 @@ typename ValueType::type _sdsdot( const typename sb_handle_t::event_t &_dependencies) { constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; - element_t res{}; + element_t res = element_t(0); auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, element_t > (static_cast(1), sb_handle.get_queue()); - auto event1 = blas::internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, _vy, - _incy, gpu_res, _dependencies); - sb_handle.wait(event1); - auto event2 = + auto copyTodD = + blas::helper::copy_to_device(sb_handle.get_queue(), &res, gpu_res, 1); + typename sb_handle_t::event_t all_deps = concatenate_vectors( + _dependencies, typename sb_handle_t::event_t{copyTodD}); + + auto sdsdot_event = blas::internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, + _vy, _incy, gpu_res, all_deps); + sb_handle.wait(sdsdot_event); + auto copyToH = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1); - sb_handle.wait(event2); + sb_handle.wait(copyToH); blas::helper::deallocate( From 71229e90c679e0a0235ae1b03bec44f997816e9f Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Fri, 13 Oct 2023 15:59:14 +0100 Subject: [PATCH 04/10] fixed async dot operation bug in rot tests causing failure --- test/unittest/blas1/blas1_rot_test.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/unittest/blas1/blas1_rot_test.cpp b/test/unittest/blas1/blas1_rot_test.cpp index 1a54672e4..1a991a3f5 100644 --- a/test/unittest/blas1/blas1_rot_test.cpp +++ b/test/unittest/blas1/blas1_rot_test.cpp @@ -61,6 +61,7 @@ void run_test(const combination_t combi) { s_d); auto out_cpu_s = reference_blas::dot(size, a_cpu_v.data(), incX, b_cpu_v.data(), incY); + scalar_t init_out_gpu = 0; // SYCL implementation auto q = make_queue(); @@ -77,8 +78,10 @@ void run_test(const combination_t combi) { auto c = static_cast(c_d); auto s = static_cast(s_d); + auto init_copy = helper::copy_to_device(q, &init_out_gpu, gpu_out_s, 1); auto rot_event = _rot(sb_handle, size, gpu_a_v, incX, gpu_b_v, incY, c, s, {copy_a, copy_b}); + sb_handle.wait(init_copy); auto dot_event = _dot(sb_handle, size, gpu_a_v, incX, gpu_b_v, incY, gpu_out_s, {rot_event}); sb_handle.wait(dot_event); From 432bf252267ca342b136fd5ef55596b4ffb98fa1 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Wed, 18 Oct 2023 19:31:24 +0100 Subject: [PATCH 05/10] Addressed PR comments --- src/interface/blas1_interface.hpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 9dac8fa4c..46058dfdc 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -151,17 +151,19 @@ typename sb_handle_t::event_t _sdsdot( sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t &_dependencies) { - auto rs = make_vector_view(_rs, static_cast(1), - static_cast(1)); if (!_N) { - auto addOp = make_op(sb, rs); - auto assignOp = make_op(rs, addOp); - auto ret = sb_handle.execute(assignOp, _dependencies); + sb_handle.wait(_dependencies); + auto copy_sb = + blas::helper::copy_to_device(sb_handle.get_queue(), &sb, _rs, 1); + sb_handle.wait(copy_sb); + auto ret = concatenate_vectors(_dependencies, + typename sb_handle_t::event_t{copy_sb}); return ret; } else { - typename sb_handle_t::event_t dotOp{}; - dotOp = blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs, - _dependencies); + auto rs = make_vector_view(_rs, static_cast(1), + static_cast(1)); + auto dotOp = blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, + _rs, _dependencies); auto addOp = make_op(sb, rs); auto assignOp2 = make_op(rs, addOp); auto ret = sb_handle.execute(assignOp2, dotOp); @@ -763,7 +765,7 @@ typename ValueType::type _dot( const typename sb_handle_t::event_t &_dependencies) { constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; - element_t res = element_t(0); + element_t res{0}; auto gpu_res = helper::allocate < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, element_t > (static_cast(1), sb_handle.get_queue()); @@ -813,7 +815,7 @@ typename ValueType::type _sdsdot( const typename sb_handle_t::event_t &_dependencies) { constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; - element_t res = element_t(0); + element_t res{0}; auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, element_t > (static_cast(1), sb_handle.get_queue()); @@ -951,8 +953,7 @@ typename ValueType::type _nrm2( element_t > (static_cast(1), sb_handle.get_queue()); typename sb_handle_t::event_t copy_init_val = {blas::helper::copy_to_device( sb_handle.get_queue(), res.data(), gpu_res, 1)}; - const auto local_deps = - concatenate_vectors(_dependencies, copy_init_val); + const auto local_deps = concatenate_vectors(_dependencies, copy_init_val); auto nrm2_event = blas::internal::_nrm2(sb_handle, _N, _vx, _incx, gpu_res, local_deps); sb_handle.wait(nrm2_event); From 89e495bd4e717ea8f97747527882cf6372e6e709 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Thu, 19 Oct 2023 00:09:21 +0100 Subject: [PATCH 06/10] Enabled const specifier for dot & sdsdot inputs --- include/operations/blas1_trees.h | 18 ++++++++++ src/interface/blas1/dot.cpp.in | 5 +++ src/interface/blas1/dot_return.cpp.in | 5 +++ src/interface/blas1/sdsdot.cpp.in | 5 +++ src/interface/blas1/sdsdot_return.cpp.in | 5 +++ src/interface/blas1_interface.hpp | 7 ++-- src/operations/blas1_trees.hpp | 44 ++++++++++++++++++++++++ 7 files changed, 84 insertions(+), 5 deletions(-) diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h index 1685b4403..d017e37e5 100644 --- a/include/operations/blas1_trees.h +++ b/include/operations/blas1_trees.h @@ -142,6 +142,24 @@ struct BinaryOp { void adjust_access_displacement(); }; +/*! BinaryOpConst. + * @brief Implements a const Binary Operation (x OP z) with x and z vectors. + */ +template +struct BinaryOpConst { + using index_t = typename rhs_t::index_t; + using value_t = typename ResolveReturnType::type::value_t; + lhs_t lhs_; + rhs_t rhs_; + BinaryOpConst(lhs_t &_l, rhs_t &_r); + index_t get_size() const; + bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + value_t eval(index_t i) const; + value_t eval(cl::sycl::nd_item<1> ndItem) const; + void bind(cl::sycl::handler &h); + void adjust_access_displacement(); +}; + /*! TupleOp. * @brief Implements a Tuple Operation (map (\x -> [i, x]) vector). */ diff --git a/src/interface/blas1/dot.cpp.in b/src/interface/blas1/dot.cpp.in index ab298051f..e87f83335 100644 --- a/src/interface/blas1/dot.cpp.in +++ b/src/interface/blas1/dot.cpp.in @@ -61,6 +61,11 @@ template typename SB_Handle::event_t _dot( SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} * _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, ${DATA_TYPE} * _rs, const typename SB_Handle::event_t& dependencies); + +template typename SB_Handle::event_t _dot( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, const ${DATA_TYPE} * _vx, + ${INCREMENT_TYPE} _incx, const ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, + ${DATA_TYPE} * _rs, const typename SB_Handle::event_t& dependencies); #endif } // namespace internal diff --git a/src/interface/blas1/dot_return.cpp.in b/src/interface/blas1/dot_return.cpp.in index 09ed29a49..eded3f48b 100644 --- a/src/interface/blas1/dot_return.cpp.in +++ b/src/interface/blas1/dot_return.cpp.in @@ -60,6 +60,11 @@ template typename ValueType<${DATA_TYPE}>::type _dot( SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} * _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, const typename SB_Handle::event_t& dependencies); + +template typename ValueType<${DATA_TYPE}>::type _dot( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, const ${DATA_TYPE} * _vx, + ${INCREMENT_TYPE} _incx, const ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& dependencies); #endif } // namespace internal diff --git a/src/interface/blas1/sdsdot.cpp.in b/src/interface/blas1/sdsdot.cpp.in index bfbb298e8..0efd628b4 100644 --- a/src/interface/blas1/sdsdot.cpp.in +++ b/src/interface/blas1/sdsdot.cpp.in @@ -65,6 +65,11 @@ template typename SB_Handle::event_t _sdsdot( SB_Handle& sb_handle, ${INDEX_TYPE} _N, float sb, ${DATA_TYPE} * _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, ${DATA_TYPE} * _rs, const typename SB_Handle::event_t& dependencies); + +template typename SB_Handle::event_t _sdsdot( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, float sb, const ${DATA_TYPE} * _vx, + ${INCREMENT_TYPE} _incx, const ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, + ${DATA_TYPE} * _rs, const typename SB_Handle::event_t& dependencies); #endif } // namespace internal diff --git a/src/interface/blas1/sdsdot_return.cpp.in b/src/interface/blas1/sdsdot_return.cpp.in index 719c38fb9..ae2fa4b3a 100644 --- a/src/interface/blas1/sdsdot_return.cpp.in +++ b/src/interface/blas1/sdsdot_return.cpp.in @@ -64,6 +64,11 @@ template typename ValueType<${DATA_TYPE}>::type _sdsdot( SB_Handle& sb_handle, ${INDEX_TYPE} _N, float sb, ${DATA_TYPE} * _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, const typename SB_Handle::event_t& dependencies); + +template typename ValueType<${DATA_TYPE}>::type _sdsdot( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, float sb, const ${DATA_TYPE} * _vx, + ${INCREMENT_TYPE} _incx, const ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& dependencies); #endif } // namespace internal diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 46058dfdc..3387cb772 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -440,17 +440,14 @@ typename sb_handle_t::event_t _dot_impl( const typename sb_handle_t::event_t &_dependencies) { typename sb_handle_t::event_t ret_event; // Skip if N==0, _rs is not overwritten - if (!_N) return ret_event; + if (!_N) return {_dependencies}; - // TODO: (Tanvir) avoid over-writing the input. - // Once this is fixed, we should be able to add - // const support for dot and sdsdot operators. auto vx = make_vector_view(_vx, _incx, _N); auto vy = make_vector_view(_vy, _incy, _N); auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); - auto prdOp = make_op(vx, vy); + auto prdOp = make_op(vx, vy); auto assignOp = make_wg_atomic_reduction(rs, prdOp); if constexpr (localMemSize) { diff --git a/src/operations/blas1_trees.hpp b/src/operations/blas1_trees.hpp index 1b079c98b..178b8ab86 100644 --- a/src/operations/blas1_trees.hpp +++ b/src/operations/blas1_trees.hpp @@ -385,6 +385,50 @@ BinaryOp::adjust_access_displacement() { rhs_.adjust_access_displacement(); } +/*! BinaryOpConst. + * @brief Implements a const Binary Operation (x OP z) with x and z vectors. + */ +template +BinaryOpConst::BinaryOpConst(lhs_t &_l, rhs_t &_r) + : lhs_(_l), rhs_(_r){}; + +template +PORTBLAS_INLINE typename BinaryOpConst::index_t +BinaryOpConst::get_size() const { + return rhs_.get_size(); +} +template +PORTBLAS_INLINE bool BinaryOpConst::valid_thread( + cl::sycl::nd_item<1> ndItem) const { + return ((ndItem.get_global_id(0) < get_size())); +} + +template +PORTBLAS_INLINE typename BinaryOpConst::value_t +BinaryOpConst::eval( + typename BinaryOpConst::index_t i) const { + return operator_t::eval(lhs_.eval(i), rhs_.eval(i)); +} +template +PORTBLAS_INLINE typename BinaryOpConst::value_t +BinaryOpConst::eval( + cl::sycl::nd_item<1> ndItem) const { + return BinaryOpConst::eval(ndItem.get_global_id(0)); +} +template +PORTBLAS_INLINE void BinaryOpConst::bind( + cl::sycl::handler &h) { + lhs_.bind(h); + rhs_.bind(h); +} + +template +PORTBLAS_INLINE void +BinaryOpConst::adjust_access_displacement() { + lhs_.adjust_access_displacement(); + rhs_.adjust_access_displacement(); +} + /*! TupleOp. * @brief Implements a Tuple Operation (map (\x -> [i, x]) vector). */ From 3c493b6d88a752fbe979cbf05009b7efbe63d634 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Thu, 19 Oct 2023 19:20:03 +0100 Subject: [PATCH 07/10] Removed unecessary sync --- src/interface/blas1_interface.hpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 3387cb772..a98e7b3c7 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -153,12 +153,8 @@ typename sb_handle_t::event_t _sdsdot( const typename sb_handle_t::event_t &_dependencies) { if (!_N) { sb_handle.wait(_dependencies); - auto copy_sb = - blas::helper::copy_to_device(sb_handle.get_queue(), &sb, _rs, 1); - sb_handle.wait(copy_sb); - auto ret = concatenate_vectors(_dependencies, - typename sb_handle_t::event_t{copy_sb}); - return ret; + auto ret = blas::helper::copy_to_device(sb_handle.get_queue(), &sb, _rs, 1); + return {ret}; } else { auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); From 05ed6aa0033fe8846bd858e602abc16d9ad46547 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Fri, 20 Oct 2023 11:25:47 +0100 Subject: [PATCH 08/10] Added sync after copy sdsdot --- src/interface/blas1_interface.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index a98e7b3c7..bb1592a2a 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -154,6 +154,7 @@ typename sb_handle_t::event_t _sdsdot( if (!_N) { sb_handle.wait(_dependencies); auto ret = blas::helper::copy_to_device(sb_handle.get_queue(), &sb, _rs, 1); + sb_handle.wait(ret); return {ret}; } else { auto rs = make_vector_view(_rs, static_cast(1), From 5fb556edaed8e7d3a1c4fbf7695890ed35de5e9b Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Mon, 13 Nov 2023 13:27:25 +0000 Subject: [PATCH 09/10] Fixed copy data type mismatch when using double with sdsdot --- src/interface/blas1_interface.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index bb1592a2a..f0acf8496 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -152,8 +152,10 @@ typename sb_handle_t::event_t _sdsdot( increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t &_dependencies) { if (!_N) { + using element_t = typename ValueType::type; sb_handle.wait(_dependencies); - auto ret = blas::helper::copy_to_device(sb_handle.get_queue(), &sb, _rs, 1); + auto ret = blas::helper::copy_to_device( + sb_handle.get_queue(), reinterpret_cast(&sb), _rs, 1); sb_handle.wait(ret); return {ret}; } else { From eaaf2e0709587ffc667264ed9e5e1a9becafacda Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Thu, 16 Nov 2023 15:59:14 +0000 Subject: [PATCH 10/10] fixed param type in sdsdot --- test/unittest/blas1/blas1_sdsdot_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/unittest/blas1/blas1_sdsdot_test.cpp b/test/unittest/blas1/blas1_sdsdot_test.cpp index f88496d85..b45b5c406 100644 --- a/test/unittest/blas1/blas1_sdsdot_test.cpp +++ b/test/unittest/blas1/blas1_sdsdot_test.cpp @@ -35,7 +35,7 @@ template void run_test(const combination_t combi) { std::string alloc; index_t N; - scalar_t sb; + float sb; index_t incX; index_t incY; api_type api; @@ -104,7 +104,7 @@ void run_test(const combination_t combi) { std::string alloc; index_t N; - scalar_t sb; + float sb; index_t incX; index_t incY; api_type api; @@ -127,7 +127,7 @@ const auto combi = ::testing::Combine( ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 65, 1002, 1002400), // N - ::testing::Values(9.5f, 0.5f), // sb + ::testing::Values(9.5f, 0.5f), // sb ::testing::Values(1, 4), // incX ::testing::Values(1, 3) // incY ); @@ -137,7 +137,7 @@ const auto combi = ::testing::Combine( ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 1002, 0), // N - ::testing::Values(9.5f, 0.5f, 0.0f), // sb + ::testing::Values(9.5f, 0.5f, 0.0f), // sb ::testing::Values(1, 4), // incX ::testing::Values(1, 3) // incY @@ -149,7 +149,7 @@ static std::string generate_name( const ::testing::TestParamInfo>& info) { std::string alloc; int size, incX, incY; - T sb; + float sb; api_type api; BLAS_GENERATE_NAME(info.param, alloc, api, size, sb, incX, incY); }