diff --git a/benchmark/portblas/blas1/dot.cpp b/benchmark/portblas/blas1/dot.cpp index 7fbbf190d..4d45dc577 100644 --- a/benchmark/portblas/blas1/dot.cpp +++ b/benchmark/portblas/blas1/dot.cpp @@ -65,8 +65,10 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, scalar_t vr_temp = 0; { auto vr_temp_gpu = blas::helper::allocate(1, q); + auto copyToD = + blas::helper::copy_to_device(q, &vr_temp, vr_temp_gpu, 1); auto dot_event = _dot(sb_handle, size, inx, static_cast(1), iny, - static_cast(1), vr_temp_gpu); + static_cast(1), vr_temp_gpu, {copyToD}); sb_handle.wait(dot_event); auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1); sb_handle.wait(copy_output); @@ -128,8 +130,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name( - size, mem_type).c_str(), + blas_benchmark::utils::get_name(size, mem_type) + .c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } @@ -141,7 +143,8 @@ void register_benchmark(blas_benchmark::Args& args, auto dot_params = blas_benchmark::utils::get_blas1_params(args); register_benchmark( - sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, dot_params); + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, + dot_params); #ifdef SB_ENABLE_USM register_benchmark( sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, dot_params); diff --git a/benchmark/portblas/blas1/sdsdot.cpp b/benchmark/portblas/blas1/sdsdot.cpp index d270b486d..36e963bf6 100644 --- a/benchmark/portblas/blas1/sdsdot.cpp +++ b/benchmark/portblas/blas1/sdsdot.cpp @@ -63,9 +63,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, scalar_t vr_temp = 0; { auto vr_temp_gpu = blas::helper::allocate(1, q); + auto copyToD = + blas::helper::copy_to_device(q, &vr_temp, vr_temp_gpu, 1); auto sdsdot_event = _sdsdot(sb_handle, size, sb, inx, static_cast(1), iny, - static_cast(1), vr_temp_gpu); + static_cast(1), vr_temp_gpu, {copyToD}); sb_handle.wait(sdsdot_event); auto event = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1); sb_handle.wait(event); @@ -126,8 +128,8 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, run(st, sb_handle_ptr, size, success); }; benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name( - size, mem_type).c_str(), + blas_benchmark::utils::get_name(size, mem_type) + .c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } @@ -139,10 +141,12 @@ void register_benchmark(blas_benchmark::Args& args, auto sdsdot_params = blas_benchmark::utils::get_blas1_params(args); register_benchmark( - sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, sdsdot_params); + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, + sdsdot_params); #ifdef SB_ENABLE_USM register_benchmark( - sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, sdsdot_params); + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, + sdsdot_params); #endif } diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h index eef09e7b8..a5fd15439 100644 --- a/include/interface/blas1_interface.h +++ b/include/interface/blas1_interface.h @@ -225,6 +225,19 @@ typename sb_handle_t::event_t _nrm2_impl( container_1_t _rs, const index_t number_WG, const typename sb_handle_t::event_t &_dependencies); +/*! + * \brief Prototype for the internal implementation of the Dot operator. See + * documentation in the blas1_interface.hpp file for details. + */ +template +typename sb_handle_t::event_t _dot_impl( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const index_t _number_wg, + const typename sb_handle_t::event_t &_dependencies); + /** * @brief _rot constructor given plane rotation * @param sb_handle SB_Handle @@ -306,12 +319,12 @@ typename sb_handle_t::event_t _rotm( * @tparam container_3_t Buffer Iterator or USM pointer * @tparam container_4_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle - * @param _d1[in,out] On entry, memory object holding the scaling factor for the - * x-coordinate. On exit, the re-scaled _d1. - * @param _d2[in,out] On entry, memory object holding the scaling factor for the - * y-coordinate. On exit, the re-scaled _d2. - * @param _x1[in,out] On entry, memory object holding the x-coordinate. On exit, - * the re-scaled _x1 + * @param _d1[in,out] On entry, memory object holding the scaling factor for + * the x-coordinate. On exit, the re-scaled _d1. + * @param _d2[in,out] On entry, memory object holding the scaling factor for + * the y-coordinate. On exit, the re-scaled _d2. + * @param _x1[in,out] On entry, memory object holding the x-coordinate. On + * exit, the re-scaled _x1 * @param _y1[in] Memory object holding the y-coordinate of the point. * @param _param[out] Buffer with the following layout: [flag, h11, h21, h12, * h22]. @@ -359,8 +372,10 @@ typename sb_handle_t::event_t _rotg( * @tparam sb_handle_t SB_Handle type * @tparam scalar_t Scalar type * @param sb_handle SB_Handle - * @param a[in, out] On entry, x-coordinate of the point. On exit, the scalar z. - * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar r. + * @param a[in, out] On entry, x-coordinate of the point. On exit, the scalar + * z. + * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar + * r. * @param c[out] scalar representing the output c. * @param s[out] scalar representing the output s. * @param _dependencies Vector of events @@ -377,7 +392,6 @@ void _rotg(sb_handle_t &sb_handle, scalar_t &a, scalar_t &b, scalar_t &c, * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM pointer * @tparam container_1_t Buffer Iterator or USM pointer - * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle @@ -404,7 +418,6 @@ typename ValueType::type _dot( * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM pointer * @tparam container_1_t Buffer Iterator or USM pointer - * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle @@ -754,12 +767,12 @@ typename sb_handle_t::event_t _rotm( * @tparam container_3_t Buffer Iterator or USM pointer * @tparam container_4_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle - * @param _d1[in,out] On entry, memory object holding the scaling factor for the - * x-coordinate. On exit, the re-scaled _d1. - * @param _d2[in,out] On entry, memory object holding the scaling factor for the - * y-coordinate. On exit, the re-scaled _d2. - * @param _x1[in,out] On entry, memory object holding the x-coordinate. On exit, - * the re-scaled _x1 + * @param _d1[in,out] On entry, memory object holding the scaling factor for + * the x-coordinate. On exit, the re-scaled _d1. + * @param _d2[in,out] On entry, memory object holding the scaling factor for + * the y-coordinate. On exit, the re-scaled _d2. + * @param _x1[in,out] On entry, memory object holding the x-coordinate. On + * exit, the re-scaled _x1 * @param _y1[in] Memory object holding the y-coordinate of the point. * @param _param[out] Buffer with the following layout: [flag, h11, h21, h12, * h22]. @@ -811,8 +824,10 @@ typename sb_handle_t::event_t _rotg( * @tparam sb_handle_t SB_Handle type * @tparam scalar_t Scalar type * @param sb_handle SB_Handle - * @param a[in, out] On entry, x-coordinate of the point. On exit, the scalar z. - * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar r. + * @param a[in, out] On entry, x-coordinate of the point. On exit, the scalar + * z. + * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar + * r. * @param c[out] scalar representing the output c. * @param s[out] scalar representing the output s. * @param _dependencies Vector of events diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h index 1685b4403..d017e37e5 100644 --- a/include/operations/blas1_trees.h +++ b/include/operations/blas1_trees.h @@ -142,6 +142,24 @@ struct BinaryOp { void adjust_access_displacement(); }; +/*! BinaryOpConst. + * @brief Implements a const Binary Operation (x OP z) with x and z vectors. + */ +template +struct BinaryOpConst { + using index_t = typename rhs_t::index_t; + using value_t = typename ResolveReturnType::type::value_t; + lhs_t lhs_; + rhs_t rhs_; + BinaryOpConst(lhs_t &_l, rhs_t &_r); + index_t get_size() const; + bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + value_t eval(index_t i) const; + value_t eval(cl::sycl::nd_item<1> ndItem) const; + void bind(cl::sycl::handler &h); + void adjust_access_displacement(); +}; + /*! TupleOp. * @brief Implements a Tuple Operation (map (\x -> [i, x]) vector). */ diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 071436eea..a4d1df1ea 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -70,6 +70,29 @@ typename sb_handle_t::event_t _nrm2( } } // namespace backend } // namespace nrm2 + +namespace dot { +namespace backend { +template +typename sb_handle_t::event_t _dot( + sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t& _dependencies) { + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_dot_impl(localSize), 32>( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 512; + return blas::internal::_dot_impl( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } +} +} // namespace backend +} // namespace dot } // namespace blas #endif diff --git a/src/interface/blas1/backend/default_cpu.hpp b/src/interface/blas1/backend/default_cpu.hpp index 24b3a29a5..f64e9b5b1 100644 --- a/src/interface/blas1/backend/default_cpu.hpp +++ b/src/interface/blas1/backend/default_cpu.hpp @@ -56,6 +56,22 @@ typename sb_handle_t::event_t _nrm2( } } // namespace backend } // namespace nrm2 + +namespace dot { +namespace backend { +template +typename sb_handle_t::event_t _dot( + sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t& _dependencies) { + constexpr int localSize = 8; + constexpr index_t number_WG = 16; + return blas::internal::_dot_impl( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); +} +} // namespace backend +} // namespace dot } // namespace blas #endif diff --git a/src/interface/blas1/backend/intel_gpu.hpp b/src/interface/blas1/backend/intel_gpu.hpp index 1115f3e6e..5cb6c106c 100644 --- a/src/interface/blas1/backend/intel_gpu.hpp +++ b/src/interface/blas1/backend/intel_gpu.hpp @@ -59,6 +59,23 @@ typename sb_handle_t::event_t _nrm2( } // namespace backend } // namespace nrm2 +namespace dot { +namespace backend { +template +typename sb_handle_t::event_t _dot( + sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t& _dependencies) { + constexpr index_t localSize = 128; + const index_t number_WG = + std::min((_N + localSize - 1) / localSize, static_cast(512)); + return blas::internal::_dot_impl(localSize), 32>( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); +} +} // namespace backend +} // namespace dot + } // namespace blas #endif diff --git a/src/interface/blas1/backend/nvidia_gpu.hpp b/src/interface/blas1/backend/nvidia_gpu.hpp index 15ee336bb..5a7be241a 100644 --- a/src/interface/blas1/backend/nvidia_gpu.hpp +++ b/src/interface/blas1/backend/nvidia_gpu.hpp @@ -75,6 +75,32 @@ typename sb_handle_t::event_t _nrm2( } // namespace backend } // namespace nrm2 +namespace dot { +namespace backend { +template +typename sb_handle_t::event_t _dot( + sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t& _dependencies) { + if (_N < (1 << 23)) { + constexpr index_t localSize = 512; + const index_t number_WG = (_N < (1 << 18)) + ? (_N + localSize - 1) / localSize + : static_cast(256); + + return blas::internal::_dot_impl(localSize), 32>( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 1024; + return blas::internal::_dot_impl( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } +} +} // namespace backend +} // namespace dot + } // namespace blas #endif diff --git a/src/interface/blas1/dot.cpp.in b/src/interface/blas1/dot.cpp.in index fd91eba1c..e87f83335 100644 --- a/src/interface/blas1/dot.cpp.in +++ b/src/interface/blas1/dot.cpp.in @@ -39,7 +39,6 @@ namespace internal { * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM Pointer * @tparam container_1_t Buffer Iterator or USM Pointer - * @tparam container_2_t Buffer Iterator or USM Pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle @@ -62,6 +61,11 @@ template typename SB_Handle::event_t _dot( SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} * _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, ${DATA_TYPE} * _rs, const typename SB_Handle::event_t& dependencies); + +template typename SB_Handle::event_t _dot( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, const ${DATA_TYPE} * _vx, + ${INCREMENT_TYPE} _incx, const ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, + ${DATA_TYPE} * _rs, const typename SB_Handle::event_t& dependencies); #endif } // namespace internal diff --git a/src/interface/blas1/dot_return.cpp.in b/src/interface/blas1/dot_return.cpp.in index 320728f4b..eded3f48b 100644 --- a/src/interface/blas1/dot_return.cpp.in +++ b/src/interface/blas1/dot_return.cpp.in @@ -39,7 +39,6 @@ namespace internal { * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM Pointer * @tparam container_1_t Buffer Iterator or USM Pointer - * @tparam container_2_t Buffer Iterator or USM Pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle @@ -61,6 +60,11 @@ template typename ValueType<${DATA_TYPE}>::type _dot( SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} * _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, const typename SB_Handle::event_t& dependencies); + +template typename ValueType<${DATA_TYPE}>::type _dot( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, const ${DATA_TYPE} * _vx, + ${INCREMENT_TYPE} _incx, const ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& dependencies); #endif } // namespace internal diff --git a/src/interface/blas1/sdsdot.cpp.in b/src/interface/blas1/sdsdot.cpp.in index bfbb298e8..0efd628b4 100644 --- a/src/interface/blas1/sdsdot.cpp.in +++ b/src/interface/blas1/sdsdot.cpp.in @@ -65,6 +65,11 @@ template typename SB_Handle::event_t _sdsdot( SB_Handle& sb_handle, ${INDEX_TYPE} _N, float sb, ${DATA_TYPE} * _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, ${DATA_TYPE} * _rs, const typename SB_Handle::event_t& dependencies); + +template typename SB_Handle::event_t _sdsdot( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, float sb, const ${DATA_TYPE} * _vx, + ${INCREMENT_TYPE} _incx, const ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, + ${DATA_TYPE} * _rs, const typename SB_Handle::event_t& dependencies); #endif } // namespace internal diff --git a/src/interface/blas1/sdsdot_return.cpp.in b/src/interface/blas1/sdsdot_return.cpp.in index 719c38fb9..ae2fa4b3a 100644 --- a/src/interface/blas1/sdsdot_return.cpp.in +++ b/src/interface/blas1/sdsdot_return.cpp.in @@ -64,6 +64,11 @@ template typename ValueType<${DATA_TYPE}>::type _sdsdot( SB_Handle& sb_handle, ${INDEX_TYPE} _N, float sb, ${DATA_TYPE} * _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, const typename SB_Handle::event_t& dependencies); + +template typename ValueType<${DATA_TYPE}>::type _sdsdot( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, float sb, const ${DATA_TYPE} * _vx, + ${INCREMENT_TYPE} _incx, const ${DATA_TYPE} * _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& dependencies); #endif } // namespace internal diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 899b97011..f0acf8496 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -120,22 +120,8 @@ typename sb_handle_t::event_t _dot( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t &_dependencies) { - auto vx = make_vector_view(_vx, _incx, _N); - auto vy = make_vector_view(_vy, _incy, _N); - auto rs = make_vector_view(_rs, static_cast(1), - static_cast(1)); - // TODO: (Tanvir) avoid over-writing the input. - // Once this is fixed, we should be able to add - // const support for dot and sdsdot operators. - auto prdOp = make_op(vx, vy); - - auto localSize = sb_handle.get_work_group_size(); - auto nWG = 2 * localSize; - - auto assignOp = - make_assign_reduction(rs, prdOp, localSize, localSize * nWG); - auto ret = sb_handle.execute(assignOp, _dependencies); - return ret; + return blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs, + _dependencies); } /** @@ -165,16 +151,23 @@ typename sb_handle_t::event_t _sdsdot( sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t &_dependencies) { - typename sb_handle_t::event_t dot_event{}; - auto rs = make_vector_view(_rs, static_cast(1), - static_cast(1)); - - dot_event = - internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs, _dependencies); - auto addOp = make_op(sb, rs); - auto assignOp2 = make_op(rs, addOp); - auto ret2 = sb_handle.execute(assignOp2, dot_event); - return blas::concatenate_vectors(dot_event, ret2); + if (!_N) { + using element_t = typename ValueType::type; + sb_handle.wait(_dependencies); + auto ret = blas::helper::copy_to_device( + sb_handle.get_queue(), reinterpret_cast(&sb), _rs, 1); + sb_handle.wait(ret); + return {ret}; + } else { + auto rs = make_vector_view(_rs, static_cast(1), + static_cast(1)); + auto dotOp = blas::dot::backend::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, + _rs, _dependencies); + auto addOp = make_op(sb, rs); + auto assignOp2 = make_op(rs, addOp); + auto ret = sb_handle.execute(assignOp2, dotOp); + return blas::concatenate_vectors(dotOp, ret); + } } /** @@ -329,7 +322,7 @@ typename sb_handle_t::event_t _swap( } /** - * \brief SCALAR operation on a vector + * \brief SCALAR operation on a vector * @param sb_handle_t sb_handle * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis @@ -418,6 +411,57 @@ typename sb_handle_t::event_t _nrm2_impl( return blas::concatenate_vectors(ret0, ret1); } +/** + * @brief _dot_impl Internal implementation of the dot operator. + * + * This function contains the code that sets up and executes the kernels + * required to perform the dot operation (also used in sdsdot). + * + * This function is called by blas::dot::backend::_dot which, depending on + * the TUNING_TARGET and other RT parameters (size for instance), selects + * different template parameters / configuration to ensure the adequate kernel + * is called. + * + * @tparam localSize specifies the number of threads per work group used by + * the kernel + * @tparam localMemSize specifies the size of local shared memory to use, which + * is device and implementation dependent. If 0 the + * implementation use a kernel implementation which doesn't + * require local memory. + */ +template +typename sb_handle_t::event_t _dot_impl( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const index_t _number_wg, + const typename sb_handle_t::event_t &_dependencies) { + typename sb_handle_t::event_t ret_event; + // Skip if N==0, _rs is not overwritten + if (!_N) return {_dependencies}; + + auto vx = make_vector_view(_vx, _incx, _N); + auto vy = make_vector_view(_vy, _incy, _N); + auto rs = make_vector_view(_rs, static_cast(1), + static_cast(1)); + + auto prdOp = make_op(vx, vy); + auto assignOp = make_wg_atomic_reduction(rs, prdOp); + + if constexpr (localMemSize) { + ret_event = + sb_handle.execute(assignOp, static_cast(localSize), + static_cast(_number_wg * localSize), + static_cast(localMemSize), _dependencies); + } else { + ret_event = sb_handle.execute(assignOp, static_cast(localSize), + static_cast(_number_wg * localSize), + _dependencies); + } + return ret_event; +} + /** * . * @brief _rot constructor given plane rotation @@ -697,7 +741,6 @@ void _rotg(sb_handle_t &sb_handle, scalar_t &a, scalar_t &b, scalar_t &c, * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM pointer * @tparam container_1_t Buffer Iterator or USM pointer - * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle @@ -718,21 +761,26 @@ typename ValueType::type _dot( const typename sb_handle_t::event_t &_dependencies) { constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; - auto res = std::vector(1); + element_t res{0}; auto gpu_res = helper::allocate < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, element_t > (static_cast(1), sb_handle.get_queue()); - auto dot_event = internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, - gpu_res, _dependencies); - sb_handle.wait(dot_event); - auto event = - helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1); - sb_handle.wait(event); + auto copyTodD = + blas::helper::copy_to_device(sb_handle.get_queue(), &res, gpu_res, 1); + typename sb_handle_t::event_t all_deps = concatenate_vectors( + _dependencies, typename sb_handle_t::event_t{copyTodD}); + + auto dotOp = + internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, gpu_res, all_deps); + + sb_handle.wait(dotOp); + auto copyToH = helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1); + sb_handle.wait(copyToH); helper::deallocate(gpu_res, sb_handle.get_queue()); - return res[0]; + return res; } /** @@ -742,7 +790,6 @@ typename ValueType::type _dot( * @tparam sb_handle_t SB_Handle type * @tparam container_0_t Buffer Iterator or USM pointer * @tparam container_1_t Buffer Iterator or USM pointer - * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle @@ -764,16 +811,21 @@ typename ValueType::type _sdsdot( const typename sb_handle_t::event_t &_dependencies) { constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; - element_t res{}; + element_t res{0}; auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, element_t > (static_cast(1), sb_handle.get_queue()); - auto event1 = blas::internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, _vy, - _incy, gpu_res, _dependencies); - sb_handle.wait(event1); - auto event2 = + auto copyTodD = + blas::helper::copy_to_device(sb_handle.get_queue(), &res, gpu_res, 1); + typename sb_handle_t::event_t all_deps = concatenate_vectors( + _dependencies, typename sb_handle_t::event_t{copyTodD}); + + auto sdsdot_event = blas::internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, + _vy, _incy, gpu_res, all_deps); + sb_handle.wait(sdsdot_event); + auto copyToH = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1); - sb_handle.wait(event2); + sb_handle.wait(copyToH); blas::helper::deallocate( @@ -897,8 +949,7 @@ typename ValueType::type _nrm2( element_t > (static_cast(1), sb_handle.get_queue()); typename sb_handle_t::event_t copy_init_val = {blas::helper::copy_to_device( sb_handle.get_queue(), res.data(), gpu_res, 1)}; - const auto local_deps = - concatenate_vectors(_dependencies, copy_init_val); + const auto local_deps = concatenate_vectors(_dependencies, copy_init_val); auto nrm2_event = blas::internal::_nrm2(sb_handle, _N, _vx, _incx, gpu_res, local_deps); sb_handle.wait(nrm2_event); diff --git a/src/operations/blas1_trees.hpp b/src/operations/blas1_trees.hpp index 1b079c98b..178b8ab86 100644 --- a/src/operations/blas1_trees.hpp +++ b/src/operations/blas1_trees.hpp @@ -385,6 +385,50 @@ BinaryOp::adjust_access_displacement() { rhs_.adjust_access_displacement(); } +/*! BinaryOpConst. + * @brief Implements a const Binary Operation (x OP z) with x and z vectors. + */ +template +BinaryOpConst::BinaryOpConst(lhs_t &_l, rhs_t &_r) + : lhs_(_l), rhs_(_r){}; + +template +PORTBLAS_INLINE typename BinaryOpConst::index_t +BinaryOpConst::get_size() const { + return rhs_.get_size(); +} +template +PORTBLAS_INLINE bool BinaryOpConst::valid_thread( + cl::sycl::nd_item<1> ndItem) const { + return ((ndItem.get_global_id(0) < get_size())); +} + +template +PORTBLAS_INLINE typename BinaryOpConst::value_t +BinaryOpConst::eval( + typename BinaryOpConst::index_t i) const { + return operator_t::eval(lhs_.eval(i), rhs_.eval(i)); +} +template +PORTBLAS_INLINE typename BinaryOpConst::value_t +BinaryOpConst::eval( + cl::sycl::nd_item<1> ndItem) const { + return BinaryOpConst::eval(ndItem.get_global_id(0)); +} +template +PORTBLAS_INLINE void BinaryOpConst::bind( + cl::sycl::handler &h) { + lhs_.bind(h); + rhs_.bind(h); +} + +template +PORTBLAS_INLINE void +BinaryOpConst::adjust_access_displacement() { + lhs_.adjust_access_displacement(); + rhs_.adjust_access_displacement(); +} + /*! TupleOp. * @brief Implements a Tuple Operation (map (\x -> [i, x]) vector). */ diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index b4d2b0a3b..54b386df8 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -40,6 +40,7 @@ set(SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/blas1/blas1_rotg_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp + ${PORTBLAS_UNITTEST}/blas1/blas1_dot_test.cpp # # Blas 2 tests ${PORTBLAS_UNITTEST}/blas2/blas2_gbmv_test.cpp ${PORTBLAS_UNITTEST}/blas2/blas2_gemv_test.cpp @@ -85,7 +86,6 @@ if(is_computecpp) set(SYCL_UNITTEST_SRCS ${SYCL_UNITTEST_SRCS} # Blas 1 tests ${PORTBLAS_UNITTEST}/blas1/blas1_swap_test.cpp - ${PORTBLAS_UNITTEST}/blas1/blas1_dot_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp ${PORTBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp # Blas 2 tests diff --git a/test/unittest/blas1/blas1_dot_test.cpp b/test/unittest/blas1/blas1_dot_test.cpp index c9401b68e..ba9406b56 100644 --- a/test/unittest/blas1/blas1_dot_test.cpp +++ b/test/unittest/blas1/blas1_dot_test.cpp @@ -26,7 +26,8 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; template void run_test(const combination_t combi) { @@ -35,7 +36,8 @@ void run_test(const combination_t combi) { index_t size; index_t incX; index_t incY; - std::tie(alloc, api, size, incX, incY) = combi; + scalar_t unused; /* Necessary to work around dpcpp compiler bug */ + std::tie(alloc, api, size, incX, incY, unused) = combi; // Input vectors std::vector x_v(size * incX); @@ -44,7 +46,7 @@ void run_test(const combination_t combi) { fill_random(y_v); // Output - scalar_t out_s = 0; + scalar_t out_s = 0.0; // Reference implementation auto out_cpu_s = @@ -91,7 +93,8 @@ void run_test(const combination_t combi) { index_t size; index_t incX; index_t incY; - std::tie(alloc, api, size, incX, incY) = combi; + scalar_t unused; + std::tie(alloc, api, size, incX, incY, unused) = combi; if (alloc == "usm") { // usm alloc #ifdef SB_ENABLE_USM @@ -111,9 +114,10 @@ const auto combi = ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 65, 1002, - 1002400), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY + 1002400), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3), // incY + ::testing::Values(0) // unused ); #else template @@ -123,7 +127,8 @@ const auto combi = api_type::sync), // Api ::testing::Values(11, 1002), // size ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY + ::testing::Values(1, 3), // incY + ::testing::Values(0) // unused ); #endif @@ -133,7 +138,8 @@ static std::string generate_name( std::string alloc; api_type api; int size, incX, incY; - BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, incY); + T unused; + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, incY, unused); } BLAS_REGISTER_TEST_ALL(Dot, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_rot_test.cpp b/test/unittest/blas1/blas1_rot_test.cpp index 1a54672e4..1a991a3f5 100644 --- a/test/unittest/blas1/blas1_rot_test.cpp +++ b/test/unittest/blas1/blas1_rot_test.cpp @@ -61,6 +61,7 @@ void run_test(const combination_t combi) { s_d); auto out_cpu_s = reference_blas::dot(size, a_cpu_v.data(), incX, b_cpu_v.data(), incY); + scalar_t init_out_gpu = 0; // SYCL implementation auto q = make_queue(); @@ -77,8 +78,10 @@ void run_test(const combination_t combi) { auto c = static_cast(c_d); auto s = static_cast(s_d); + auto init_copy = helper::copy_to_device(q, &init_out_gpu, gpu_out_s, 1); auto rot_event = _rot(sb_handle, size, gpu_a_v, incX, gpu_b_v, incY, c, s, {copy_a, copy_b}); + sb_handle.wait(init_copy); auto dot_event = _dot(sb_handle, size, gpu_a_v, incX, gpu_b_v, incY, gpu_out_s, {rot_event}); sb_handle.wait(dot_event); diff --git a/test/unittest/blas1/blas1_sdsdot_test.cpp b/test/unittest/blas1/blas1_sdsdot_test.cpp index b4b4b6ac6..b45b5c406 100644 --- a/test/unittest/blas1/blas1_sdsdot_test.cpp +++ b/test/unittest/blas1/blas1_sdsdot_test.cpp @@ -53,7 +53,7 @@ void run_test(const combination_t combi) { fill_random(y_v); // Output scalar - scalar_t out_s = 10.0; + scalar_t out_s = 0.0; // Reference implementation auto out_cpu_s = @@ -127,7 +127,7 @@ const auto combi = ::testing::Combine( ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 65, 1002, 1002400), // N - ::testing::Values(9.5f, 0.5f), // sb + ::testing::Values(9.5f, 0.5f), // sb ::testing::Values(1, 4), // incX ::testing::Values(1, 3) // incY ); @@ -137,7 +137,7 @@ const auto combi = ::testing::Combine( ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 1002, 0), // N - ::testing::Values(9.5f, 0.5f, 0.0f), // sb + ::testing::Values(9.5f, 0.5f, 0.0f), // sb ::testing::Values(1, 4), // incX ::testing::Values(1, 3) // incY