diff --git a/include/views/view.h b/include/views/view.h index bd6ed5bda..6e3b6418c 100644 --- a/include/views/view.h +++ b/include/views/view.h @@ -56,6 +56,10 @@ struct VectorView { index_t size_; increment_t strd_; // never size_t, because it could be negative + // Start of the vector. + // If stride is negative, start at the end of the vector and move backward. + container_t ptr_; + VectorView(view_container_t data, view_increment_t strd, view_index_t size); VectorView(VectorView opV, view_increment_t strd, view_index_t size); @@ -96,13 +100,13 @@ struct VectorView { template SYCL_BLAS_INLINE typename std::enable_if::type eval( index_t i) { - return (strd_ == 1) ? *(data_ + i) : *(data_ + i * strd_); + return (strd_ == 1) ? *(ptr_ + i) : *(ptr_ + i * strd_); } template SYCL_BLAS_INLINE typename std::enable_if::type eval( index_t i) const { - return (strd_ == 1) ? *(data_ + i) : *(data_ + i * strd_); + return (strd_ == 1) ? *(ptr_ + i) : *(ptr_ + i * strd_); } SYCL_BLAS_INLINE value_t &eval(cl::sycl::nd_item<1> ndItem) { @@ -116,13 +120,13 @@ struct VectorView { template SYCL_BLAS_INLINE typename std::enable_if::type eval( index_t indx) { - return *(data_ + indx); + return *(ptr_ + indx); } template SYCL_BLAS_INLINE typename std::enable_if::type eval( index_t indx) const noexcept { - return *(data_ + indx); + return *(ptr_ + indx); } }; diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 8f82372e3..a2bae504e 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -327,10 +327,10 @@ typename sb_handle_t::event_t _nrm2( const auto nWG = 2 * localSize; auto assignOp = make_assign_reduction(rs, prdOp, localSize, localSize * nWG); - auto ret0 = sb_handle.execute(assignOp); + auto ret0 = sb_handle.execute(assignOp, _dependencies); auto sqrtOp = make_op(rs); auto assignOpFinal = make_op(rs, sqrtOp); - auto ret1 = sb_handle.execute(assignOpFinal, _dependencies); + auto ret1 = sb_handle.execute(assignOpFinal, ret0); return blas::concatenate_vectors(ret0, ret1); } @@ -776,7 +776,8 @@ typename ValueType::type _asum( auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm : helper::AllocType::buffer, element_t > (static_cast(1), sb_handle.get_queue()); - blas::internal::_asum(sb_handle, _N, _vx, _incx, gpu_res, _dependencies); + auto asum_event = blas::internal::_asum(sb_handle, _N, _vx, _incx, gpu_res, _dependencies); + sb_handle.wait(asum_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1); sb_handle.wait(event); diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index 2a45491b4..94cde47b5 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -662,8 +662,8 @@ typename sb_handle_t::event_t _tbmv_impl( global_size, _dependencies); auto assignOp = make_op(vx, vres); - auto ret = concatenate_vectors( - tbmvEvent, sb_handle.execute(assignOp, local_range, _dependencies)); + auto assignEvent = sb_handle.execute(assignOp, local_range, tbmvEvent); + auto ret = concatenate_vectors(tbmvEvent, assignEvent); blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue()); diff --git a/src/views/view.hpp b/src/views/view.hpp index 8dcadd51f..077bf1c71 100644 --- a/src/views/view.hpp +++ b/src/views/view.hpp @@ -45,7 +45,7 @@ SYCL_BLAS_INLINE VectorView<_container_t, _IndexType, _IncrementType>::VectorView(_container_t data, _IncrementType strd, _IndexType size) - : data_(data), size_(size), strd_(strd) {} + : data_(data), size_(size), strd_(strd), ptr_(strd > 0 ? data_ : data_ + (size_ - 1) * (-strd_)) {} /*! @brief Creates a view from an existing view. @@ -55,7 +55,7 @@ SYCL_BLAS_INLINE VectorView<_container_t, _IndexType, _IncrementType>::VectorView( VectorView<_container_t, _IndexType, _IncrementType> opV, _IncrementType strd, _IndexType size) - : data_(opV.get_data()), size_(size), strd_(strd) {} + : data_(opV.get_data()), size_(size), strd_(strd), ptr_(strd > 0 ? data_ : data_ + (size_ - 1) * (-strd_)) {} /*! * @brief Returns a reference to the container diff --git a/test/blas_test.hpp b/test/blas_test.hpp index fdb43b31d..60ca065df 100644 --- a/test/blas_test.hpp +++ b/test/blas_test.hpp @@ -307,7 +307,14 @@ template inline void generate_name_helper(std::ostream &ss, T arg, Args... args) { auto token = strtok(nullptr, ", "); ss << "__" << token << "_"; - dump_arg(ss, arg); + if constexpr (std::is_arithmetic::value) { + if (arg < 0) { + ss << "minus_"; + } + dump_arg(ss, std::abs(arg)); + } else { + dump_arg(ss, arg); + } generate_name_helper(ss, args...); } @@ -325,7 +332,14 @@ inline std::string generate_name_helper(char *str_args, T arg, Args... args) { std::stringstream ss; auto token = strtok(str_args, ", "); ss << token << "_"; - dump_arg(ss, arg); + if constexpr (std::is_arithmetic::value) { + if (arg < 0) { + ss << "minus_"; + } + dump_arg(ss, std::abs(arg)); + } else { + dump_arg(ss, arg); + } generate_name_helper(ss, args...); return ss.str(); } diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 1f09057e3..39023d0fc 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -30,6 +30,7 @@ include_directories(${SYCLBLAS_TEST} ${BLAS_INCLUDE_DIRS}) # compiling tests set(SYCL_UNITTEST_SRCS # Blas 1 tests + ${SYCLBLAS_UNITTEST}/blas1/blas1_asum_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_axpy_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_copy_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_scal_test.cpp @@ -38,6 +39,7 @@ set(SYCL_UNITTEST_SRCS ${SYCLBLAS_UNITTEST}/blas1/blas1_rotmg_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_rotg_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp + ${SYCLBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp # # Blas 2 tests ${SYCLBLAS_UNITTEST}/blas2/blas2_gbmv_test.cpp ${SYCLBLAS_UNITTEST}/blas2/blas2_gemv_test.cpp @@ -81,9 +83,7 @@ if(is_computecpp) set(SYCL_UNITTEST_SRCS ${SYCL_UNITTEST_SRCS} # Blas 1 tests ${SYCLBLAS_UNITTEST}/blas1/blas1_swap_test.cpp - ${SYCLBLAS_UNITTEST}/blas1/blas1_asum_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_dot_test.cpp - ${SYCLBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp # Blas 2 tests diff --git a/test/unittest/blas1/blas1_asum_test.cpp b/test/unittest/blas1/blas1_asum_test.cpp index 1dac65eaa..a92bdcdf9 100644 --- a/test/unittest/blas1/blas1_asum_test.cpp +++ b/test/unittest/blas1/blas1_asum_test.cpp @@ -26,7 +26,7 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; template void run_test(const combination_t combi) { @@ -34,10 +34,13 @@ void run_test(const combination_t combi) { api_type api; index_t size; index_t incX; - std::tie(alloc, api, size, incX) = combi; + scalar_t unused; + std::tie(alloc, api, size, incX, unused) = combi; + + auto vector_size = size * std::abs(incX); // Input vector - std::vector x_v(size * incX); + std::vector x_v(vector_size); fill_random(x_v); // We need to guarantee that cl::sycl::half can hold the sum @@ -47,18 +50,28 @@ void run_test(const combination_t combi) { // Output scalar scalar_t out_s = 0; + scalar_t out_cpu_s; // Reference implementation - scalar_t out_cpu_s = reference_blas::asum(size, x_v.data(), incX); + if (incX < 0) { + // Some reference implementations of BLAS do not support negative + // increments for asum. To simulate what is specified in the + // oneAPI spec, invert the vector and use a positive increment. + std::vector x_v_inv(vector_size); + std::reverse_copy(x_v.begin(), x_v.end() + (incX + 1), x_v_inv.begin()); + out_cpu_s = reference_blas::asum(size, x_v_inv.data(), -incX); + } else { + out_cpu_s = reference_blas::asum(size, x_v.data(), incX); + } // SYCL implementation auto q = make_queue(); blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = helper::allocate(size * incX, q); + auto gpu_x_v = helper::allocate(vector_size, q); auto copy_x = - helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); + helper::copy_to_device(q, x_v.data(), gpu_x_v, vector_size); if (api == api_type::async) { auto gpu_out_s = helper::allocate(1, q); @@ -87,7 +100,8 @@ static void run_test(const combination_t combi) { api_type api; index_t size; index_t incX; - std::tie(alloc, api, size, incX) = combi; + scalar_t unused; + std::tie(alloc, api, size, incX, unused) = combi; if (alloc == "usm") { #ifdef SB_ENABLE_USM @@ -102,12 +116,13 @@ static void run_test(const combination_t combi) { template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 65, 10000, - 1002400), // size - ::testing::Values(1, 4) // incX + 1002400), // size + ::testing::Values(1, 4, -1, -3), // incX + ::testing::Values(0) // unused ); template @@ -116,7 +131,8 @@ static std::string generate_name( std::string alloc; api_type api; int size, incX; - BLAS_GENERATE_NAME(info.param, alloc, api, size, incX); + T unused; + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, unused); } BLAS_REGISTER_TEST_ALL(Asum, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_axpy_test.cpp b/test/unittest/blas1/blas1_axpy_test.cpp index cb395fb92..c76ea5ea8 100644 --- a/test/unittest/blas1/blas1_axpy_test.cpp +++ b/test/unittest/blas1/blas1_axpy_test.cpp @@ -37,13 +37,15 @@ void run_test(const combination_t combi) { index_t incY; std::tie(alloc, size, alpha, incX, incY) = combi; + auto x_size = size * std::abs(incX); + auto y_size = size * std::abs(incY); // Input vector - std::vector x_v(size * incX); + std::vector x_v(x_size); fill_random(x_v); // Output vector - std::vector y_v(size * incY, 10.0); - std::vector y_cpu_v(size * incY, 10.0); + std::vector y_v(y_size, 10.0); + std::vector y_cpu_v(y_size, 10.0); // Reference implementation reference_blas::axpy(size, alpha, x_v.data(), incX, y_cpu_v.data(), incY); @@ -53,17 +55,17 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = helper::allocate(size * incX, q); - auto gpu_y_v = helper::allocate(size * incY, q); + auto gpu_x_v = helper::allocate(x_size, q); + auto gpu_y_v = helper::allocate(y_size, q); - auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); - auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, size * incY); + auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, x_size); + auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, y_size); auto axpy_event = _axpy(sb_handle, size, alpha, gpu_x_v, incX, gpu_y_v, incY, {copy_x, copy_y}); sb_handle.wait(axpy_event); - auto event = helper::copy_to_host(q, gpu_y_v, y_v.data(), size * incY); + auto event = helper::copy_to_host(q, gpu_y_v, y_v.data(), y_size); sb_handle.wait(event); // Validate the result @@ -108,9 +110,9 @@ template const auto combi = ::testing::Combine(::testing::Values("usm", "buf"), // allocation type ::testing::Values(11, 1002), // size - ::testing::Values(0.0, 1.5), // alpha - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY + ::testing::Values(0.0, 1.0), // alpha + ::testing::Values(1, 4, -1, -3), // incX + ::testing::Values(1, 3, -2) // incY ); #endif diff --git a/test/unittest/blas1/blas1_nrm2_test.cpp b/test/unittest/blas1/blas1_nrm2_test.cpp index 4efd61725..383fab54c 100644 --- a/test/unittest/blas1/blas1_nrm2_test.cpp +++ b/test/unittest/blas1/blas1_nrm2_test.cpp @@ -26,7 +26,7 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; template void run_test(const combination_t combi) { @@ -34,27 +34,39 @@ void run_test(const combination_t combi) { api_type api; index_t size; index_t incX; - std::tie(alloc, api, size, incX) = combi; + scalar_t unused; + std::tie(alloc, api, size, incX, unused) = combi; + auto vector_size = size * std::abs(incX); // Input vectors - std::vector x_v(size * incX); + std::vector x_v(vector_size); fill_random(x_v); // Output scalar scalar_t out_s = 10.0; + scalar_t out_cpu_s = 20.0; // Reference implementation - auto out_cpu_s = reference_blas::nrm2(size, x_v.data(), incX); + if (incX < 0) { + // Some reference implementations of BLAS do not support negative + // increments for nrm2. To simulate what is specified in the + // oneAPI spec, invert the vector and use a positive increment. + std::vector x_v_inv(vector_size); + std::reverse_copy(x_v.begin(), x_v.end() + (incX + 1), x_v_inv.begin()); + out_cpu_s = reference_blas::nrm2(size, x_v_inv.data(), -incX); + } else { + out_cpu_s = reference_blas::nrm2(size, x_v.data(), incX); + } // SYCL implementation auto q = make_queue(); blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::helper::allocate(size * incX, q); + auto gpu_x_v = blas::helper::allocate(vector_size, q); auto copy_x = - blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); + blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, vector_size); if (api == api_type::async) { auto gpu_out_s = blas::helper::allocate(1, q); @@ -84,7 +96,8 @@ void run_test(const combination_t combi) { api_type api; index_t size; index_t incX; - std::tie(alloc, api, size, incX) = combi; + scalar_t unused; + std::tie(alloc, api, size, incX, unused) = combi; if (alloc == "usm") { // usm alloc #ifdef SB_ENABLE_USM @@ -98,12 +111,12 @@ void run_test(const combination_t combi) { } template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 1002), // size - ::testing::Values(1, 4) // incX - ); + ::testing::Values(1, 4, -3), // incX + ::testing::Values(scalar_t{1})); template static std::string generate_name( @@ -111,7 +124,8 @@ static std::string generate_name( std::string alloc; api_type api; int size, incX; - BLAS_GENERATE_NAME(info.param, alloc, api, size, incX); + T unused; + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, unused); } BLAS_REGISTER_TEST_ALL(Nrm2, combination_t, combi, generate_name);