diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 866588d68..d4393159e 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -332,10 +332,10 @@ typename sb_handle_t::event_t _nrm2( const auto nWG = 2 * localSize; auto assignOp = make_assign_reduction(rs, prdOp, localSize, localSize * nWG); - auto ret0 = sb_handle.execute(assignOp); + auto ret0 = sb_handle.execute(assignOp, _dependencies); auto sqrtOp = make_op(rs); auto assignOpFinal = make_op(rs, sqrtOp); - auto ret1 = sb_handle.execute(assignOpFinal, _dependencies); + auto ret1 = sb_handle.execute(assignOpFinal, ret0); return blas::concatenate_vectors(ret0, ret1); } diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index 21dae4b34..acd7fe736 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -661,8 +661,8 @@ typename sb_handle_t::event_t _tbmv_impl( global_size, _dependencies); auto assignOp = make_op(vx, vres); - auto ret = concatenate_vectors( - tbmvEvent, sb_handle.execute(assignOp, local_range, _dependencies)); + auto assignEvent = sb_handle.execute(assignOp, local_range, tbmvEvent); + auto ret = concatenate_vectors(tbmvEvent, assignEvent); blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue()); diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index f76b6218c..c4b860432 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -39,6 +39,7 @@ set(SYCL_UNITTEST_SRCS ${SYCLBLAS_UNITTEST}/blas1/blas1_rotmg_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_rotg_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp + ${SYCLBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp # # Blas 2 tests ${SYCLBLAS_UNITTEST}/blas2/blas2_gbmv_test.cpp ${SYCLBLAS_UNITTEST}/blas2/blas2_gemv_test.cpp @@ -72,7 +73,6 @@ if(is_computecpp) # Blas 1 tests ${SYCLBLAS_UNITTEST}/blas1/blas1_swap_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_dot_test.cpp - ${SYCLBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp ${SYCLBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp # Blas 2 tests diff --git a/test/unittest/blas1/blas1_nrm2_test.cpp b/test/unittest/blas1/blas1_nrm2_test.cpp index 4efd61725..383fab54c 100644 --- a/test/unittest/blas1/blas1_nrm2_test.cpp +++ b/test/unittest/blas1/blas1_nrm2_test.cpp @@ -26,7 +26,7 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; template void run_test(const combination_t combi) { @@ -34,27 +34,39 @@ void run_test(const combination_t combi) { api_type api; index_t size; index_t incX; - std::tie(alloc, api, size, incX) = combi; + scalar_t unused; + std::tie(alloc, api, size, incX, unused) = combi; + auto vector_size = size * std::abs(incX); // Input vectors - std::vector x_v(size * incX); + std::vector x_v(vector_size); fill_random(x_v); // Output scalar scalar_t out_s = 10.0; + scalar_t out_cpu_s = 20.0; // Reference implementation - auto out_cpu_s = reference_blas::nrm2(size, x_v.data(), incX); + if (incX < 0) { + // Some reference implementations of BLAS do not support negative + // increments for nrm2. To simulate what is specified in the + // oneAPI spec, invert the vector and use a positive increment. + std::vector x_v_inv(vector_size); + std::reverse_copy(x_v.begin(), x_v.end() + (incX + 1), x_v_inv.begin()); + out_cpu_s = reference_blas::nrm2(size, x_v_inv.data(), -incX); + } else { + out_cpu_s = reference_blas::nrm2(size, x_v.data(), incX); + } // SYCL implementation auto q = make_queue(); blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::helper::allocate(size * incX, q); + auto gpu_x_v = blas::helper::allocate(vector_size, q); auto copy_x = - blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); + blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, vector_size); if (api == api_type::async) { auto gpu_out_s = blas::helper::allocate(1, q); @@ -84,7 +96,8 @@ void run_test(const combination_t combi) { api_type api; index_t size; index_t incX; - std::tie(alloc, api, size, incX) = combi; + scalar_t unused; + std::tie(alloc, api, size, incX, unused) = combi; if (alloc == "usm") { // usm alloc #ifdef SB_ENABLE_USM @@ -98,12 +111,12 @@ void run_test(const combination_t combi) { } template const auto combi = - ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 1002), // size - ::testing::Values(1, 4) // incX - ); + ::testing::Values(1, 4, -3), // incX + ::testing::Values(scalar_t{1})); template static std::string generate_name( @@ -111,7 +124,8 @@ static std::string generate_name( std::string alloc; api_type api; int size, incX; - BLAS_GENERATE_NAME(info.param, alloc, api, size, incX); + T unused; + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, unused); } BLAS_REGISTER_TEST_ALL(Nrm2, combination_t, combi, generate_name);