Skip to content

Commit

Permalink
Fixes issue with operators using negative increments with USM (#450)
Browse files Browse the repository at this point in the history
* Fix negative increment

* Fix build with computecpp

* Update dpcpp

* Revert dpcpp version

* Fix sync call

* Add a pointer to the first element of the vector

* Fixes issue with dependencies in nrm2 (#451)

* Fix nrm2

* Remove cout

* Fixes issue with dependencies in TBMV (#454)

This PR fixes an issue with dependencies in TBMV (similar to the fix in #451)
  • Loading branch information
aacostadiaz committed Aug 11, 2023
1 parent a7cf3d5 commit 1606dd5
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 48 deletions.
12 changes: 8 additions & 4 deletions include/views/view.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ struct VectorView {
index_t size_;
increment_t strd_; // never size_t, because it could be negative

// Start of the vector.
// If stride is negative, start at the end of the vector and move backward.
container_t ptr_;

VectorView(view_container_t data, view_increment_t strd, view_index_t size);
VectorView(VectorView<view_container_t, view_index_t, view_increment_t> opV,
view_increment_t strd, view_index_t size);
Expand Down Expand Up @@ -96,13 +100,13 @@ struct VectorView {
template <bool use_as_ptr = false>
SYCL_BLAS_INLINE typename std::enable_if<!use_as_ptr, value_t &>::type eval(
index_t i) {
return (strd_ == 1) ? *(data_ + i) : *(data_ + i * strd_);
return (strd_ == 1) ? *(ptr_ + i) : *(ptr_ + i * strd_);
}

template <bool use_as_ptr = false>
SYCL_BLAS_INLINE typename std::enable_if<!use_as_ptr, value_t>::type eval(
index_t i) const {
return (strd_ == 1) ? *(data_ + i) : *(data_ + i * strd_);
return (strd_ == 1) ? *(ptr_ + i) : *(ptr_ + i * strd_);
}

SYCL_BLAS_INLINE value_t &eval(cl::sycl::nd_item<1> ndItem) {
Expand All @@ -116,13 +120,13 @@ struct VectorView {
template <bool use_as_ptr = false>
SYCL_BLAS_INLINE typename std::enable_if<use_as_ptr, value_t &>::type eval(
index_t indx) {
return *(data_ + indx);
return *(ptr_ + indx);
}

template <bool use_as_ptr = false>
SYCL_BLAS_INLINE typename std::enable_if<use_as_ptr, value_t>::type eval(
index_t indx) const noexcept {
return *(data_ + indx);
return *(ptr_ + indx);
}
};

Expand Down
7 changes: 4 additions & 3 deletions src/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,10 +327,10 @@ typename sb_handle_t::event_t _nrm2(
const auto nWG = 2 * localSize;
auto assignOp =
make_assign_reduction<AddOperator>(rs, prdOp, localSize, localSize * nWG);
auto ret0 = sb_handle.execute(assignOp);
auto ret0 = sb_handle.execute(assignOp, _dependencies);
auto sqrtOp = make_op<UnaryOp, SqrtOperator>(rs);
auto assignOpFinal = make_op<Assign>(rs, sqrtOp);
auto ret1 = sb_handle.execute(assignOpFinal, _dependencies);
auto ret1 = sb_handle.execute(assignOpFinal, ret0);
return blas::concatenate_vectors(ret0, ret1);
}

Expand Down Expand Up @@ -776,7 +776,8 @@ typename ValueType<container_t>::type _asum(
auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm
: helper::AllocType::buffer,
element_t > (static_cast<index_t>(1), sb_handle.get_queue());
blas::internal::_asum(sb_handle, _N, _vx, _incx, gpu_res, _dependencies);
auto asum_event = blas::internal::_asum(sb_handle, _N, _vx, _incx, gpu_res, _dependencies);
sb_handle.wait(asum_event);
auto event =
blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1);
sb_handle.wait(event);
Expand Down
4 changes: 2 additions & 2 deletions src/interface/blas2_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,8 +662,8 @@ typename sb_handle_t::event_t _tbmv_impl(
global_size, _dependencies);

auto assignOp = make_op<Assign>(vx, vres);
auto ret = concatenate_vectors(
tbmvEvent, sb_handle.execute(assignOp, local_range, _dependencies));
auto assignEvent = sb_handle.execute(assignOp, local_range, tbmvEvent);
auto ret = concatenate_vectors(tbmvEvent, assignEvent);

blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue());

Expand Down
4 changes: 2 additions & 2 deletions src/views/view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ SYCL_BLAS_INLINE VectorView<_container_t, _IndexType,
_IncrementType>::VectorView(_container_t data,
_IncrementType strd,
_IndexType size)
: data_(data), size_(size), strd_(strd) {}
: data_(data), size_(size), strd_(strd), ptr_(strd > 0 ? data_ : data_ + (size_ - 1) * (-strd_)) {}

/*!
@brief Creates a view from an existing view.
Expand All @@ -55,7 +55,7 @@ SYCL_BLAS_INLINE
VectorView<_container_t, _IndexType, _IncrementType>::VectorView(
VectorView<_container_t, _IndexType, _IncrementType> opV,
_IncrementType strd, _IndexType size)
: data_(opV.get_data()), size_(size), strd_(strd) {}
: data_(opV.get_data()), size_(size), strd_(strd), ptr_(strd > 0 ? data_ : data_ + (size_ - 1) * (-strd_)) {}

/*!
* @brief Returns a reference to the container
Expand Down
18 changes: 16 additions & 2 deletions test/blas_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,14 @@ template <class T, class... Args>
inline void generate_name_helper(std::ostream &ss, T arg, Args... args) {
auto token = strtok(nullptr, ", ");
ss << "__" << token << "_";
dump_arg(ss, arg);
if constexpr (std::is_arithmetic<T>::value) {
if (arg < 0) {
ss << "minus_";
}
dump_arg(ss, std::abs<T>(arg));
} else {
dump_arg(ss, arg);
}
generate_name_helper(ss, args...);
}

Expand All @@ -325,7 +332,14 @@ inline std::string generate_name_helper(char *str_args, T arg, Args... args) {
std::stringstream ss;
auto token = strtok(str_args, ", ");
ss << token << "_";
dump_arg(ss, arg);
if constexpr (std::is_arithmetic<T>::value) {
if (arg < 0) {
ss << "minus_";
}
dump_arg(ss, std::abs<T>(arg));
} else {
dump_arg(ss, arg);
}
generate_name_helper(ss, args...);
return ss.str();
}
Expand Down
4 changes: 2 additions & 2 deletions test/unittest/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ include_directories(${SYCLBLAS_TEST} ${BLAS_INCLUDE_DIRS})
# compiling tests
set(SYCL_UNITTEST_SRCS
# Blas 1 tests
${SYCLBLAS_UNITTEST}/blas1/blas1_asum_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_axpy_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_copy_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_scal_test.cpp
Expand All @@ -38,6 +39,7 @@ set(SYCL_UNITTEST_SRCS
${SYCLBLAS_UNITTEST}/blas1/blas1_rotmg_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_rotg_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_sdsdot_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp
# # Blas 2 tests
${SYCLBLAS_UNITTEST}/blas2/blas2_gbmv_test.cpp
${SYCLBLAS_UNITTEST}/blas2/blas2_gemv_test.cpp
Expand Down Expand Up @@ -81,9 +83,7 @@ if(is_computecpp)
set(SYCL_UNITTEST_SRCS ${SYCL_UNITTEST_SRCS}
# Blas 1 tests
${SYCLBLAS_UNITTEST}/blas1/blas1_swap_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_asum_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_dot_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_nrm2_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_iamax_test.cpp
${SYCLBLAS_UNITTEST}/blas1/blas1_iamin_test.cpp
# Blas 2 tests
Expand Down
38 changes: 27 additions & 11 deletions test/unittest/blas1/blas1_asum_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,21 @@
#include "blas_test.hpp"

template <typename scalar_t>
using combination_t = std::tuple<std::string, api_type, int, int>;
using combination_t = std::tuple<std::string, api_type, int, int, scalar_t>;

template <typename scalar_t, helper::AllocType mem_alloc>
void run_test(const combination_t<scalar_t> combi) {
std::string alloc;
api_type api;
index_t size;
index_t incX;
std::tie(alloc, api, size, incX) = combi;
scalar_t unused;
std::tie(alloc, api, size, incX, unused) = combi;

auto vector_size = size * std::abs(incX);

// Input vector
std::vector<scalar_t> x_v(size * incX);
std::vector<scalar_t> x_v(vector_size);
fill_random<scalar_t>(x_v);

// We need to guarantee that cl::sycl::half can hold the sum
Expand All @@ -47,18 +50,28 @@ void run_test(const combination_t<scalar_t> combi) {

// Output scalar
scalar_t out_s = 0;
scalar_t out_cpu_s;

// Reference implementation
scalar_t out_cpu_s = reference_blas::asum(size, x_v.data(), incX);
if (incX < 0) {
// Some reference implementations of BLAS do not support negative
// increments for asum. To simulate what is specified in the
// oneAPI spec, invert the vector and use a positive increment.
std::vector<scalar_t> x_v_inv(vector_size);
std::reverse_copy(x_v.begin(), x_v.end() + (incX + 1), x_v_inv.begin());
out_cpu_s = reference_blas::asum(size, x_v_inv.data(), -incX);
} else {
out_cpu_s = reference_blas::asum(size, x_v.data(), incX);
}

// SYCL implementation
auto q = make_queue();
blas::SB_Handle sb_handle(q);

// Iterators
auto gpu_x_v = helper::allocate<mem_alloc, scalar_t>(size * incX, q);
auto gpu_x_v = helper::allocate<mem_alloc, scalar_t>(vector_size, q);
auto copy_x =
helper::copy_to_device<scalar_t>(q, x_v.data(), gpu_x_v, size * incX);
helper::copy_to_device<scalar_t>(q, x_v.data(), gpu_x_v, vector_size);

if (api == api_type::async) {
auto gpu_out_s = helper::allocate<mem_alloc, scalar_t>(1, q);
Expand Down Expand Up @@ -87,7 +100,8 @@ static void run_test(const combination_t<scalar_t> combi) {
api_type api;
index_t size;
index_t incX;
std::tie(alloc, api, size, incX) = combi;
scalar_t unused;
std::tie(alloc, api, size, incX, unused) = combi;

if (alloc == "usm") {
#ifdef SB_ENABLE_USM
Expand All @@ -102,12 +116,13 @@ static void run_test(const combination_t<scalar_t> combi) {

template <typename scalar_t>
const auto combi =
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values(api_type::async,
api_type::sync), // Api
::testing::Values(11, 65, 10000,
1002400), // size
::testing::Values(1, 4) // incX
1002400), // size
::testing::Values(1, 4, -1, -3), // incX
::testing::Values(0) // unused
);

template <class T>
Expand All @@ -116,7 +131,8 @@ static std::string generate_name(
std::string alloc;
api_type api;
int size, incX;
BLAS_GENERATE_NAME(info.param, alloc, api, size, incX);
T unused;
BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, unused);
}

BLAS_REGISTER_TEST_ALL(Asum, combination_t, combi, generate_name);
24 changes: 13 additions & 11 deletions test/unittest/blas1/blas1_axpy_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,15 @@ void run_test(const combination_t<scalar_t> combi) {
index_t incY;
std::tie(alloc, size, alpha, incX, incY) = combi;

auto x_size = size * std::abs(incX);
auto y_size = size * std::abs(incY);
// Input vector
std::vector<scalar_t> x_v(size * incX);
std::vector<scalar_t> x_v(x_size);
fill_random(x_v);

// Output vector
std::vector<scalar_t> y_v(size * incY, 10.0);
std::vector<scalar_t> y_cpu_v(size * incY, 10.0);
std::vector<scalar_t> y_v(y_size, 10.0);
std::vector<scalar_t> y_cpu_v(y_size, 10.0);

// Reference implementation
reference_blas::axpy(size, alpha, x_v.data(), incX, y_cpu_v.data(), incY);
Expand All @@ -53,17 +55,17 @@ void run_test(const combination_t<scalar_t> combi) {
blas::SB_Handle sb_handle(q);

// Iterators
auto gpu_x_v = helper::allocate<mem_alloc, scalar_t>(size * incX, q);
auto gpu_y_v = helper::allocate<mem_alloc, scalar_t>(size * incY, q);
auto gpu_x_v = helper::allocate<mem_alloc, scalar_t>(x_size, q);
auto gpu_y_v = helper::allocate<mem_alloc, scalar_t>(y_size, q);

auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX);
auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, size * incY);
auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, x_size);
auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, y_size);

auto axpy_event = _axpy(sb_handle, size, alpha, gpu_x_v, incX, gpu_y_v, incY,
{copy_x, copy_y});
sb_handle.wait(axpy_event);

auto event = helper::copy_to_host(q, gpu_y_v, y_v.data(), size * incY);
auto event = helper::copy_to_host(q, gpu_y_v, y_v.data(), y_size);
sb_handle.wait(event);

// Validate the result
Expand Down Expand Up @@ -108,9 +110,9 @@ template <typename scalar_t>
const auto combi =
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values(11, 1002), // size
::testing::Values<scalar_t>(0.0, 1.5), // alpha
::testing::Values(1, 4), // incX
::testing::Values(1, 3) // incY
::testing::Values<scalar_t>(0.0, 1.0), // alpha
::testing::Values(1, 4, -1, -3), // incX
::testing::Values(1, 3, -2) // incY
);
#endif

Expand Down
36 changes: 25 additions & 11 deletions test/unittest/blas1/blas1_nrm2_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,35 +26,47 @@
#include "blas_test.hpp"

template <typename scalar_t>
using combination_t = std::tuple<std::string, api_type, int, int>;
using combination_t = std::tuple<std::string, api_type, int, int, scalar_t>;

template <typename scalar_t, helper::AllocType mem_alloc>
void run_test(const combination_t<scalar_t> combi) {
std::string alloc;
api_type api;
index_t size;
index_t incX;
std::tie(alloc, api, size, incX) = combi;
scalar_t unused;
std::tie(alloc, api, size, incX, unused) = combi;

auto vector_size = size * std::abs(incX);
// Input vectors
std::vector<scalar_t> x_v(size * incX);
std::vector<scalar_t> x_v(vector_size);
fill_random(x_v);

// Output scalar
scalar_t out_s = 10.0;
scalar_t out_cpu_s = 20.0;

// Reference implementation
auto out_cpu_s = reference_blas::nrm2(size, x_v.data(), incX);
if (incX < 0) {
// Some reference implementations of BLAS do not support negative
// increments for nrm2. To simulate what is specified in the
// oneAPI spec, invert the vector and use a positive increment.
std::vector<scalar_t> x_v_inv(vector_size);
std::reverse_copy(x_v.begin(), x_v.end() + (incX + 1), x_v_inv.begin());
out_cpu_s = reference_blas::nrm2(size, x_v_inv.data(), -incX);
} else {
out_cpu_s = reference_blas::nrm2(size, x_v.data(), incX);
}

// SYCL implementation
auto q = make_queue();
blas::SB_Handle sb_handle(q);

// Iterators
auto gpu_x_v = blas::helper::allocate<mem_alloc, scalar_t>(size * incX, q);
auto gpu_x_v = blas::helper::allocate<mem_alloc, scalar_t>(vector_size, q);

auto copy_x =
blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX);
blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, vector_size);

if (api == api_type::async) {
auto gpu_out_s = blas::helper::allocate<mem_alloc, scalar_t>(1, q);
Expand Down Expand Up @@ -84,7 +96,8 @@ void run_test(const combination_t<scalar_t> combi) {
api_type api;
index_t size;
index_t incX;
std::tie(alloc, api, size, incX) = combi;
scalar_t unused;
std::tie(alloc, api, size, incX, unused) = combi;

if (alloc == "usm") { // usm alloc
#ifdef SB_ENABLE_USM
Expand All @@ -98,20 +111,21 @@ void run_test(const combination_t<scalar_t> combi) {
}
template <typename scalar_t>
const auto combi =
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Combine(::testing::Values("usm", "buf"), // allocation type
::testing::Values(api_type::async,
api_type::sync), // Api
::testing::Values(11, 1002), // size
::testing::Values(1, 4) // incX
);
::testing::Values(1, 4, -3), // incX
::testing::Values(scalar_t{1}));

template <class T>
static std::string generate_name(
const ::testing::TestParamInfo<combination_t<T>>& info) {
std::string alloc;
api_type api;
int size, incX;
BLAS_GENERATE_NAME(info.param, alloc, api, size, incX);
T unused;
BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, unused);
}

BLAS_REGISTER_TEST_ALL(Nrm2, combination_t, combi, generate_name);

0 comments on commit 1606dd5

Please sign in to comment.