Skip to content

Commit

Permalink
Add USM support
Browse files Browse the repository at this point in the history
  • Loading branch information
aacostadiaz committed Aug 11, 2023
1 parent 48a5361 commit 37be9e4
Show file tree
Hide file tree
Showing 165 changed files with 6,697 additions and 3,738 deletions.
19 changes: 14 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,14 +175,23 @@ of multiple BLAS operations.

## API description

This section references all the supported operations and their interface.
This section references all the supported operations and their interface. The
library follows the [oneAPI MKL BLAS specification](https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/blas/blas.html)
as reference for the api. We have support for both USM and Buffer api, however
the group apis for USM are not supported. We don't support mixing USM and Buffer
arguments together to compile the library, and instead stick to the aformentioned
reference specification.

All operations take as their first argument a reference to the SB_Handle, a
`blas::SB_Handle` created with a `sycl::queue`. The return value is usually an
array of SYCL events (except for some operations that can return a scalar or
`blas::SB_Handle` created with a `sycl::queue`. The last argument for all operators
is a vector of dependencies of type `cl::sycl::event` (empty by default). The return value
is usually an array of SYCL events (except for some operations that can return a scalar or
a tuple). The containers for the vectors and matrices (and scalars written by
the BLAS operations) are iterator buffers that can be created with
`make_sycl_iterator_buffer`.
the BLAS operations) can either be `raw usm pointers` or `iterator buffers` that can be
created with a call to `cl::sycl::malloc_device` or `make_sycl_iterator_buffer` respectively.

The USM support in SYCL-BLAS is limited to `device allocated` memory only and we don't support
`shared` or `host` allocations with USM.

We recommend checking the [samples](samples) to get started with SYCL-BLAS. It
is better to be familiar with BLAS:
Expand Down
57 changes: 39 additions & 18 deletions benchmark/syclblas/blas1/asum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
constexpr blas_benchmark::utils::Level1Op benchmark_op =
blas_benchmark::utils::Level1Op::asum;

template <typename scalar_t>
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
bool* success) {
// initialize the state label
Expand All @@ -40,6 +40,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
size);

blas::SB_Handle& sb_handle = *sb_handle_ptr;
auto q = sb_handle.get_queue();

// Create data
std::vector<scalar_t> v1 = blas_benchmark::utils::random_data<scalar_t>(size);
Expand All @@ -49,19 +50,24 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
std::transform(std::begin(v1), std::end(v1), std::begin(v1),
[=](scalar_t x) { return x / v1.size(); });

scalar_t vr;
auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
auto inr = blas::helper::allocate<mem_alloc, scalar_t>(1, q);

auto copy_x = blas::helper::copy_to_device<scalar_t>(q, v1.data(), inx, size);

auto inx = blas::make_sycl_iterator_buffer<scalar_t>(v1, size);
auto inr = blas::make_sycl_iterator_buffer<scalar_t>(&vr, 1);
sb_handle.wait({copy_x});

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
scalar_t vr_ref = reference_blas::asum(size, v1.data(), 1);
scalar_t vr_temp = 0;
{
auto vr_temp_gpu = blas::make_sycl_iterator_buffer<scalar_t>(&vr_temp, 1);
auto event = _asum(sb_handle, size, inx, 1, vr_temp_gpu);
sb_handle.wait(event);
auto vr_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(1, q);
auto asum_event = _asum(sb_handle, size, inx, 1, vr_temp_gpu);
sb_handle.wait(asum_event);
auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1);
sb_handle.wait(copy_output);
blas::helper::deallocate<mem_alloc>(vr_temp_gpu, q);
}

if (!utils::almost_equal<scalar_t>(vr_temp, vr_ref)) {
Expand Down Expand Up @@ -100,30 +106,45 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
state.counters["bytes_processed"]);

blas_benchmark::utils::calc_avg_counters(state);
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
bool* success) {
auto asum_params = blas_benchmark::utils::get_blas1_params(args);
blas::helper::deallocate<mem_alloc>(inx, q);
blas::helper::deallocate<mem_alloc>(inr, q);
}

for (auto size : asum_params) {
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
std::string mem_type,
std::vector<blas1_param_t> params) {
for (auto size : params) {
auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
index_t size, bool* success) {
run<scalar_t>(st, sb_handle_ptr, size, success);
run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, success);
};

benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
size, blas_benchmark::utils::MEM_TYPE_BUFFER)
.c_str(),
size, mem_type).c_str(),
BM_lambda, sb_handle_ptr, size, success)
->UseRealTime();
}
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
auto asum_params = blas_benchmark::utils::get_blas1_params(args);

register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, asum_params);
#ifdef SB_ENABLE_USM
register_benchmark<scalar_t, blas::helper::AllocType::usm>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, asum_params);
#endif
}

namespace blas_benchmark {
void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
bool* success) {
void create_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success);
}
} // namespace blas_benchmark
62 changes: 45 additions & 17 deletions benchmark/syclblas/blas1/axpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
constexpr blas_benchmark::utils::Level1Op benchmark_op =
blas_benchmark::utils::Level1Op::axpy;

template <typename scalar_t>
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
bool* success) {
// initialize the state label
Expand All @@ -40,14 +40,20 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
size);

blas::SB_Handle& sb_handle = *sb_handle_ptr;
auto q = sb_handle.get_queue();

// Create data
std::vector<scalar_t> v1 = blas_benchmark::utils::random_data<scalar_t>(size);
std::vector<scalar_t> v2 = blas_benchmark::utils::random_data<scalar_t>(size);
auto alpha = blas_benchmark::utils::random_scalar<scalar_t>();

auto inx = blas::make_sycl_iterator_buffer<scalar_t>(v1, size);
auto iny = blas::make_sycl_iterator_buffer<scalar_t>(v2, size);
auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
auto iny = blas::helper::allocate<mem_alloc, scalar_t>(size, q);

auto copy_x = blas::helper::copy_to_device<scalar_t>(q, v1.data(), inx, size);
auto copy_y = blas::helper::copy_to_device<scalar_t>(q, v2.data(), iny, size);

sb_handle.wait({copy_x, copy_y});

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
Expand All @@ -56,9 +62,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
y_ref.data(), 1);
std::vector<scalar_t> y_temp = v2;
{
auto y_temp_gpu = blas::make_sycl_iterator_buffer<scalar_t>(y_temp, size);
auto event = _axpy(sb_handle, size, alpha, inx, 1, y_temp_gpu, 1);
sb_handle.wait(event);
auto y_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size, q);
auto copy_temp = blas::helper::copy_to_device<scalar_t>(q, y_temp.data(),
y_temp_gpu, size);
sb_handle.wait(copy_temp);
auto axpy_event = _axpy(sb_handle, size, alpha, inx, 1, y_temp_gpu, 1);
sb_handle.wait(axpy_event);
auto copy_output =
blas::helper::copy_to_host(q, y_temp_gpu, y_temp.data(), size);
sb_handle.wait(copy_output);

blas::helper::deallocate<mem_alloc>(y_temp_gpu, q);
}

std::ostringstream err_stream;
Expand Down Expand Up @@ -97,30 +111,44 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
state.counters["bytes_processed"]);

blas_benchmark::utils::calc_avg_counters(state);
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
bool* success) {
auto axpy_params = blas_benchmark::utils::get_blas1_params(args);
blas::helper::deallocate<mem_alloc>(inx, q);
blas::helper::deallocate<mem_alloc>(iny, q);
}

for (auto size : axpy_params) {
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
std::string mem_type,
std::vector<blas1_param_t> params) {
for (auto size : params) {
auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
index_t size, bool* success) {
run<scalar_t>(st, sb_handle_ptr, size, success);
run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
size, blas_benchmark::utils::MEM_TYPE_BUFFER)
.c_str(),
size, mem_type).c_str(),
BM_lambda, sb_handle_ptr, size, success)
->UseRealTime();
}
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
auto axpy_params = blas_benchmark::utils::get_blas1_params(args);

register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, axpy_params);
#ifdef SB_ENABLE_USM
register_benchmark<scalar_t, blas::helper::AllocType::usm>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, axpy_params);
#endif
}

namespace blas_benchmark {
void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr,
bool* success) {
void create_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success);
}
} // namespace blas_benchmark
67 changes: 51 additions & 16 deletions benchmark/syclblas/blas1/copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
constexpr blas_benchmark::utils::Level1Op benchmark_op =
blas_benchmark::utils::Level1Op::copy;

template <typename scalar_t>
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
index_t incx, index_t incy, bool* success) {
// initialize the state label
Expand All @@ -39,6 +39,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
blas_benchmark::utils::Level1Op::copy, scalar_t>(state, size);

blas::SB_Handle& sb_handle = *sb_handle_ptr;
auto q = sb_handle.get_queue();

auto size_x = size * incx;
auto size_y = size * incy;
Expand All @@ -47,18 +48,36 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
std::vector<scalar_t> y =
blas_benchmark::utils::random_data<scalar_t>(size_y);

auto x_gpu = blas::make_sycl_iterator_buffer<scalar_t>(x, size_x);
auto y_gpu = blas::make_sycl_iterator_buffer<scalar_t>(y, size_y);
auto x_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_x, q);
auto y_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_y, q);

auto copy_x =
blas::helper::copy_to_device<scalar_t>(q, x.data(), x_gpu, size_x);
auto copy_y =
blas::helper::copy_to_device<scalar_t>(q, y.data(), y_gpu, size_y);

sb_handle.wait({copy_x, copy_y});

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<scalar_t> y_ref = y;
reference_blas::copy(size, x.data(), incx, y_ref.data(), incy);
std::vector<scalar_t> y_temp = y;
{
auto y_temp_gpu = blas::make_sycl_iterator_buffer<scalar_t>(y_temp, size_y);
auto event = _copy(sb_handle, size, x_gpu, incx, y_temp_gpu, incy);
sb_handle.wait(event);
auto y_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_y, q);
auto copy_temp = blas::helper::copy_to_device<scalar_t>(q, y_temp.data(),
y_temp_gpu, size_y);
sb_handle.wait(copy_temp);
auto copy_event = blas::_copy<blas::SB_Handle, index_t, decltype(x_gpu),
decltype(y_temp_gpu), index_t>(
sb_handle, size, x_gpu, incx, y_temp_gpu, incy);
sb_handle.wait(copy_event);

auto copy_out = blas::helper::copy_to_host<scalar_t>(q, y_temp_gpu,
y_temp.data(), size_y);
sb_handle.wait(copy_out);

blas::helper::deallocate<mem_alloc>(y_temp_gpu, q);
}

std::ostringstream err_stream;
Expand All @@ -70,7 +89,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
#endif

auto blas_method_def = [&]() -> std::vector<cl::sycl::event> {
auto event = _copy(sb_handle, size, x_gpu, incx, y_gpu, incy);
auto event =
blas::_copy<blas::SB_Handle, index_t, decltype(x_gpu), decltype(y_gpu),
index_t>(sb_handle, size, x_gpu, incx, y_gpu, incy);
sb_handle.wait(event);
return event;
};
Expand All @@ -96,14 +117,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
state.counters["bytes_processed"]);

blas_benchmark::utils::calc_avg_counters(state);
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
auto copy_params = blas_benchmark::utils::get_copy_params<scalar_t>(args);
blas::helper::deallocate<mem_alloc>(x_gpu, q);
blas::helper::deallocate<mem_alloc>(y_gpu, q);
}

for (auto p : copy_params) {
template <typename scalar_t, blas::helper::AllocType mem_alloc>
void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
std::string mem_type,
std::vector<copy_param_t<scalar_t>> params) {
for (auto p : params) {
index_t size, incx, incy;
scalar_t unused; // Work around a dpcpp compiler bug
// (https://github.com/intel/llvm/issues/7075)
Expand All @@ -112,17 +135,29 @@ void register_benchmark(blas_benchmark::Args& args,
auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
index_t size, index_t incx, index_t incy,
bool* success) {
run<scalar_t>(st, sb_handle_ptr, size, incx, incy, success);
run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, incx, incy, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op, scalar_t>(
size, incx, incy, blas_benchmark::utils::MEM_TYPE_BUFFER)
.c_str(),
size, incx, incy, mem_type).c_str(),
BM_lambda, sb_handle_ptr, size, incx, incy, success)
->UseRealTime();
}
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
auto copy_params = blas_benchmark::utils::get_copy_params<scalar_t>(args);

register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, copy_params);
#ifdef SB_ENABLE_USM
register_benchmark<scalar_t, blas::helper::AllocType::usm>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, copy_params);
#endif
}

namespace blas_benchmark {
void create_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
Expand Down
Loading

0 comments on commit 37be9e4

Please sign in to comment.