diff --git a/README.md b/README.md index 3a40d3e61..9f28c20be 100644 --- a/README.md +++ b/README.md @@ -175,14 +175,23 @@ of multiple BLAS operations. ## API description -This section references all the supported operations and their interface. +This section references all the supported operations and their interface. The +library follows the [oneAPI MKL BLAS specification](https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/blas/blas.html) +as reference for the api. We have support for both USM and Buffer api, however +the group apis for USM are not supported. We don't support mixing USM and Buffer +arguments together to compile the library, and instead stick to the aformentioned +reference specification. All operations take as their first argument a reference to the SB_Handle, a -`blas::SB_Handle` created with a `sycl::queue`. The return value is usually an -array of SYCL events (except for some operations that can return a scalar or +`blas::SB_Handle` created with a `sycl::queue`. The last argument for all operators +is a vector of dependencies of type `cl::sycl::event` (empty by default). The return value +is usually an array of SYCL events (except for some operations that can return a scalar or a tuple). The containers for the vectors and matrices (and scalars written by -the BLAS operations) are iterator buffers that can be created with -`make_sycl_iterator_buffer`. +the BLAS operations) can either be `raw usm pointers` or `iterator buffers` that can be +created with a call to `cl::sycl::malloc_device` or `make_sycl_iterator_buffer` respectively. + +The USM support in SYCL-BLAS is limited to `device allocated` memory only and we don't support +`shared` or `host` allocations with USM. We recommend checking the [samples](samples) to get started with SYCL-BLAS. It is better to be familiar with BLAS: diff --git a/benchmark/syclblas/blas1/asum.cpp b/benchmark/syclblas/blas1/asum.cpp index f42502c2e..36a0c9097 100644 --- a/benchmark/syclblas/blas1/asum.cpp +++ b/benchmark/syclblas/blas1/asum.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::asum; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,6 +40,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Create data std::vector v1 = blas_benchmark::utils::random_data(size); @@ -49,19 +50,24 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); - scalar_t vr; + auto inx = blas::helper::allocate(size, q); + auto inr = blas::helper::allocate(1, q); + + auto copy_x = blas::helper::copy_to_device(q, v1.data(), inx, size); - auto inx = blas::make_sycl_iterator_buffer(v1, size); - auto inr = blas::make_sycl_iterator_buffer(&vr, 1); + sb_handle.wait({copy_x}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results scalar_t vr_ref = reference_blas::asum(size, v1.data(), 1); scalar_t vr_temp = 0; { - auto vr_temp_gpu = blas::make_sycl_iterator_buffer(&vr_temp, 1); - auto event = _asum(sb_handle, size, inx, 1, vr_temp_gpu); - sb_handle.wait(event); + auto vr_temp_gpu = blas::helper::allocate(1, q); + auto asum_event = _asum(sb_handle, size, inx, 1, vr_temp_gpu); + sb_handle.wait(asum_event); + auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1); + sb_handle.wait(copy_output); + blas::helper::deallocate(vr_temp_gpu, q); } if (!utils::almost_equal(vr_temp, vr_ref)) { @@ -100,30 +106,45 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto asum_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(inx, q); + blas::helper::deallocate(inr, q); +} - for (auto size : asum_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; + benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto asum_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, asum_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, asum_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/benchmark/syclblas/blas1/axpy.cpp b/benchmark/syclblas/blas1/axpy.cpp index db29313d5..88500f188 100644 --- a/benchmark/syclblas/blas1/axpy.cpp +++ b/benchmark/syclblas/blas1/axpy.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::axpy; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,14 +40,20 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Create data std::vector v1 = blas_benchmark::utils::random_data(size); std::vector v2 = blas_benchmark::utils::random_data(size); auto alpha = blas_benchmark::utils::random_scalar(); - auto inx = blas::make_sycl_iterator_buffer(v1, size); - auto iny = blas::make_sycl_iterator_buffer(v2, size); + auto inx = blas::helper::allocate(size, q); + auto iny = blas::helper::allocate(size, q); + + auto copy_x = blas::helper::copy_to_device(q, v1.data(), inx, size); + auto copy_y = blas::helper::copy_to_device(q, v2.data(), iny, size); + + sb_handle.wait({copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -56,9 +62,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, y_ref.data(), 1); std::vector y_temp = v2; { - auto y_temp_gpu = blas::make_sycl_iterator_buffer(y_temp, size); - auto event = _axpy(sb_handle, size, alpha, inx, 1, y_temp_gpu, 1); - sb_handle.wait(event); + auto y_temp_gpu = blas::helper::allocate(size, q); + auto copy_temp = blas::helper::copy_to_device(q, y_temp.data(), + y_temp_gpu, size); + sb_handle.wait(copy_temp); + auto axpy_event = _axpy(sb_handle, size, alpha, inx, 1, y_temp_gpu, 1); + sb_handle.wait(axpy_event); + auto copy_output = + blas::helper::copy_to_host(q, y_temp_gpu, y_temp.data(), size); + sb_handle.wait(copy_output); + + blas::helper::deallocate(y_temp_gpu, q); } std::ostringstream err_stream; @@ -97,30 +111,44 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto axpy_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(inx, q); + blas::helper::deallocate(iny, q); +} - for (auto size : axpy_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto axpy_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, axpy_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, axpy_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/benchmark/syclblas/blas1/copy.cpp b/benchmark/syclblas/blas1/copy.cpp index af5657066..91d5bebe6 100644 --- a/benchmark/syclblas/blas1/copy.cpp +++ b/benchmark/syclblas/blas1/copy.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::copy; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, index_t incx, index_t incy, bool* success) { // initialize the state label @@ -39,6 +39,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas_benchmark::utils::Level1Op::copy, scalar_t>(state, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); auto size_x = size * incx; auto size_y = size * incy; @@ -47,8 +48,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, std::vector y = blas_benchmark::utils::random_data(size_y); - auto x_gpu = blas::make_sycl_iterator_buffer(x, size_x); - auto y_gpu = blas::make_sycl_iterator_buffer(y, size_y); + auto x_gpu = blas::helper::allocate(size_x, q); + auto y_gpu = blas::helper::allocate(size_y, q); + + auto copy_x = + blas::helper::copy_to_device(q, x.data(), x_gpu, size_x); + auto copy_y = + blas::helper::copy_to_device(q, y.data(), y_gpu, size_y); + + sb_handle.wait({copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -56,9 +64,20 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, reference_blas::copy(size, x.data(), incx, y_ref.data(), incy); std::vector y_temp = y; { - auto y_temp_gpu = blas::make_sycl_iterator_buffer(y_temp, size_y); - auto event = _copy(sb_handle, size, x_gpu, incx, y_temp_gpu, incy); - sb_handle.wait(event); + auto y_temp_gpu = blas::helper::allocate(size_y, q); + auto copy_temp = blas::helper::copy_to_device(q, y_temp.data(), + y_temp_gpu, size_y); + sb_handle.wait(copy_temp); + auto copy_event = blas::_copy( + sb_handle, size, x_gpu, incx, y_temp_gpu, incy); + sb_handle.wait(copy_event); + + auto copy_out = blas::helper::copy_to_host(q, y_temp_gpu, + y_temp.data(), size_y); + sb_handle.wait(copy_out); + + blas::helper::deallocate(y_temp_gpu, q); } std::ostringstream err_stream; @@ -70,7 +89,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, #endif auto blas_method_def = [&]() -> std::vector { - auto event = _copy(sb_handle, size, x_gpu, incx, y_gpu, incy); + auto event = + blas::_copy(sb_handle, size, x_gpu, incx, y_gpu, incy); sb_handle.wait(event); return event; }; @@ -96,14 +117,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto copy_params = blas_benchmark::utils::get_copy_params(args); + blas::helper::deallocate(x_gpu, q); + blas::helper::deallocate(y_gpu, q); +} - for (auto p : copy_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { index_t size, incx, incy; scalar_t unused; // Work around a dpcpp compiler bug // (https://github.com/intel/llvm/issues/7075) @@ -112,17 +135,29 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, index_t incx, index_t incy, bool* success) { - run(st, sb_handle_ptr, size, incx, incy, success); + run(st, sb_handle_ptr, size, incx, incy, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, incx, incy, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, incx, incy, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, incx, incy, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto copy_params = blas_benchmark::utils::get_copy_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, copy_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, copy_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas1/dot.cpp b/benchmark/syclblas/blas1/dot.cpp index b3d516454..729d327e7 100644 --- a/benchmark/syclblas/blas1/dot.cpp +++ b/benchmark/syclblas/blas1/dot.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::dot; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,6 +40,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas_benchmark::utils::Level1Op::dot, scalar_t>(state, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Create data std::vector v1 = blas_benchmark::utils::random_data(size); @@ -49,19 +50,28 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); - auto inx = blas::make_sycl_iterator_buffer(v1, size); - auto iny = blas::make_sycl_iterator_buffer(v2, size); - auto inr = blas::make_sycl_iterator_buffer(1); + auto inx = blas::helper::allocate(size, q); + auto iny = blas::helper::allocate(size, q); + auto inr = blas::helper::allocate(1, q); + + auto copy_x = blas::helper::copy_to_device(q, v1.data(), inx, size); + auto copy_y = blas::helper::copy_to_device(q, v2.data(), iny, size); + + sb_handle.wait({copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results scalar_t vr_ref = reference_blas::dot(size, v1.data(), 1, v2.data(), 1); scalar_t vr_temp = 0; { - auto vr_temp_gpu = blas::make_sycl_iterator_buffer(&vr_temp, 1); - auto event = _dot(sb_handle, size, inx, static_cast(1), iny, - static_cast(1), vr_temp_gpu); - sb_handle.wait(event); + auto vr_temp_gpu = blas::helper::allocate(1, q); + auto dot_event = _dot(sb_handle, size, inx, static_cast(1), iny, + static_cast(1), vr_temp_gpu); + sb_handle.wait(dot_event); + auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1); + sb_handle.wait(copy_output); + + blas::helper::deallocate(vr_temp_gpu, q); } if (!utils::almost_equal(vr_temp, vr_ref)) { @@ -101,30 +111,46 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto dot_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(inx, q); + blas::helper::deallocate(iny, q); + blas::helper::deallocate(inr, q); +} - for (auto size : dot_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; + benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto dot_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, dot_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, dot_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/benchmark/syclblas/blas1/iamax.cpp b/benchmark/syclblas/blas1/iamax.cpp index 35e920d9f..6327337f9 100644 --- a/benchmark/syclblas/blas1/iamax.cpp +++ b/benchmark/syclblas/blas1/iamax.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::iamax; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,6 +40,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas_benchmark::utils::Level1Op::iamax, scalar_t>(state, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); using tuple_scalar_t = blas::IndexValueTuple; @@ -52,8 +53,12 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, return utils::clamp_to_limits(v); }); - auto inx = blas::make_sycl_iterator_buffer(v1, size); - auto outI = blas::make_sycl_iterator_buffer(&out, 1); + auto inx = blas::helper::allocate(size, q); + auto outI = blas::helper::allocate(1, q); + + auto copy_x = blas::helper::copy_to_device(q, v1.data(), inx, size); + + sb_handle.wait({copy_x}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -61,11 +66,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, static_cast(reference_blas::iamax(size, v1.data(), 1)); tuple_scalar_t idx_temp{-1, 0}; { - auto idx_temp_gpu = blas::make_sycl_iterator_buffer< - blas::IndexValueTuple>(&idx_temp, 1); - auto event = + auto idx_temp_gpu = blas::helper::allocate(1, q); + auto iamax_event = _iamax(sb_handle, size, inx, static_cast(1), idx_temp_gpu); - sb_handle.wait(event); + sb_handle.wait(iamax_event); + auto copy_output = + blas::helper::copy_to_host(q, idx_temp_gpu, &idx_temp, 1); + sb_handle.wait(copy_output); + + blas::helper::deallocate(idx_temp_gpu, q); } if (idx_temp.ind != idx_ref) { @@ -105,30 +114,45 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto iamax_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(inx, q); + blas::helper::deallocate(outI, q); +} - for (auto size : iamax_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; + benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto iamax_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, iamax_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, iamax_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/benchmark/syclblas/blas1/iamin.cpp b/benchmark/syclblas/blas1/iamin.cpp index 14facae00..994ea042a 100644 --- a/benchmark/syclblas/blas1/iamin.cpp +++ b/benchmark/syclblas/blas1/iamin.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::iamin; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,6 +40,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas_benchmark::utils::Level1Op::iamin, scalar_t>(state, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); using tuple_scalar_t = blas::IndexValueTuple; @@ -51,8 +52,12 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, return utils::clamp_to_limits(v); }); - auto inx = blas::make_sycl_iterator_buffer(v1, size); - auto outI = blas::make_sycl_iterator_buffer(&out, 1); + auto inx = blas::helper::allocate(size, q); + auto outI = blas::helper::allocate(1, q); + + auto copy_x = blas::helper::copy_to_device(q, v1.data(), inx, size); + + sb_handle.wait({copy_x}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -60,11 +65,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, static_cast(reference_blas::iamin(size, v1.data(), 1)); tuple_scalar_t idx_temp{-1, -1}; { - auto idx_temp_gpu = blas::make_sycl_iterator_buffer< - blas::IndexValueTuple>(&idx_temp, 1); - auto event = + auto idx_temp_gpu = blas::helper::allocate(1, q); + auto iamin_event = _iamin(sb_handle, size, inx, static_cast(1), idx_temp_gpu); - sb_handle.wait(event); + sb_handle.wait(iamin_event); + auto copy_output = + blas::helper::copy_to_host(q, idx_temp_gpu, &idx_temp, 1); + sb_handle.wait(copy_output); + + blas::helper::deallocate(idx_temp_gpu, q); } if (idx_temp.ind != idx_ref) { @@ -104,30 +113,45 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto iamin_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(inx, q); + blas::helper::deallocate(outI, q); +} - for (auto size : iamin_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; + benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto iamin_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, iamin_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, iamin_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/benchmark/syclblas/blas1/nrm2.cpp b/benchmark/syclblas/blas1/nrm2.cpp index 9f7d40243..343a4be47 100644 --- a/benchmark/syclblas/blas1/nrm2.cpp +++ b/benchmark/syclblas/blas1/nrm2.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::nrm2; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,6 +40,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas_benchmark::utils::Level1Op::nrm2, scalar_t>(state, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Create data std::vector v1 = blas_benchmark::utils::random_data(size); @@ -48,18 +49,26 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, std::transform(std::begin(v1), std::end(v1), std::begin(v1), [=](scalar_t x) { return x / v1.size(); }); - auto inx = blas::make_sycl_iterator_buffer(v1, size); - auto inr = blas::make_sycl_iterator_buffer(1); + auto inx = blas::helper::allocate(size, q); + auto inr = blas::helper::allocate(1, q); + + auto copy_x = blas::helper::copy_to_device(q, v1.data(), inx, size); + + sb_handle.wait({copy_x}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results scalar_t vr_ref = reference_blas::nrm2(size, v1.data(), 1); scalar_t vr_temp = 0; { - auto vr_temp_gpu = blas::make_sycl_iterator_buffer(&vr_temp, 1); - auto event = + auto vr_temp_gpu = blas::helper::allocate(1, q); + auto nrm2_event = _nrm2(sb_handle, size, inx, static_cast(1), vr_temp_gpu); - sb_handle.wait(event); + sb_handle.wait(nrm2_event); + auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1); + sb_handle.wait(copy_output); + + blas::helper::deallocate(vr_temp_gpu, q); } if (!utils::almost_equal(vr_temp, vr_ref)) { @@ -98,30 +107,44 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto nrm2_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(inx, q); + blas::helper::deallocate(inr, q); +} - for (auto size : nrm2_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto nrm2_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, nrm2_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, nrm2_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/benchmark/syclblas/blas1/rotg.cpp b/benchmark/syclblas/blas1/rotg.cpp index 33075ddb7..dd4f04d8c 100644 --- a/benchmark/syclblas/blas1/rotg.cpp +++ b/benchmark/syclblas/blas1/rotg.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::rotg; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, bool* success) { // initialize the state label @@ -42,11 +42,19 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, scalar_t s = blas_benchmark::utils::random_data(1)[0]; blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); + + auto buf_a = blas::helper::allocate(1, q); + auto buf_b = blas::helper::allocate(1, q); + auto buf_c = blas::helper::allocate(1, q); + auto buf_s = blas::helper::allocate(1, q); + + auto copy_a = blas::helper::copy_to_device(q, &a, buf_a, 1); + auto copy_b = blas::helper::copy_to_device(q, &b, buf_b, 1); + auto copy_c = blas::helper::copy_to_device(q, &c, buf_c, 1); + auto copy_s = blas::helper::copy_to_device(q, &s, buf_s, 1); - auto buf_a = blas::make_sycl_iterator_buffer(&a, 1); - auto buf_b = blas::make_sycl_iterator_buffer(&b, 1); - auto buf_c = blas::make_sycl_iterator_buffer(&c, 1); - auto buf_s = blas::make_sycl_iterator_buffer(&s, 1); + sb_handle.wait({copy_a, copy_b, copy_c, copy_s}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -60,22 +68,32 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, scalar_t c_verify = c; scalar_t s_verify = s; - auto buf_verify_a = blas::make_sycl_iterator_buffer(&a_verify, 1); - auto buf_verify_b = blas::make_sycl_iterator_buffer(&b_verify, 1); - auto buf_verify_c = blas::make_sycl_iterator_buffer(&c_verify, 1); - auto buf_verify_s = blas::make_sycl_iterator_buffer(&s_verify, 1); - reference_blas::rotg(&a_ref, &b_ref, &c_ref, &s_ref); - _rotg(sb_handle, buf_verify_a, buf_verify_b, buf_verify_c, buf_verify_s); - auto event1 = blas::helper::copy_to_host(sb_handle.get_queue(), buf_verify_c, - &c_verify, 1); - auto event2 = blas::helper::copy_to_host(sb_handle.get_queue(), buf_verify_s, - &s_verify, 1); - auto event3 = blas::helper::copy_to_host(sb_handle.get_queue(), buf_verify_a, - &a_verify, 1); - auto event4 = blas::helper::copy_to_host(sb_handle.get_queue(), buf_verify_b, - &b_verify, 1); + auto buf_verify_a = blas::helper::allocate(1, q); + auto buf_verify_b = blas::helper::allocate(1, q); + auto buf_verify_c = blas::helper::allocate(1, q); + auto buf_verify_s = blas::helper::allocate(1, q); + + auto copy_verify_a = + blas::helper::copy_to_device(q, &a_verify, buf_verify_a, 1); + auto copy_verify_b = + blas::helper::copy_to_device(q, &b_verify, buf_verify_b, 1); + auto copy_verify_c = + blas::helper::copy_to_device(q, &c_verify, buf_verify_c, 1); + auto copy_verify_s = + blas::helper::copy_to_device(q, &s_verify, buf_verify_s, 1); + + sb_handle.wait({copy_verify_a, copy_verify_b, copy_verify_c, copy_verify_s}); + + auto rotg_event = + _rotg(sb_handle, buf_verify_a, buf_verify_b, buf_verify_c, buf_verify_s); + sb_handle.wait(rotg_event); + + auto event1 = blas::helper::copy_to_host(q, buf_verify_c, &c_verify, 1); + auto event2 = blas::helper::copy_to_host(q, buf_verify_s, &s_verify, 1); + auto event3 = blas::helper::copy_to_host(q, buf_verify_a, &a_verify, 1); + auto event4 = blas::helper::copy_to_host(q, buf_verify_b, &b_verify, 1); sb_handle.wait({event1, event2, event3, event4}); @@ -98,6 +116,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.SkipWithError(err_str.c_str()); *success = false; }; + + blas::helper::deallocate(buf_verify_a, q); + blas::helper::deallocate(buf_verify_b, q); + blas::helper::deallocate(buf_verify_c, q); + blas::helper::deallocate(buf_verify_s, q); #endif // Create a utility lambda describing the blas method that we want to run. @@ -126,23 +149,38 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); + + blas::helper::deallocate(buf_a, q); + blas::helper::deallocate(buf_b, q); + blas::helper::deallocate(buf_c, q); + blas::helper::deallocate(buf_s, q); }; -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, bool* success) { - run(st, sb_handle_ptr, success); + run(st, sb_handle_ptr, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + mem_type).c_str(), BM_lambda, sb_handle_ptr, success) ->UseRealTime(); } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER); +#ifdef SB_ENABLE_USM + register_benchmark(sb_handle_ptr, + success, blas_benchmark::utils::MEM_TYPE_USM); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas1/rotm.cpp b/benchmark/syclblas/blas1/rotm.cpp index 414a2a31e..a98b19fac 100644 --- a/benchmark/syclblas/blas1/rotm.cpp +++ b/benchmark/syclblas/blas1/rotm.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::rotm; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,6 +40,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas_benchmark::utils::Level1Op::rotm, scalar_t>(state, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Create data constexpr size_t param_size = 5; @@ -53,9 +54,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, std::vector y_v = blas_benchmark::utils::random_data(size); - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size); - auto gpu_y_v = blas::make_sycl_iterator_buffer(y_v, size); - auto gpu_param = blas::make_sycl_iterator_buffer(param, param_size); + auto gpu_x_v = blas::helper::allocate(size, q); + auto gpu_y_v = blas::helper::allocate(size, q); + auto gpu_param = blas::helper::allocate(param_size, q); + + auto copy_x = + blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, size); + auto copy_y = + blas::helper::copy_to_device(q, y_v.data(), gpu_y_v, size); + auto copy_param = blas::helper::copy_to_device( + q, param.data(), gpu_param, param_size); + + sb_handle.wait({copy_x, copy_y, copy_param}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -67,15 +77,31 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, reference_blas::rotm(size, x_v_ref.data(), 1, y_v_ref.data(), 1, param.data()); - - _rotm(sb_handle, size, gpu_x_v, static_cast(1), gpu_y_v, - static_cast(1), gpu_param); - auto event1 = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_x_v, - x_v_verify.data(), size); - auto event2 = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_y_v, - y_v_verify.data(), size); - sb_handle.wait({event1, event2}); - + { + auto gpu_x_verify = blas::helper::allocate(size, q); + auto gpu_y_verify = blas::helper::allocate(size, q); + + auto copy_x_verify = blas::helper::copy_to_device( + q, x_v_verify.data(), gpu_x_verify, size); + auto copy_y_verify = blas::helper::copy_to_device( + q, y_v_verify.data(), gpu_y_verify, size); + + sb_handle.wait({copy_x_verify, copy_y_verify}); + + auto rotm_event = + _rotm(sb_handle, size, gpu_x_verify, static_cast(1), + gpu_y_verify, static_cast(1), gpu_param); + sb_handle.wait(rotm_event); + + auto event1 = + blas::helper::copy_to_host(q, gpu_x_verify, x_v_verify.data(), size); + auto event2 = + blas::helper::copy_to_host(q, gpu_y_verify, y_v_verify.data(), size); + sb_handle.wait({event1, event2}); + + blas::helper::deallocate(gpu_x_verify, q); + blas::helper::deallocate(gpu_y_verify, q); + } // Verify results std::ostringstream err_stream; const bool isAlmostEqual = utils::compare_vectors( @@ -118,27 +144,43 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto rotm_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(gpu_x_v, q); + blas::helper::deallocate(gpu_y_v, q); + blas::helper::deallocate(gpu_param, q); +} - for (auto size : rotm_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; + benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto rotm_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, rotm_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, rotm_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas1/rotmg.cpp b/benchmark/syclblas/blas1/rotmg.cpp index b89072afd..6cfef99fa 100644 --- a/benchmark/syclblas/blas1/rotmg.cpp +++ b/benchmark/syclblas/blas1/rotmg.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::rotmg; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, bool* success) { // initialize the state label @@ -47,13 +47,22 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, scalar_t y1 = blas_benchmark::utils::random_data(1)[0]; blas::SB_Handle& sb_handle = *sb_handle_ptr; - - auto buf_d1 = blas::make_sycl_iterator_buffer(&d1, 1); - auto buf_d2 = blas::make_sycl_iterator_buffer(&d2, 1); - auto buf_x1 = blas::make_sycl_iterator_buffer(&x1, 1); - auto buf_y1 = blas::make_sycl_iterator_buffer(&y1, 1); - auto buf_param = blas::make_sycl_iterator_buffer(param, param_size); - + auto q = sb_handle.get_queue(); + + auto buf_d1 = blas::helper::allocate(1, q); + auto buf_d2 = blas::helper::allocate(1, q); + auto buf_x1 = blas::helper::allocate(1, q); + auto buf_y1 = blas::helper::allocate(1, q); + auto buf_param = blas::helper::allocate(param_size, q); + + auto copy_d1 = blas::helper::copy_to_device(q, &d1, buf_d1, 1); + auto copy_d2 = blas::helper::copy_to_device(q, &d2, buf_d2, 1); + auto copy_x1 = blas::helper::copy_to_device(q, &x1, buf_x1, 1); + auto copy_y1 = blas::helper::copy_to_device(q, &y1, buf_y1, 1); + auto copy_param = blas::helper::copy_to_device( + q, param.data(), buf_param, param_size); + + sb_handle.wait({copy_d1, copy_d2, copy_x1, copy_y1, copy_param}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results scalar_t d1_ref = d1; @@ -68,29 +77,48 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, scalar_t y1_verify = y1; std::vector param_verify = param; - auto buf_verify_d1 = blas::make_sycl_iterator_buffer(&d1_verify, 1); - auto buf_verify_d2 = blas::make_sycl_iterator_buffer(&d2_verify, 1); - auto buf_verify_x1 = blas::make_sycl_iterator_buffer(&x1_verify, 1); - auto buf_verify_y1 = blas::make_sycl_iterator_buffer(&y1_verify, 1); - auto device_param = - blas::make_sycl_iterator_buffer(param_verify, param_size); - reference_blas::rotmg(&d1_ref, &d2_ref, &x1_ref, &y1_ref, param_ref.data()); - _rotmg(sb_handle, buf_verify_d1, buf_verify_d2, buf_verify_x1, buf_verify_y1, - device_param); - - auto event1 = blas::helper::copy_to_host(sb_handle.get_queue(), buf_verify_d1, - &d1_verify, 1); - auto event2 = blas::helper::copy_to_host(sb_handle.get_queue(), buf_verify_d2, - &d2_verify, 1); - auto event3 = blas::helper::copy_to_host(sb_handle.get_queue(), buf_verify_x1, - &x1_verify, 1); - auto event4 = blas::helper::copy_to_host(sb_handle.get_queue(), buf_verify_y1, - &y1_verify, 1); - auto event5 = blas::helper::copy_to_host(sb_handle.get_queue(), device_param, - param_verify.data(), param_size); - - sb_handle.wait({event1, event2, event3, event4, event5}); + { + auto buf_verify_d1 = blas::helper::allocate(1, q); + auto buf_verify_d2 = blas::helper::allocate(1, q); + auto buf_verify_x1 = blas::helper::allocate(1, q); + auto buf_verify_y1 = blas::helper::allocate(1, q); + auto buf_verify_param = + blas::helper::allocate(param_size, q); + + auto copy_verify_d1 = + blas::helper::copy_to_device(q, &d1_verify, buf_verify_d1, 1); + auto copy_verify_d2 = + blas::helper::copy_to_device(q, &d2_verify, buf_verify_d2, 1); + auto copy_verify_x1 = + blas::helper::copy_to_device(q, &x1_verify, buf_verify_x1, 1); + auto copy_verify_y1 = + blas::helper::copy_to_device(q, &y1_verify, buf_verify_y1, 1); + auto copy_verify_param = blas::helper::copy_to_device( + q, param_verify.data(), buf_verify_param, param_size); + + sb_handle.wait({copy_verify_d1, copy_verify_d2, copy_verify_x1, + copy_verify_y1, copy_verify_param}); + + auto rotmg_event = _rotmg(sb_handle, buf_verify_d1, buf_verify_d2, + buf_verify_x1, buf_verify_y1, buf_verify_param); + sb_handle.wait(rotmg_event); + + auto event1 = blas::helper::copy_to_host(q, buf_verify_d1, &d1_verify, 1); + auto event2 = blas::helper::copy_to_host(q, buf_verify_d2, &d2_verify, 1); + auto event3 = blas::helper::copy_to_host(q, buf_verify_x1, &x1_verify, 1); + auto event4 = blas::helper::copy_to_host(q, buf_verify_y1, &y1_verify, 1); + auto event5 = blas::helper::copy_to_host(q, buf_verify_param, + param_verify.data(), param_size); + + sb_handle.wait({event1, event2, event3, event4, event5}); + + blas::helper::deallocate(buf_verify_d1, q); + blas::helper::deallocate(buf_verify_d2, q); + blas::helper::deallocate(buf_verify_x1, q); + blas::helper::deallocate(buf_verify_y1, q); + blas::helper::deallocate(buf_verify_param, q); + } const bool isAlmostEqual = utils::almost_equal(d1_verify, d1_ref) && utils::almost_equal(d2_verify, d2_ref) && @@ -132,23 +160,39 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); + + blas::helper::deallocate(buf_d1, q); + blas::helper::deallocate(buf_d2, q); + blas::helper::deallocate(buf_x1, q); + blas::helper::deallocate(buf_y1, q); + blas::helper::deallocate(buf_param, q); }; -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, bool* success) { - run(st, sb_handle_ptr, success); + run(st, sb_handle_ptr, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + mem_type).c_str(), BM_lambda, sb_handle_ptr, success) ->UseRealTime(); } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER); +#ifdef SB_ENABLE_USM + register_benchmark(sb_handle_ptr, + success, blas_benchmark::utils::MEM_TYPE_USM); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas1/scal.cpp b/benchmark/syclblas/blas1/scal.cpp index 3fcf6dbf9..bfc7d7b8a 100644 --- a/benchmark/syclblas/blas1/scal.cpp +++ b/benchmark/syclblas/blas1/scal.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::scal; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,12 +40,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas_benchmark::utils::Level1Op::scal, scalar_t>(state, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Create data std::vector v1 = blas_benchmark::utils::random_data(size); auto alpha = blas_benchmark::utils::random_scalar(); - auto in = blas::make_sycl_iterator_buffer(v1, size); + auto in = blas::helper::allocate(size, q); + auto copy_in = blas::helper::copy_to_device(q, v1.data(), in, size); + + sb_handle.wait({copy_in}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -53,10 +57,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, reference_blas::scal(size, alpha, v1_ref.data(), 1); std::vector v1_temp = v1; { - auto v1_temp_gpu = blas::make_sycl_iterator_buffer(v1_temp, size); - auto event = + auto v1_temp_gpu = blas::helper::allocate(size, q); + auto copy_temp = blas::helper::copy_to_device(q, v1_temp.data(), + v1_temp_gpu, size); + sb_handle.wait({copy_temp}); + auto scal_event = _scal(sb_handle, size, alpha, v1_temp_gpu, static_cast(1)); - sb_handle.wait(event); + sb_handle.wait(scal_event); + auto copy_output = + blas::helper::copy_to_host(q, v1_temp_gpu, v1_temp.data(), size); + sb_handle.wait(copy_output); + + blas::helper::deallocate(v1_temp_gpu, q); } std::ostringstream err_stream; @@ -94,30 +106,43 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto scal_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(in, q); +} - for (auto size : scal_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto scal_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, scal_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, scal_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/benchmark/syclblas/blas1/sdsdot.cpp b/benchmark/syclblas/blas1/sdsdot.cpp index 649f422ca..a70b7f40a 100644 --- a/benchmark/syclblas/blas1/sdsdot.cpp +++ b/benchmark/syclblas/blas1/sdsdot.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level1Op benchmark_op = blas_benchmark::utils::Level1Op::sdsdot; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { // initialize the state label @@ -40,15 +40,21 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, blas_benchmark::utils::Level1Op::sdsdot, scalar_t>(state, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Create data const float sb = blas_benchmark::utils::random_data(1)[0]; std::vector v1 = blas_benchmark::utils::random_data(size); std::vector v2 = blas_benchmark::utils::random_data(size); - auto inx = blas::make_sycl_iterator_buffer(v1, size); - auto iny = blas::make_sycl_iterator_buffer(v2, size); - auto inr = blas::make_sycl_iterator_buffer(1); + auto inx = blas::helper::allocate(size, q); + auto iny = blas::helper::allocate(size, q); + auto inr = blas::helper::allocate(1, q); + + auto copy_x = blas::helper::copy_to_device(q, v1.data(), inx, size); + auto copy_y = blas::helper::copy_to_device(q, v2.data(), iny, size); + + sb_handle.wait({copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -56,12 +62,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, reference_blas::sdsdot(size, sb, v1.data(), 1, v2.data(), 1); scalar_t vr_temp = 0; { - auto vr_temp_gpu = blas::make_sycl_iterator_buffer(&vr_temp, 1); - _sdsdot(sb_handle, size, sb, inx, static_cast(1), iny, - static_cast(1), vr_temp_gpu); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), vr_temp_gpu, - &vr_temp, 1); + auto vr_temp_gpu = blas::helper::allocate(1, q); + auto sdsdot_event = + _sdsdot(sb_handle, size, sb, inx, static_cast(1), iny, + static_cast(1), vr_temp_gpu); + sb_handle.wait(sdsdot_event); + auto event = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1); sb_handle.wait(event); + + blas::helper::deallocate(vr_temp_gpu, q); } if (!utils::almost_equal(vr_temp, vr_ref)) { @@ -101,30 +110,45 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto sdsdot_params = blas_benchmark::utils::get_blas1_params(args); + blas::helper::deallocate(inx, q); + blas::helper::deallocate(iny, q); + blas::helper::deallocate(inr, q); +} - for (auto size : sdsdot_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto size : params) { auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t size, bool* success) { - run(st, sb_handle_ptr, size, success); + run(st, sb_handle_ptr, size, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - size, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + size, mem_type).c_str(), BM_lambda, sb_handle_ptr, size, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto sdsdot_params = blas_benchmark::utils::get_blas1_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, sdsdot_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, sdsdot_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK_FLOAT(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/benchmark/syclblas/blas2/gbmv.cpp b/benchmark/syclblas/blas2/gbmv.cpp index 1c4fc0cc7..8485c6e85 100644 --- a/benchmark/syclblas/blas2/gbmv.cpp +++ b/benchmark/syclblas/blas2/gbmv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::gbmv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, index_t m, index_t n, index_t kl, index_t ku, scalar_t alpha, scalar_t beta, bool* success) { @@ -53,6 +53,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, 0, ku, kl); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -62,10 +63,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, std::vector v_y = blas_benchmark::utils::random_data(ylen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, lda * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); - auto v_y_gpu = blas::make_sycl_iterator_buffer(v_y, ylen); + auto m_a_gpu = blas::helper::allocate(lda * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + auto v_y_gpu = blas::helper::allocate(ylen, q); + + auto copy_a = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, lda * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + auto copy_y = + blas::helper::copy_to_device(q, v_y.data(), v_y_gpu, ylen); + sb_handle.wait({copy_a, copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results std::vector v_y_ref = v_y; @@ -73,11 +82,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, incX, beta, v_y_ref.data(), incY); std::vector v_y_temp = v_y; { - auto v_y_temp_gpu = - blas::make_sycl_iterator_buffer(v_y_temp, ylen); - auto event = _gbmv(sb_handle, *t_str, m, n, kl, ku, alpha, m_a_gpu, lda, - v_x_gpu, incX, beta, v_y_temp_gpu, incY); - sb_handle.wait(); + auto v_y_temp_gpu = blas::helper::allocate(ylen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_y_temp.data(), + v_y_temp_gpu, ylen); + sb_handle.wait({copy_temp}); + auto gbmv_event = _gbmv(sb_handle, *t_str, m, n, kl, ku, alpha, m_a_gpu, + lda, v_x_gpu, incX, beta, v_y_temp_gpu, incY); + sb_handle.wait({gbmv_event}); + auto copy_out = blas::helper::copy_to_host(q, v_y_temp_gpu, + v_y_temp.data(), ylen); + sb_handle.wait({copy_out}); + + blas::helper::deallocate(v_y_temp_gpu, q); } std::ostringstream err_stream; @@ -116,14 +132,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto gbmv_params = blas_benchmark::utils::get_gbmv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); + blas::helper::deallocate(v_y_gpu, q); +} - for (auto p : gbmv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string ts; index_t m, n, kl, ku; scalar_t alpha, beta; @@ -133,17 +152,30 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, int t, index_t m, index_t n, index_t kl, index_t ku, scalar_t alpha, scalar_t beta, bool* success) { - run(st, sb_handle_ptr, t, m, n, kl, ku, alpha, beta, success); + run(st, sb_handle_ptr, t, m, n, kl, ku, alpha, beta, + success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - ts, m, n, kl, ku, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + ts, m, n, kl, ku, mem_type).c_str(), BM_lambda, sb_handle_ptr, t, m, n, kl, ku, alpha, beta, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto gbmv_params = blas_benchmark::utils::get_gbmv_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, gbmv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, gbmv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/gemv.cpp b/benchmark/syclblas/blas2/gemv.cpp index 2d2f5e822..4692b6c08 100644 --- a/benchmark/syclblas/blas2/gemv.cpp +++ b/benchmark/syclblas/blas2/gemv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::gemv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, index_t m, index_t n, scalar_t alpha, scalar_t beta, bool* success) { // initialize the state label @@ -52,6 +52,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -61,9 +62,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, std::vector v_y = blas_benchmark::utils::random_data(ylen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, m * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); - auto v_y_gpu = blas::make_sycl_iterator_buffer(v_y, ylen); + auto m_a_gpu = blas::helper::allocate(lda * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + auto v_y_gpu = blas::helper::allocate(ylen, q); + + auto copy_a = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, lda * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + auto copy_y = + blas::helper::copy_to_device(q, v_y.data(), v_y_gpu, ylen); + + sb_handle.wait({copy_a, copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -72,11 +82,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, beta, v_y_ref.data(), incY); std::vector v_y_temp = v_y; { - auto v_y_temp_gpu = - blas::make_sycl_iterator_buffer(v_y_temp, ylen); - auto event = _gemv(sb_handle, *t_str, m, n, alpha, m_a_gpu, m, v_x_gpu, - incX, beta, v_y_temp_gpu, incY); - sb_handle.wait(); + auto v_y_temp_gpu = blas::helper::allocate(ylen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_y_temp.data(), + v_y_temp_gpu, ylen); + sb_handle.wait({copy_temp}); + auto gemv_event = _gemv(sb_handle, *t_str, m, n, alpha, m_a_gpu, m, v_x_gpu, + incX, beta, v_y_temp_gpu, incY); + sb_handle.wait({gemv_event}); + auto copy_out = blas::helper::copy_to_host(q, v_y_temp_gpu, + v_y_temp.data(), ylen); + sb_handle.wait({copy_out}); + + blas::helper::deallocate(v_y_temp_gpu, q); } std::ostringstream err_stream; @@ -115,34 +132,50 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto gemv_params = blas_benchmark::utils::get_blas2_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); + blas::helper::deallocate(v_y_gpu, q); +} - for (auto p : gemv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string ts; index_t m, n; scalar_t alpha, beta; std::tie(ts, m, n, alpha, beta) = p; int t = static_cast(blas_benchmark::utils::to_transpose_enum(ts)); - auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, int t, - index_t m, index_t n, scalar_t alpha, scalar_t beta, - bool* success) { - run(st, sb_handle_ptr, t, m, n, alpha, beta, success); + auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, + int t, index_t m, index_t n, scalar_t alpha, + scalar_t beta, bool* success) { + run(st, sb_handle_ptr, t, m, n, alpha, beta, + success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - ts, m, n, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + ts, m, n, mem_type).c_str(), BM_lambda, sb_handle_ptr, t, m, n, alpha, beta, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto gemv_params = blas_benchmark::utils::get_blas2_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, gemv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, gemv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/ger.cpp b/benchmark/syclblas/blas2/ger.cpp index 1968f1b55..c7965b515 100644 --- a/benchmark/syclblas/blas2/ger.cpp +++ b/benchmark/syclblas/blas2/ger.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::ger; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t m, index_t n, scalar_t alpha, bool* success) { // initialize the state label @@ -48,6 +48,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t m, blas_benchmark::utils::Level2Op::ger, scalar_t>(state, "n", 0, m, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -57,9 +58,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t m, std::vector v_y = blas_benchmark::utils::random_data(ylen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, m * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); - auto v_y_gpu = blas::make_sycl_iterator_buffer(v_y, ylen); + auto m_a_gpu = blas::helper::allocate(m * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + auto v_y_gpu = blas::helper::allocate(ylen, q); + + auto copy_m = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, m * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + auto copy_y = + blas::helper::copy_to_device(q, v_y.data(), v_y_gpu, ylen); + + sb_handle.wait({copy_m, copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -69,11 +79,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t m, std::vector m_a_temp(m_a); { - auto m_a_temp_gpu = - blas::make_sycl_iterator_buffer(m_a_temp, m * n); - auto event = _ger(sb_handle, m, n, alpha, v_x_gpu, incX, v_y_gpu, incY, - m_a_temp_gpu, lda); - sb_handle.wait(); + auto m_a_temp_gpu = blas::helper::allocate(m * n, q); + auto copy_temp = blas::helper::copy_to_device( + q, m_a_temp.data(), m_a_temp_gpu, m * n); + sb_handle.wait(copy_temp); + auto ger_event = _ger(sb_handle, m, n, alpha, v_x_gpu, incX, v_y_gpu, incY, + m_a_temp_gpu, lda); + sb_handle.wait(ger_event); + auto copy_out = blas::helper::copy_to_host( + q, m_a_temp_gpu, m_a_temp.data(), m * n); + sb_handle.wait(copy_out); + + blas::helper::deallocate(m_a_temp_gpu, q); } std::ostringstream err_stream; @@ -113,31 +130,45 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t m, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto ger_params = blas_benchmark::utils::get_ger_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); + blas::helper::deallocate(v_y_gpu, q); +} - for (auto p : ger_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { index_t m, n; scalar_t alpha; std::tie(m, n, alpha) = p; auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t m, index_t n, scalar_t alpha, bool* success) { - run(st, sb_handle_ptr, m, n, alpha, success); + run(st, sb_handle_ptr, m, n, alpha, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - m, n, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + m, n, mem_type).c_str(), BM_lambda, sb_handle_ptr, m, n, alpha, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto ger_params = blas_benchmark::utils::get_ger_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, ger_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, ger_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/sbmv.cpp b/benchmark/syclblas/blas2/sbmv.cpp index 7ae2ff4e5..be648326e 100644 --- a/benchmark/syclblas/blas2/sbmv.cpp +++ b/benchmark/syclblas/blas2/sbmv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::sbmv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, index_t n, index_t k, scalar_t alpha, scalar_t beta, bool* success) { @@ -51,6 +51,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, k); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -60,9 +61,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector v_y = blas_benchmark::utils::random_data(ylen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, lda * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); - auto v_y_gpu = blas::make_sycl_iterator_buffer(v_y, ylen); + auto m_a_gpu = blas::helper::allocate(lda * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + auto v_y_gpu = blas::helper::allocate(ylen, q); + + auto copy_a = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, lda * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + auto copy_y = + blas::helper::copy_to_device(q, v_y.data(), v_y_gpu, ylen); + + sb_handle.wait({copy_a, copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -71,11 +81,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, beta, v_y_ref.data(), incY); std::vector v_y_temp = v_y; { - auto v_y_temp_gpu = - blas::make_sycl_iterator_buffer(v_y_temp, ylen); - auto event = _sbmv(sb_handle, *uplo_str, n, k, alpha, m_a_gpu, lda, v_x_gpu, - incX, beta, v_y_temp_gpu, incY); - sb_handle.wait(); + auto v_y_temp_gpu = blas::helper::allocate(ylen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_y_temp.data(), + v_y_temp_gpu, ylen); + sb_handle.wait({copy_temp}); + auto sbmv_event = _sbmv(sb_handle, *uplo_str, n, k, alpha, m_a_gpu, lda, + v_x_gpu, incX, beta, v_y_temp_gpu, incY); + sb_handle.wait({sbmv_event}); + auto copy_out = blas::helper::copy_to_host(q, v_y_temp_gpu, + v_y_temp.data(), ylen); + sb_handle.wait({copy_out}); + + blas::helper::deallocate(v_y_temp_gpu, q); } std::ostringstream err_stream; @@ -114,14 +131,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto sbmv_params = blas_benchmark::utils::get_sbmv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); + blas::helper::deallocate(v_y_gpu, q); +} - for (auto p : sbmv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string uplos; index_t n, k; scalar_t alpha, beta; @@ -130,17 +150,30 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplos, index_t n, index_t k, scalar_t alpha, scalar_t beta, bool* success) { - run(st, sb_handle_ptr, uplos, n, k, alpha, beta, success); + run(st, sb_handle_ptr, uplos, n, k, alpha, beta, + success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplos, n, k, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplos, n, k, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplos, n, k, alpha, beta, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto sbmv_params = blas_benchmark::utils::get_sbmv_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, sbmv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, sbmv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/spmv.cpp b/benchmark/syclblas/blas2/spmv.cpp index b003acce7..a07d52c59 100644 --- a/benchmark/syclblas/blas2/spmv.cpp +++ b/benchmark/syclblas/blas2/spmv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::spmv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, index_t n, scalar_t alpha, scalar_t beta, bool* success) { @@ -48,6 +48,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, blas_benchmark::utils::Level2Op::spmv, scalar_t>(state, "n", beta, 0, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -57,9 +58,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector v_y = blas_benchmark::utils::random_data(ylen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, m_a.size()); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); - auto v_y_gpu = blas::make_sycl_iterator_buffer(v_y, ylen); + auto m_a_gpu = blas::helper::allocate(m_a.size(), q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + auto v_y_gpu = blas::helper::allocate(ylen, q); + + auto copy_m = blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, + m_a.size()); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + auto copy_y = + blas::helper::copy_to_device(q, v_y.data(), v_y_gpu, ylen); + + sb_handle.wait({copy_m, copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -68,11 +78,19 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, v_y_ref.data(), incY); std::vector v_y_temp = v_y; { - auto v_y_temp_gpu = - blas::make_sycl_iterator_buffer(v_y_temp, ylen); - auto event = _spmv(sb_handle, *uplo_str, n, alpha, m_a_gpu, v_x_gpu, incX, - beta, v_y_temp_gpu, incY); - sb_handle.wait(); + auto v_y_temp_gpu = blas::helper::allocate(ylen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_y_temp.data(), + v_y_temp_gpu, ylen); + sb_handle.wait(copy_temp); + + auto spmv_event = _spmv(sb_handle, *uplo_str, n, alpha, m_a_gpu, v_x_gpu, + incX, beta, v_y_temp_gpu, incY); + sb_handle.wait(spmv_event); + auto copy_out = blas::helper::copy_to_host(q, v_y_temp_gpu, + v_y_temp.data(), ylen); + sb_handle.wait(copy_out); + + blas::helper::deallocate(v_y_temp_gpu, q); } std::ostringstream err_stream; @@ -111,15 +129,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - // spmv and symv use the same set of params, so reuse the symv function - auto spmv_params = blas_benchmark::utils::get_symv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); + blas::helper::deallocate(v_y_gpu, q); +} - for (auto p : spmv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string uplos; index_t n; scalar_t alpha, beta; @@ -128,17 +148,31 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplos, index_t n, scalar_t alpha, scalar_t beta, bool* success) { - run(st, sb_handle_ptr, uplos, n, alpha, beta, success); + run(st, sb_handle_ptr, uplos, n, alpha, beta, + success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplos, n, alpha, beta, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplos, n, alpha, beta, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplos, n, alpha, beta, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + // spmv and symv use the same set of params, so reuse the symv function + auto spmv_params = blas_benchmark::utils::get_symv_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, spmv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, spmv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/spr.cpp b/benchmark/syclblas/blas2/spr.cpp index ee7f2ebd0..92ed03f94 100644 --- a/benchmark/syclblas/blas2/spr.cpp +++ b/benchmark/syclblas/blas2/spr.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::spr; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, int size, scalar_t alpha, int incX, bool* success) { // initialize the state label @@ -39,6 +39,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, blas_benchmark::utils::Level2Op::spr, scalar_t>(state, "n", 0, 0, size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); const int m_size = size * size; const int v_size = 1 + (size - 1) * std::abs(incX); @@ -49,9 +50,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, std::vector v_x = blas_benchmark::utils::random_data(v_size); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, m_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, v_size); + auto m_a_gpu = blas::helper::allocate(m_size, q); + auto v_x_gpu = blas::helper::allocate(v_size, q); + + auto copy_a = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, m_size); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, v_size); + sb_handle.wait({copy_a, copy_x}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results std::vector x_ref = v_x; @@ -61,13 +68,20 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, std::vector m_a_temp = m_a; { - auto m_a_temp_gpu = - blas::make_sycl_iterator_buffer(m_a_temp, m_size); - - blas::_spr(sb_handle, uplo, size, alpha, v_x_gpu, incX, - m_a_temp_gpu); - sb_handle.wait(); + auto m_a_temp_gpu = blas::helper::allocate(m_size, q); + auto copy_temp = blas::helper::copy_to_device( + q, m_a_temp.data(), m_a_temp_gpu, m_size); + sb_handle.wait({copy_temp}); + + auto spr_event = blas::_spr( + sb_handle, uplo, size, alpha, v_x_gpu, incX, m_a_temp_gpu); + sb_handle.wait({spr_event}); + auto copy_out = blas::helper::copy_to_host( + q, m_a_temp_gpu, m_a_temp.data(), m_size); + sb_handle.wait({copy_out}); + + blas::helper::deallocate(m_a_temp_gpu, q); } std::ostringstream err_stream; @@ -107,14 +121,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto spr_params = blas_benchmark::utils::get_spr_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} - for (auto p : spr_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { index_t n, incX; std::string uplo; scalar_t alpha; @@ -125,17 +141,30 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda_col = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, char uplo, int size, scalar_t alpha, int incX, bool* success) { - run(st, sb_handle_ptr, uplo, size, alpha, incX, success); + run(st, sb_handle_ptr, uplo, size, alpha, incX, + success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplo, n, alpha, incX, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplo, n, alpha, incX, mem_type).c_str(), BM_lambda_col, sb_handle_ptr, uplo_c, n, alpha, incX, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto spr_params = blas_benchmark::utils::get_spr_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, spr_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, spr_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/spr2.cpp b/benchmark/syclblas/blas2/spr2.cpp index 8a1b3f324..da86bf68e 100644 --- a/benchmark/syclblas/blas2/spr2.cpp +++ b/benchmark/syclblas/blas2/spr2.cpp @@ -28,9 +28,9 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::spr2; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, - int n, scalar_t alpha, int incX, int incY, bool* success) { + index_t n, scalar_t alpha, index_t incX, index_t incY, bool* success) { // initialize the state label blas_benchmark::utils::set_benchmark_label( state, sb_handle_ptr->get_queue()); @@ -39,10 +39,11 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, blas_benchmark::utils::Level2Op::spr2, scalar_t>(state, "n", 0, 0, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); - const int m_size = n * n; - const int vx_size = 1 + (n - 1) * std::abs(incX); - const int vy_size = 1 + (n - 1) * std::abs(incY); + const index_t m_size = n * n; + const index_t vx_size = 1 + (n - 1) * std::abs(incX); + const index_t vy_size = 1 + (n - 1) * std::abs(incY); // Input matrix/vector, output vector. std::vector m_a = @@ -52,9 +53,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, std::vector v_y = blas_benchmark::utils::random_data(vy_size); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, m_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, vx_size); - auto v_y_gpu = blas::make_sycl_iterator_buffer(v_y, vy_size); + auto m_a_gpu = blas::helper::allocate(m_size, q); + auto v_x_gpu = blas::helper::allocate(vx_size, q); + auto v_y_gpu = blas::helper::allocate(vy_size, q); + + auto copy_a = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, m_size); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, vx_size); + auto copy_y = + blas::helper::copy_to_device(q, v_y.data(), v_y_gpu, vy_size); + + sb_handle.wait({copy_a, copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -66,12 +76,19 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, std::vector m_a_temp = m_a; { - auto m_a_temp_gpu = - blas::make_sycl_iterator_buffer(m_a_temp, m_size); - - blas::_spr2(sb_handle, uplo, n, alpha, v_x_gpu, incX, v_y_gpu, incY, - m_a_temp_gpu); - sb_handle.wait(); + auto m_a_temp_gpu = blas::helper::allocate(m_size, q); + auto copy_temp = blas::helper::copy_to_device( + q, m_a_temp.data(), m_a_temp_gpu, m_size); + sb_handle.wait({copy_temp}); + + auto spr2_event = blas::_spr2(sb_handle, uplo, n, alpha, v_x_gpu, incX, + v_y_gpu, incY, m_a_temp_gpu); + sb_handle.wait(spr2_event); + auto copy_out = blas::helper::copy_to_host( + q, m_a_temp_gpu, m_a_temp.data(), m_size); + sb_handle.wait({copy_out}); + + blas::helper::deallocate(m_a_temp_gpu, q); } std::ostringstream err_stream; @@ -110,14 +127,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char uplo, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto spr2_params = blas_benchmark::utils::get_spr2_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); + blas::helper::deallocate(v_y_gpu, q); +} - for (auto p : spr2_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { index_t n, incX, incY; std::string uplo; scalar_t alpha; @@ -127,18 +147,32 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda_col = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, char uplo, - int n, scalar_t alpha, int incX, int incY, bool* success) { - run(st, sb_handle_ptr, uplo, n, alpha, incX, incY, success); + index_t n, scalar_t alpha, index_t incX, index_t incY, + bool* success) { + run(st, sb_handle_ptr, uplo, n, alpha, incX, + incY, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplo, n, alpha, incX, incY, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplo, n, alpha, incX, incY, mem_type).c_str(), BM_lambda_col, sb_handle_ptr, uplo_c, n, alpha, incX, incY, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto spr2_params = blas_benchmark::utils::get_spr2_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, spr2_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, spr2_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/symv.cpp b/benchmark/syclblas/blas2/symv.cpp index fc4eedfb3..fb9f06a53 100644 --- a/benchmark/syclblas/blas2/symv.cpp +++ b/benchmark/syclblas/blas2/symv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::symv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, index_t n, scalar_t alpha, scalar_t beta, bool* success) { @@ -50,6 +50,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, blas_benchmark::utils::Level2Op::symv, scalar_t>(state, "n", beta, 0, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -59,9 +60,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector v_y = blas_benchmark::utils::random_data(ylen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, n * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); - auto v_y_gpu = blas::make_sycl_iterator_buffer(v_y, ylen); + auto m_a_gpu = blas::helper::allocate(n * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + auto v_y_gpu = blas::helper::allocate(ylen, q); + + auto copy_m = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, n * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + auto copy_y = + blas::helper::copy_to_device(q, v_y.data(), v_y_gpu, ylen); + + sb_handle.wait({copy_m, copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -70,11 +80,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, beta, v_y_ref.data(), incY); std::vector v_y_temp(v_y); { - auto v_y_temp_gpu = - blas::make_sycl_iterator_buffer(v_y_temp, ylen); - auto event = _symv(sb_handle, *uplo_str, n, alpha, m_a_gpu, lda, v_x_gpu, - incX, beta, v_y_temp_gpu, incY); - sb_handle.wait(); + auto v_y_temp_gpu = blas::helper::allocate(ylen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_y_temp.data(), + v_y_temp_gpu, ylen); + sb_handle.wait(copy_temp); + auto symv_event = _symv(sb_handle, *uplo_str, n, alpha, m_a_gpu, lda, + v_x_gpu, incX, beta, v_y_temp_gpu, incY); + sb_handle.wait(symv_event); + auto copy_out = blas::helper::copy_to_host(q, v_y_temp_gpu, + v_y_temp.data(), ylen); + sb_handle.wait(copy_out); + + blas::helper::deallocate(v_y_temp_gpu, q); } std::ostringstream err_stream; @@ -113,14 +130,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto symv_params = blas_benchmark::utils::get_symv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); + blas::helper::deallocate(v_y_gpu, q); +} - for (auto p : symv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string uplos; index_t n; scalar_t alpha, beta; @@ -129,17 +149,29 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplos, index_t n, scalar_t alpha, scalar_t beta, bool* success) { - run(st, sb_handle_ptr, uplos, n, alpha, beta, success); + run(st, sb_handle_ptr, uplos, n, alpha, beta, + success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplos, n, alpha, beta, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplos, n, alpha, beta, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplos, n, alpha, beta, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto symv_params = blas_benchmark::utils::get_symv_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, symv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, symv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/syr.cpp b/benchmark/syclblas/blas2/syr.cpp index d7d3ec30a..b8256bae7 100644 --- a/benchmark/syclblas/blas2/syr.cpp +++ b/benchmark/syclblas/blas2/syr.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::syr; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, index_t n, scalar_t alpha, bool* success) { // initialize the state label @@ -45,14 +45,22 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, blas_benchmark::utils::Level2Op::syr, scalar_t>(state, "n", 0, 0, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = blas_benchmark::utils::random_data(n * n); std::vector v_x = blas_benchmark::utils::random_data(n); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, n * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, n); + auto m_a_gpu = blas::helper::allocate(n * n, q); + auto v_x_gpu = blas::helper::allocate(n, q); + + auto copy_m = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, n * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, n); + + sb_handle.wait({copy_m, copy_x}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -62,11 +70,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector m_a_temp(m_a); { - auto m_a_temp_gpu = - blas::make_sycl_iterator_buffer(m_a_temp, n * n); - auto event = + auto m_a_temp_gpu = blas::helper::allocate(n * n, q); + auto copy_temp = blas::helper::copy_to_device( + q, m_a_temp.data(), m_a_temp_gpu, n * n); + sb_handle.wait(copy_temp); + auto syr_event = _syr(sb_handle, *uplo_str, n, alpha, v_x_gpu, incX, m_a_temp_gpu, lda); - sb_handle.wait(); + sb_handle.wait(syr_event); + auto copy_out = blas::helper::copy_to_host( + q, m_a_temp_gpu, m_a_temp.data(), n * n); + sb_handle.wait(copy_out); + + blas::helper::deallocate(m_a_temp_gpu, q); } std::ostringstream err_stream; @@ -106,14 +121,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto syr_params = blas_benchmark::utils::get_syr_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} - for (auto p : syr_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string uplo; index_t n; scalar_t alpha; @@ -122,17 +139,28 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplo, index_t n, scalar_t alpha, bool* success) { - run(st, sb_handle_ptr, uplo, n, alpha, success); + run(st, sb_handle_ptr, uplo, n, alpha, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplo, n, alpha, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplo, n, alpha, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplo, n, alpha, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto syr_params = blas_benchmark::utils::get_syr_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, syr_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, syr_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/syr2.cpp b/benchmark/syclblas/blas2/syr2.cpp index cad0f537b..b5d6d651c 100644 --- a/benchmark/syclblas/blas2/syr2.cpp +++ b/benchmark/syclblas/blas2/syr2.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::syr2; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, index_t n, scalar_t alpha, bool* success) { // initialize the state label @@ -46,6 +46,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, blas_benchmark::utils::Level2Op::syr2, scalar_t>(state, "n", 0, 0, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); const int v_x_size = 1 + (n - 1) * std::abs(incX); const int v_y_size = 1 + (n - 1) * std::abs(incY); @@ -58,9 +59,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector v_y = blas_benchmark::utils::random_data(v_y_size); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, n * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, v_x_size); - auto v_y_gpu = blas::make_sycl_iterator_buffer(v_y, v_y_size); + auto m_a_gpu = blas::helper::allocate(n * n, q); + auto v_x_gpu = blas::helper::allocate(v_x_size, q); + auto v_y_gpu = blas::helper::allocate(v_y_size, q); + + auto copy_m = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, n * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, v_x_size); + auto copy_y = + blas::helper::copy_to_device(q, v_y.data(), v_y_gpu, v_y_size); + + sb_handle.wait({copy_m, copy_x, copy_y}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -70,11 +80,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector m_a_temp(m_a); { - auto m_a_temp_gpu = - blas::make_sycl_iterator_buffer(m_a_temp, n * n); - auto event = _syr2(sb_handle, *uplo_str, n, alpha, v_x_gpu, incX, v_y_gpu, - incY, m_a_temp_gpu, lda); - sb_handle.wait(); + auto m_a_temp_gpu = blas::helper::allocate(n * n, q); + auto copy_temp = blas::helper::copy_to_device( + q, m_a_temp.data(), m_a_temp_gpu, n * n); + sb_handle.wait(copy_temp); + auto syr2_event = _syr2(sb_handle, *uplo_str, n, alpha, v_x_gpu, incX, + v_y_gpu, incY, m_a_temp_gpu, lda); + sb_handle.wait(syr2_event); + auto copy_out = blas::helper::copy_to_host( + q, m_a_temp_gpu, m_a_temp.data(), n * n); + sb_handle.wait(copy_out); + + blas::helper::deallocate(m_a_temp_gpu, q); } std::ostringstream err_stream; @@ -114,15 +131,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - // syr2 use the same parameters so reuse syr function - auto syr2_params = blas_benchmark::utils::get_syr_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); + blas::helper::deallocate(v_y_gpu, q); +} - for (auto p : syr2_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string uplo; index_t n; scalar_t alpha; @@ -131,17 +150,29 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplo, index_t n, scalar_t alpha, bool* success) { - run(st, sb_handle_ptr, uplo, n, alpha, success); + run(st, sb_handle_ptr, uplo, n, alpha, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplo, n, alpha, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplo, n, alpha, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplo, n, alpha, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + // syr2 use the same parameters so reuse syr function + auto syr2_params = blas_benchmark::utils::get_syr_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, syr2_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, syr2_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/tbmv.cpp b/benchmark/syclblas/blas2/tbmv.cpp index bf4880414..b046c48c3 100644 --- a/benchmark/syclblas/blas2/tbmv.cpp +++ b/benchmark/syclblas/blas2/tbmv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::tbmv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, std::string t, std::string diag, index_t n, index_t k, bool* success) { @@ -49,6 +49,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, blas_benchmark::utils::Level2Op::tbmv, scalar_t>(state, "n", 0, 0, n, k); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -56,9 +57,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector v_x = blas_benchmark::utils::random_data(xlen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, lda * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); + auto m_a_gpu = blas::helper::allocate(lda * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + + auto copy_a = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, lda * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + sb_handle.wait({copy_a, copy_x}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results std::vector v_x_ref = v_x; @@ -66,11 +73,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, v_x_ref.data(), incX); std::vector v_x_temp = v_x; { - auto v_x_temp_gpu = - blas::make_sycl_iterator_buffer(v_x_temp, xlen); - auto event = _tbmv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, m_a_gpu, - lda, v_x_temp_gpu, incX); - sb_handle.wait(); + auto v_x_temp_gpu = blas::helper::allocate(xlen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_x_temp.data(), + v_x_temp_gpu, xlen); + sb_handle.wait({copy_temp}); + auto tbmv_event = _tbmv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, + m_a_gpu, lda, v_x_temp_gpu, incX); + sb_handle.wait({tbmv_event}); + auto copy_out = blas::helper::copy_to_host(q, v_x_temp_gpu, + v_x_temp.data(), xlen); + sb_handle.wait({copy_out}); + + blas::helper::deallocate(v_x_temp_gpu, q); } std::ostringstream err_stream; @@ -109,14 +123,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto tbmv_params = blas_benchmark::utils::get_tbmv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} - for (auto p : tbmv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto p : params) { std::string uplos; std::string ts; std::string diags; @@ -127,17 +143,30 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplos, std::string ts, std::string diags, index_t n, index_t k, bool* success) { - run(st, sb_handle_ptr, uplos, ts, diags, n, k, success); + run(st, sb_handle_ptr, uplos, ts, diags, n, k, + success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplos, ts, diags, n, k, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplos, ts, diags, n, k, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplos, ts, diags, n, k, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto tbmv_params = blas_benchmark::utils::get_tbmv_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, tbmv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, tbmv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/tbsv.cpp b/benchmark/syclblas/blas2/tbsv.cpp index 7b8cdeffc..7cfa170c1 100644 --- a/benchmark/syclblas/blas2/tbsv.cpp +++ b/benchmark/syclblas/blas2/tbsv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::tbsv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, std::string t, std::string diag, index_t n, index_t k, bool* success) { @@ -49,6 +49,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, blas_benchmark::utils::Level2Op::tbsv, scalar_t>(state, "n", 0, 0, n, k); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a(lda * n); @@ -66,8 +67,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, scalar_t{10}) / scalar_t(n)); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, lda * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); + auto m_a_gpu = blas::helper::allocate(lda * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + + auto copy_m = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, lda * n); + auto copy_v = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + + sb_handle.wait({copy_m, copy_v}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -76,11 +84,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, v_x_ref.data(), incX); std::vector v_x_temp = v_x; { - auto v_x_temp_gpu = - blas::make_sycl_iterator_buffer(v_x_temp, xlen); - auto event = _tbsv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, m_a_gpu, - lda, v_x_temp_gpu, incX); - sb_handle.wait(); + auto v_x_temp_gpu = blas::helper::allocate(xlen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_x_temp.data(), + v_x_temp_gpu, xlen); + sb_handle.wait(copy_temp); + auto tbsv_event = _tbsv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, + m_a_gpu, lda, v_x_temp_gpu, incX); + sb_handle.wait(tbsv_event); + auto copy_out = blas::helper::copy_to_host(q, v_x_temp_gpu, + v_x_temp.data(), xlen); + sb_handle.wait(copy_out); + + blas::helper::deallocate(v_x_temp_gpu, q); } std::ostringstream err_stream; @@ -119,14 +134,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto tbsv_params = blas_benchmark::utils::get_tbmv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} - for (auto p : tbsv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto p : params) { std::string uplos; std::string ts; std::string diags; @@ -137,17 +154,30 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplos, std::string ts, std::string diags, index_t n, index_t k, bool* success) { - run(st, sb_handle_ptr, uplos, ts, diags, n, k, success); + run(st, sb_handle_ptr, uplos, ts, diags, n, k, + success); }; + benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplos, ts, diags, n, k, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplos, ts, diags, n, k, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplos, ts, diags, n, k, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto tbsv_params = blas_benchmark::utils::get_tbmv_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, tbsv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, tbsv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/tpmv.cpp b/benchmark/syclblas/blas2/tpmv.cpp index fab71fac4..88133ce14 100644 --- a/benchmark/syclblas/blas2/tpmv.cpp +++ b/benchmark/syclblas/blas2/tpmv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::tpmv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, std::string t, std::string diag, index_t n, bool* success) { @@ -37,13 +37,14 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, const char* t_str = t.c_str(); const char* diag_str = diag.c_str(); - index_t xlen = n; index_t incX = 1; + index_t xlen = 1 + (n - 1) * std::abs(incX); blas_benchmark::utils::init_level_2_counters< blas_benchmark::utils::Level2Op::tpmv, scalar_t>(state, "n", 0, 0, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -51,8 +52,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector v_x = blas_benchmark::utils::random_data(xlen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, m_a.size()); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); + auto m_a_gpu = blas::helper::allocate(m_a.size(), q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + + auto copy_m = blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, + m_a.size()); + auto copy_v = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + + sb_handle.wait({copy_m, copy_v}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -61,11 +69,19 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, incX); std::vector v_x_temp = v_x; { - auto v_x_temp_gpu = - blas::make_sycl_iterator_buffer(v_x_temp, xlen); - auto event = _tpmv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, - v_x_temp_gpu, incX); - sb_handle.wait(); + auto v_x_temp_gpu = blas::helper::allocate(xlen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_x_temp.data(), + v_x_temp_gpu, xlen); + sb_handle.wait(copy_temp); + auto tpmv_event = _tpmv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, + v_x_temp_gpu, incX); + sb_handle.wait(tpmv_event); + + auto copy_out = blas::helper::copy_to_host(q, v_x_temp_gpu, + v_x_temp.data(), xlen); + sb_handle.wait(copy_out); + + blas::helper::deallocate(v_x_temp_gpu, q); } std::ostringstream err_stream; @@ -104,15 +120,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - // tpmv uses the same parameters as trsv - auto tpmv_params = blas_benchmark::utils::get_trsv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} - for (auto p : tpmv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto p : params) { std::string uplos; std::string ts; std::string diags; @@ -122,7 +139,7 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplos, std::string ts, std::string diags, index_t n, bool* success) { - run(st, sb_handle_ptr, uplos, ts, diags, n, success); + run(st, sb_handle_ptr, uplos, ts, diags, n, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( @@ -133,6 +150,20 @@ void register_benchmark(blas_benchmark::Args& args, } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + // tpmv uses the same parameters as trsv + auto tpmv_params = blas_benchmark::utils::get_trsv_params(args); + + register_benchmark( + sb_handle_ptr, success, "buffer", tpmv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, "usm", tpmv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas2/trmv.cpp b/benchmark/syclblas/blas2/trmv.cpp index c4a95d975..82f4033d1 100644 --- a/benchmark/syclblas/blas2/trmv.cpp +++ b/benchmark/syclblas/blas2/trmv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::trmv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, std::string t, std::string diag, index_t n, bool* success) { @@ -49,6 +49,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, blas_benchmark::utils::Level2Op::trmv, scalar_t>(state, t_str, 0, 0, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a = @@ -56,8 +57,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::vector v_x = blas_benchmark::utils::random_data(xlen); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, lda * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); + auto m_a_gpu = blas::helper::allocate(lda * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + + auto copy_m = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, lda * n); + auto copy_x = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + + sb_handle.wait({copy_m, copy_x}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -66,11 +74,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, v_x_ref.data(), incX); std::vector v_x_temp = v_x; { - auto v_x_temp_gpu = - blas::make_sycl_iterator_buffer(v_x_temp, xlen); - auto event = _trmv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, lda, - v_x_temp_gpu, incX); - sb_handle.wait(); + auto v_x_temp_gpu = blas::helper::allocate(xlen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_x_temp.data(), + v_x_temp_gpu, xlen); + sb_handle.wait(copy_temp); + auto trmv_event = _trmv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, + lda, v_x_temp_gpu, incX); + sb_handle.wait(trmv_event); + auto copy_out = blas::helper::copy_to_host(q, v_x_temp_gpu, + v_x_temp.data(), xlen); + sb_handle.wait(copy_out); + + blas::helper::deallocate(v_x_temp_gpu, q); } std::ostringstream err_stream; @@ -109,15 +124,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - // trmv uses the same parameters as trsv - auto trmv_params = blas_benchmark::utils::get_trsv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} - for (auto p : trmv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto p : params) { std::string uplos; std::string ts; std::string diags; @@ -127,16 +143,27 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplos, std::string ts, std::string diags, index_t n, bool* success) { - run(st, sb_handle_ptr, uplos, ts, diags, n, success); + run(st, sb_handle_ptr, uplos, ts, diags, n, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplos, ts, diags, n, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplos, ts, diags, n, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplos, ts, diags, n, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + // trmv uses the same parameters as trsv + auto trmv_params = blas_benchmark::utils::get_trsv_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, trmv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, trmv_params); +#endif +} namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, diff --git a/benchmark/syclblas/blas2/trsv.cpp b/benchmark/syclblas/blas2/trsv.cpp index a5e9f143b..2730b94c4 100644 --- a/benchmark/syclblas/blas2/trsv.cpp +++ b/benchmark/syclblas/blas2/trsv.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level2Op benchmark_op = blas_benchmark::utils::Level2Op::trsv; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, std::string uplo, std::string t, std::string diag, index_t n, bool* success) { @@ -49,6 +49,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, blas_benchmark::utils::Level2Op::trsv, scalar_t>(state, "n", 0, 0, n); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Input matrix/vector, output vector. std::vector m_a(lda * n); @@ -64,8 +65,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, scalar_t{-10}, scalar_t{10}) / scalar_t(n); - auto m_a_gpu = blas::make_sycl_iterator_buffer(m_a, lda * n); - auto v_x_gpu = blas::make_sycl_iterator_buffer(v_x, xlen); + auto m_a_gpu = blas::helper::allocate(lda * n, q); + auto v_x_gpu = blas::helper::allocate(xlen, q); + + auto copy_m = + blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, lda * n); + auto copy_v = + blas::helper::copy_to_device(q, v_x.data(), v_x_gpu, xlen); + + sb_handle.wait({copy_m, copy_v}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -74,11 +82,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, v_x_ref.data(), incX); std::vector v_x_temp = v_x; { - auto v_x_temp_gpu = - blas::make_sycl_iterator_buffer(v_x_temp, xlen); - auto event = _trsv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, lda, - v_x_temp_gpu, incX); - sb_handle.wait(); + auto v_x_temp_gpu = blas::helper::allocate(xlen, q); + auto copy_temp = blas::helper::copy_to_device(q, v_x_temp.data(), + v_x_temp_gpu, xlen); + sb_handle.wait(copy_temp); + auto trsv_event = _trsv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, + lda, v_x_temp_gpu, incX); + sb_handle.wait(trsv_event); + auto copy_out = blas::helper::copy_to_host(q, v_x_temp_gpu, + v_x_temp.data(), xlen); + sb_handle.wait(copy_out); + + blas::helper::deallocate(v_x_temp_gpu, q); } std::ostringstream err_stream; @@ -117,14 +132,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -} -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto trsv_params = blas_benchmark::utils::get_trsv_params(args); + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} - for (auto p : trsv_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector params) { + for (auto p : params) { std::string uplos; std::string ts; std::string diags; @@ -134,17 +151,29 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, std::string uplos, std::string ts, std::string diags, index_t n, bool* success) { - run(st, sb_handle_ptr, uplos, ts, diags, n, success); + run(st, sb_handle_ptr, uplos, ts, diags, n, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - uplos, ts, diags, n, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + uplos, ts, diags, n, mem_type).c_str(), BM_lambda, sb_handle_ptr, uplos, ts, diags, n, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto trsv_params = blas_benchmark::utils::get_trsv_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, trsv_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, trsv_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas3/gemm.cpp b/benchmark/syclblas/blas3/gemm.cpp index a87c10559..6ae9ebce5 100644 --- a/benchmark/syclblas/blas3/gemm.cpp +++ b/benchmark/syclblas/blas3/gemm.cpp @@ -28,9 +28,9 @@ constexpr blas_benchmark::utils::Level3Op benchmark_op = blas_benchmark::utils::Level3Op::gemm; -template -void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, int t2, - index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, +template +void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, + int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, bool* success) { // initialize the state label blas_benchmark::utils::set_benchmark_label( @@ -52,6 +52,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, int t2 blas_benchmark::utils::Level3Op::gemm, scalar_t>(state, beta, m, n, k, 1); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Matrices std::vector a = blas_benchmark::utils::random_data(m * k); @@ -59,9 +60,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, int t2 std::vector c = blas_benchmark::utils::const_data(m * n, 0); - auto a_gpu = blas::make_sycl_iterator_buffer(a, m * k); - auto b_gpu = blas::make_sycl_iterator_buffer(b, k * n); - auto c_gpu = blas::make_sycl_iterator_buffer(c, m * n); + auto a_gpu = blas::helper::allocate(m * k, q); + auto b_gpu = blas::helper::allocate(k * n, q); + auto c_gpu = blas::helper::allocate(m * n, q); + + auto copy_a = + blas::helper::copy_to_device(q, a.data(), a_gpu, m * k); + auto copy_b = + blas::helper::copy_to_device(q, b.data(), b_gpu, n * k); + auto copy_c = + blas::helper::copy_to_device(q, c.data(), c_gpu, m * n); + + sb_handle.wait({copy_a, copy_b, copy_c}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -70,10 +80,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, int t2 beta, c_ref.data(), ldc); std::vector c_temp = c; { - auto c_temp_gpu = blas::make_sycl_iterator_buffer(c_temp, m * n); - auto event = _gemm(sb_handle, *t_a, *t_b, m, n, k, alpha, a_gpu, lda, b_gpu, - ldb, beta, c_temp_gpu, ldc); - sb_handle.wait(event); + auto c_temp_gpu = blas::helper::allocate(m * n, q); + auto copy_temp = blas::helper::copy_to_device(q, c_temp.data(), + c_temp_gpu, m * n); + sb_handle.wait(copy_temp); + auto gemm_event = _gemm(sb_handle, *t_a, *t_b, m, n, k, alpha, a_gpu, lda, + b_gpu, ldb, beta, c_temp_gpu, ldc); + sb_handle.wait(gemm_event); + auto copy_out = blas::helper::copy_to_host(q, c_temp_gpu, + c_temp.data(), m * n); + sb_handle.wait(copy_out); + + blas::helper::deallocate(c_temp_gpu, q); } std::ostringstream err_stream; @@ -112,14 +130,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, int t2 state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -}; -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto gemm_params = blas_benchmark::utils::get_blas3_params(args); + blas::helper::deallocate(a_gpu, q); + blas::helper::deallocate(b_gpu, q); + blas::helper::deallocate(c_gpu, q); +}; - for (auto p : gemm_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string t1s, t2s; index_t m, n, k; scalar_t alpha, beta; @@ -127,20 +148,32 @@ void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_p int t1 = static_cast(blas_benchmark::utils::to_transpose_enum(t1s)); int t2 = static_cast(blas_benchmark::utils::to_transpose_enum(t2s)); - auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, int t1, - int t2, index_t m, index_t k, index_t n, + auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, + int t1, int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, bool* success) { - run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, success); + run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, + success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - t1s, t2s, m, k, n, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + t1s, t2s, m, k, n, mem_type).c_str(), BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto gemm_params = blas_benchmark::utils::get_blas3_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, gemm_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, gemm_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas3/gemm_batched.cpp b/benchmark/syclblas/blas3/gemm_batched.cpp index 64c928d3b..4157ab8d3 100644 --- a/benchmark/syclblas/blas3/gemm_batched.cpp +++ b/benchmark/syclblas/blas3/gemm_batched.cpp @@ -62,7 +62,7 @@ std::vector interleaved_to_strided(const std::vector& input, return output; } -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, int batch_type_i, bool* success) { @@ -88,6 +88,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, state, beta, m, n, k, batch_size); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Matrices std::vector a = @@ -120,19 +121,39 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, } #endif - auto a_gpu = blas::make_sycl_iterator_buffer(a, m * k * batch_size); - auto b_gpu = blas::make_sycl_iterator_buffer(b, k * n * batch_size); - auto c_gpu = blas::make_sycl_iterator_buffer(c, m * n * batch_size); + auto a_gpu = + blas::helper::allocate(m * k * batch_size, q); + auto b_gpu = + blas::helper::allocate(k * n * batch_size, q); + auto c_gpu = + blas::helper::allocate(m * n * batch_size, q); + + auto copy_a = blas::helper::copy_to_device(q, a.data(), a_gpu, + m * k * batch_size); + auto copy_b = blas::helper::copy_to_device(q, b.data(), b_gpu, + n * k * batch_size); + auto copy_c = blas::helper::copy_to_device(q, c.data(), c_gpu, + m * n * batch_size); + + sb_handle.wait({copy_a, copy_b, copy_c}); #ifdef BLAS_VERIFY_BENCHMARK std::vector c_temp = c; { auto c_temp_gpu = - blas::make_sycl_iterator_buffer(c_temp, m * n * batch_size); - auto event = + blas::helper::allocate(m * n * batch_size, q); + auto copy_temp = blas::helper::copy_to_device( + q, c_temp.data(), c_temp_gpu, m * n * batch_size); + sb_handle.wait(copy_temp); + auto gemm_batched_event = _gemm_batched(sb_handle, *t_a, *t_b, m, n, k, alpha, a_gpu, lda, b_gpu, ldb, beta, c_temp_gpu, ldc, batch_size, batch_type); - sb_handle.wait(event); + sb_handle.wait(gemm_batched_event); + auto copy_out = blas::helper::copy_to_host( + q, c_temp_gpu, c_temp.data(), m * n * batch_size); + sb_handle.wait(copy_out); + + blas::helper::deallocate(c_temp_gpu, q); } if (batch_type == blas::gemm_batch_type_t::interleaved) { constexpr int offset = 0; @@ -176,15 +197,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -}; -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto gemm_params = - blas_benchmark::utils::get_gemm_batched_params(args); + blas::helper::deallocate(a_gpu, q); + blas::helper::deallocate(b_gpu, q); + blas::helper::deallocate(c_gpu, q); +}; - for (auto p : gemm_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string t1s, t2s; index_t m, n, k, batch_size; scalar_t alpha, beta; @@ -197,20 +220,32 @@ void register_benchmark(blas_benchmark::Args& args, int t1, int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, int batch_type, bool* success) { - run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size, - batch_type, success); + run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, + batch_size, batch_type, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( t1s, t2s, m, k, n, batch_size, batch_type, - blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + mem_type).c_str(), BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size, batch_type, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto gemm_batched_params = + blas_benchmark::utils::get_gemm_batched_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, gemm_batched_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, gemm_batched_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas3/gemm_batched_strided.cpp b/benchmark/syclblas/blas3/gemm_batched_strided.cpp index c47ab55ad..de68bd59c 100644 --- a/benchmark/syclblas/blas3/gemm_batched_strided.cpp +++ b/benchmark/syclblas/blas3/gemm_batched_strided.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level3Op benchmark_op = blas_benchmark::utils::Level3Op::gemm_batched_strided; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, index_t stride_a_mul, index_t stride_b_mul, @@ -94,19 +94,36 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, #endif - auto a_gpu = blas::make_sycl_iterator_buffer(a, size_a_batch); - auto b_gpu = blas::make_sycl_iterator_buffer(b, size_b_batch); - auto c_gpu = blas::make_sycl_iterator_buffer(c, size_c_batch); + auto a_gpu = blas::helper::allocate(size_a_batch, q); + auto b_gpu = blas::helper::allocate(size_b_batch, q); + auto c_gpu = blas::helper::allocate(size_c_batch, q); + + auto copy_a = + blas::helper::copy_to_device(q, a.data(), a_gpu, size_a_batch); + auto copy_b = + blas::helper::copy_to_device(q, b.data(), b_gpu, size_b_batch); + auto copy_c = + blas::helper::copy_to_device(q, c.data(), c_gpu, size_c_batch); + + sb_handle.wait({copy_a, copy_b, copy_c}); #ifdef BLAS_VERIFY_BENCHMARK std::vector c_temp = c; { auto c_temp_gpu = - blas::make_sycl_iterator_buffer(c_temp, size_c_batch); - auto event = _gemm_strided_batched( + blas::helper::allocate(size_c_batch, q); + auto copy_temp = blas::helper::copy_to_device( + q, c_temp.data(), c_temp_gpu, size_c_batch); + sb_handle.wait(copy_temp); + auto gemm_batched_strided_event = _gemm_strided_batched( sb_handle, *t_a, *t_b, m, n, k, alpha, a_gpu, lda, stride_a, b_gpu, ldb, stride_b, beta, c_temp_gpu, ldc, stride_c, batch_size); - sb_handle.wait(event); + sb_handle.wait(gemm_batched_strided_event); + auto copy_out = blas::helper::copy_to_host( + q, c_temp_gpu, c_temp.data(), size_c_batch); + sb_handle.wait(copy_out); + + blas::helper::deallocate(c_temp_gpu, q); } std::ostringstream err_stream; @@ -147,15 +164,17 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int t1, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -}; -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto gemm_batched_strided_params = - blas_benchmark::utils::get_gemm_batched_strided_params(args); + blas::helper::deallocate(a_gpu, q); + blas::helper::deallocate(b_gpu, q); + blas::helper::deallocate(c_gpu, q); +}; - for (auto p : gemm_batched_strided_params) { +template +void register_benchmark( + blas::SB_Handle* sb_handle_ptr, bool* success, std::string mem_type, + std::vector> params) { + for (auto p : params) { std::string t1s, t2s; index_t m, n, k, batch_size, stride_a_mul, stride_b_mul, stride_c_mul; scalar_t alpha, beta; @@ -167,22 +186,35 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, int t1, int t2, index_t m, index_t k, index_t n, scalar_t alpha, scalar_t beta, index_t batch_size, - index_t strd_a_mul, index_t strd_b_mul, - index_t strd_c_mul, bool* success) { - run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size, - strd_a_mul, strd_b_mul, strd_c_mul, success); + index_t stride_a_mul, index_t stride_b_mul, + index_t stride_c_mul, bool* success) { + run(st, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, + batch_size, stride_a_mul, stride_b_mul, + stride_c_mul, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( t1s, t2s, m, k, n, batch_size, stride_a_mul, stride_b_mul, - stride_c_mul, blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + stride_c_mul, mem_type).c_str(), BM_lambda, sb_handle_ptr, t1, t2, m, k, n, alpha, beta, batch_size, stride_a_mul, stride_b_mul, stride_c_mul, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto gemm_batched_strided_params = + blas_benchmark::utils::get_gemm_batched_strided_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, gemm_batched_strided_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, gemm_batched_strided_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas3/symm.cpp b/benchmark/syclblas/blas3/symm.cpp index 1448ab7d4..f3d401b9c 100644 --- a/benchmark/syclblas/blas3/symm.cpp +++ b/benchmark/syclblas/blas3/symm.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level3Op benchmark_op = blas_benchmark::utils::Level3Op::symm; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, char uplo, index_t m, index_t n, scalar_t alpha, scalar_t beta, bool* success) { @@ -47,6 +47,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, side); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Matrices std::vector a = blas_benchmark::utils::random_data(k * k); @@ -54,9 +55,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, std::vector c = blas_benchmark::utils::const_data(m * n, 0); - auto a_gpu = blas::make_sycl_iterator_buffer(a, k * k); - auto b_gpu = blas::make_sycl_iterator_buffer(b, m * n); - auto c_gpu = blas::make_sycl_iterator_buffer(c, m * n); + auto a_gpu = blas::helper::allocate(k * k, q); + auto b_gpu = blas::helper::allocate(m * n, q); + auto c_gpu = blas::helper::allocate(m * n, q); + + auto copy_a = + blas::helper::copy_to_device(q, a.data(), a_gpu, k * k); + auto copy_b = + blas::helper::copy_to_device(q, b.data(), b_gpu, n * m); + auto copy_c = + blas::helper::copy_to_device(q, c.data(), c_gpu, m * n); + + sb_handle.wait({copy_a, copy_b, copy_c}); #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results @@ -67,10 +77,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, ldb, beta, c_ref.data(), ldc); std::vector c_temp = c; { - auto c_temp_gpu = blas::make_sycl_iterator_buffer(c_temp, m * n); - auto event = _symm(sb_handle, side, uplo, m, n, alpha, a_gpu, lda, b_gpu, - ldb, beta, c_temp_gpu, ldc); - sb_handle.wait(event); + auto c_temp_gpu = blas::helper::allocate(m * n, q); + auto copy_temp = blas::helper::copy_to_device(q, c_temp.data(), + c_temp_gpu, m * n); + sb_handle.wait(copy_temp); + auto symm_event = _symm(sb_handle, side, uplo, m, n, alpha, a_gpu, lda, + b_gpu, ldb, beta, c_temp_gpu, ldc); + sb_handle.wait(symm_event); + auto copy_out = blas::helper::copy_to_host(q, c_temp_gpu, + c_temp.data(), m * n); + sb_handle.wait(copy_out); + + blas::helper::deallocate(c_temp_gpu, q); } std::ostringstream err_stream; @@ -109,15 +127,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -}; -template -void register_benchmark(blas_benchmark::Args& args, - blas::SB_Handle* sb_handle_ptr, bool* success) { - auto symm_params = blas_benchmark::utils::get_symm_params(args); + blas::helper::deallocate(a_gpu, q); + blas::helper::deallocate(b_gpu, q); + blas::helper::deallocate(c_gpu, q); +}; - for (auto p : symm_params) { - std::string side, uplo; +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { + std::string side, uplo; index_t m, n; scalar_t alpha, beta; std::tie(side, uplo, m, n, alpha, beta) = p; @@ -128,19 +149,31 @@ void register_benchmark(blas_benchmark::Args& args, auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, char side, char uplo, index_t m, index_t n, scalar_t alpha, scalar_t beta, bool* success) { - run(st, sb_handle_ptr, side_c, uplo_c, m, n, alpha, beta, + run(st, sb_handle_ptr, side_c, uplo_c, m, n, alpha, beta, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( side, uplo, m, n, alpha, beta, - blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + mem_type).c_str(), BM_lambda, sb_handle_ptr, side_c, uplo_c, m, n, alpha, beta, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto symm_params = blas_benchmark::utils::get_symm_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, symm_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, symm_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/blas3/trsm.cpp b/benchmark/syclblas/blas3/trsm.cpp index bdafd4558..ce115b669 100644 --- a/benchmark/syclblas/blas3/trsm.cpp +++ b/benchmark/syclblas/blas3/trsm.cpp @@ -28,7 +28,7 @@ constexpr blas_benchmark::utils::Level3Op benchmark_op = blas_benchmark::utils::Level3Op::trsm; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, char uplo, char trans, char diag, index_t m, index_t n, scalar_t alpha, bool* success) { @@ -44,8 +44,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, blas_benchmark::utils::init_level_3_counters< blas_benchmark::utils::Level3Op::trsm, scalar_t>(state, 0, m, n, 0, 1, side); - blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); const int sizeA = k * lda; const int sizeB = n * ldb; @@ -62,11 +62,15 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, blas_benchmark::utils::fill_trsm_matrix(a, k, lda, uplo, diagValue, scalar_t{0}); - auto a_gpu = blas::make_sycl_iterator_buffer(a, sizeA); - auto b_gpu = blas::make_sycl_iterator_buffer(b, sizeB); + auto a_gpu = blas::helper::allocate(sizeA, q); + auto b_gpu = blas::helper::allocate(sizeB, q); - a_gpu.get_buffer().set_final_data(nullptr); - b_gpu.get_buffer().set_final_data(nullptr); + auto copy_a = + blas::helper::copy_to_device(q, a.data(), a_gpu, sizeA); + auto copy_b = + blas::helper::copy_to_device(q, b.data(), b_gpu, sizeB); + + sb_handle.wait({copy_a, copy_b}); #ifdef BLAS_VERIFY_BENCHMARK // Run once verifying the results against the reference blas implementation. @@ -78,12 +82,18 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, x_ref.data(), ldb); { - auto b_temp_gpu = blas::make_sycl_iterator_buffer(b_temp, sizeB); - _trsm(sb_handle, side, uplo, trans, diag, m, n, alpha, a_gpu, lda, - b_temp_gpu, ldb); + auto b_temp_gpu = blas::helper::allocate(sizeB, q); + auto copy_temp = blas::helper::copy_to_device(q, b_temp.data(), + b_temp_gpu, sizeB); + sb_handle.wait({copy_temp}); + auto trsm_event = _trsm(sb_handle, side, uplo, trans, diag, m, n, alpha, + a_gpu, lda, b_temp_gpu, ldb); + sb_handle.wait(trsm_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), b_temp_gpu, b_temp.data(), sizeB); sb_handle.wait(event); + + blas::helper::deallocate(b_temp_gpu, q); } std::ostringstream err_stream; @@ -148,14 +158,16 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, char side, state.counters["bytes_processed"]); blas_benchmark::utils::calc_avg_counters(state); -}; -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto trsm_params = blas_benchmark::utils::get_trsm_params(args); + blas::helper::deallocate(a_gpu, q); + blas::helper::deallocate(b_gpu, q); +}; - for (auto p : trsm_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { char side, uplo, trans, diag; index_t m, n; scalar_t alpha; @@ -164,18 +176,30 @@ void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_p auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, char side, char uplo, char trans, char diag, index_t m, index_t n, scalar_t alpha, bool* success) { - run(st, sb_handle_ptr, side, uplo, trans, diag, m, n, alpha, success); + run(st, sb_handle_ptr, side, uplo, trans, diag, m, n, + alpha, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( side, uplo, trans, diag, m, n, - blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), + mem_type).c_str(), BM_lambda, sb_handle_ptr, side, uplo, trans, diag, m, n, alpha, success) ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto trsm_params = blas_benchmark::utils::get_trsm_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, trsm_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, trsm_params); +#endif +} + namespace blas_benchmark { void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, bool* success) { diff --git a/benchmark/syclblas/extension/reduction.cpp b/benchmark/syclblas/extension/reduction.cpp index 78bba15b1..9fd98f9c5 100644 --- a/benchmark/syclblas/extension/reduction.cpp +++ b/benchmark/syclblas/extension/reduction.cpp @@ -30,7 +30,7 @@ using namespace blas; constexpr blas_benchmark::utils::ExtensionOp benchmark_op = blas_benchmark::utils::ExtensionOp::reduction; -template +template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t rows, index_t cols, reduction_dim_t dim, bool* success) { // initialize the state label @@ -49,16 +49,24 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t rows, state.counters["bytes_processed"] = (rows_d * cols_d) * sizeof(scalar_t); blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); // Matrix std::vector mat = blas_benchmark::utils::random_data(rows * cols); - auto mat_buffer = blas::make_sycl_iterator_buffer(mat, rows * cols); - // Output vector std::vector vec = blas_benchmark::utils::random_data( (dim == reduction_dim_t::outer) ? rows : cols); - auto vec_buffer = blas::make_sycl_iterator_buffer(vec, vec.size()); + + auto mat_buffer = blas::helper::allocate(rows * cols, q); + auto vec_buffer = blas::helper::allocate(vec.size(), q); + + auto copy_mat = blas::helper::copy_to_device( + q, mat.data(), mat_buffer, rows * cols); + auto copy_vec = blas::helper::copy_to_device( + q, vec.data(), vec_buffer, vec.size()); + + sb_handle.wait({copy_mat, copy_vec}); /* If enabled, run a first time with a verification of the results */ #ifdef BLAS_VERIFY_BENCHMARK @@ -81,15 +89,20 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t rows, } std::vector vec_temp = vec; { - auto vec_temp_buffer = blas::make_sycl_iterator_buffer( - vec_temp.data(), vec_temp.size()); - - extension::_reduction( + auto vec_temp_buffer = + blas::helper::allocate(vec_temp.size(), q); + auto copy_temp = blas::helper::copy_to_device( + q, vec_temp.data(), vec_temp_buffer, vec_temp.size()); + sb_handle.wait({copy_temp}); + auto reduction_event = extension::_reduction( sb_handle, mat_buffer, rows, vec_temp_buffer, rows, cols, dim); - auto event = - blas::helper::copy_to_host(sb_handle.get_queue(), vec_temp_buffer, - vec_temp.data(), vec_temp.size()); + sb_handle.wait(reduction_event); + + auto event = blas::helper::copy_to_host(q, vec_temp_buffer, vec_temp.data(), + vec_temp.size()); sb_handle.wait(event); + + blas::helper::deallocate(vec_temp_buffer, q); } std::ostringstream err_stream; @@ -124,38 +137,53 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t rows, } blas_benchmark::utils::calc_avg_counters(state); -}; -template -void register_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { - auto red_params = blas_benchmark::utils::get_reduction_params(args); + blas::helper::deallocate(mat_buffer, q); + blas::helper::deallocate(vec_buffer, q); +}; - for (auto p : red_params) { +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + reduction_dim_t dimension, std::string mem_type, + std::vector params) { + for (auto p : params) { index_t rows, cols; std::tie(rows, cols) = p; - auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, index_t rows, index_t cols, reduction_dim_t dim, bool* success) { - run(st, sb_handle_ptr, rows, cols, dim, success); + run(st, sb_handle_ptr, rows, cols, dim, success); }; benchmark::RegisterBenchmark( blas_benchmark::utils::get_name( - rows, cols, "inner", blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), - BM_lambda, sb_handle_ptr, rows, cols, reduction_dim_t::inner, success); - benchmark::RegisterBenchmark( - blas_benchmark::utils::get_name( - rows, cols, "outer", blas_benchmark::utils::MEM_TYPE_BUFFER) - .c_str(), - BM_lambda, sb_handle_ptr, rows, cols, reduction_dim_t::outer, success); + rows, cols, (dimension == reduction_dim_t::inner ? "inner" : "outer"), mem_type).c_str(), + BM_lambda, sb_handle_ptr, rows, cols, reduction_dim_t::outer, success) + ->UseRealTime(); } } +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto reduction_params = + blas_benchmark::utils::get_reduction_params(args); + register_benchmark( + sb_handle_ptr, success, reduction_dim_t::inner, blas_benchmark::utils::MEM_TYPE_BUFFER, + reduction_params); + register_benchmark( + sb_handle_ptr, success, reduction_dim_t::outer, blas_benchmark::utils::MEM_TYPE_BUFFER, + reduction_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, reduction_dim_t::inner, blas_benchmark::utils::MEM_TYPE_USM, reduction_params); + register_benchmark( + sb_handle_ptr, success, reduction_dim_t::outer, blas_benchmark::utils::MEM_TYPE_USM, reduction_params); +#endif +} + namespace blas_benchmark { -void create_benchmark(blas_benchmark::Args& args, blas::SB_Handle* sb_handle_ptr, - bool* success) { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); } } // namespace blas_benchmark diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake index daacdb9d5..901c642ee 100644 --- a/cmake/CmakeFunctionHelper.cmake +++ b/cmake/CmakeFunctionHelper.cmake @@ -93,6 +93,9 @@ set(LOCATION "${SYCLBLAS_GENERATED_SRC}/${blas_level}/${func}/") foreach(data ${data_list}) cpp_type(cpp_data ${data}) set(container_list "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list "${cpp_data}*") + endif() foreach(index ${index_list}) foreach(container0 ${container_list}) foreach(increment ${index_list}) @@ -135,34 +138,38 @@ set(LOCATION "${SYCLBLAS_GENERATED_SRC}/${blas_level}/${func}/") foreach(data ${data_list}) cpp_type(cpp_data ${data}) set(container_list "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list "${cpp_data}*") + endif() foreach(index ${index_list}) + set(idx 0) foreach(container0 ${container_list}) - foreach(container1 ${container_list}) - set(container_names "${container0}_${container1}") - foreach(increment ${index_list}) - sanitize_file_name(file_name - "${func}_${data}_${index}_${container_names}_${increment}.cpp") - add_custom_command(OUTPUT "${LOCATION}/${file_name}" - COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_binary.py - ${PROJECT_SOURCE_DIR}/external/ - ${SYCLBLAS_SRC_GENERATOR}/gen - ${blas_level} - ${func} - ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in - ${cpp_data} - ${index} - ${increment} - ${container0} - ${container1} - ${file_name} - MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in - DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_binary.py - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - VERBATIM - ) - list(APPEND FUNC_SRC "${LOCATION}/${file_name}") - endforeach(increment) - endforeach(container1) + list(GET container_list ${idx} container1) + MATH(EXPR idx "${idx}+1") + set(container_names "${container0}_${container1}") + foreach(increment ${index_list}) + sanitize_file_name(file_name + "${func}_${data}_${index}_${container_names}_${increment}.cpp") + add_custom_command(OUTPUT "${LOCATION}/${file_name}" + COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_binary.py + ${PROJECT_SOURCE_DIR}/external/ + ${SYCLBLAS_SRC_GENERATOR}/gen + ${blas_level} + ${func} + ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in + ${cpp_data} + ${index} + ${increment} + ${container0} + ${container1} + ${file_name} + MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in + DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_binary.py + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + VERBATIM + ) + list(APPEND FUNC_SRC "${LOCATION}/${file_name}") + endforeach(increment) endforeach(container0) endforeach(index) endforeach(data) @@ -188,41 +195,50 @@ foreach(data ${data_list}) set(container_list_in) if(pos EQUAL -1) list(APPEND container_list_in "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in "${cpp_data}*") + endif() else() - list(APPEND container_list_in "BufferIterator<${cpp_data} const>") + list(APPEND container_list_in "BufferIterator<${cpp_data}> const") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in "${cpp_data}* const") + endif() endif() set(container_list_out "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_out "${cpp_data}*") + endif() foreach(index ${index_list}) - set(container_list "BufferIterator<${cpp_data}>") foreach(operator ${operator_list}) + set(idx 0) foreach(container0 ${container_list_in}) - foreach(container1 ${container_list_out}) - set(container_names "${container0}_${container1}") - foreach(increment ${index_list}) - sanitize_file_name(file_name - "${func}_${operator}_${data}_${index}_${container0}_${increment}.cpp") - add_custom_command(OUTPUT "${LOCATION}/${file_name}" - COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_reduction.py - ${PROJECT_SOURCE_DIR}/external/ - ${SYCLBLAS_SRC_GENERATOR}/gen - ${blas_level} - ${func} - ${SYCLBLAS_SRC}/interface/${blas_level}/${actualfunc}.cpp.in - ${cpp_data} - ${index} - ${increment} - ${container0} - ${container1} - ${operator} - ${file_name} - MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${actualfunc}.cpp.in - DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_reduction.py - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - VERBATIM - ) - list(APPEND FUNC_SRC "${LOCATION}/${file_name}") - endforeach(increment) - endforeach(container1) + list(GET container_list_in ${idx} container1) + MATH(EXPR idx "${idx}+1") + set(container_names "${container0}_${container1}") + foreach(increment ${index_list}) + sanitize_file_name(file_name + "${func}_${operator}_${data}_${index}_${container0}_${container1}_${increment}.cpp") + add_custom_command(OUTPUT "${LOCATION}/${file_name}" + COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_reduction.py + ${PROJECT_SOURCE_DIR}/external/ + ${SYCLBLAS_SRC_GENERATOR}/gen + ${blas_level} + ${func} + ${SYCLBLAS_SRC}/interface/${blas_level}/${actualfunc}.cpp.in + ${cpp_data} + ${index} + ${increment} + ${container0} + ${container1} + ${operator} + ${file_name} + MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${actualfunc}.cpp.in + DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_reduction.py + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + VERBATIM + ) + list(APPEND FUNC_SRC "${LOCATION}/${file_name}") + endforeach(increment) endforeach(container0) endforeach(operator) endforeach(index) @@ -242,36 +258,44 @@ set(LOCATION "${SYCLBLAS_GENERATED_SRC}/${blas_level}/${func}/") foreach(data ${data_list}) cpp_type(cpp_data ${data}) set(container_list_in "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in "${cpp_data}*") + endif() foreach(index ${index_list}) set(container_list_out - "BufferIterator>") + "BufferIterator>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_out + "IndexValueTuple<${index},${cpp_data}>*") + endif() + set(idx 0) foreach(container0 ${container_list_in}) - foreach(container1 ${container_list_out}) - set(container_names "${container0}_${container1}") - foreach(increment ${index_list}) - sanitize_file_name(file_name - "${func}_${data}_${index}_${container_names}_${increment}.cpp") - add_custom_command(OUTPUT "${LOCATION}/${file_name}" - COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_binary_special.py - ${PROJECT_SOURCE_DIR}/external/ - ${SYCLBLAS_SRC_GENERATOR}/gen - ${blas_level} - ${func} - ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in - ${cpp_data} - ${index} - ${increment} - ${container0} - ${container1} - ${file_name} - MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in - DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_binary_special.py - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - VERBATIM - ) - list(APPEND FUNC_SRC "${LOCATION}/${file_name}") - endforeach(increment) - endforeach(container1) + list(GET container_list_out ${idx} container1) + MATH(EXPR idx "${idx}+1") + set(container_names "${container0}_${container1}") + foreach(increment ${index_list}) + sanitize_file_name(file_name + "${func}_${data}_${index}_${container_names}_${increment}.cpp") + add_custom_command(OUTPUT "${LOCATION}/${file_name}" + COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_binary_special.py + ${PROJECT_SOURCE_DIR}/external/ + ${SYCLBLAS_SRC_GENERATOR}/gen + ${blas_level} + ${func} + ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in + ${cpp_data} + ${index} + ${increment} + ${container0} + ${container1} + ${file_name} + MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in + DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_binary_special.py + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + VERBATIM + ) + list(APPEND FUNC_SRC "${LOCATION}/${file_name}") + endforeach(increment) endforeach(container0) endforeach(index) endforeach(data) @@ -297,42 +321,51 @@ foreach(data ${data_list}) set(container_list_in) if(const_pos EQUAL -1) list(APPEND container_list_in "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in "${cpp_data}*") + endif() else() - list(APPEND container_list_in "BufferIterator<${cpp_data} const>") + list(APPEND container_list_in "BufferIterator<${cpp_data}> const") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in "${cpp_data}* const") + endif() endif() set(container_list_out "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_out "${cpp_data}*") + endif() foreach(index ${index_list}) + set(idx 0) foreach(container0 ${container_list_in}) - foreach(container1 ${container_list_in}) - foreach(container2 ${container_list_out}) - set(container_names - "${container0}_${container1}_${container2}") - foreach(increment ${index_list}) - sanitize_file_name(file_name - "${func}_${data}_${index}_${container_names}_${increment}.cpp") - add_custom_command(OUTPUT "${LOCATION}/${file_name}" - COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_ternary.py - ${PROJECT_SOURCE_DIR}/external/ - ${SYCLBLAS_SRC_GENERATOR}/gen - ${blas_level} - ${func} - ${SYCLBLAS_SRC}/interface/${blas_level}/${actualfunc}.cpp.in - ${cpp_data} - ${index} - ${increment} - ${container0} - ${container1} - ${container2} - ${file_name} - MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${actualfunc}.cpp.in - DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_ternary.py - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - VERBATIM - ) - list(APPEND FUNC_SRC "${LOCATION}/${file_name}") - endforeach(increment) - endforeach(container2) - endforeach(container1) + list(GET container_list_in ${idx} container1) + list(GET container_list_out ${idx} container2) + MATH(EXPR idx "${idx}+1") + set(container_names + "${container0}_${container1}_${container2}") + foreach(increment ${index_list}) + sanitize_file_name(file_name + "${func}_${data}_${index}_${container_names}_${increment}.cpp") + add_custom_command(OUTPUT "${LOCATION}/${file_name}" + COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_ternary.py + ${PROJECT_SOURCE_DIR}/external/ + ${SYCLBLAS_SRC_GENERATOR}/gen + ${blas_level} + ${func} + ${SYCLBLAS_SRC}/interface/${blas_level}/${actualfunc}.cpp.in + ${cpp_data} + ${index} + ${increment} + ${container0} + ${container1} + ${container2} + ${file_name} + MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${actualfunc}.cpp.in + DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_ternary.py + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + VERBATIM + ) + list(APPEND FUNC_SRC "${LOCATION}/${file_name}") + endforeach(increment) endforeach(container0) endforeach(index) endforeach(data) @@ -351,35 +384,37 @@ function(generate_blas_rotg_objects blas_level func) foreach (data ${data_list}) cpp_type(cpp_data ${data}) set(container_list_in_out "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in_out "${cpp_data}*") + endif() + set(idx 0) foreach (container0 ${container_list_in_out}) - foreach (container1 ${container_list_in_out}) - foreach (container2 ${container_list_in_out}) - foreach (container3 ${container_list_in_out}) - set(container_names "${container0}_${container1}_${container2}_${container3}") - sanitize_file_name(file_name - "${func}_${data}_${index}_${container_names}_${increment}.cpp") - add_custom_command(OUTPUT "${LOCATION}/${file_name}" - COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_rotg.py - ${PROJECT_SOURCE_DIR}/external/ - ${SYCLBLAS_SRC_GENERATOR}/gen - ${blas_level} - ${func} - ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in - ${cpp_data} - ${container0} - ${container1} - ${container2} - ${container3} - ${file_name} - MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in - DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_rotg.py - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - VERBATIM - ) - list(APPEND FUNC_SRC "${LOCATION}/${file_name}") - endforeach (container3) - endforeach (container2) - endforeach (container1) + list(GET container_list_in_out ${idx} container1) + list(GET container_list_in_out ${idx} container2) + list(GET container_list_in_out ${idx} container3) + MATH(EXPR idx "${idx}+1") + set(container_names "${container0}_${container1}_${container2}_${container3}") + sanitize_file_name(file_name + "${func}_${data}_${index}_${container_names}_${increment}.cpp") + add_custom_command(OUTPUT "${LOCATION}/${file_name}" + COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_rotg.py + ${PROJECT_SOURCE_DIR}/external/ + ${SYCLBLAS_SRC_GENERATOR}/gen + ${blas_level} + ${func} + ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in + ${cpp_data} + ${container0} + ${container1} + ${container2} + ${container3} + ${file_name} + MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in + DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_rotg.py + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + VERBATIM + ) + list(APPEND FUNC_SRC "${LOCATION}/${file_name}") endforeach (container0) endforeach (data) add_library(${func} OBJECT ${FUNC_SRC}) @@ -396,7 +431,7 @@ function(generate_blas_rotg_return_objects blas_level func) set(LOCATION "${SYCLBLAS_GENERATED_SRC}/${blas_level}/${func}/") foreach (data ${data_list}) cpp_type(cpp_data ${data}) - set(container_list "BufferIterator<${cpp_data}>") + set(container_list "${cpp_data}") sanitize_file_name(file_name "${func}_${data}_${index}_${container0}_${increment}.cpp") add_custom_command(OUTPUT "${LOCATION}/${file_name}" @@ -429,37 +464,38 @@ function(generate_blas_rotmg_objects blas_level func) foreach (data ${data_list}) cpp_type(cpp_data ${data}) set(container_list_in_out "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in_out "${cpp_data}*") + endif() + set(idx 0) foreach (container0 ${container_list_in_out}) - foreach (container1 ${container_list_in_out}) - foreach (container2 ${container_list_in_out}) - foreach (container3 ${container_list_in_out}) - foreach (container4 ${container_list_in_out}) - set(container_names "${container0}_${container1}_${container2}_${container3}") - sanitize_file_name(file_name "${func}_${data}_${container_names}.cpp") - add_custom_command(OUTPUT "${LOCATION}/${file_name}" - COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_rotmg.py - ${PROJECT_SOURCE_DIR}/external/ - ${SYCLBLAS_SRC_GENERATOR}/gen - ${blas_level} - ${func} - ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in - ${cpp_data} - ${container0} - ${container1} - ${container2} - ${container3} - ${container4} - ${file_name} - MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in - DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_rotmg.py - WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - VERBATIM - ) - list(APPEND FUNC_SRC "${LOCATION}/${file_name}") - endforeach (container4) - endforeach (container3) - endforeach (container2) - endforeach (container1) + list(GET container_list_in_out ${idx} container1) + list(GET container_list_in_out ${idx} container2) + list(GET container_list_in_out ${idx} container3) + list(GET container_list_in_out ${idx} container4) + MATH(EXPR idx "${idx}+1") + set(container_names "${container0}_${container1}_${container2}_${container3}_${container4}") + sanitize_file_name(file_name "${func}_${data}_${container_names}.cpp") + add_custom_command(OUTPUT "${LOCATION}/${file_name}" + COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_rotmg.py + ${PROJECT_SOURCE_DIR}/external/ + ${SYCLBLAS_SRC_GENERATOR}/gen + ${blas_level} + ${func} + ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in + ${cpp_data} + ${container0} + ${container1} + ${container2} + ${container3} + ${container4} + ${file_name} + MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in + DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_rotmg.py + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + VERBATIM + ) + list(APPEND FUNC_SRC "${LOCATION}/${file_name}") endforeach (container0) endforeach (data) add_library(${func} OBJECT ${FUNC_SRC}) @@ -513,12 +549,37 @@ function(add_gemm_configuration # Tall/skinny configurations not enabled, skip return() endif() + string(FIND ${func} "_const" const_pos) + if(const_pos) + string(REPLACE "_const" "" actualfunc ${func}) + endif() cpp_type(cpp_data ${data}) - foreach(symm_a ${boolean_list}) - foreach(symm_b ${boolean_list}) - foreach(trans_a ${boolean_list}) - foreach(trans_b ${boolean_list}) - foreach(is_beta_zero ${boolean_list}) + set(container_list_in) + if(const_pos EQUAL -1) + list(APPEND container_list_in "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in "${cpp_data}*") + endif() + else() + list(APPEND container_list_in "BufferIterator<${cpp_data}> const") + if(${SB_ENABLE_USM}) + list(APPEND container_list_in "${cpp_data}* const") + endif() + endif() + set(container_list_out "BufferIterator<${cpp_data}>") + if(${SB_ENABLE_USM}) + list(APPEND container_list_out "${cpp_data}*") + endif() + set(idx 0) + foreach (container0 ${container_list_in}) + list(GET container_list_in ${idx} container1) + list(GET container_list_out ${idx} container2) + MATH(EXPR idx "${idx}+1") + foreach(symm_a ${boolean_list}) + foreach(symm_b ${boolean_list}) + foreach(trans_a ${boolean_list}) + foreach(trans_b ${boolean_list}) + foreach(is_beta_zero ${boolean_list}) foreach(index ${index_list}) set(file_name "${func}_${double_buffer}_${conflict_a}_" "${conflict_b}_${trans_a}_${trans_b}_" @@ -529,7 +590,7 @@ function(add_gemm_configuration "${twc}_${tsr}_${tsc}_${tlr}_${tlc}_" "${item_batch}_${wg_batch}_${symm_a}_${symm_b}_" "${jm_m}_${jm_n}_${jm_k}_${jm_in_type}_${jm_out_type}_" - "${wg_size}_${cache_line_size}.cpp") + "${wg_size}_${cache_line_size}_${container0}.cpp") sanitize_file_name(file_name "${file_name}") add_custom_command(OUTPUT "${LOCATION}/${file_name}" COMMAND ${PYTHON_EXECUTABLE} ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_gemm_launcher.py @@ -572,6 +633,9 @@ function(add_gemm_configuration ${use_joint_matrix} ${symm_a} ${symm_b} + ${container0} + ${container1} + ${container2} MAIN_DEPENDENCY ${SYCLBLAS_SRC}/interface/${blas_level}/${func}.cpp.in DEPENDS ${SYCLBLAS_SRC_GENERATOR}/py_gen_blas_gemm_launcher.py WORKING_DIRECTORY ${PROJECT_BINARY_DIR} @@ -580,11 +644,12 @@ function(add_gemm_configuration list(APPEND gemm_sources "${LOCATION}/${file_name}") set(gemm_sources "${gemm_sources}" PARENT_SCOPE) endforeach(index) - endforeach(is_beta_zero) - endforeach(trans_b) - endforeach(trans_a) - endforeach(symm_b) - endforeach(symm_a) + endforeach(is_beta_zero) + endforeach(trans_b) + endforeach(trans_a) + endforeach(symm_b) + endforeach(symm_a) + endforeach(container0) endfunction() if(${TUNING_TARGET} STREQUAL "INTEL_GPU") set(supported_types diff --git a/cmake/Modules/FindDPCPP.cmake b/cmake/Modules/FindDPCPP.cmake index e55837105..e9472db9e 100644 --- a/cmake/Modules/FindDPCPP.cmake +++ b/cmake/Modules/FindDPCPP.cmake @@ -81,6 +81,11 @@ if (${DPCPP_SYCL_TARGET} STREQUAL "nvptx64-nvidia-cuda") endif() endif() +# add compiler directive to enable USM code +add_definitions(-DSB_ENABLE_USM=1) +set(SB_ENABLE_USM 1) +list(APPEND DPCPP_FLAGS "-DSB_ENABLE_USM=1") + function(add_sycl_to_target) set(options) set(one_value_args TARGET) diff --git a/include/blas_meta.h b/include/blas_meta.h index 864eb8c9a..93c98c6e1 100644 --- a/include/blas_meta.h +++ b/include/blas_meta.h @@ -97,8 +97,8 @@ struct Choose { /// \tparam element_t : the type we are interested in template struct RemoveAll { - using Type = typename std::remove_reference< - typename std::remove_cv::type>::type; + using Type = typename std::remove_reference::type>::type>::type; }; template @@ -184,6 +184,12 @@ struct is_sycl_scalar template <> struct is_sycl_scalar : std::true_type {}; +template <> +struct is_sycl_scalar : std::false_type {}; + +template <> +struct is_sycl_scalar : std::false_type {}; + } // namespace blas #endif // BLAS_META_H diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h index ead0e92f8..8bc20efb3 100644 --- a/include/interface/blas1_interface.h +++ b/include/interface/blas1_interface.h @@ -34,173 +34,190 @@ namespace internal { * Implements AXPY \f$y = ax + y\f$ * * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X - * @param _vy BufferIterator + * @param _vy BufferIterator or USM pointer * @param _incy Increment for the vector Y + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _axpy(sb_handle_t &sb_handle, index_t _N, - element_t _alpha, container_0_t _vx, - increment_t _incx, container_1_t _vy, - increment_t _incy); +typename sb_handle_t::event_t _axpy( + sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + increment_t _incx, container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies); /** * \brief COPY copies a vector, x, to a vector, y. * * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X - * @param _vy BufferIterator + * @param _vy BufferIterator or USM pointer * @param _incy Increment for the vector Y + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _copy(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy); +typename sb_handle_t::event_t _copy( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies); /** * \brief Computes the inner product of two vectors with double precision * accumulation (Asynchronous version that returns an event) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _dot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - container_2_t _rs); +typename sb_handle_t::event_t _dot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t &_dependencies); /** * \brief Computes the inner product of two vectors with double precision * accumulation and adds a scalar to the result (Asynchronous version that * returns an event) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. If size 0, the result will be sb. * @param sb Scalar to add to the results of the inner product. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _sdsdot(sb_handle_t &sb_handle, index_t _N, - float sb, container_0_t _vx, - increment_t _incx, container_1_t _vy, - increment_t _incy, container_2_t _rs); +typename sb_handle_t::event_t _sdsdot( + sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, + increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t &_dependencies); /** * \brief ASUM Takes the sum of the absolute values * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _asum(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _rs); +typename sb_handle_t::event_t _asum( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _rs, const typename sb_handle_t::event_t &_dependencies); /** * \brief IAMAX finds the index of the first element having maximum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _iamax(sb_handle_t &sb_handle, index_t _N, - container_t _vx, increment_t _incx, - ContainerI _rs); +typename sb_handle_t::event_t _iamax( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + ContainerI _rs, const typename sb_handle_t::event_t &_dependencies); /** * \brief IAMIN finds the index of the first element having minimum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _iamin(sb_handle_t &sb_handle, index_t _N, - container_t _vx, increment_t _incx, - ContainerI _rs); +typename sb_handle_t::event_t _iamin( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + ContainerI _rs, const typename sb_handle_t::event_t &_dependencies); /** * \brief SWAP interchanges two vectors * * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X - * @param _vy BufferIterator + * @param _vy BufferIterator or USM pointer * @param _incy Increment for the vector Y + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _swap(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy); +typename sb_handle_t::event_t _swap( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies); /** * \brief SCALAR operation on a vector * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _scal(sb_handle_t &sb_handle, index_t _N, - element_t _alpha, container_0_t _vx, - increment_t _incx); +typename sb_handle_t::event_t _scal( + sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + increment_t _incx, const typename sb_handle_t::event_t &_dependencies); /** * \brief NRM2 Returns the euclidian norm of a vector * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _nrm2(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _rs); +typename sb_handle_t::event_t _nrm2( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _rs, const typename sb_handle_t::event_t &_dependencies); /** * @brief _rot constructor given plane rotation * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incy Increment for the vector Y * @param _sin sine * @param _cos cosine * @param _N data size + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _rot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - element_t _cos, element_t _sin); +typename sb_handle_t::event_t _rot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, element_t _cos, element_t _sin, + const typename sb_handle_t::event_t &_dependencies); /** * @brief Performs a modified Givens rotation of points. @@ -219,27 +236,28 @@ typename sb_handle_t::event_t _rot(sb_handle_t &sb_handle, index_t _N, * [h21 h22] [h21 1.0] [-1.0 h22] [0.0 1.0] * * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes (for vx and vy). - * @param[in, out] _vx Buffer holding input vector x + * @param[in, out] _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param[in, out] _vy Buffer holding input vector y + * @param[in, out] _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) * @param[in] _param Buffer with the following layout: [flag, h11, h21, h12, * h22]. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _rotm(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - container_2_t _param); +typename sb_handle_t::event_t _rotm( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _param, + const typename sb_handle_t::event_t &_dependencies); /** * Given the Cartesian coordinates (x1, y1) of a point, the rotmg routines @@ -257,54 +275,57 @@ typename sb_handle_t::event_t _rotm(sb_handle_t &sb_handle, index_t _N, * Rotmg may apply scaling operations to d1, d2 and x1 to avoid overflows. * * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator - * @tparam container_3_t Buffer Iterator - * @tparam container_4_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer + * @tparam container_3_t Buffer Iterator or USM pointer + * @tparam container_4_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle - * @param _d1[in,out] On entry, buffer holding the scaling factor for the + * @param _d1[in,out] On entry, memory object holding the scaling factor for the * x-coordinate. On exit, the re-scaled _d1. - * @param _d2[in,out] On entry, buffer holding the scaling factor for the + * @param _d2[in,out] On entry, memory object holding the scaling factor for the * y-coordinate. On exit, the re-scaled _d2. - * @param _x1[in,out] On entry, buffer holding the x-coordinate. On exit, the - * re-scaled _x1 - * @param _y1[in] Buffer holding the y-coordinate of the point. + * @param _x1[in,out] On entry, memory object holding the x-coordinate. On exit, + * the re-scaled _x1 + * @param _y1[in] Memory object holding the y-coordinate of the point. * @param _param[out] Buffer with the following layout: [flag, h11, h21, h12, * h22]. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _rotmg(sb_handle_t &sb_handle, container_0_t _d1, - container_1_t _d2, container_2_t _x1, - container_3_t _y1, container_4_t _param); +typename sb_handle_t::event_t _rotmg( + sb_handle_t &sb_handle, container_0_t _d1, container_1_t _d2, + container_2_t _x1, container_3_t _y1, container_4_t _param, + const typename sb_handle_t::event_t &_dependencies); /** * \brief Given the Cartesian coordinates (a, b) of a point, the rotg routines * return the parameters c, s, r, and z associated with the Givens rotation. * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator - * @tparam container_3_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer + * @tparam container_3_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle - * @param a[in, out] On entry, buffer holding the x-coordinate of the point. On - * exit, the scalar z. - * @param b[in, out] On entry, buffer holding the y-coordinate of the point. On - * exit, the scalar r. - * @param c[out] Buffer holding the parameter c. - * @param s[out] Buffer holding the parameter s. + * @param a[in, out] On entry, memory object holding the x-coordinate of the + * point. On exit, the scalar z. + * @param b[in, out] On entry, memory object holding the y-coordinate of the + * point. On exit, the scalar r. + * @param c[out] Memory object holding the parameter c. + * @param s[out] Memory object holding the parameter s. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template ::value, bool>::type = true> -typename sb_handle_t::event_t _rotg(sb_handle_t &sb_handle, container_0_t a, - container_1_t b, container_2_t c, - container_3_t s); +typename sb_handle_t::event_t _rotg( + sb_handle_t &sb_handle, container_0_t a, container_1_t b, container_2_t c, + container_3_t s, const typename sb_handle_t::event_t &_dependencies); /** * \brief Synchronous version of rotg. @@ -317,160 +338,176 @@ typename sb_handle_t::event_t _rotg(sb_handle_t &sb_handle, container_0_t a, * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar r. * @param c[out] scalar representing the output c. * @param s[out] scalar representing the output s. + * @param _dependencies Vector of events */ template < typename sb_handle_t, typename scalar_t, typename std::enable_if::value, bool>::type = true> void _rotg(sb_handle_t &sb_handle, scalar_t &a, scalar_t &b, scalar_t &c, - scalar_t &s); + scalar_t &s, const typename sb_handle_t::event_t &_dependencies); /** * \brief Computes the inner product of two vectors with double precision * accumulation (synchronous version that returns the result directly) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename ValueType::type _dot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, - increment_t _incx, - container_1_t _vy, - increment_t _incy); +typename ValueType::type _dot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies); /** * \brief Computes the inner product of two vectors with double precision * accumulation and adds a scalar to the result (synchronous version that * returns the result directly) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. If size 0, the result will be sb. * @param sb Scalar to add to the results of the inner product. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template typename ValueType::type _sdsdot( sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, - increment_t _incx, container_1_t _vy, increment_t _incy); + increment_t _incx, container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies); /** * \brief ICAMAX finds the index of the first element having maximum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template index_t _iamax(sb_handle_t &sb_handle, index_t _N, container_t _vx, - increment_t _incx); + increment_t _incx, + const typename sb_handle_t::event_t &_dependencies); /** * \brief ICAMIN finds the index of the first element having minimum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template index_t _iamin(sb_handle_t &sb_handle, index_t _N, container_t _vx, - increment_t _incx); + increment_t _incx, + const typename sb_handle_t::event_t &_dependencies); /** * \brief ASUM Takes the sum of the absolute values * * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template -typename ValueType::type _asum(sb_handle_t &sb_handle, index_t _N, - container_t _vx, increment_t _incx); +typename ValueType::type _asum( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + const typename sb_handle_t::event_t &_dependencies); /** * \brief NRM2 Returns the euclidian norm of a vector * * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template -typename ValueType::type _nrm2(sb_handle_t &sb_handle, index_t _N, - container_t _vx, increment_t _incx); +typename ValueType::type _nrm2( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + const typename sb_handle_t::event_t &_dependencies); } // namespace internal template -typename sb_handle_t::event_t _axpy(sb_handle_t &sb_handle, index_t _N, - element_t _alpha, container_0_t _vx, - increment_t _incx, container_1_t _vy, - increment_t _incy) { - return internal::_axpy(sb_handle, _N, _alpha, _vx, _incx, _vy, _incy); +typename sb_handle_t::event_t _axpy( + sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + increment_t _incx, container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_axpy(sb_handle, _N, _alpha, _vx, _incx, _vy, _incy, + _dependencies); } /** * \brief COPY copies a vector, x, to a vector, y. * * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X - * @param _vy BufferIterator + * @param _vy BufferIterator or USM pointer * @param _incy Increment for the vector Y + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _copy(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy) { - return internal::_copy(sb_handle, _N, _vx, _incx, _vy, _incy); +typename sb_handle_t::event_t _copy( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_copy(sb_handle, _N, _vx, _incx, _vy, _incy, _dependencies); } /** * \brief Computes the inner product of two vectors with double precision * accumulation (Asynchronous version that returns an event) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _dot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - container_2_t _rs) { - return internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs); +typename sb_handle_t::event_t _dot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs, + _dependencies); } /** @@ -478,113 +515,128 @@ typename sb_handle_t::event_t _dot(sb_handle_t &sb_handle, index_t _N, * accumulation and adds a scalar to the result (Asynchronous version that * returns an event) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. If size 0, the result will be sb. * @param sb Scalar to add to the results of the inner product. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _sdsdot(sb_handle_t &sb_handle, index_t _N, - float sb, container_0_t _vx, - increment_t _incx, container_1_t _vy, - increment_t _incy, container_2_t _rs) { - return internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, _vy, _incy, _rs); +typename sb_handle_t::event_t _sdsdot( + sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, + increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, _vy, _incy, _rs, + _dependencies); } /** * \brief ASUM Takes the sum of the absolute values * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _asum(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _rs) { - return internal::_asum(sb_handle, _N, _vx, _incx, _rs); +typename sb_handle_t::event_t _asum( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _rs, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_asum(sb_handle, _N, _vx, _incx, _rs, _dependencies); } /** * \brief IAMAX finds the index of the first element having maximum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _iamax(sb_handle_t &sb_handle, index_t _N, - container_t _vx, increment_t _incx, - ContainerI _rs) { - return internal::_iamax(sb_handle, _N, _vx, _incx, _rs); +typename sb_handle_t::event_t _iamax( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + ContainerI _rs, const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_iamax(sb_handle, _N, _vx, _incx, _rs, _dependencies); } /** * \brief IAMIN finds the index of the first element having minimum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _iamin(sb_handle_t &sb_handle, index_t _N, - container_t _vx, increment_t _incx, - ContainerI _rs) { - return internal::_iamin(sb_handle, _N, _vx, _incx, _rs); +typename sb_handle_t::event_t _iamin( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + ContainerI _rs, const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_iamin(sb_handle, _N, _vx, _incx, _rs, _dependencies); } /** * \brief SWAP interchanges two vectors * * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X - * @param _vy BufferIterator + * @param _vy BufferIterator or USM pointer * @param _incy Increment for the vector Y + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _swap(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy) { - return internal::_swap(sb_handle, _N, _vx, _incx, _vy, _incy); +typename sb_handle_t::event_t _swap( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_swap(sb_handle, _N, _vx, _incx, _vy, _incy, _dependencies); } /** * \brief SCALAR operation on a vector * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _scal(sb_handle_t &sb_handle, index_t _N, - element_t _alpha, container_0_t _vx, - increment_t _incx) { - return internal::_scal(sb_handle, _N, _alpha, _vx, _incx); +typename sb_handle_t::event_t _scal( + sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + increment_t _incx, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_scal(sb_handle, _N, _alpha, _vx, _incx, _dependencies); } /** * \brief NRM2 Returns the euclidian norm of a vector * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _nrm2(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _rs) { - return internal::_nrm2(sb_handle, _N, _vx, _incx, _rs); +typename sb_handle_t::event_t _nrm2( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _rs, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_nrm2(sb_handle, _N, _vx, _incx, _rs, _dependencies); } /** @@ -592,22 +644,23 @@ typename sb_handle_t::event_t _nrm2(sb_handle_t &sb_handle, index_t _N, * @brief _rot constructor given plane rotation * * * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incy Increment for the vector Y * @param _sin sine * @param _cos cosine * @param _N data size - * + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _rot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - element_t _cos, element_t _sin) { - return internal::_rot(sb_handle, _N, _vx, _incx, _vy, _incy, _cos, _sin); +typename sb_handle_t::event_t _rot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, element_t _cos, element_t _sin, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_rot(sb_handle, _N, _vx, _incx, _vy, _incy, _cos, _sin, + _dependencies); } /** @@ -627,28 +680,30 @@ typename sb_handle_t::event_t _rot(sb_handle_t &sb_handle, index_t _N, * [h21 h22] [h21 1.0] [-1.0 h22] [0.0 1.0] * * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes (for vx and vy). - * @param[in, out] _vx Buffer holding input vector x + * @param[in, out] _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param[in, out] _vy Buffer holding input vector y + * @param[in, out] _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) * @param[in] _param Buffer with the following layout: [flag, h11, h21, h12, * h22]. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _rotm(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - container_2_t _param) { - return internal::_rotm(sb_handle, _N, _vx, _incx, _vy, _incy, _param); +typename sb_handle_t::event_t _rotm( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _param, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_rotm(sb_handle, _N, _vx, _incx, _vy, _incy, _param, + _dependencies); } /** @@ -667,57 +722,60 @@ typename sb_handle_t::event_t _rotm(sb_handle_t &sb_handle, index_t _N, * Rotmg may apply scaling operations to d1, d2 and x1 to avoid overflows. * * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator - * @tparam container_3_t Buffer Iterator - * @tparam container_4_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer + * @tparam container_3_t Buffer Iterator or USM pointer + * @tparam container_4_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle - * @param _d1[in,out] On entry, buffer holding the scaling factor for the + * @param _d1[in,out] On entry, memory object holding the scaling factor for the * x-coordinate. On exit, the re-scaled _d1. - * @param _d2[in,out] On entry, buffer holding the scaling factor for the + * @param _d2[in,out] On entry, memory object holding the scaling factor for the * y-coordinate. On exit, the re-scaled _d2. - * @param _x1[in,out] On entry, buffer holding the x-coordinate. On exit, the - * re-scaled _x1 - * @param _y1[in] Buffer holding the y-coordinate of the point. + * @param _x1[in,out] On entry, memory object holding the x-coordinate. On exit, + * the re-scaled _x1 + * @param _y1[in] Memory object holding the y-coordinate of the point. * @param _param[out] Buffer with the following layout: [flag, h11, h21, h12, * h22]. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _rotmg(sb_handle_t &sb_handle, container_0_t _d1, - container_1_t _d2, container_2_t _x1, - container_3_t _y1, container_4_t _param) { - return internal::_rotmg(sb_handle, _d1, _d2, _x1, _y1, _param); +typename sb_handle_t::event_t _rotmg( + sb_handle_t &sb_handle, container_0_t _d1, container_1_t _d2, + container_2_t _x1, container_3_t _y1, container_4_t _param, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_rotmg(sb_handle, _d1, _d2, _x1, _y1, _param, _dependencies); } /** * \brief Given the Cartesian coordinates (a, b) of a point, the rotg routines * return the parameters c, s, r, and z associated with the Givens rotation. * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator - * @tparam container_3_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer + * @tparam container_3_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle - * @param a[in, out] On entry, buffer holding the x-coordinate of the point. On - * exit, the scalar z. - * @param b[in, out] On entry, buffer holding the y-coordinate of the point. On - * exit, the scalar r. - * @param c[out] Buffer holding the parameter c. - * @param s[out] Buffer holding the parameter s. + * @param a[in, out] On entry, memory object holding the x-coordinate of the + * point. On exit, the scalar z. + * @param b[in, out] On entry, memory object holding the y-coordinate of the + * point. On exit, the scalar r. + * @param c[out] Memory object holding the parameter c. + * @param s[out] Memory object holding the parameter s. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template ::value, bool>::type = true> -typename sb_handle_t::event_t _rotg(sb_handle_t &sb_handle, container_0_t a, - container_1_t b, container_2_t c, - container_3_t s) { - return internal::_rotg(sb_handle, a, b, c, s); +typename sb_handle_t::event_t _rotg( + sb_handle_t &sb_handle, container_0_t a, container_1_t b, container_2_t c, + container_3_t s, const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_rotg(sb_handle, a, b, c, s, _dependencies); } /** @@ -731,41 +789,43 @@ typename sb_handle_t::event_t _rotg(sb_handle_t &sb_handle, container_0_t a, * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar r. * @param c[out] scalar representing the output c. * @param s[out] scalar representing the output s. + * @param _dependencies Vector of events */ template < typename sb_handle_t, typename scalar_t, typename std::enable_if::value, bool>::type = true> void _rotg(sb_handle_t &sb_handle, scalar_t &a, scalar_t &b, scalar_t &c, - scalar_t &s) { - internal::_rotg(sb_handle, a, b, c, s); + scalar_t &s, + const typename sb_handle_t::event_t &_dependencies = {}) { + internal::_rotg(sb_handle, a, b, c, s, _dependencies); } /** * \brief Computes the inner product of two vectors with double precision * accumulation (synchronous version that returns the result directly) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename ValueType::type _dot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, - increment_t _incx, - container_1_t _vy, - increment_t _incy) { - return internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy); +typename ValueType::type _dot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _dependencies); } /** @@ -773,81 +833,90 @@ typename ValueType::type _dot(sb_handle_t &sb_handle, index_t _N, * accumulation and adds a scalar to the result (synchronous version that * returns the result directly) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. If size 0, the result will be sb. * @param sb Scalar to add to the results of the inner product. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template typename ValueType::type _sdsdot( sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, - increment_t _incx, container_1_t _vy, increment_t _incy) { - return internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, _vy, _incy); + increment_t _incx, container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, _vy, _incy, + _dependencies); } /** * \brief ICAMAX finds the index of the first element having maximum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template index_t _iamax(sb_handle_t &sb_handle, index_t _N, container_t _vx, - increment_t _incx) { - return internal::_iamax(sb_handle, _N, _vx, _incx); + increment_t _incx, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_iamax(sb_handle, _N, _vx, _incx, _dependencies); } /** * \brief ICAMIN finds the index of the first element having minimum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template index_t _iamin(sb_handle_t &sb_handle, index_t _N, container_t _vx, - increment_t _incx) { - return internal::_iamin(sb_handle, _N, _vx, _incx); + increment_t _incx, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_iamin(sb_handle, _N, _vx, _incx, _dependencies); } /** * \brief ASUM Takes the sum of the absolute values * * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template -typename ValueType::type _asum(sb_handle_t &sb_handle, index_t _N, - container_t _vx, - increment_t _incx) { - return internal::_asum(sb_handle, _N, _vx, _incx); +typename ValueType::type _asum( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_asum(sb_handle, _N, _vx, _incx, _dependencies); } /** * \brief NRM2 Returns the euclidian norm of a vector * * @param sb_handle SB_Handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment for the vector X + * @param _dependencies Vector of events */ template -typename ValueType::type _nrm2(sb_handle_t &sb_handle, index_t _N, - container_t _vx, - increment_t _incx) { - return internal::_nrm2(sb_handle, _N, _vx, _incx); +typename ValueType::type _nrm2( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + const typename sb_handle_t::event_t &_dependencies = {}) { + return internal::_nrm2(sb_handle, _N, _vx, _incx, _dependencies); } } // end namespace blas diff --git a/include/interface/blas2_interface.h b/include/interface/blas2_interface.h index 6ca9b3a2f..af5ec2b1f 100644 --- a/include/interface/blas2_interface.h +++ b/include/interface/blas2_interface.h @@ -62,7 +62,8 @@ typename sb_handle_t::event_t _gemv( // when trans = "n" and (1+(n-1)*abs(incy) otherwise, // containing the vector "y" (if beta is nonzero). When // finished, y is overwritten with the updated vector. - increment_t _incy // The increment for elements in y (nonzero). + increment_t _incy, // The increment for elements in y (nonzero). + const typename sb_handle_t::event_t& _dependencies // Vector of events ); /*! @@ -70,15 +71,14 @@ typename sb_handle_t::event_t _gemv( * documentation in the blas2_interface.hpp file for details. */ template -typename SB_Handle::event_t _gemv_impl(SB_Handle& sb_handle, index_t _M, - index_t _N, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy); +typename sb_handle_t::event_t _gemv_impl( + sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies); /*! @brief Generalised matrix vector product with a triangular symmetric matrix. @@ -103,7 +103,8 @@ typename sb_handle_t::event_t _trmv( container_0_t _mA, // (_lda, _N) The input matrix index_t _lda, // >max(1, _N) The first dimension of _mA container_1_t _vx, // (1 + (_N-1)*abs(_incx)), output vector X - increment_t _incx // !=0 The increment for the elements of X + increment_t _incx, // !=0 The increment for the elements of X + const typename sb_handle_t::event_t& _dependencies // Vector of events ); /** @@ -125,21 +126,23 @@ typename sb_handle_t::event_t _trmv( * @param _lda Leading dimension _mA at least _N * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _trsv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx); +typename sb_handle_t::event_t _trsv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + container_0_t _mA, index_t _lda, container_1_t _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies = {}); template -typename sb_handle_t::event_t _trsv_impl(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx); +typename sb_handle_t::event_t _trsv_impl( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, + container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies); /*! @brief Generalised matrix vector product with a square symmetric matrix, @@ -168,7 +171,8 @@ typename sb_handle_t::event_t _symv( increment_t _incx, // !=0 The increment for the elements of X element_t _beta, // Scalar parameter beta container_2_t _vy, // (1 + (_N-1)*abs(_incy)), output vector Y - increment_t _incy // !=0 The increment for the elements of Y + increment_t _incy, // !=0 The increment for the elements of Y + const typename sb_handle_t::event_t& _dependencies // Vector of events ); /*! @@ -197,7 +201,8 @@ typename sb_handle_t::event_t _ger( container_1_t _vy, // >(1 + (_N-1)*abs(_incy)), input vector Y increment_t _incy, // Increment for vector Y container_2_t _mA, // (_lda, n) array containing A, the output - index_t _lda // >max(1, m), Leading dimension of A + index_t _lda, // >max(1, m), Leading dimension of A + const typename sb_handle_t::event_t& _dependencies // Vector of events ); /*! @@ -222,7 +227,8 @@ typename sb_handle_t::event_t _syr( container_0_t _vx, // (1 + (_N-1)*abs(_incx)), input vector X increment_t _incx, // !=0 The increment for the elements of X container_1_t _mA, // (_lda, _N) The output matrix - index_t _lda // >max(1, _N) The first dimension of _mA + index_t _lda, // >max(1, _N) The first dimension of _mA + const typename sb_handle_t::event_t& _dependencies // Vector of events ); /** @@ -245,13 +251,14 @@ typename sb_handle_t::event_t _syr( * @param _vx (1 + (_N-1)*abs(_incx)), input vector X * @param _incx !=0 The increment for the elements of X * @param _mPA (_lda, _N) The output matrix in packed format + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _spr(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_0_t _vx, increment_t _incx, - container_1_t _mPA); +typename sb_handle_t::event_t _spr( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_0_t _vx, increment_t _incx, container_1_t _mPA, + const typename sb_handle_t::event_t& _dependencies); /** * @brief Generalised two vectors squaring followed by a sum with a packed @@ -275,15 +282,15 @@ typename sb_handle_t::event_t _spr(sb_handle_t& sb_handle, char _Uplo, * @param _vy (1 + (_N-1)*abs(_incy)), input vector Y * @param _incy !=0 The increment for the elements of Y * @param _mPA (_lda, _N) The output matrix in packed format + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _spr2(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _vy, increment_t _incy, - container_t2 _mPA); +typename sb_handle_t::event_t _spr2( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _vy, increment_t _incy, + container_t2 _mPA, const typename sb_handle_t::event_t& _dependencies); /*! @brief Generalised vector products followed by a sum with a symmetric matrix. @@ -310,7 +317,8 @@ typename sb_handle_t::event_t _syr2( container_1_t _vy, // (1 + (_N-1)*abs(_incx)), input vector Y increment_t _incy, // !=0 The increment for the elements of Y container_2_t _mA, // (_lda, _N) The output matrix - index_t _lda // >max(1, _N) The first dimension of _mA + index_t _lda, // >max(1, _N) The first dimension of _mA + const typename sb_handle_t::event_t& _dependencies // Vector of events ); /** @@ -341,27 +349,25 @@ typename sb_handle_t::event_t _syr2( * @param _vy Buffer containing y of at least (1+(_M-1)*abs(_incy)) elements * when trans = 'n' and (1+(_N-1)*abs(_incy) otherwise * @param _incy Increment for _vy + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _gbmv(sb_handle_t& sb_handle, char _trans, - index_t _M, index_t _N, index_t _KL, - index_t _KU, element_t _alpha, - container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx, - element_t _beta, container_2_t _vy, - increment_t _incy); +typename sb_handle_t::event_t _gbmv( + sb_handle_t& sb_handle, char _trans, index_t _M, index_t _N, index_t _KL, + index_t _KU, element_t _alpha, container_0_t _mA, index_t _lda, + container_1_t _vx, increment_t _incx, element_t _beta, container_2_t _vy, + increment_t _incy, const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _gbmv_impl(sb_handle_t& sb_handle, index_t _M, - index_t _N, index_t _KL, index_t _KU, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy); +typename sb_handle_t::event_t _gbmv_impl( + sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _KL, index_t _KU, + element_t _alpha, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, element_t _beta, container_t2 _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies); /** * @brief Matrix vector product with symmetric band matrices. @@ -387,26 +393,25 @@ typename sb_handle_t::event_t _gbmv_impl(sb_handle_t& sb_handle, index_t _M, * @param _beta Scalar parameter beta * @param _vy Buffer containing y of at least (1+(_N-1)*abs(_incy)) elements * @param _incy Increment for _vy + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _sbmv(sb_handle_t& sb_handle, char _Uplo, - index_t _N, index_t _K, element_t _alpha, - container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx, - element_t _beta, container_2_t _vy, - increment_t _incy); +typename sb_handle_t::event_t _sbmv( + sb_handle_t& sb_handle, char _Uplo, index_t _N, index_t _K, + element_t _alpha, container_0_t _mA, index_t _lda, container_1_t _vx, + increment_t _incx, element_t _beta, container_2_t _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _sbmv_impl(sb_handle_t& sb_handle, index_t _N, - index_t _K, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy); +typename sb_handle_t::event_t _sbmv_impl( + sb_handle_t& sb_handle, index_t _N, index_t _K, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies); /** * @brief Matrix vector product with symmetric packed matrices. @@ -434,21 +439,20 @@ typename sb_handle_t::event_t _sbmv_impl(sb_handle_t& sb_handle, index_t _N, template -typename sb_handle_t::event_t _spmv(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_0_t _mA, container_1_t _vx, - increment_t _incx, element_t _beta, - container_2_t _vy, increment_t _incy); +typename sb_handle_t::event_t _spmv( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_0_t _mA, container_1_t _vx, increment_t _incx, element_t _beta, + container_2_t _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _spmv_impl(sb_handle_t& sb_handle, index_t _N, - element_t _alpha, container_t0 _mA, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy); +typename sb_handle_t::event_t _spmv_impl( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_t0 _mA, + container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, + increment_t _incy, const typename sb_handle_t::event_t& _dependencies); /** * @brief Matrix vector product with triangular band matrices. @@ -472,21 +476,22 @@ typename sb_handle_t::event_t _spmv_impl(sb_handle_t& sb_handle, index_t _N, * @param _lda Leading dimension _mA at least (_K + 1) * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - index_t _K, container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx); +typename sb_handle_t::event_t _tbmv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + index_t _K, container_0_t _mA, index_t _lda, container_1_t _vx, + increment_t _incx, const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _tbmv_impl(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx); +typename sb_handle_t::event_t _tbmv_impl( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies); /** * @brief Matrix vector product with triangular packed matrices. @@ -508,21 +513,22 @@ typename sb_handle_t::event_t _tbmv_impl(sb_handle_t& sb_handle, index_t _N, * matrix format * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - container_0_t _mA, container_1_t _vx, - increment_t _incx); +typename sb_handle_t::event_t _tpmv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + container_0_t _mA, container_1_t _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _tpmv_impl(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, container_t1 _vx, - increment_t _incx); +typename sb_handle_t::event_t _tpmv_impl( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, + increment_t _incx, const typename sb_handle_t::event_t& _dependencies); /** * @brief Linear system solver for triangular band matrices. @@ -545,22 +551,23 @@ typename sb_handle_t::event_t _tpmv_impl(sb_handle_t& sb_handle, index_t _N, * @param _lda Leading dimension _mA at least (_K + 1) * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _tbsv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - index_t _K, container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx); +typename sb_handle_t::event_t _tbsv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + index_t _K, container_0_t _mA, index_t _lda, container_1_t _vx, + increment_t _incx, const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _tbsv_impl(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx); +typename sb_handle_t::event_t _tbsv_impl( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies); /** * @brief Linear system solver for triangular packed matrices. @@ -581,13 +588,15 @@ typename sb_handle_t::event_t _tbsv_impl(sb_handle_t& sb_handle, index_t _N, * Matrix format * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) +* @param _dependencies Vector of events */ template typename sb_handle_t::event_t _tpsv(sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, container_0_t _mA, container_1_t _vx, - increment_t _incx); + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies); template typename sb_handle_t::event_t _tpsv_impl(sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, - increment_t _incx); + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies); } // namespace internal @@ -632,10 +642,11 @@ typename sb_handle_t::event_t inline _gemv( // when trans = "n" and (1+(n-1)*abs(incy) otherwise, // containing the vector "y" (if beta is nonzero). When // finished, y is overwritten with the updated vector. - increment_t _incy // The increment for elements in y (nonzero). + increment_t _incy, // The increment for elements in y (nonzero). + const typename sb_handle_t::event_t& _dependencies = {} // Vector of events ) { return internal::_gemv(sb_handle, _trans, _M, _N, _alpha, _mA, _lda, _vx, - _incx, _beta, _vy, _incy); + _incx, _beta, _vy, _incy, _dependencies); } /*! @@ -661,10 +672,11 @@ typename sb_handle_t::event_t inline _trmv( container_0_t _mA, // (_lda, _N) The input matrix index_t _lda, // >max(1, _N) The first dimension of _mA container_1_t _vx, // (1 + (_N-1)*abs(_incx)), output vector X - increment_t _incx // !=0 The increment for the elements of X + increment_t _incx, // !=0 The increment for the elements of X + const typename sb_handle_t::event_t& _dependencies = {} // Vector of events ) { return internal::_trmv(sb_handle, _Uplo, _trans, _Diag, _N, _mA, _lda, _vx, - _incx); + _incx, _dependencies); } /** @@ -686,16 +698,16 @@ typename sb_handle_t::event_t inline _trmv( * @param _lda Leading dimension _mA at least _N * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t inline _trsv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - container_0_t _mA, index_t _lda, - container_1_t _vx, - increment_t _incx) { +typename sb_handle_t::event_t inline _trsv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + container_0_t _mA, index_t _lda, container_1_t _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_trsv(sb_handle, _Uplo, _trans, _Diag, _N, _mA, _lda, _vx, - _incx); + _incx, _dependencies); } /*! @@ -726,10 +738,11 @@ typename sb_handle_t::event_t inline _symv( increment_t _incx, // !=0 The increment for the elements of X element_t _beta, // Scalar parameter beta container_2_t _vy, // (1 + (_N-1)*abs(_incy)), output vector Y - increment_t _incy // !=0 The increment for the elements of Y + increment_t _incy, // !=0 The increment for the elements of Y + const typename sb_handle_t::event_t& _dependencies = {} // Vector of events ) { return internal::_symv(sb_handle, _Uplo, _N, _alpha, _mA, _lda, _vx, _incx, - _beta, _vy, _incy); + _beta, _vy, _incy, _dependencies); } /*! @@ -759,10 +772,11 @@ typename sb_handle_t::event_t inline _ger( container_1_t _vy, // >(1 + (_N-1)*abs(_incy)), input vector Y increment_t _incy, // Increment for vector Y container_2_t _mA, // (_lda, n) array containing A, the output - index_t _lda // >max(1, m), Leading dimension of A + index_t _lda, // >max(1, m), Leading dimension of A + const typename sb_handle_t::event_t& _dependencies = {} // Vector of events ) { return internal::_ger(sb_handle, _M, _N, _alpha, _vx, _incx, _vy, _incy, _mA, - _lda); + _lda, _dependencies); } /*! @@ -788,9 +802,11 @@ typename sb_handle_t::event_t inline _syr( container_0_t _vx, // (1 + (_N-1)*abs(_incx)), input vector X increment_t _incx, // !=0 The increment for the elements of X container_1_t _mA, // (_lda, _N) The output matrix - index_t _lda // >max(1, _N) The first dimension of _mA + index_t _lda, // >max(1, _N) The first dimension of _mA + const typename sb_handle_t::event_t& _dependencies = {} // Vector of events ) { - return internal::_syr(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _mA, _lda); + return internal::_syr(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _mA, _lda, + _dependencies); } /** @@ -813,16 +829,16 @@ typename sb_handle_t::event_t inline _syr( * @param _vx (1 + (_N-1)*abs(_incx)), input vector X * @param _incx !=0 The increment for the elements of X * @param _mPA (_lda, _N) The output matrix in packed format + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t inline _spr(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_0_t _vx, increment_t _incx, - container_1_t _mPA) { - return internal::_spr(sb_handle, _Uplo, _N, - _alpha, _vx, _incx, _mPA); +typename sb_handle_t::event_t inline _spr( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_0_t _vx, increment_t _incx, container_1_t _mPA, + const typename sb_handle_t::event_t& _dependencies = {}) { + return internal::_spr(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _mPA, + _dependencies); } /** @@ -847,17 +863,18 @@ typename sb_handle_t::event_t inline _spr(sb_handle_t& sb_handle, char _Uplo, * @param _vy (1 + (_N-1)*abs(_incy)), input vector Y * @param _incy !=0 The increment for the elements of Y * @param _mPA (_lda, _N) The output matrix in packed format + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t inline _spr2(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _vy, increment_t _incy, - container_t2 _mPA) { +typename sb_handle_t::event_t inline _spr2( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _vy, increment_t _incy, + container_t2 _mPA, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_spr2(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _vy, _incy, - _mPA); + _mPA, _dependencies); } /*! @@ -887,10 +904,11 @@ typename sb_handle_t::event_t inline _syr2( container_1_t _vy, // (1 + (_N-1)*abs(_incx)), input vector Y increment_t _incy, // !=0 The increment for the elements of Y container_2_t _mA, // (_lda, _N) The output matrix - index_t _lda // >max(1, _N) The first dimension of _mA + index_t _lda, // >max(1, _N) The first dimension of _mA + const typename sb_handle_t::event_t& _dependencies = {} // Vector of events ) { return internal::_syr2(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _vy, _incy, - _mA, _lda); + _mA, _lda, _dependencies); } /** @@ -921,19 +939,19 @@ typename sb_handle_t::event_t inline _syr2( * @param _vy Buffer containing y of at least (1+(_M-1)*abs(_incy)) elements * when trans = 'n' and (1+(_N-1)*abs(_incy) otherwise * @param _incy Increment for _vy + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t inline _gbmv(sb_handle_t& sb_handle, char _trans, - index_t _M, index_t _N, index_t _KL, - index_t _KU, element_t _alpha, - container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx, - element_t _beta, container_2_t _vy, - increment_t _incy) { +typename sb_handle_t::event_t inline _gbmv( + sb_handle_t& sb_handle, char _trans, index_t _M, index_t _N, index_t _KL, + index_t _KU, element_t _alpha, container_0_t _mA, index_t _lda, + container_1_t _vx, increment_t _incx, element_t _beta, container_2_t _vy, + increment_t _incy, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_gbmv(sb_handle, _trans, _M, _N, _KL, _KU, _alpha, _mA, _lda, - _vx, _incx, _beta, _vy, _incy); + _vx, _incx, _beta, _vy, _incy, _dependencies); } /** @@ -960,18 +978,18 @@ typename sb_handle_t::event_t inline _gbmv(sb_handle_t& sb_handle, char _trans, * @param _beta Scalar parameter beta * @param _vy Buffer containing y of at least (1+(_N-1)*abs(_incy)) elements * @param _incy Increment for _vy + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _sbmv(sb_handle_t& sb_handle, char _Uplo, - index_t _N, index_t _K, element_t _alpha, - container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx, - element_t _beta, container_2_t _vy, - increment_t _incy) { +typename sb_handle_t::event_t _sbmv( + sb_handle_t& sb_handle, char _Uplo, index_t _N, index_t _K, + element_t _alpha, container_0_t _mA, index_t _lda, container_1_t _vx, + increment_t _incx, element_t _beta, container_2_t _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_sbmv(sb_handle, _Uplo, _N, _K, _alpha, _mA, _lda, _vx, - _incx, _beta, _vy, _incy); + _incx, _beta, _vy, _incy, _dependencies); } /** @@ -1002,13 +1020,13 @@ typename sb_handle_t::event_t _sbmv(sb_handle_t& sb_handle, char _Uplo, template -typename sb_handle_t::event_t _spmv(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_0_t _mA, container_1_t _vx, - increment_t _incx, element_t _beta, - container_2_t _vy, increment_t _incy) { +typename sb_handle_t::event_t _spmv( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_0_t _mA, container_1_t _vx, increment_t _incx, element_t _beta, + container_2_t _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_spmv(sb_handle, _Uplo, _N, _alpha, _mA, _vx, _incx, _beta, - _vy, _incy); + _vy, _incy, _dependencies); } /** @@ -1033,15 +1051,17 @@ typename sb_handle_t::event_t _spmv(sb_handle_t& sb_handle, char _Uplo, * @param _lda Leading dimension _mA at least (_K + 1) * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - index_t _K, container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbmv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + index_t _K, container_0_t _mA, index_t _lda, container_1_t _vx, + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_tbmv(sb_handle, _Uplo, _trans, _Diag, _N, _K, _mA, _lda, - _vx, _incx); + _vx, _incx, _dependencies); } /** @@ -1064,14 +1084,16 @@ typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, char _Uplo, * matrix format * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - container_0_t _mA, container_1_t _vx, - increment_t _incx) { - return internal::_tpmv(sb_handle, _Uplo, _trans, _Diag, _N, _mA, _vx, _incx); +typename sb_handle_t::event_t _tpmv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + container_0_t _mA, container_1_t _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies = {}) { + return internal::_tpmv(sb_handle, _Uplo, _trans, _Diag, _N, _mA, _vx, _incx, + _dependencies); } /** @@ -1095,18 +1117,20 @@ typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, char _Uplo, * @param _lda Leading dimension _mA at least (_K + 1) * @param _vx Buffer containing x of at least (1+(_N-1)*abs(_incx)) elements * @param _incx Increment for _vx (nonzero) + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _tbsv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - index_t _K, container_0_t _mA, index_t _lda, - container_1_t _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbsv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + index_t _K, container_0_t _mA, index_t _lda, container_1_t _vx, + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_tbsv(sb_handle, _Uplo, _trans, _Diag, _N, _K, _mA, _lda, - _vx, _incx); + _vx, _incx, _dependencies); } -/** +/**` * @brief Linear system solver for triangular packed matrices. * * Linear system solver for triangular packed matrices, i.e., computing x s.t. @@ -1131,8 +1155,9 @@ template -typename sb_handle_t::event_t _gemm(sb_handle_t& sb_handle, char _TransA, - char _TransB, index_t _M, index_t _N, - index_t _K, element_t _alpha, - container_0_t a_, index_t _lda, - container_1_t b_, index_t _ldb, - element_t _beta, container_2_t _C, - index_t _ldc); +typename sb_handle_t::event_t _gemm( + sb_handle_t& sb_handle, char _TransA, char _TransB, index_t _M, index_t _N, + index_t _K, element_t _alpha, container_0_t a_, index_t _lda, + container_1_t b_, index_t _ldb, element_t _beta, container_2_t _C, + index_t _ldc, const typename sb_handle_t::event_t& _dependencies); template @@ -55,7 +53,8 @@ typename sb_handle_t::event_t _gemm_batched( index_t _K, element_t _alpha, container_0_t a_, index_t _lda, container_1_t b_, index_t _ldb, element_t _beta, container_2_t _C, index_t _ldc, index_t batch_size, - gemm_batch_type_t batch_type = gemm_batch_type_t::strided); + gemm_batch_type_t batch_type = gemm_batch_type_t::strided, + const typename sb_handle_t::event_t& _dependencies = {}); template @@ -64,37 +63,35 @@ typename sb_handle_t::event_t _gemm_strided_batched( index_t _K, element_t _alpha, container_0_t a_, index_t _lda, index_t _stridea, container_1_t b_, index_t _ldb, index_t _strideb, element_t _beta, container_2_t _C, index_t _ldc, index_t _stridec, - index_t batch_size); + index_t batch_size, const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, - char uplo, char trans, char diag, index_t M, - index_t N, element_t alpha, container_0_t A, - index_t lda, container_1_t B, index_t ldb); +typename sb_handle_t::event_t _trsm( + sb_handle_t& sb_handle, char side, char uplo, char trans, char diag, + index_t M, index_t N, element_t alpha, container_0_t A, index_t lda, + container_1_t B, index_t ldb, + const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _symm(sb_handle_t& sb_handle, char _side, - char _uplo, index_t _M, index_t _N, - element_t _alpha, container_0_t a_, - index_t _lda, container_1_t b_, - index_t _ldb, element_t _beta, - container_2_t _C, index_t _ldc); +typename sb_handle_t::event_t _symm( + sb_handle_t& sb_handle, char _side, char _uplo, index_t _M, index_t _N, + element_t _alpha, container_0_t a_, index_t _lda, container_1_t b_, + index_t _ldb, element_t _beta, container_2_t _C, index_t _ldc, + const typename sb_handle_t::event_t& _dependencies); } // namespace internal template -typename sb_handle_t::event_t _gemm(sb_handle_t& sb_handle, char _TransA, - char _TransB, index_t _M, index_t _N, - index_t _K, element_t _alpha, - container_0_t a_, index_t _lda, - container_1_t b_, index_t _ldb, - element_t _beta, container_2_t _C, - index_t _ldc) { +typename sb_handle_t::event_t _gemm( + sb_handle_t& sb_handle, char _TransA, char _TransB, index_t _M, index_t _N, + index_t _K, element_t _alpha, container_0_t a_, index_t _lda, + container_1_t b_, index_t _ldb, element_t _beta, container_2_t _C, + index_t _ldc, const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_gemm(sb_handle, _TransA, _TransB, _M, _N, _K, _alpha, a_, - _lda, b_, _ldb, _beta, _C, _ldc); + _lda, b_, _ldb, _beta, _C, _ldc, _dependencies); } template -typename sb_handle_t::event_t inline _trsm(sb_handle_t& sb_handle, char side, - char uplo, char trans, char diag, - index_t M, index_t N, - element_t alpha, container_0_t A, - index_t lda, container_1_t B, - index_t ldb) { +typename sb_handle_t::event_t inline _trsm( + sb_handle_t& sb_handle, char side, char uplo, char trans, char diag, + index_t M, index_t N, element_t alpha, container_0_t A, index_t lda, + container_1_t B, index_t ldb, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_trsm(sb_handle, side, uplo, trans, diag, M, N, alpha, A, - lda, B, ldb); + lda, B, ldb, _dependencies); } template -typename sb_handle_t::event_t _symm(sb_handle_t& sb_handle, char _side, - char _uplo, index_t _M, index_t _N, - element_t _alpha, container_0_t a_, - index_t _lda, container_1_t b_, - index_t _ldb, element_t _beta, - container_2_t _C, index_t _ldc) { +typename sb_handle_t::event_t _symm( + sb_handle_t& sb_handle, char _side, char _uplo, index_t _M, index_t _N, + element_t _alpha, container_0_t a_, index_t _lda, container_1_t b_, + index_t _ldb, element_t _beta, container_2_t _C, index_t _ldc, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_symm(sb_handle, _side, _uplo, _M, _N, _alpha, a_, _lda, b_, - _ldb, _beta, _C, _ldc); + _ldb, _beta, _C, _ldc, _dependencies); } } // namespace blas diff --git a/include/interface/extension_interface.h b/include/interface/extension_interface.h index 32fc7930f..44b3ff5b1 100644 --- a/include/interface/extension_interface.h +++ b/include/interface/extension_interface.h @@ -65,7 +65,8 @@ typename sb_handle_t::event_t _matcopy(sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out); + index_t ld_out, index_t inc_out, + const typename sb_handle_t::event_t& _dependencies); template @@ -74,13 +75,15 @@ typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a, element_t alpha, container_t a, index_t lda, element_t beta, container_t b, index_t ldb, - container_t c, index_t ldc); + container_t c, index_t ldc, + const typename sb_handle_t::event_t& _dependencies); template typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, - out_t B, index_t ld_b); + out_t B, index_t ld_b, + const typename sb_handle_t::event_t& _dependencies); template @@ -88,7 +91,8 @@ typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out, index_t rows, index_t cols, - reduction_dim_t reduction_dim); + reduction_dim_t reduction_dim, + const typename sb_handle_t::event_t& _dependencies); template (sb_handle, trans, m, n, alpha, in_memory, ld_in, static_cast(1), out_memory, - ld_out, static_cast(1)); + ld_out, static_cast(1), + _dependencies); } /** @@ -169,9 +175,11 @@ typename sb_handle_t::event_t _omatcopy2(sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out) { + index_t ld_out, index_t inc_out, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_matcopy(sb_handle, trans, m, n, alpha, in_memory, - ld_in, inc_in, out_memory, ld_out, inc_out); + ld_in, inc_in, out_memory, ld_out, inc_out, + _dependencies); } /** @@ -202,9 +210,10 @@ typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a, element_t alpha, container_t A, index_t lda, element_t beta, container_t B, index_t ldb, - container_t C, index_t ldc) { + container_t C, index_t ldc, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, A, lda, - beta, B, ldb, C, ldc); + beta, B, ldb, C, ldc, _dependencies); } namespace extension { @@ -230,9 +239,10 @@ template typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_in, - index_t ld_out) { + index_t ld_out, + const typename sb_handle_t::event_t& _dependencies = {}) { return blas::internal::_transpose(sb_handle, m, n, A, ld_in, - A, ld_out); + A, ld_out, _dependencies); } /** @@ -258,9 +268,10 @@ template typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, - out_t B, index_t ld_b) { + out_t B, index_t ld_b, + const typename sb_handle_t::event_t& _dependencies = {}) { return blas::internal::_transpose(sb_handle, m, n, A, ld_a, - B, ld_b); + B, ld_b, _dependencies); } template ( - sb_handle, buffer_in, ld, buffer_out, rows, cols, reduction_dim); + sb_handle, buffer_in, ld, buffer_out, rows, cols, reduction_dim, _dependencies); } } // namespace extension diff --git a/include/interface/gemm_launcher.h b/include/interface/gemm_launcher.h index 01dedcde5..42b5d77a5 100644 --- a/include/interface/gemm_launcher.h +++ b/include/interface/gemm_launcher.h @@ -34,21 +34,21 @@ namespace blas { /*! * @brief Wrapper around Gemm. Creates the views, then makes and launches Gemm */ -template (gemm_batch_type_t::strided), bool UseJointMatrix = false> struct Gemm_Launcher { - template + template static typename sb_handle_t::event_t _select_gemm( sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, element_t _alpha, container_0_t a_, index_t _lda, index_t _stridea, container_1_t b_, index_t _ldb, index_t _strideb, element_t _beta, - container_2_t _C, index_t _ldc, index_t _stridec, index_t batch_size); + container_2_t _C, index_t _ldc, index_t _stridec, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies = {}); }; } // namespace blas diff --git a/include/sb_handle/kernel_constructor.h b/include/sb_handle/kernel_constructor.h index 2bdb8c040..73e9fa414 100644 --- a/include/sb_handle/kernel_constructor.h +++ b/include/sb_handle/kernel_constructor.h @@ -118,9 +118,9 @@ struct ExpressionTreeFunctor; using_local_memory == false). */ template -static cl::sycl::event execute_tree(queue_t q, expression_tree_t t, - size_t _localSize, size_t _globalSize, - size_t _shMem); +static cl::sycl::event execute_tree( + queue_t q, expression_tree_t t, size_t _localSize, size_t _globalSize, + size_t _shMem, std::vector dependencies = {}); } // namespace blas diff --git a/include/sb_handle/sycl_blas_handle.h b/include/sb_handle/sycl_blas_handle.h index a8c1e388e..a3d86ecb9 100644 --- a/include/sb_handle/sycl_blas_handle.h +++ b/include/sb_handle/sycl_blas_handle.h @@ -53,25 +53,27 @@ class SB_Handle { computeUnits_(helper::get_num_compute_units(q)) {} template - event_t execute(expression_tree_t tree); + event_t execute(expression_tree_t tree, const event_t& dependencies = {}); template - event_t execute(expression_tree_t tree, index_t localSize); + event_t execute(expression_tree_t tree, index_t localSize, + const event_t& dependencies = {}); template - event_t execute(expression_tree_t tree, index_t localSize, - index_t globalSize); + event_t execute(expression_tree_t tree, index_t localSize, index_t globalSize, + const event_t& dependencies = {}); template event_t execute(expression_tree_t tree, index_t localSize, index_t globalSize, - index_t local_memory_size); + index_t local_memory_size, const event_t& dependencies = {}); template - event_t execute(AssignReduction); + event_t execute(AssignReduction, + const event_t& dependencies = {}); template event_t execute(AssignReduction t, - local_memory_t scr); + local_memory_t scr, const event_t& dependencies = {}); template - gemm_tree); + gemm_tree, + const event_t& dependencies = {}); // Tall and skinny Gemm specialization template (gemm_algorithm_t::tall_skinny), GemmVectorization, VectorSize, BatchType> - gemm_wrapper); + gemm_wrapper, + const event_t& dependencies = {}); // GemmPartial specialization template - gemm_partial); + gemm_partial, + const event_t& dependencies = {}); // Reduction specialization (inner or outer dimension) template event_t execute( - Reduction reduction_wrapper); + Reduction reduction_wrapper, + const event_t& dependencies = {}); inline bool has_local_memory() const { return localMemorySupport_; } inline queue_t get_queue() const { return q_; } @@ -134,8 +140,10 @@ class SB_Handle { // this must be in header as the number of event is controlled by user and we // dont know howmany permutation can be used by a user template - void inline wait(first_event_t first_event, next_event_t... next_events) { - cl::sycl::event::wait(concatenate_vectors(first_event, next_events...)); + void inline wait(first_event_t first_event, + next_event_t... next_dependencies) { + cl::sycl::event::wait( + concatenate_vectors(first_event, next_dependencies...)); } private: diff --git a/include/sycl_blas_helper.h b/include/sycl_blas_helper.h index 043ada44b..4cfd84b5c 100644 --- a/include/sycl_blas_helper.h +++ b/include/sycl_blas_helper.h @@ -32,6 +32,76 @@ namespace blas { namespace helper { +/** + * Allocation type for tests and benchmarks + */ +enum class AllocType : int { usm = 0, buffer = 1 }; + +template +struct AllocHelper; + +template +struct AllocHelper { + using type = value_t *; +}; + +template +struct AllocHelper { + using type = blas::BufferIterator; +}; + +#ifdef SB_ENABLE_USM +template +typename std::enable_if::type>::type +allocate(int size, cl::sycl::queue q) { + return cl::sycl::malloc_device(size, q); +} +#endif + +template +typename std::enable_if::type>::type +allocate(int size, cl::sycl::queue q) { + return make_sycl_iterator_buffer(size); +} + +#ifdef SB_ENABLE_USM +template +typename std::enable_if::type deallocate( + container_t mem, cl::sycl::queue q) { + if (mem != NULL) { + cl::sycl::free(reinterpret_cast(mem), q); + } +} +#endif + +template +typename std::enable_if::type deallocate( + container_t mem, cl::sycl::queue q) {} + +template +typename std::enable_if::type, + AllocType::usm>::type>::value>::type +enqueue_deallocate(std::vector dependencies, container_t mem, + cl::sycl::queue q) { +#ifdef SB_ENABLE_USM + auto event = q.submit([&](cl::sycl::handler &cgh) { + cgh.depends_on(dependencies); + cgh.host_task([=]() { cl::sycl::free(mem, q); }); + }); +#endif + return; +} + +template +typename std::enable_if::type, + AllocType::buffer>::type>::value>::type +enqueue_deallocate(std::vector, container_t mem, + cl::sycl::queue q) {} + inline bool has_local_memory(cl::sycl::queue &q) { return (q.get_device() .template get_info() == @@ -69,6 +139,15 @@ inline cl::sycl::event copy_to_device(cl::sycl::queue q, const element_t *src, return event; } +#ifdef SB_ENABLE_USM +template +inline cl::sycl::event copy_to_device(cl::sycl::queue q, const element_t *src, + element_t *dst, size_t size) { + auto event = q.memcpy(dst, src, size * sizeof(element_t)); + return event; +} +#endif + /* @brief Copying the data back to device @tparam element_t is the type of the data @param src is the BufferIterator we want to copy from. @@ -87,9 +166,19 @@ inline cl::sycl::event copy_to_host(cl::sycl::queue q, return event; } +#ifdef SB_ENABLE_USM +template +inline cl::sycl::event copy_to_host(cl::sycl::queue q, element_t *src, + element_t *dst, size_t size) { + auto event = q.memcpy(dst, src, size * sizeof(element_t)); + return event; +} +#endif + template inline cl::sycl::event fill(cl::sycl::queue q, BufferIterator buff, - element_t value, size_t size) { + element_t value, size_t size, + const std::vector &) { auto event = q.submit([&](cl::sycl::handler &cgh) { auto acc = buff.template get_range_accessor( cgh, size); @@ -97,6 +186,20 @@ inline cl::sycl::event fill(cl::sycl::queue q, BufferIterator buff, }); return event; } + +#ifdef SB_ENABLE_USM +template +inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value, + size_t size, + const std::vector &dependencies) { + auto event = q.submit([&](cl::sycl::handler &cgh) { + cgh.depends_on(dependencies); + cgh.fill(buff, value, size); + }); + return event; +} +#endif + } // end namespace helper } // end namespace blas #endif // SYCL_BLAS_HELPER_H diff --git a/include/views/view.h b/include/views/view.h index 129fe7800..2c8353f13 100644 --- a/include/views/view.h +++ b/include/views/view.h @@ -52,67 +52,83 @@ struct VectorView { using index_t = view_index_t; using increment_t = view_increment_t; using self_t = VectorView; - container_t &data_; - index_t size_data_; + container_t data_; index_t size_; - index_t disp_; increment_t strd_; // never size_t, because it could be negative - VectorView(view_container_t &data, view_index_t disp = 0, - view_increment_t strd = 1); - VectorView(view_container_t &data, view_index_t disp, view_increment_t strd, - view_index_t size); + // global pointer access inside the kernel + cl::sycl::global_ptr ptr_; + + VectorView(view_container_t data, view_increment_t strd, view_index_t size); VectorView( VectorView opV, - view_index_t disp, view_increment_t strd, view_index_t size); - - /*! - @brief Initializes the view using the indexing values. - @param originalSize The original size of the container - */ - inline void initialize(index_t originalSize); + view_increment_t strd, view_index_t size); /*! * @brief Returns a reference to the container */ - container_t &get_data(); + SYCL_BLAS_INLINE container_t get_data(); /*! * @brief Returns a pointer containing the raw data of the container */ - value_t *get_pointer(); - - /*! get_access_displacement. - * @brief get displacement from the origin. - */ - index_t get_access_displacement(); + SYCL_BLAS_INLINE value_t *get_pointer() const; /*! adjust_access_displacement. * @brief this method adjust the position of the data access to point to the * data_ + offset_ on the device side. This function will be called at the * begining of an expression so that the kernel wont repeat this operation at - * every eval call + * every eval call. + * For USM case, this method is not going to do anything as the library + * doesn't allow pointer manipulation. */ - void adjust_access_displacement(); - - /*! - * @brief Returns the size of the underlying container. - */ - index_t get_data_size(); + SYCL_BLAS_INLINE void adjust_access_displacement(); /*! @brief Returns the size of the view */ - inline index_t get_size() const; + SYCL_BLAS_INLINE index_t get_size() const; /*! @brief Returns the stride of the view. */ - increment_t get_stride(); + SYCL_BLAS_INLINE increment_t get_stride(); + + SYCL_BLAS_INLINE void bind(cl::sycl::handler &h) {} /**** EVALUATING ****/ - value_t &eval(index_t i); + template + SYCL_BLAS_INLINE typename std::enable_if::type eval( + index_t i) { + return (strd_ == 1) ? *(ptr_ + i) : *(ptr_ + i * strd_); + } + + template + SYCL_BLAS_INLINE typename std::enable_if::type eval( + index_t i) const { + return (strd_ == 1) ? *(ptr_ + i) : *(ptr_ + i * strd_); + } + + SYCL_BLAS_INLINE value_t &eval(cl::sycl::nd_item<1> ndItem) { + return eval(ndItem.get_global_id(0)); + } + + SYCL_BLAS_INLINE const value_t eval(cl::sycl::nd_item<1> ndItem) const { + return eval(ndItem.get_global_id(0)); + } + + template + SYCL_BLAS_INLINE typename std::enable_if::type eval( + index_t indx) { + return *(ptr_ + indx); + } + + template + SYCL_BLAS_INLINE typename std::enable_if::type eval( + index_t indx) const noexcept { + return *(ptr_ + indx); + } }; /*! MatrixView @@ -130,79 +146,116 @@ struct MatrixView { using container_t = view_container_t; using index_t = view_index_t; using self_t = MatrixView; - container_t &data_; - index_t size_data_; // real size of the data + container_t data_; index_t sizeR_; // number of rows index_t sizeC_; // number of columns index_t sizeL_; // size of the leading dimension - index_t disp_; // displacementt od the first element + + // global pointer access inside the kernel + cl::sycl::global_ptr ptr_; + // UPLO, BAND(KU,KL), PACKED, SIDE ARE ONLY REQUIRED - MatrixView(view_container_t &data, view_index_t sizeR, view_index_t sizeC); - MatrixView(view_container_t &data, view_index_t sizeR, view_index_t sizeC, - view_index_t sizeL, view_index_t disp); + MatrixView(view_container_t data, view_index_t sizeR, view_index_t sizeC); + MatrixView(view_container_t data, view_index_t sizeR, view_index_t sizeC, + view_index_t sizeL); MatrixView( - MatrixView opM, - view_index_t sizeR, view_index_t sizeC, view_index_t sizeL, - view_index_t disp); + MatrixView opM, + view_index_t sizeR, view_index_t sizeC, view_index_t sizeL); /*! * @brief Returns the container */ - container_t &get_data(); + SYCL_BLAS_INLINE container_t get_data(); /*! * @brief Returns the container */ - value_t *get_pointer(); + SYCL_BLAS_INLINE value_t *get_pointer() const; /*! - * @brief Returns the data size + * @brief Returns the size of the view. */ - index_t get_data_size() const; + SYCL_BLAS_INLINE index_t get_size() const; /*! - * @brief Returns the size of the view. + * @brief Returns the leading dimension. */ - inline index_t get_size() const; + SYCL_BLAS_INLINE const index_t getSizeL() const; /*! get_size_row. * @brief Return the number of columns. * @bug This value should change depending on the access mode, but * is currently set to Rows. */ - inline index_t get_size_row() const; + SYCL_BLAS_INLINE index_t get_size_row() const; /*! get_size_col. * @brief Return the number of columns. * @bug This value should change depending on the access mode, but * is currently set to Rows. */ - index_t get_size_col() const; - - /*! get_access_device. - * @brief Access on the Device (e.g CPU: Row, GPU: Column). - */ - int get_access_device() const; + SYCL_BLAS_INLINE index_t get_size_col() const; /*! adjust_access_displacement. * @brief set displacement from the origin. + * This method allows to have a pointer arithmetic semantics for buffers + * in the host code. The end result of the pointer arithmetic is passed + * as an access displacement for the buffer. + * In the case of USM, this method does nothing since the pointer + * arithmetic is performed implicitly. */ - void adjust_access_displacement(); - - /*! get_access_displacement. - * @brief get displacement from the origin. - */ - index_t get_access_displacement() const; + SYCL_BLAS_INLINE void adjust_access_displacement(); - /*! eval. - * @brief Evaluation for the given linear value. - */ - value_t &eval(index_t k); + SYCL_BLAS_INLINE void bind(cl::sycl::handler &h) {} /*! eval. * @brief Evaluation for the pair of row/col. */ - value_t &eval(index_t i, index_t j); + SYCL_BLAS_INLINE value_t &eval(index_t i, index_t j) { + return ((layout::is_col_major()) ? *(ptr_ + i + sizeL_ * j) + : *(ptr_ + j + sizeL_ * i)); + } + + SYCL_BLAS_INLINE value_t eval(index_t i, index_t j) const noexcept { + return ((layout::is_col_major()) ? *(ptr_ + i + sizeL_ * j) + : *(ptr_ + j + sizeL_ * i)); + } + + template + SYCL_BLAS_INLINE typename std::enable_if::type eval( + index_t indx) { + const index_t j = indx / sizeR_; + const index_t i = indx - sizeR_ * j; + return eval(i, j); + } + + template + SYCL_BLAS_INLINE typename std::enable_if::type eval( + index_t indx) const noexcept { + const index_t j = indx / sizeR_; + const index_t i = indx - sizeR_ * j; + return eval(i, j); + } + + SYCL_BLAS_INLINE value_t &eval(cl::sycl::nd_item<1> ndItem) { + return eval(ndItem.get_global_id(0)); + } + + SYCL_BLAS_INLINE value_t eval(cl::sycl::nd_item<1> ndItem) const noexcept { + return eval(ndItem.get_global_id(0)); + } + + template + SYCL_BLAS_INLINE typename std::enable_if::type eval( + index_t indx) { + return *(ptr_ + indx); + } + + template + SYCL_BLAS_INLINE typename std::enable_if::type eval( + index_t indx) const noexcept { + return *(ptr_ + indx); + } }; template -static inline auto make_vector_view(BufferIterator buff, - increment_t inc, index_t sz) { +static SYCL_BLAS_INLINE auto make_vector_view(BufferIterator buff, + increment_t inc, index_t sz) { static constexpr cl::sycl::access::mode access_mode_t = Choose::value, cl::sycl::access::mode, cl::sycl::access::mode::read, @@ -235,8 +288,9 @@ static inline auto make_vector_view(BufferIterator buff, } template -static inline auto make_matrix_view(BufferIterator buff, index_t m, - index_t n, index_t lda) { +static SYCL_BLAS_INLINE auto make_matrix_view(BufferIterator buff, + index_t m, index_t n, + index_t lda) { static constexpr cl::sycl::access::mode access_mode_t = Choose::value, cl::sycl::access::mode, cl::sycl::access::mode::read, @@ -269,6 +323,29 @@ static inline auto make_matrix_view(BufferIterator buff, index_t m, inc, (index_t)buff.get_offset()}; } +template +static SYCL_BLAS_INLINE auto make_vector_view(scalar_t *usm_ptr, + increment_t inc, index_t sz) { + using leaf_node_t = VectorView; + return leaf_node_t{usm_ptr, inc, sz}; +} + +template +static SYCL_BLAS_INLINE auto make_matrix_view(scalar_t *usm_ptr, index_t m, + index_t n, index_t lda) { + using leaf_node_t = + MatrixView; + return leaf_node_t{usm_ptr, m, n, lda}; +} + +template +static SYCL_BLAS_INLINE auto make_matrix_view(scalar_t *usm_ptr, index_t m, + index_t n, index_t lda, index_t inc) { + using leaf_node_t = + MatrixView; + return leaf_node_t{usm_ptr, m, n, lda}; +} + } // namespace blas #endif // VIEW_H diff --git a/python_generator/py_gen_blas_gemm_launcher.py b/python_generator/py_gen_blas_gemm_launcher.py index 9bac81c4d..b77a6e997 100644 --- a/python_generator/py_gen_blas_gemm_launcher.py +++ b/python_generator/py_gen_blas_gemm_launcher.py @@ -72,6 +72,9 @@ use_joint_matrix = sys.argv[37] symm_a = sys.argv[38] symm_b = sys.argv[39] + container0 = sys.argv[40] + container1 = sys.argv[41] + container2 = sys.argv[42] source = 'generated_src/' + blas_level_name + '/' + blas_function_name + '/' try: os.makedirs(source) @@ -246,6 +249,21 @@ key='SYMM_B', vals=[symm_b], itermode=Itermode.combinations, + iter_modifier=1), + Iterable( + key='container_t0', + vals=[container0], + itermode=Itermode.combinations, + iter_modifier=1), + Iterable( + key='container_t1', + vals=[container1], + itermode=Itermode.combinations, + iter_modifier=1), + Iterable( + key='container_t2', + vals=[container2], + itermode=Itermode.combinations, iter_modifier=1) ] iter_groups = [IterGroup('@ip1@', template, iterables, combine_iters=True)] diff --git a/src/interface/blas1/asum.cpp.in b/src/interface/blas1/asum.cpp.in index 304ac3f09..5a4e9ec30 100644 --- a/src/interface/blas1/asum.cpp.in +++ b/src/interface/blas1/asum.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -38,9 +38,12 @@ namespace internal { * @param _vx VectorView * @param _incx Increment in X axis */ -template typename SB_Handle::event_t _asum( - SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _rs); +template typename SB_Handle::event_t _asum(SB_Handle &sb_handle, + ${INDEX_TYPE} _N, + ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, + ${container_t1} _rs, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/asum_return.cpp.in b/src/interface/blas1/asum_return.cpp.in index d1f182d7e..1bdad8d08 100644 --- a/src/interface/blas1/asum_return.cpp.in +++ b/src/interface/blas1/asum_return.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -39,11 +39,9 @@ namespace internal { * @param _vx VectorView * @param _incx Increment in X axis */ -template - typename ValueType<${container_t0}>::type _asum(SB_Handle &sb_handle, - ${INDEX_TYPE} _N, - ${container_t0} _vx, - ${INCREMENT_TYPE} _incx); +template typename ValueType<${container_t0}>::type _asum( + SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/axpy.cpp.in b/src/interface/blas1/axpy.cpp.in index 9d2b64a9f..258404229 100644 --- a/src/interface/blas1/axpy.cpp.in +++ b/src/interface/blas1/axpy.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -48,6 +48,6 @@ namespace internal { template typename SB_Handle::event_t _axpy( SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, ${container_t0} _vx, ${INCREMENT_TYPE} _incx, ${container_t1} _vy, - ${INCREMENT_TYPE} _incy); + ${INCREMENT_TYPE} _incy, const typename SB_Handle::event_t& dependencies); } // namespace internal } // end namespace blas diff --git a/src/interface/blas1/copy.cpp.in b/src/interface/blas1/copy.cpp.in index bff01e630..cf9b20ff9 100644 --- a/src/interface/blas1/copy.cpp.in +++ b/src/interface/blas1/copy.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -44,6 +44,7 @@ namespace internal { */ template typename SB_Handle::event_t _copy( SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy); + ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // end namespace blas diff --git a/src/interface/blas1/dot.cpp.in b/src/interface/blas1/dot.cpp.in index f3a4539b6..4cd19a316 100644 --- a/src/interface/blas1/dot.cpp.in +++ b/src/interface/blas1/dot.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -51,9 +51,12 @@ namespace internal { * @param _rs Output buffer * @return Vector of events to wait for. */ -template typename SB_Handle::event_t _dot( - SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, - ${container_t2} _rs); +template typename SB_Handle::event_t _dot(SB_Handle &sb_handle, + ${INDEX_TYPE} _N, ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, + ${container_t1} _vy, + ${INCREMENT_TYPE} _incy, + ${container_t2} _rs, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/dot_return.cpp.in b/src/interface/blas1/dot_return.cpp.in index 6f57aae41..37e4a3d9c 100644 --- a/src/interface/blas1/dot_return.cpp.in +++ b/src/interface/blas1/dot_return.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -53,6 +53,7 @@ namespace internal { */ template typename ValueType<${container_t0}>::type _dot( SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy); + ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/iamax.cpp.in b/src/interface/blas1/iamax.cpp.in index b8ab27b83..45f519b72 100644 --- a/src/interface/blas1/iamax.cpp.in +++ b/src/interface/blas1/iamax.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -39,9 +39,12 @@ namespace internal { * @param _vx VectorView * @param _incx Increment in X axis */ -template typename SB_Handle::event_t _iamax( - SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _rs); +template typename SB_Handle::event_t _iamax(SB_Handle &sb_handle, + ${INDEX_TYPE} _N, + ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, + ${container_t1} _rs, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/iamax_return.cpp.in b/src/interface/blas1/iamax_return.cpp.in index 9a6e445e6..6cbcecc70 100644 --- a/src/interface/blas1/iamax_return.cpp.in +++ b/src/interface/blas1/iamax_return.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -40,7 +40,8 @@ namespace internal { * @param SB_Handle sb_handle */ template ${INDEX_TYPE} _iamax(SB_Handle &sb_handle, ${INDEX_TYPE} _N, - ${container_t0} _vx, ${INCREMENT_TYPE} _incx); + ${container_t0} _vx, ${INCREMENT_TYPE} _incx, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/iamin.cpp.in b/src/interface/blas1/iamin.cpp.in index e93043958..cfaff1864 100644 --- a/src/interface/blas1/iamin.cpp.in +++ b/src/interface/blas1/iamin.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -39,9 +39,12 @@ namespace internal { * @param _vx VectorView * @param _incx Increment in X axis */ -template typename SB_Handle::event_t _iamin( - SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _rs); +template typename SB_Handle::event_t _iamin(SB_Handle &sb_handle, + ${INDEX_TYPE} _N, + ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, + ${container_t1} _rs, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/iamin_return.cpp.in b/src/interface/blas1/iamin_return.cpp.in index c442baed4..09db47ef0 100644 --- a/src/interface/blas1/iamin_return.cpp.in +++ b/src/interface/blas1/iamin_return.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -40,7 +40,8 @@ namespace internal { * @param _incx Increment in X axis */ template ${INDEX_TYPE} _iamin(SB_Handle &sb_handle, ${INDEX_TYPE} _N, - ${container_t0} _vx, ${INCREMENT_TYPE} _incx); + ${container_t0} _vx, ${INCREMENT_TYPE} _incx, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/nrm2.cpp.in b/src/interface/blas1/nrm2.cpp.in index 654e78e39..be298f4f5 100644 --- a/src/interface/blas1/nrm2.cpp.in +++ b/src/interface/blas1/nrm2.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -39,8 +39,11 @@ namespace internal { * @param _vx VectorView * @param _incx Increment in X axis */ -template typename SB_Handle::event_t _nrm2( - SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _rs); +template typename SB_Handle::event_t _nrm2(SB_Handle &sb_handle, + ${INDEX_TYPE} _N, + ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, + ${container_t1} _rs, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/nrm2_return.cpp.in b/src/interface/blas1/nrm2_return.cpp.in index ea5f274ca..f82235677 100644 --- a/src/interface/blas1/nrm2_return.cpp.in +++ b/src/interface/blas1/nrm2_return.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -40,10 +40,8 @@ namespace internal { * @param _vx VectorView * @param _incx Increment in X axis */ -template - typename ValueType<${container_t0}>::type _nrm2(SB_Handle &sb_handle, - ${INDEX_TYPE} _N, - ${container_t0} _vx, - ${INCREMENT_TYPE} _incx); +template typename ValueType<${container_t0}>::type _nrm2( + SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/rot.cpp.in b/src/interface/blas1/rot.cpp.in index ee2a32701..e36db770b 100644 --- a/src/interface/blas1/rot.cpp.in +++ b/src/interface/blas1/rot.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -44,9 +44,12 @@ namespace internal { * @param _cos cosine * @param _N data size */ -template typename SB_Handle::event_t _rot( - SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, - ${DATA_TYPE} _cos, ${DATA_TYPE} _sin); +template typename SB_Handle::event_t _rot(SB_Handle &sb_handle, + ${INDEX_TYPE} _N, ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, + ${container_t1} _vy, + ${INCREMENT_TYPE} _incy, + ${DATA_TYPE} _cos, ${DATA_TYPE} _sin, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/rotg.cpp.in b/src/interface/blas1/rotg.cpp.in index 20789789c..eae48e81c 100644 --- a/src/interface/blas1/rotg.cpp.in +++ b/src/interface/blas1/rotg.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -50,8 +50,10 @@ namespace internal { * @param s[out] Buffer holding the parameter s. * @return Vector of events to wait for. */ -template typename SB_Handle::event_t _rotg( - SB_Handle &sb_handle, ${container_t0} a, ${container_t1} b, - ${container_t2} c, ${container_t3} s); +template typename SB_Handle::event_t _rotg(SB_Handle &sb_handle, + ${container_t0} a, ${container_t1} b, + ${container_t2} c, + ${container_t3} s, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/rotg_return.cpp.in b/src/interface/blas1/rotg_return.cpp.in index 9e42fa73d..4daf106d0 100644 --- a/src/interface/blas1/rotg_return.cpp.in +++ b/src/interface/blas1/rotg_return.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -45,7 +45,7 @@ namespace internal { * @param c[out] Scalar representing the output c. * @param s[out] Scalar representing the output s. */ -template void _rotg(SB_Handle &sb_handle, ${DATA_TYPE}& a, - ${DATA_TYPE}& b, ${DATA_TYPE}& c, ${DATA_TYPE}& s); +template void _rotg(SB_Handle &sb_handle, ${DATA_TYPE} & a, ${DATA_TYPE} & b, + ${DATA_TYPE} & c, ${DATA_TYPE} & s, const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/rotm.cpp.in b/src/interface/blas1/rotm.cpp.in index cde9b39c5..556b692f1 100644 --- a/src/interface/blas1/rotm.cpp.in +++ b/src/interface/blas1/rotm.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -41,7 +41,8 @@ namespace internal { * [xi] = [h11 h12] * [xi] * [yi] [h21 h22] [yi] * - * where h11, h12, h21 and h22 represent the modified Givens transformation matrix. + * where h11, h12, h21 and h22 represent the modified Givens transformation + * matrix. * * The value of the flag parameter can be used to modify the matrix as follows: * @@ -60,12 +61,13 @@ namespace internal { * @param _incx Stride of vector x (i.e. measured in elements of _vx) * @param[in, out] _vy Buffer holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param[in] _param Buffer with the following layout: [flag, h11, h21, h12, h22]. + * @param[in] _param Buffer with the following layout: [flag, h11, h21, h12, + * h22]. * @return Vector of events to wait for. */ template typename SB_Handle::event_t _rotm( SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, - ${container_t2} _param); + ${container_t2} _param, const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/rotmg.cpp.in b/src/interface/blas1/rotmg.cpp.in index 24a2e0e81..0fcbd7809 100644 --- a/src/interface/blas1/rotmg.cpp.in +++ b/src/interface/blas1/rotmg.cpp.in @@ -1,33 +1,33 @@ /*************************************************************************** -* -* @license -* Copyright (C) Codeplay Software Limited -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* For your convenience, a copy of the License has been included in this -* repository. -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* SYCL-BLAS: BLAS implementation using SYCL -* -* @filename rotmg.cpp.in -* -**************************************************************************/ + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL-BLAS: BLAS implementation using SYCL + * + * @filename rotmg.cpp.in + * + **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -67,7 +67,8 @@ namespace internal { * @return Vector of events to wait for. */ template typename SB_Handle::event_t _rotmg( - SB_Handle &sb_handle, ${container_t0} _d1, ${container_t1} _d2, - ${container_t2} _x1, ${container_t3} _y1, ${container_t4} _param); + SB_Handle &sb_handle, ${container_t0} _d1, ${container_t1} _d2, + ${container_t2} _x1, ${container_t3} _y1, ${container_t4} _param, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/scal.cpp.in b/src/interface/blas1/scal.cpp.in index fe1e80958..9221d9d5d 100644 --- a/src/interface/blas1/scal.cpp.in +++ b/src/interface/blas1/scal.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -39,8 +39,11 @@ namespace internal { * @param _vx VectorView * @param _incx Increment in X axis */ -template typename SB_Handle::event_t _scal( - SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, - ${container_t0} _vx, ${INCREMENT_TYPE} _incx); +template typename SB_Handle::event_t _scal(SB_Handle &sb_handle, + ${INDEX_TYPE} _N, + ${DATA_TYPE} _alpha, + ${container_t0} _vx, + ${INCREMENT_TYPE} _incx, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/sdsdot.cpp.in b/src/interface/blas1/sdsdot.cpp.in index deadbb74f..760f117c9 100644 --- a/src/interface/blas1/sdsdot.cpp.in +++ b/src/interface/blas1/sdsdot.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -56,6 +56,6 @@ namespace internal { template typename SB_Handle::event_t _sdsdot( SB_Handle &sb_handle, ${INDEX_TYPE} _N, float sb, ${container_t0} _vx, ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, - ${container_t2} _rs); + ${container_t2} _rs, const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/sdsdot_return.cpp.in b/src/interface/blas1/sdsdot_return.cpp.in index bdfd60feb..5127c7744 100644 --- a/src/interface/blas1/sdsdot_return.cpp.in +++ b/src/interface/blas1/sdsdot_return.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -55,6 +55,7 @@ namespace internal { */ template typename ValueType<${container_t0}>::type _sdsdot( SB_Handle &sb_handle, ${INDEX_TYPE} _N, float sb, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy); + ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1/swap.cpp.in b/src/interface/blas1/swap.cpp.in index fff79479a..61aa5dd2d 100644 --- a/src/interface/blas1/swap.cpp.in +++ b/src/interface/blas1/swap.cpp.in @@ -23,11 +23,11 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas1_interface.hpp" #include "operations/blas1_trees.hpp" #include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -43,7 +43,8 @@ namespace internal { */ template typename SB_Handle::event_t _swap( SB_Handle &sb_handle, ${INDEX_TYPE} _N, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy); + ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 309969e06..620316272 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -38,6 +38,7 @@ #include "operations/blas_constants.h" #include "operations/blas_operators.hpp" #include "sb_handle/sycl_blas_handle.h" +#include "views/view.h" namespace blas { namespace internal { @@ -47,24 +48,25 @@ namespace internal { * Implements AXPY \f$y = ax + y\f$ * * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis - * @param _vy BufferIterator + * @param _vy BufferIterator or USM pointer * @param _incy Increment in Y axis + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _axpy(sb_handle_t &sb_handle, index_t _N, - element_t _alpha, container_0_t _vx, - increment_t _incx, container_1_t _vy, - increment_t _incy) { +typename sb_handle_t::event_t _axpy( + sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + increment_t _incx, container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); auto vy = make_vector_view(_vy, _incy, _N); auto scalOp = make_op(_alpha, vx); auto addOp = make_op(vy, scalOp); auto assignOp = make_op(vy, addOp); - auto ret = sb_handle.execute(assignOp); + auto ret = sb_handle.execute(assignOp, _dependencies); return ret; } @@ -72,20 +74,22 @@ typename sb_handle_t::event_t _axpy(sb_handle_t &sb_handle, index_t _N, * \brief COPY copies a vector, x, to a vector, y. * * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis - * @param _vy BufferIterator + * @param _vy BufferIterator or USM pointer * @param _incy Increment in Y axis + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _copy(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy) { +typename sb_handle_t::event_t _copy( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); auto vy = make_vector_view(_vy, _incy, _N); auto assignOp2 = make_op(vy, vx); - auto ret = sb_handle.execute(assignOp2); + auto ret = sb_handle.execute(assignOp2, _dependencies); return ret; } @@ -93,30 +97,32 @@ typename sb_handle_t::event_t _copy(sb_handle_t &sb_handle, index_t _N, * \brief Computes the inner product of two vectors with double precision * accumulation (Asynchronous version that returns an event) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _dot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - container_2_t _rs) { - auto vx = make_vector_view(_vx, _incx, _N); - auto vy = make_vector_view(_vy, _incy, _N); - auto rs = make_vector_view(_rs, static_cast(1), - static_cast(1)); +typename sb_handle_t::event_t _dot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t &_dependencies) { + using element_t = typename blas::ValueType::type; + auto vx = make_vector_view(_vx, _incx, _N); + auto vy = make_vector_view(_vy, _incy, _N); + auto rs = make_vector_view(_rs, static_cast(1), + static_cast(1)); auto prdOp = make_op(vx, vy); auto localSize = sb_handle.get_work_group_size(); @@ -124,7 +130,7 @@ typename sb_handle_t::event_t _dot(sb_handle_t &sb_handle, index_t _N, auto assignOp = make_assign_reduction(rs, prdOp, localSize, localSize * nWG); - auto ret = sb_handle.execute(assignOp); + auto ret = sb_handle.execute(assignOp, _dependencies); return ret; } @@ -133,50 +139,55 @@ typename sb_handle_t::event_t _dot(sb_handle_t &sb_handle, index_t _N, * accumulation and adds a scalar to the result (Asynchronous version that * returns an event) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. If size 0, the result will be sb. * @param sb Scalar to add to the results of the inner product. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _sdsdot(sb_handle_t &sb_handle, index_t _N, - float sb, container_0_t _vx, - increment_t _incx, container_1_t _vy, - increment_t _incy, container_2_t _rs) { +typename sb_handle_t::event_t _sdsdot( + sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, + increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, + const typename sb_handle_t::event_t &_dependencies) { typename sb_handle_t::event_t dot_event{}; - auto rs = make_vector_view(_rs, static_cast(1), - static_cast(1)); + using element_t = typename blas::ValueType::type; + auto rs = make_vector_view(_rs, static_cast(1), + static_cast(1)); - dot_event = internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs); + dot_event = + internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, _rs, _dependencies); auto addOp = make_op(sb, rs); auto assignOp2 = make_op(rs, addOp); - auto ret2 = sb_handle.execute(assignOp2); + auto ret2 = sb_handle.execute(assignOp2, dot_event); return blas::concatenate_vectors(dot_event, ret2); } /** * \brief ASUM Takes the sum of the absolute values * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _asum(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _rs) { +typename sb_handle_t::event_t _asum( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _rs, const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); @@ -185,20 +196,22 @@ typename sb_handle_t::event_t _asum(sb_handle_t &sb_handle, index_t _N, const auto nWG = 2 * localSize; auto assignOp = make_assign_reduction(rs, vx, localSize, localSize * nWG); - auto ret = sb_handle.execute(assignOp); + auto ret = sb_handle.execute(assignOp, _dependencies); return ret; } /** * \brief IAMAX finds the index of the first element having maximum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _iamax(sb_handle_t &sb_handle, index_t _N, - container_t _vx, increment_t _incx, - ContainerI _rs) { +typename sb_handle_t::event_t _iamax( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + ContainerI _rs, const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); @@ -207,20 +220,22 @@ typename sb_handle_t::event_t _iamax(sb_handle_t &sb_handle, index_t _N, auto tupOp = make_tuple_op(vx); auto assignOp = make_assign_reduction(rs, tupOp, localSize, localSize * nWG); - auto ret = sb_handle.execute(assignOp); + auto ret = sb_handle.execute(assignOp, _dependencies); return ret; } /** * \brief IAMIN finds the index of the first element having minimum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _iamin(sb_handle_t &sb_handle, index_t _N, - container_t _vx, increment_t _incx, - ContainerI _rs) { +typename sb_handle_t::event_t _iamin( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + ContainerI _rs, const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); @@ -230,7 +245,7 @@ typename sb_handle_t::event_t _iamin(sb_handle_t &sb_handle, index_t _N, auto tupOp = make_tuple_op(vx); auto assignOp = make_assign_reduction(rs, tupOp, localSize, localSize * nWG); - auto ret = sb_handle.execute(assignOp); + auto ret = sb_handle.execute(assignOp, _dependencies); return ret; } @@ -238,20 +253,22 @@ typename sb_handle_t::event_t _iamin(sb_handle_t &sb_handle, index_t _N, * \brief SWAP interchanges two vectors * * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis - * @param _vy BufferIterator + * @param _vy BufferIterator or USM pointer * @param _incy Increment in Y axis + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _swap(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy) { +typename sb_handle_t::event_t _swap( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); auto vy = make_vector_view(_vy, _incy, _N); auto swapOp = make_op(vy, vx, vx, vy); - auto ret = sb_handle.execute(swapOp); + auto ret = sb_handle.execute(swapOp, _dependencies); return ret; } @@ -259,24 +276,25 @@ typename sb_handle_t::event_t _swap(sb_handle_t &sb_handle, index_t _N, /** * \brief SCALAR operation on a vector * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _scal(sb_handle_t &sb_handle, index_t _N, - element_t _alpha, container_0_t _vx, - increment_t _incx) { +typename sb_handle_t::event_t _scal( + sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + increment_t _incx, const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); if (_alpha == element_t{0}) { auto zeroOp = make_op(vx); auto assignOp = make_op(vx, zeroOp); - auto ret = sb_handle.execute(assignOp); + auto ret = sb_handle.execute(assignOp, _dependencies); return ret; } else { auto scalOp = make_op(_alpha, vx); auto assignOp = make_op(vx, scalOp); - auto ret = sb_handle.execute(assignOp); + auto ret = sb_handle.execute(assignOp, _dependencies); return ret; } } @@ -284,14 +302,16 @@ typename sb_handle_t::event_t _scal(sb_handle_t &sb_handle, index_t _N, /** * \brief NRM2 Returns the euclidian norm of a vector * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _rs BufferIterator or USM pointer + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _nrm2(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _rs) { +typename sb_handle_t::event_t _nrm2( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _rs, const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); @@ -304,7 +324,7 @@ typename sb_handle_t::event_t _nrm2(sb_handle_t &sb_handle, index_t _N, auto ret0 = sb_handle.execute(assignOp); auto sqrtOp = make_op(rs); auto assignOpFinal = make_op(rs, sqrtOp); - auto ret1 = sb_handle.execute(assignOpFinal); + auto ret1 = sb_handle.execute(assignOpFinal, _dependencies); return blas::concatenate_vectors(ret0, ret1); } @@ -313,21 +333,21 @@ typename sb_handle_t::event_t _nrm2(sb_handle_t &sb_handle, index_t _N, * @brief _rot constructor given plane rotation * * * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incy Increment in Y axis * @param _sin sine * @param _cos cosine * @param _N data size - * + * @param _dependencies Vector of events */ template -typename sb_handle_t::event_t _rot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - element_t _cos, element_t _sin) { +typename sb_handle_t::event_t _rot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, element_t _cos, element_t _sin, + const typename sb_handle_t::event_t &_dependencies) { auto vx = make_vector_view(_vx, _incx, _N); auto vy = make_vector_view(_vy, _incy, _N); auto scalOp1 = make_op(_cos, vx); @@ -337,7 +357,7 @@ typename sb_handle_t::event_t _rot(sb_handle_t &sb_handle, index_t _N, auto addOp12 = make_op(scalOp1, scalOp2); auto addOp34 = make_op(scalOp3, scalOp4); auto DoubleAssignView = make_op(vx, vy, addOp12, addOp34); - auto ret = sb_handle.execute(DoubleAssignView); + auto ret = sb_handle.execute(DoubleAssignView, _dependencies); return ret; } @@ -358,27 +378,28 @@ typename sb_handle_t::event_t _rot(sb_handle_t &sb_handle, index_t _N, * [h21 h22] [h21 1.0] [-1.0 h22] [0.0 1.0] * * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes (for vx and vy). - * @param[in, out] _vx Buffer holding input vector x + * @param[in, out] _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param[in, out] _vy Buffer holding input vector y + * @param[in, out] _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) * @param[in] _param Buffer with the following layout: [flag, h11, h21, h12, * h22]. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _rotm(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, increment_t _incx, - container_1_t _vy, increment_t _incy, - container_2_t _param) { +typename sb_handle_t::event_t _rotm( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, container_2_t _param, + const typename sb_handle_t::event_t &_dependencies) { using element_t = typename ValueType::type; auto vx = make_vector_view(_vx, _incx, _N); @@ -425,7 +446,7 @@ typename sb_handle_t::event_t _rotm(sb_handle_t &sb_handle, index_t _N, auto vxResult = make_op(h11TimesVx, h12TimesVy); auto vyResult = make_op(h21TimesVx, h22TimesVy); auto DoubleAssignView = make_op(vx, vy, vxResult, vyResult); - auto ret = sb_handle.execute(DoubleAssignView); + auto ret = sb_handle.execute(DoubleAssignView, _dependencies); return ret; } @@ -446,11 +467,11 @@ typename sb_handle_t::event_t _rotm(sb_handle_t &sb_handle, index_t _N, * Rotmg may apply scaling operations to d1, d2 and x1 to avoid overflows. * * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator - * @tparam container_3_t Buffer Iterator - * @tparam container_4_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer + * @tparam container_3_t Buffer Iterator or USM pointer + * @tparam container_4_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle * @param _d1[in,out] On entry, buffer holding the scaling factor for the * x-coordinate. On exit, the re-scaled _d1. @@ -458,17 +479,19 @@ typename sb_handle_t::event_t _rotm(sb_handle_t &sb_handle, index_t _N, * y-coordinate. On exit, the re-scaled _d2. * @param _x1[in,out] On entry, buffer holding the x-coordinate. On exit, the * re-scaled _x1 - * @param _y1[in] Buffer holding the y-coordinate of the point. + * @param _y1[in] Memory object holding the y-coordinate of the point. * @param _param[out] Buffer with the following layout: [flag, h11, h21, h12, * h22]. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename sb_handle_t::event_t _rotmg(sb_handle_t &sb_handle, container_0_t _d1, - container_1_t _d2, container_2_t _x1, - container_3_t _y1, container_4_t _param) { +typename sb_handle_t::event_t _rotmg( + sb_handle_t &sb_handle, container_0_t _d1, container_1_t _d2, + container_2_t _x1, container_3_t _y1, container_4_t _param, + const typename sb_handle_t::event_t &_dependencies) { constexpr int inc = 1; constexpr int vector_size = 1; constexpr int param_size = 5; @@ -481,7 +504,7 @@ typename sb_handle_t::event_t _rotmg(sb_handle_t &sb_handle, container_0_t _d1, auto operation = Rotmg(d1_view, d2_view, x1_view, y1_view, param_view); - auto ret = sb_handle.execute(operation); + auto ret = sb_handle.execute(operation, _dependencies); return ret; } @@ -490,33 +513,34 @@ typename sb_handle_t::event_t _rotmg(sb_handle_t &sb_handle, container_0_t _d1, * \brief Given the Cartesian coordinates (a, b) of a point, the rotg routines * return the parameters c, s, r, and z associated with the Givens rotation. * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator - * @tparam container_3_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer + * @tparam container_3_t Buffer Iterator or USM pointer * @param sb_handle SB_Handle * @param a[in, out] On entry, buffer holding the x-coordinate of the point. On * exit, the scalar z. * @param b[in, out] On entry, buffer holding the y-coordinate of the point. On * exit, the scalar r. - * @param c[out] Buffer holding the parameter c. - * @param s[out] Buffer holding the parameter s. + * @param c[out] Memory object holding the parameter c. + * @param s[out] Memory object holding the parameter s. + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template < typename sb_handle_t, typename container_0_t, typename container_1_t, typename container_2_t, typename container_3_t, typename std::enable_if::value, bool>::type> -typename sb_handle_t::event_t _rotg(sb_handle_t &sb_handle, container_0_t a, - container_1_t b, container_2_t c, - container_3_t s) { +typename sb_handle_t::event_t _rotg( + sb_handle_t &sb_handle, container_0_t a, container_1_t b, container_2_t c, + container_3_t s, const typename sb_handle_t::event_t &_dependencies) { auto a_view = make_vector_view(a, 1, 1); auto b_view = make_vector_view(b, 1, 1); auto c_view = make_vector_view(c, 1, 1); auto s_view = make_vector_view(s, 1, 1); auto operation = Rotg(a_view, b_view, c_view, s_view); - auto ret = sb_handle.execute(operation); + auto ret = sb_handle.execute(operation, _dependencies); return ret; } @@ -532,22 +556,39 @@ typename sb_handle_t::event_t _rotg(sb_handle_t &sb_handle, container_0_t a, * @param b[in, out] On entry, y-coordinate of the point. On exit, the scalar r. * @param c[out] Scalar representing the output c. * @param s[out] Scalar representing the output s. + * @param _dependencies Vector of events */ template ::value, bool>::type> void _rotg(sb_handle_t &sb_handle, scalar_t &a, scalar_t &b, scalar_t &c, - scalar_t &s) { - auto device_a = make_sycl_iterator_buffer(1); - auto device_b = make_sycl_iterator_buffer(1); - auto device_c = make_sycl_iterator_buffer(1); - auto device_s = make_sycl_iterator_buffer(1); - blas::helper::copy_to_device(sb_handle.get_queue(), &a, device_a, 1); - blas::helper::copy_to_device(sb_handle.get_queue(), &b, device_b, 1); - blas::helper::copy_to_device(sb_handle.get_queue(), &c, device_c, 1); - blas::helper::copy_to_device(sb_handle.get_queue(), &s, device_s, 1); - - auto event = - blas::internal::_rotg(sb_handle, device_a, device_b, device_c, device_s); + scalar_t &s, const typename sb_handle_t::event_t &_dependencies) { + auto device_a = + blas::helper::allocate( + 1, sb_handle.get_queue()); + auto device_b = + blas::helper::allocate( + 1, sb_handle.get_queue()); + auto device_c = + blas::helper::allocate( + 1, sb_handle.get_queue()); + auto device_s = + blas::helper::allocate( + 1, sb_handle.get_queue()); + auto copy_a = + blas::helper::copy_to_device(sb_handle.get_queue(), &a, device_a, 1); + auto copy_b = + blas::helper::copy_to_device(sb_handle.get_queue(), &b, device_b, 1); + auto copy_c = + blas::helper::copy_to_device(sb_handle.get_queue(), &c, device_c, 1); + auto copy_s = + blas::helper::copy_to_device(sb_handle.get_queue(), &s, device_s, 1); + + typename sb_handle_t::event_t ret = concatenate_vectors( + _dependencies, + typename sb_handle_t::event_t{copy_a, copy_b, copy_c, copy_s}); + + auto event = blas::internal::_rotg(sb_handle, device_a, device_b, device_c, + device_s, ret); auto event1 = blas::helper::copy_to_host(sb_handle.get_queue(), device_c, &c, 1); @@ -558,44 +599,50 @@ void _rotg(sb_handle_t &sb_handle, scalar_t &a, scalar_t &b, scalar_t &c, auto event4 = blas::helper::copy_to_host(sb_handle.get_queue(), device_b, &b, 1); - sb_handle.wait(event1); - sb_handle.wait(event2); - sb_handle.wait(event3); - sb_handle.wait(event4); + sb_handle.wait({event1, event2, event3, event4}); } /** * \brief Computes the inner product of two vectors with double precision * accumulation (synchronous version that returns the result directly) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template -typename ValueType::type _dot(sb_handle_t &sb_handle, index_t _N, - container_0_t _vx, - increment_t _incx, - container_1_t _vy, - increment_t _incy) { +typename ValueType::type _dot( + sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, + container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies) { + constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; auto res = std::vector(1); - auto gpu_res = make_sycl_iterator_buffer(static_cast(1)); - blas::internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, gpu_res); + auto gpu_res = helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (static_cast(1), sb_handle.get_queue()); + auto dot_event = internal::_dot(sb_handle, _N, _vx, _incx, _vy, _incy, + gpu_res, _dependencies); + sb_handle.wait(dot_event); auto event = - blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1); + helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1); sb_handle.wait(event); + + helper::deallocate(gpu_res, + sb_handle.get_queue()); return res[0]; } @@ -604,77 +651,104 @@ typename ValueType::type _dot(sb_handle_t &sb_handle, index_t _N, * accumulation and adds a scalar to the result (synchronous version that * returns the result directly) * @tparam sb_handle_t SB_Handle type - * @tparam container_0_t Buffer Iterator - * @tparam container_1_t Buffer Iterator - * @tparam container_2_t Buffer Iterator + * @tparam container_0_t Buffer Iterator or USM pointer + * @tparam container_1_t Buffer Iterator or USM pointer + * @tparam container_2_t Buffer Iterator or USM pointer * @tparam index_t Index type * @tparam increment_t Increment type * @param sb_handle SB_Handle * @param _N Input buffer sizes. If size 0, the result will be sb. * @param sb Scalar to add to the results of the inner product. - * @param _vx Buffer holding input vector x + * @param _vx Memory object holding input vector x * @param _incx Stride of vector x (i.e. measured in elements of _vx) - * @param _vy Buffer holding input vector y + * @param _vy Memory object holding input vector y * @param _incy Stride of vector y (i.e. measured in elements of _vy) - * @param _rs Output buffer + * @param _rs Output memory object + * @param _dependencies Vector of events * @return Vector of events to wait for. */ template typename ValueType::type _sdsdot( sb_handle_t &sb_handle, index_t _N, float sb, container_0_t _vx, - increment_t _incx, container_1_t _vy, increment_t _incy) { + increment_t _incx, container_1_t _vy, increment_t _incy, + const typename sb_handle_t::event_t &_dependencies) { + constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; element_t res{}; - auto gpu_res = make_sycl_iterator_buffer(static_cast(1)); + auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (static_cast(1), sb_handle.get_queue()); auto event1 = blas::internal::_sdsdot(sb_handle, _N, sb, _vx, _incx, _vy, - _incy, gpu_res); + _incy, gpu_res, _dependencies); sb_handle.wait(event1); auto event2 = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, &res, 1); sb_handle.wait(event2); + + blas::helper::deallocate( + gpu_res, sb_handle.get_queue()); return res; } /** * \brief ICAMAX finds the index of the first element having maximum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _dependencies Vector of events */ template index_t _iamax(sb_handle_t &sb_handle, index_t _N, container_t _vx, - increment_t _incx) { + increment_t _incx, + const typename sb_handle_t::event_t &_dependencies) { + constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; using IndValTuple = IndexValueTuple; std::vector rsT(1, IndValTuple(index_t(-1), element_t(-1))); - auto gpu_res = - make_sycl_iterator_buffer(static_cast(1)); - blas::internal::_iamax(sb_handle, _N, _vx, _incx, gpu_res); - auto event = - blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, rsT.data(), 1); + auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + IndValTuple > (static_cast(1), sb_handle.get_queue()); + auto iamax_event = + blas::internal::_iamax(sb_handle, _N, _vx, _incx, gpu_res, _dependencies); + sb_handle.wait(iamax_event); + auto event = blas::helper::copy_to_host(sb_handle.get_queue(), + gpu_res, rsT.data(), 1); sb_handle.wait(event); + blas::helper::deallocate( + gpu_res, sb_handle.get_queue()); return rsT[0].get_index(); } /** * \brief ICAMIN finds the index of the first element having minimum - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _dependencies Vector of events */ template index_t _iamin(sb_handle_t &sb_handle, index_t _N, container_t _vx, - increment_t _incx) { + increment_t _incx, + const typename sb_handle_t::event_t &_dependencies) { + constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; using IndValTuple = IndexValueTuple; std::vector rsT(1, IndValTuple(index_t(-1), element_t(-1))); - auto gpu_res = - make_sycl_iterator_buffer(static_cast(1)); - blas::internal::_iamin(sb_handle, _N, _vx, _incx, gpu_res); + auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + IndValTuple > (static_cast(1), sb_handle.get_queue()); + auto iamin_event = + blas::internal::_iamin(sb_handle, _N, _vx, _incx, gpu_res, _dependencies); + sb_handle.wait(iamin_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, rsT.data(), 1); sb_handle.wait(event); + blas::helper::deallocate( + gpu_res, sb_handle.get_queue()); return rsT[0].get_index(); } @@ -682,21 +756,28 @@ index_t _iamin(sb_handle_t &sb_handle, index_t _N, container_t _vx, * \brief ASUM Takes the sum of the absolute values * * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _dependencies Vector of events */ template -typename ValueType::type _asum(sb_handle_t &sb_handle, index_t _N, - container_t _vx, - increment_t _incx) { +typename ValueType::type _asum( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + const typename sb_handle_t::event_t &_dependencies) { + constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; auto res = std::vector(1, element_t(0)); - auto gpu_res = make_sycl_iterator_buffer(static_cast(1)); - blas::internal::_asum(sb_handle, _N, _vx, _incx, gpu_res); + auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (static_cast(1), sb_handle.get_queue()); + blas::internal::_asum(sb_handle, _N, _vx, _incx, gpu_res, _dependencies); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1); sb_handle.wait(event); + blas::helper::deallocate( + gpu_res, sb_handle.get_queue()); return res[0]; } @@ -704,21 +785,29 @@ typename ValueType::type _asum(sb_handle_t &sb_handle, index_t _N, * \brief NRM2 Returns the euclidian norm of a vector * * @param sb_handle_t sb_handle - * @param _vx BufferIterator + * @param _vx BufferIterator or USM pointer * @param _incx Increment in X axis + * @param _dependencies Vector of events */ template -typename ValueType::type _nrm2(sb_handle_t &sb_handle, index_t _N, - container_t _vx, - increment_t _incx) { +typename ValueType::type _nrm2( + sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx, + const typename sb_handle_t::event_t &_dependencies) { + constexpr bool is_usm = std::is_pointer::value; using element_t = typename ValueType::type; auto res = std::vector(1, element_t(0)); - auto gpu_res = make_sycl_iterator_buffer(static_cast(1)); - blas::internal::_nrm2(sb_handle, _N, _vx, _incx, gpu_res); + auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (static_cast(1), sb_handle.get_queue()); + auto nrm2_event = blas::internal::_nrm2(sb_handle, _N, _vx, _incx, gpu_res, _dependencies); + sb_handle.wait(nrm2_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1); sb_handle.wait(event); + blas::helper::deallocate( + gpu_res, sb_handle.get_queue()); return res[0]; } diff --git a/src/interface/blas2/CMakeLists.txt b/src/interface/blas2/CMakeLists.txt index 43a1a2fdb..59ac41249 100644 --- a/src/interface/blas2/CMakeLists.txt +++ b/src/interface/blas2/CMakeLists.txt @@ -41,6 +41,6 @@ generate_blas_binary_objects(blas2 tpsv) generate_blas_binary_objects(blas2 trmv) generate_blas_binary_objects(blas2 trsv) -if(BLAS_ENABLE_CONST_INPUT) - generate_blas_ternary_objects(blas2 gemv_const) -endif() + if(BLAS_ENABLE_CONST_INPUT) + generate_blas_ternary_objects(blas2 gemv_const) + endif() diff --git a/src/interface/blas2/backend/amd_gpu.hpp b/src/interface/blas2/backend/amd_gpu.hpp index 156d1a4eb..da1e26d05 100644 --- a/src/interface/blas2/backend/amd_gpu.hpp +++ b/src/interface/blas2/backend/amd_gpu.hpp @@ -32,20 +32,22 @@ namespace backend { template -typename SB_Handle::event_t _gemv(SB_Handle& sb_handle, index_t _M, index_t _N, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t _gemv( + SB_Handle& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { static constexpr uint32_t cache_line_size = 128; if (trn == transpose_type::Normal) { return blas::internal::_gemv_impl<256, cache_line_size, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } else { - return blas::internal::_gemv_impl<128, cache_line_size, - gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + return blas::internal::_gemv_impl<64, cache_line_size, gemv_memory_t::local, + trn>(sb_handle, _M, _N, _alpha, _mA, _lda, + _vx, _incx, _beta, _vy, _incy, + _dependencies); } } } // namespace backend @@ -56,15 +58,14 @@ namespace backend { template -typename SB_Handle::event_t inline _gbmv(SB_Handle& sb_handle, index_t _M, - index_t _N, index_t _KL, index_t _KU, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t inline _gbmv( + SB_Handle& sb_handle, index_t _M, index_t _N, index_t _KL, index_t _KU, + element_t _alpha, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { return blas::internal::_gbmv_impl<64, trn>(sb_handle, _M, _N, _KL, _KU, _alpha, _mA, _lda, _vx, _incx, - _beta, _vy, _incy); + _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace gbmv @@ -74,14 +75,14 @@ namespace backend { template -typename SB_Handle::event_t inline _sbmv(SB_Handle& sb_handle, index_t _N, - index_t _K, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_sbmv_impl<64, uplo>( - sb_handle, _N, _K, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _sbmv( + SB_Handle& sb_handle, index_t _N, index_t _K, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_sbmv_impl<64, uplo>(sb_handle, _N, _K, _alpha, _mA, + _lda, _vx, _incx, _beta, _vy, + _incy, _dependencies); } } // namespace backend } // namespace sbmv @@ -91,13 +92,12 @@ namespace backend { template -typename SB_Handle::event_t inline _spmv(SB_Handle& sb_handle, index_t _N, - element_t _alpha, container_t0 _mA, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_spmv_impl<64, 8, uplo>(sb_handle, _N, _alpha, _mA, - _vx, _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _spmv( + SB_Handle& sb_handle, index_t _N, element_t _alpha, container_t0 _mA, + container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, + increment_t _incy, const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_spmv_impl<64, 8, uplo>( + sb_handle, _N, _alpha, _mA, _vx, _incx, _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace spmv @@ -107,11 +107,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { - return blas::internal::_tbmv_impl<64, uplo, trn, diag>(sb_handle, _N, _K, _mA, - _lda, _vx, _incx); +typename sb_handle_t::event_t _tbmv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { + return blas::internal::_tbmv_impl<64, uplo, trn, diag>( + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbmv @@ -121,11 +122,11 @@ namespace backend { template -typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, container_t1 _vx, - increment_t _incx) { - return blas::internal::_tpmv_impl<64, 8, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); +typename sb_handle_t::event_t _tpmv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, + increment_t _incx, const typename sb_handle_t::event_t& _dependencies) { + return blas::internal::_tpmv_impl<64, 8, uplo, trn, diag>( + sb_handle, _N, _mA, _vx, _incx, _dependencies); } } // namespace backend } // namespace tpmv @@ -135,11 +136,12 @@ namespace backend { template -typename sb_handle_t::event_t _trsv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { - return blas::internal::_trsv_impl<32, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _lda, _vx, _incx); +typename sb_handle_t::event_t _trsv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, + container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { + return blas::internal::_trsv_impl<32, 4, uplo, trn, diag>( + sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace trsv @@ -149,11 +151,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbsv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbsv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tbsv_impl<32, 4, uplo, trn, diag>( - sb_handle, _N, _K, _mA, _lda, _vx, _incx); + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbsv @@ -165,9 +168,10 @@ template typename sb_handle_t::event_t _tpsv(sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, - increment_t _incx) { + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tpsv_impl<32, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); + _vx, _incx, _dependencies); } } // namespace backend } // namespace tpsv diff --git a/src/interface/blas2/backend/default_cpu.hpp b/src/interface/blas2/backend/default_cpu.hpp index a6de9a435..4ffe41ca5 100644 --- a/src/interface/blas2/backend/default_cpu.hpp +++ b/src/interface/blas2/backend/default_cpu.hpp @@ -32,17 +32,19 @@ namespace backend { template -typename SB_Handle::event_t _gemv(SB_Handle& sb_handle, index_t _M, index_t _N, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t _gemv( + SB_Handle& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { if (trn == transpose_type::Normal) { return blas::internal::_gemv_impl<256, 32, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } else { return blas::internal::_gemv_impl<128, 32, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } } } // namespace backend @@ -53,15 +55,14 @@ namespace backend { template -typename SB_Handle::event_t inline _gbmv(SB_Handle& sb_handle, index_t _M, - index_t _N, index_t _KL, index_t _KU, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t inline _gbmv( + SB_Handle& sb_handle, index_t _M, index_t _N, index_t _KL, index_t _KU, + element_t _alpha, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { return blas::internal::_gbmv_impl<256, trn>(sb_handle, _M, _N, _KL, _KU, _alpha, _mA, _lda, _vx, _incx, - _beta, _vy, _incy); + _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace gbmv @@ -71,14 +72,14 @@ namespace backend { template -typename SB_Handle::event_t inline _sbmv(SB_Handle& sb_handle, index_t _N, - index_t _K, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_sbmv_impl<256, uplo>( - sb_handle, _N, _K, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _sbmv( + SB_Handle& sb_handle, index_t _N, index_t _K, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_sbmv_impl<256, uplo>(sb_handle, _N, _K, _alpha, _mA, + _lda, _vx, _incx, _beta, _vy, + _incy, _dependencies); } } // namespace backend } // namespace sbmv @@ -88,13 +89,12 @@ namespace backend { template -typename SB_Handle::event_t inline _spmv(SB_Handle& sb_handle, index_t _N, - element_t _alpha, container_t0 _mA, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_spmv_impl<4, 4, uplo>(sb_handle, _N, _alpha, _mA, _vx, - _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _spmv( + SB_Handle& sb_handle, index_t _N, element_t _alpha, container_t0 _mA, + container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, + increment_t _incy, const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_spmv_impl<4, 4, uplo>( + sb_handle, _N, _alpha, _mA, _vx, _incx, _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace spmv @@ -104,11 +104,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbmv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { return blas::internal::_tbmv_impl<256, uplo, trn, diag>( - sb_handle, _N, _K, _mA, _lda, _vx, _incx); + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbmv @@ -118,11 +119,11 @@ namespace backend { template -typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, container_t1 _vx, - increment_t _incx) { - return blas::internal::_tpmv_impl<4, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); +typename sb_handle_t::event_t _tpmv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, + increment_t _incx, typename sb_handle_t::event_t _dependencies) { + return blas::internal::_tpmv_impl<4, 4, uplo, trn, diag>( + sb_handle, _N, _mA, _vx, _incx, _dependencies); } } // namespace backend } // namespace tpmv @@ -132,11 +133,12 @@ namespace backend { template -typename sb_handle_t::event_t _trsv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { - return blas::internal::_trsv_impl<4, 2, uplo, trn, diag>(sb_handle, _N, _mA, - _lda, _vx, _incx); +typename sb_handle_t::event_t _trsv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, + container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { + return blas::internal::_trsv_impl<4, 2, uplo, trn, diag>( + sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace trsv @@ -146,11 +148,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbsv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbsv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tbsv_impl<4, 2, uplo, trn, diag>( - sb_handle, _N, _K, _mA, _lda, _vx, _incx); + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbsv @@ -162,9 +165,10 @@ template typename sb_handle_t::event_t _tpsv(sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, - increment_t _incx) { + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tpsv_impl<4, 2, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); + _vx, _incx, _dependencies); } } // namespace backend } // namespace tpsv diff --git a/src/interface/blas2/backend/intel_gpu.hpp b/src/interface/blas2/backend/intel_gpu.hpp index 3571e37cb..252ec66d6 100644 --- a/src/interface/blas2/backend/intel_gpu.hpp +++ b/src/interface/blas2/backend/intel_gpu.hpp @@ -32,25 +32,29 @@ namespace backend { template -typename SB_Handle::event_t _gemv(SB_Handle& sb_handle, index_t _M, index_t _N, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t _gemv( + SB_Handle& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { if (trn == transpose_type::Normal) { if (_N < 8192) { return blas::internal::_gemv_impl<128, 64, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } else if (_N < 16384) { return blas::internal::_gemv_impl<256, 64, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } else { return blas::internal::_gemv_impl<512, 64, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } } else { return blas::internal::_gemv_impl<128, 64, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } } } // namespace backend @@ -61,15 +65,14 @@ namespace backend { template -typename SB_Handle::event_t inline _gbmv(SB_Handle& sb_handle, index_t _M, - index_t _N, index_t _KL, index_t _KU, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t inline _gbmv( + SB_Handle& sb_handle, index_t _M, index_t _N, index_t _KL, index_t _KU, + element_t _alpha, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { return blas::internal::_gbmv_impl<64, trn>(sb_handle, _M, _N, _KL, _KU, _alpha, _mA, _lda, _vx, _incx, - _beta, _vy, _incy); + _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace gbmv @@ -79,14 +82,14 @@ namespace backend { template -typename SB_Handle::event_t inline _sbmv(SB_Handle& sb_handle, index_t _N, - index_t _K, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_sbmv_impl<64, uplo>( - sb_handle, _N, _K, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _sbmv( + SB_Handle& sb_handle, index_t _N, index_t _K, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_sbmv_impl<64, uplo>(sb_handle, _N, _K, _alpha, _mA, + _lda, _vx, _incx, _beta, _vy, + _incy, _dependencies); } } // namespace backend } // namespace sbmv @@ -96,13 +99,12 @@ namespace backend { template -typename SB_Handle::event_t inline _spmv(SB_Handle& sb_handle, index_t _N, - element_t _alpha, container_t0 _mA, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_spmv_impl<16, 4, uplo>(sb_handle, _N, _alpha, _mA, - _vx, _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _spmv( + SB_Handle& sb_handle, index_t _N, element_t _alpha, container_t0 _mA, + container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, + increment_t _incy, const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_spmv_impl<16, 4, uplo>( + sb_handle, _N, _alpha, _mA, _vx, _incx, _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace spmv @@ -112,11 +114,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { - return blas::internal::_tbmv_impl<64, uplo, trn, diag>(sb_handle, _N, _K, _mA, - _lda, _vx, _incx); +typename sb_handle_t::event_t _tbmv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { + return blas::internal::_tbmv_impl<64, uplo, trn, diag>( + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbmv @@ -126,11 +129,11 @@ namespace backend { template -typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, container_t1 _vx, - increment_t _incx) { - return blas::internal::_tpmv_impl<16, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); +typename sb_handle_t::event_t _tpmv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, + increment_t _incx, typename sb_handle_t::event_t _dependencies) { + return blas::internal::_tpmv_impl<16, 4, uplo, trn, diag>( + sb_handle, _N, _mA, _vx, _incx, _dependencies); } } // namespace backend } // namespace tpmv @@ -140,11 +143,12 @@ namespace backend { template -typename sb_handle_t::event_t _trsv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { - return blas::internal::_trsv_impl<8, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _lda, _vx, _incx); +typename sb_handle_t::event_t _trsv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, + container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { + return blas::internal::_trsv_impl<8, 4, uplo, trn, diag>( + sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace trsv @@ -154,11 +158,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbsv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbsv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tbsv_impl<8, 4, uplo, trn, diag>( - sb_handle, _N, _K, _mA, _lda, _vx, _incx); + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbsv @@ -170,9 +175,10 @@ template typename sb_handle_t::event_t _tpsv(sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, - increment_t _incx) { + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tpsv_impl<8, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); + _vx, _incx, _dependencies); } } // namespace backend } // namespace tpsv diff --git a/src/interface/blas2/backend/nvidia_gpu.hpp b/src/interface/blas2/backend/nvidia_gpu.hpp index df968fedc..73df8566a 100644 --- a/src/interface/blas2/backend/nvidia_gpu.hpp +++ b/src/interface/blas2/backend/nvidia_gpu.hpp @@ -32,17 +32,19 @@ namespace backend { template -typename SB_Handle::event_t _gemv(SB_Handle& sb_handle, index_t _M, index_t _N, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t _gemv( + SB_Handle& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { if (trn == transpose_type::Normal) { return blas::internal::_gemv_impl<256, 128, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } else { return blas::internal::_gemv_impl<128, 128, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } } } // namespace backend @@ -53,15 +55,14 @@ namespace backend { template -typename SB_Handle::event_t inline _gbmv(SB_Handle& sb_handle, index_t _M, - index_t _N, index_t _KL, index_t _KU, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t inline _gbmv( + SB_Handle& sb_handle, index_t _M, index_t _N, index_t _KL, index_t _KU, + element_t _alpha, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { return blas::internal::_gbmv_impl<64, trn>(sb_handle, _M, _N, _KL, _KU, _alpha, _mA, _lda, _vx, _incx, - _beta, _vy, _incy); + _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace gbmv @@ -71,14 +72,14 @@ namespace backend { template -typename SB_Handle::event_t inline _sbmv(SB_Handle& sb_handle, index_t _N, - index_t _K, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_sbmv_impl<64, uplo>( - sb_handle, _N, _K, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _sbmv( + SB_Handle& sb_handle, index_t _N, index_t _K, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_sbmv_impl<64, uplo>(sb_handle, _N, _K, _alpha, _mA, + _lda, _vx, _incx, _beta, _vy, + _incy, _dependencies); } } // namespace backend } // namespace sbmv @@ -88,13 +89,12 @@ namespace backend { template -typename SB_Handle::event_t inline _spmv(SB_Handle& sb_handle, index_t _N, - element_t _alpha, container_t0 _mA, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { +typename SB_Handle::event_t inline _spmv( + SB_Handle& sb_handle, index_t _N, element_t _alpha, container_t0 _mA, + container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, + increment_t _incy, const typename SB_Handle::event_t& _dependencies) { return blas::internal::_spmv_impl<32, 16, uplo>( - sb_handle, _N, _alpha, _mA, _vx, _incx, _beta, _vy, _incy); + sb_handle, _N, _alpha, _mA, _vx, _incx, _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace spmv @@ -104,11 +104,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { - return blas::internal::_tbmv_impl<64, uplo, trn, diag>(sb_handle, _N, _K, _mA, - _lda, _vx, _incx); +typename sb_handle_t::event_t _tbmv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { + return blas::internal::_tbmv_impl<64, uplo, trn, diag>( + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbmv @@ -118,11 +119,11 @@ namespace backend { template -typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, container_t1 _vx, - increment_t _incx) { - return blas::internal::_tpmv_impl<32, 16, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); +typename sb_handle_t::event_t _tpmv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, + increment_t _incx, typename sb_handle_t::event_t _dependencies) { + return blas::internal::_tpmv_impl<32, 16, uplo, trn, diag>( + sb_handle, _N, _mA, _vx, _incx, _dependencies); } } // namespace backend } // namespace tpmv @@ -132,11 +133,12 @@ namespace backend { template -typename sb_handle_t::event_t _trsv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { - return blas::internal::_trsv_impl<32, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _lda, _vx, _incx); +typename sb_handle_t::event_t _trsv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, + container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { + return blas::internal::_trsv_impl<32, 4, uplo, trn, diag>( + sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace trsv @@ -146,11 +148,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbsv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbsv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tbsv_impl<32, 4, uplo, trn, diag>( - sb_handle, _N, _K, _mA, _lda, _vx, _incx); + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbsv @@ -162,9 +165,10 @@ template typename sb_handle_t::event_t _tpsv(sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, - increment_t _incx) { + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tpsv_impl<32, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); + _vx, _incx, _dependencies); } } // namespace backend } // namespace tpsv diff --git a/src/interface/blas2/backend/power_vr.hpp b/src/interface/blas2/backend/power_vr.hpp index e52c802d7..5e2d9f5b2 100644 --- a/src/interface/blas2/backend/power_vr.hpp +++ b/src/interface/blas2/backend/power_vr.hpp @@ -32,17 +32,19 @@ namespace backend { template -typename SB_Handle::event_t _gemv(SB_Handle& sb_handle, index_t _M, index_t _N, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t _gemv( + SB_Handle& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { if (trn == transpose_type::Normal) { return blas::internal::_gemv_impl<256, 32, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } else { return blas::internal::_gemv_impl<64, 32, gemv_memory_t::local, trn>( - sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); + sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy, + _dependencies); } } } // namespace backend @@ -53,15 +55,14 @@ namespace backend { template -typename SB_Handle::event_t inline _gbmv(SB_Handle& sb_handle, index_t _M, - index_t _N, index_t _KL, index_t _KU, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename SB_Handle::event_t inline _gbmv( + SB_Handle& sb_handle, index_t _M, index_t _N, index_t _KL, index_t _KU, + element_t _alpha, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { return blas::internal::_gbmv_impl<256, trn>(sb_handle, _M, _N, _KL, _KU, _alpha, _mA, _lda, _vx, _incx, - _beta, _vy, _incy); + _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace gbmv @@ -71,14 +72,14 @@ namespace backend { template -typename SB_Handle::event_t inline _sbmv(SB_Handle& sb_handle, index_t _N, - index_t _K, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_sbmv_impl<256, uplo>( - sb_handle, _N, _K, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _sbmv( + SB_Handle& sb_handle, index_t _N, index_t _K, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_sbmv_impl<256, uplo>(sb_handle, _N, _K, _alpha, _mA, + _lda, _vx, _incx, _beta, _vy, + _incy, _dependencies); } } // namespace backend } // namespace sbmv @@ -88,13 +89,12 @@ namespace backend { template -typename SB_Handle::event_t inline _spmv(SB_Handle& sb_handle, index_t _N, - element_t _alpha, container_t0 _mA, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { - return blas::internal::_spmv_impl<32, 4, uplo>(sb_handle, _N, _alpha, _mA, - _vx, _incx, _beta, _vy, _incy); +typename SB_Handle::event_t inline _spmv( + SB_Handle& sb_handle, index_t _N, element_t _alpha, container_t0 _mA, + container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, + increment_t _incy, const typename SB_Handle::event_t& _dependencies) { + return blas::internal::_spmv_impl<32, 4, uplo>( + sb_handle, _N, _alpha, _mA, _vx, _incx, _beta, _vy, _incy, _dependencies); } } // namespace backend } // namespace spmv @@ -104,11 +104,12 @@ namespace backend { template -typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbmv( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { return blas::internal::_tbmv_impl<256, uplo, trn, diag>( - sb_handle, _N, _K, _mA, _lda, _vx, _incx); + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbmv @@ -118,11 +119,11 @@ namespace backend { template -typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, container_t1 _vx, - increment_t _incx) { - return blas::internal::_tpmv_impl<32, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); +typename sb_handle_t::event_t _tpmv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, + increment_t _incx, typename sb_handle_t::event_t _dependencies) { + return blas::internal::_tpmv_impl<32, 4, uplo, trn, diag>( + sb_handle, _N, _mA, _vx, _incx, _dependencies); } } // namespace backend } // namespace tpmv @@ -132,11 +133,12 @@ namespace backend { template -typename sb_handle_t::event_t _trsv(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { - return blas::internal::_trsv_impl<64, 4, uplo, trn, diag>(sb_handle, _N, _mA, - _lda, _vx, _incx); +typename sb_handle_t::event_t _trsv( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, + container_t1 _vx, increment_t _incx, + typename sb_handle_t::event_t _dependencies) { + return blas::internal::_trsv_impl<64, 4, uplo, trn, diag>( + sb_handle, _N, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace trsv @@ -148,9 +150,10 @@ template typename sb_handle_t::event_t _tbsv(sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { + container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tbsv_impl<4, 2, uplo, trn, diag>( - sb_handle, _N, _K, _mA, _lda, _vx, _incx); + sb_handle, _N, _K, _mA, _lda, _vx, _incx, _dependencies); } } // namespace backend } // namespace tbsv @@ -162,9 +165,10 @@ template typename sb_handle_t::event_t _tpsv(sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, - increment_t _incx) { + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { return blas::internal::_tpsv_impl<4, 2, uplo, trn, diag>(sb_handle, _N, _mA, - _vx, _incx); + _vx, _incx, _dependencies); } } // namespace backend } // namespace tpsv diff --git a/src/interface/blas2/gbmv.cpp.in b/src/interface/blas2/gbmv.cpp.in index 54455c49e..c767e44ca 100644 --- a/src/interface/blas2/gbmv.cpp.in +++ b/src/interface/blas2/gbmv.cpp.in @@ -23,22 +23,18 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas2_interface.hpp" -#include "operations/blas1_trees.hpp" -#include "operations/blas2_trees.hpp" -#include "operations/blas_constants.hpp" -#include "views/view_sycl.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" namespace blas { namespace internal { template typename SB_Handle::event_t _gbmv( SB_Handle& sb_handle, char _trans, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, - ${INDEX_TYPE} _KL, ${INDEX_TYPE} _KU, - ${DATA_TYPE} _alpha, ${container_t0} _mA, ${INDEX_TYPE} _lda, - ${container_t1} _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} _beta, - ${container_t2} _vy, ${INCREMENT_TYPE} _incy); + ${INDEX_TYPE} _KL, ${INDEX_TYPE} _KU, ${DATA_TYPE} _alpha, + ${container_t0} _mA, ${INDEX_TYPE} _lda, ${container_t1} _vx, + ${INCREMENT_TYPE} _incx, ${DATA_TYPE} _beta, ${container_t2} _vy, + ${INCREMENT_TYPE} _incy, const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/gemv.cpp.in b/src/interface/blas2/gemv.cpp.in index d19fe0a60..24a070592 100644 --- a/src/interface/blas2/gemv.cpp.in +++ b/src/interface/blas2/gemv.cpp.in @@ -33,7 +33,8 @@ template typename SB_Handle::event_t _gemv( SB_Handle& sb_handle, char _trans, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, ${container_t0} _mA, ${INDEX_TYPE} _lda, ${container_t1} _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} _beta, - ${container_t2} _vy, ${INCREMENT_TYPE} _incy); + ${container_t2} _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/ger.cpp.in b/src/interface/blas2/ger.cpp.in index 70b46cd5b..330cac853 100644 --- a/src/interface/blas2/ger.cpp.in +++ b/src/interface/blas2/ger.cpp.in @@ -23,13 +23,9 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas2_interface.hpp" -#include "operations/blas1_trees.hpp" -#include "operations/blas2_trees.hpp" -#include "operations/blas_constants.hpp" -#include "views/view_sycl.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" namespace blas { namespace internal { @@ -38,7 +34,7 @@ template typename SB_Handle::event_t _ger( SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, ${container_t0} _vx, ${INCREMENT_TYPE} _incx, ${container_t1} _vy, ${INCREMENT_TYPE} _incy, ${container_t2} _mA, - ${INDEX_TYPE} _lda); + ${INDEX_TYPE} _lda, const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/sbmv.cpp.in b/src/interface/blas2/sbmv.cpp.in index 82cfe93cc..1ef874a47 100644 --- a/src/interface/blas2/sbmv.cpp.in +++ b/src/interface/blas2/sbmv.cpp.in @@ -23,22 +23,18 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas2_interface.hpp" -#include "operations/blas1_trees.hpp" -#include "operations/blas2_trees.hpp" -#include "operations/blas_constants.hpp" -#include "views/view_sycl.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" namespace blas { namespace internal { template typename SB_Handle::event_t _sbmv( - SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, - ${INDEX_TYPE} _K, + SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, ${INDEX_TYPE} _K, ${DATA_TYPE} _alpha, ${container_t0} _mA, ${INDEX_TYPE} _lda, ${container_t1} _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} _beta, - ${container_t2} _vy, ${INCREMENT_TYPE} _incy); + ${container_t2} _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/spmv.cpp.in b/src/interface/blas2/spmv.cpp.in index c47cf594c..43cfd8d92 100644 --- a/src/interface/blas2/spmv.cpp.in +++ b/src/interface/blas2/spmv.cpp.in @@ -31,7 +31,8 @@ namespace internal { template typename SB_Handle::event_t _spmv( SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, ${container_t0} _mA, ${container_t1} _vx, ${INCREMENT_TYPE} _incx, - ${DATA_TYPE} _beta, ${container_t2} _vy, ${INCREMENT_TYPE} _incy); + ${DATA_TYPE} _beta, ${container_t2} _vy, ${INCREMENT_TYPE} _incy, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/spr.cpp.in b/src/interface/blas2/spr.cpp.in index bae1d711d..7d2f9668c 100644 --- a/src/interface/blas2/spr.cpp.in +++ b/src/interface/blas2/spr.cpp.in @@ -24,19 +24,16 @@ **************************************************************************/ #include "container/sycl_iterator.hpp" #include "interface/blas2_interface.hpp" -#include "operations/blas2_trees.hpp" #include "sb_handle/kernel_constructor.hpp" #include "sb_handle/sycl_blas_handle.hpp" -#include "views/view_sycl.hpp" namespace blas { namespace internal { -template typename SB_Handle::event_t -_spr(SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, - ${DATA_TYPE} _alpha, ${container_t0} _vx, - ${INCREMENT_TYPE} _incx, ${container_t1} _mPA); +template typename SB_Handle::event_t _spr( + SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, + ${container_t0} _vx, ${INCREMENT_TYPE} _incx, ${container_t1} _mPA, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/spr2.cpp.in b/src/interface/blas2/spr2.cpp.in index 6167c75dd..6a63d8d00 100644 --- a/src/interface/blas2/spr2.cpp.in +++ b/src/interface/blas2/spr2.cpp.in @@ -34,7 +34,8 @@ _spr2( SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, ${container_t0} _vx, ${INCREMENT_TYPE} _incx, ${container_t1} _vy, - ${INCREMENT_TYPE} _incy, ${container_t2} _mPA); + ${INCREMENT_TYPE} _incy, ${container_t2} _mPA, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/symv.cpp.in b/src/interface/blas2/symv.cpp.in index 963632ece..3e9f2b954 100644 --- a/src/interface/blas2/symv.cpp.in +++ b/src/interface/blas2/symv.cpp.in @@ -33,7 +33,7 @@ template typename SB_Handle::event_t _symv( SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, ${container_t0} _mA, ${INDEX_TYPE} _lda, ${container_t1} _vx, ${INCREMENT_TYPE} _incx, ${DATA_TYPE} _beta, ${container_t2} _vy, - ${INCREMENT_TYPE} _incy); + ${INCREMENT_TYPE} _incy, const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/syr.cpp.in b/src/interface/blas2/syr.cpp.in index dfa828675..1c73ffe89 100644 --- a/src/interface/blas2/syr.cpp.in +++ b/src/interface/blas2/syr.cpp.in @@ -23,21 +23,17 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas2_interface.hpp" -#include "operations/blas1_trees.hpp" -#include "operations/blas2_trees.hpp" -#include "operations/blas_constants.hpp" -#include "views/view_sycl.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" namespace blas { namespace internal { template typename SB_Handle::event_t _syr( - SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, - ${DATA_TYPE} _alpha, ${container_t0} _vx, ${INCREMENT_TYPE} _incx, - ${container_t1} _mA, ${INDEX_TYPE} _lda); + SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, + ${container_t0} _vx, ${INCREMENT_TYPE} _incx, ${container_t1} _mA, + ${INDEX_TYPE} _lda, const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/syr2.cpp.in b/src/interface/blas2/syr2.cpp.in index c4f7db49f..c23689c5e 100644 --- a/src/interface/blas2/syr2.cpp.in +++ b/src/interface/blas2/syr2.cpp.in @@ -23,21 +23,17 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas2_interface.hpp" -#include "operations/blas1_trees.hpp" -#include "operations/blas2_trees.hpp" -#include "operations/blas_constants.hpp" -#include "views/view_sycl.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" namespace blas { namespace internal { template typename SB_Handle::event_t _syr2( - SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, - ${DATA_TYPE} _alpha, ${container_t0} _vx, ${INCREMENT_TYPE} _incx, - ${container_t1} _vy, ${INCREMENT_TYPE} _incy, ${container_t2} _mA, - ${INDEX_TYPE} _lda); + SB_Handle& sb_handle, char _Uplo, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, + ${container_t0} _vx, ${INCREMENT_TYPE} _incx, ${container_t1} _vy, + ${INCREMENT_TYPE} _incy, ${container_t2} _mA, ${INDEX_TYPE} _lda, + const typename SB_Handle::event_t& _dependencies); } } // end namespace blas diff --git a/src/interface/blas2/tbmv.cpp.in b/src/interface/blas2/tbmv.cpp.in index 5d97a5bc1..9d9211dac 100644 --- a/src/interface/blas2/tbmv.cpp.in +++ b/src/interface/blas2/tbmv.cpp.in @@ -23,21 +23,17 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/blas2_interface.hpp" -#include "operations/blas1_trees.hpp" -#include "operations/blas2_trees.hpp" -#include "operations/blas_constants.hpp" -#include "views/view_sycl.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" namespace blas { namespace internal { template typename SB_Handle::event_t _tbmv( - SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, - ${INDEX_TYPE} _N, ${INDEX_TYPE} _K, - ${container_t0} _mA, ${INDEX_TYPE} _lda, - ${container_t1} _vx, ${INCREMENT_TYPE} _incx); + SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, ${INDEX_TYPE} _N, + ${INDEX_TYPE} _K, ${container_t0} _mA, ${INDEX_TYPE} _lda, + ${container_t1} _vx, ${INCREMENT_TYPE} _incx, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/tbsv.cpp.in b/src/interface/blas2/tbsv.cpp.in index 2f3eb26bf..dc8f105bc 100644 --- a/src/interface/blas2/tbsv.cpp.in +++ b/src/interface/blas2/tbsv.cpp.in @@ -22,17 +22,17 @@ * @filename tbsv.cpp.in * **************************************************************************/ -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" +#include "container/sycl_iterator.hpp" #include "interface/blas2_interface.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" namespace blas { namespace internal { template typename SB_Handle::event_t _tbsv( - SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, - ${INDEX_TYPE} _N, ${INDEX_TYPE} _K, - ${container_t0} _mA, ${INDEX_TYPE} _lda, - ${container_t1} _vx, ${INCREMENT_TYPE} _incx); + SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, ${INDEX_TYPE} _N, + ${INDEX_TYPE} _K, ${container_t0} _mA, ${INDEX_TYPE} _lda, ${container_t1} _vx, + ${INCREMENT_TYPE} _incx, const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/tpmv.cpp.in b/src/interface/blas2/tpmv.cpp.in index e55ad2298..73afdb5a8 100644 --- a/src/interface/blas2/tpmv.cpp.in +++ b/src/interface/blas2/tpmv.cpp.in @@ -29,7 +29,8 @@ namespace blas { namespace internal { template typename SB_Handle::event_t _tpmv( SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, ${INDEX_TYPE} _N, - ${container_t0} _mA, ${container_t1} _vx, ${INCREMENT_TYPE} _incx); + ${container_t0} _mA, ${container_t1} _vx, ${INCREMENT_TYPE} _incx, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/tpsv.cpp.in b/src/interface/blas2/tpsv.cpp.in index 495666ef7..f75e1b0bb 100644 --- a/src/interface/blas2/tpsv.cpp.in +++ b/src/interface/blas2/tpsv.cpp.in @@ -30,7 +30,8 @@ namespace blas { namespace internal { template typename SB_Handle::event_t _tpsv( SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, ${INDEX_TYPE} _N, - ${container_t0} _mA, ${container_t1} _vx, ${INCREMENT_TYPE} _incx); + ${container_t0} _mA, ${container_t1} _vx, ${INCREMENT_TYPE} _incx, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas2/trmv.cpp.in b/src/interface/blas2/trmv.cpp.in index 62cda27fc..ab1500e10 100644 --- a/src/interface/blas2/trmv.cpp.in +++ b/src/interface/blas2/trmv.cpp.in @@ -32,6 +32,6 @@ namespace internal { template typename SB_Handle::event_t _trmv( SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, ${INDEX_TYPE} _N, ${container_t0} _mA, ${INDEX_TYPE} _lda, ${container_t1} _vx, - ${INCREMENT_TYPE} _incx); + ${INCREMENT_TYPE} _incx, const typename SB_Handle::event_t& _dependencies); } // namespace internal } // end namespace blas diff --git a/src/interface/blas2/trsv.cpp.in b/src/interface/blas2/trsv.cpp.in index f0d81a333..e88b67b6b 100644 --- a/src/interface/blas2/trsv.cpp.in +++ b/src/interface/blas2/trsv.cpp.in @@ -22,16 +22,17 @@ * @filename trsv.cpp.in * **************************************************************************/ -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" +#include "container/sycl_iterator.hpp" #include "interface/blas2_interface.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" namespace blas { namespace internal { template typename SB_Handle::event_t _trsv( - SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, - ${INDEX_TYPE} _N, ${container_t0} _mA, ${INDEX_TYPE} _lda, - ${container_t1} _vx, ${INCREMENT_TYPE} _incx); + SB_Handle& sb_handle, char _Uplo, char _trans, char _Diag, ${INDEX_TYPE} _N, + ${container_t0} _mA, ${INDEX_TYPE} _lda, ${container_t1} _vx, + ${INCREMENT_TYPE} _incx, const typename SB_Handle::event_t& _dependencies); } // namespace internal } // end namespace blas diff --git a/src/interface/blas2_interface.hpp b/src/interface/blas2_interface.hpp index f0aefb84a..d04da477c 100644 --- a/src/interface/blas2_interface.hpp +++ b/src/interface/blas2_interface.hpp @@ -68,12 +68,11 @@ template -typename sb_handle_t::event_t _gemv_impl(sb_handle_t& sb_handle, index_t _M, - index_t _N, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { +typename sb_handle_t::event_t _gemv_impl( + sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies) { constexpr int cl_elems = cache_line_size / sizeof(element_t); constexpr bool is_transposed = trn != transpose_type::Normal; @@ -84,13 +83,18 @@ typename sb_handle_t::event_t _gemv_impl(sb_handle_t& sb_handle, index_t _M, auto vx = make_vector_view(_vx, _incx, x_vector_size); auto vy = make_vector_view(_vy, _incy, y_vector_size); + constexpr bool is_usm = std::is_pointer::value; + typename sb_handle_t::event_t ret; // Non-local memory kernel if (memory_type != gemv_memory_t::local) { // Leading dimension for dot products matrix const auto ld = is_transposed ? _N : _M; constexpr index_t one = 1; - auto dot_products_buffer = blas::make_sycl_iterator_buffer(ld); + auto dot_products_buffer = blas::helper::allocate < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (ld, sb_handle.get_queue()); auto dot_products_matrix = make_matrix_view(dot_products_buffer, ld, one, ld); @@ -101,8 +105,8 @@ typename sb_handle_t::event_t _gemv_impl(sb_handle_t& sb_handle, index_t _M, // Execute the GEMV kernel that calculate the partial dot products of rows // auto gemvEvent = sb_handle.execute(gemv, local_range, global_size); - auto gemvEvent = - sb_handle.execute(gemv, static_cast(local_range), global_size); + auto gemvEvent = sb_handle.execute(gemv, static_cast(local_range), + global_size, _dependencies); if (_beta != static_cast(0)) { // vec_y * b @@ -119,16 +123,19 @@ typename sb_handle_t::event_t _gemv_impl(sb_handle_t& sb_handle, index_t _M, auto assignOp = make_op(vy, addOp); // exectutes the above expression tree to yield the final GEMV result - return concatenate_vectors(gemvEvent, - sb_handle.execute(assignOp, local_range)); + ret = concatenate_vectors( + gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); + } else { auto alphaMulDotsOp = make_op(_alpha, dot_products_matrix); auto assignOp = make_op(vy, alphaMulDotsOp); - return concatenate_vectors(gemvEvent, - sb_handle.execute(assignOp, local_range)); + ret = concatenate_vectors( + gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); } + blas::helper::enqueue_deallocate(ret, dot_products_buffer, + sb_handle.get_queue()); } else // Local memory kernel { // Calculate number of work groups per each dimension based on the local @@ -150,8 +157,10 @@ typename sb_handle_t::event_t _gemv_impl(sb_handle_t& sb_handle, index_t _M, const auto dot_products_buffer_size = ld * WGs_per_C; // Create the dot products buffer and matrix view - auto dot_products_buffer = - blas::make_sycl_iterator_buffer(dot_products_buffer_size); + auto dot_products_buffer = blas::helper::allocate < is_usm + ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (dot_products_buffer_size, sb_handle.get_queue()); auto dot_products_matrix = make_matrix_view(dot_products_buffer, ld, WGs_per_C, ld); @@ -162,8 +171,9 @@ typename sb_handle_t::event_t _gemv_impl(sb_handle_t& sb_handle, index_t _M, dot_products_matrix, mA, vx, WGs_per_NC, WGs_per_C); // Execute the GEMV kernel that calculate the partial dot products of rows - auto gemvEvent = sb_handle.execute(gemv, static_cast(local_range), - global_size, kernel_scratch_size); + auto gemvEvent = + sb_handle.execute(gemv, static_cast(local_range), global_size, + kernel_scratch_size, _dependencies); // Sum the partial dot products results from the GEMV kernel auto sumColsOp = make_sum_matrix_columns(dot_products_matrix); @@ -183,16 +193,20 @@ typename sb_handle_t::event_t _gemv_impl(sb_handle_t& sb_handle, index_t _M, auto assignOp = make_op(vy, addOp); // exectutes the above expression tree to yield the final GEMV result - return concatenate_vectors(gemvEvent, - sb_handle.execute(assignOp, local_range)); + ret = concatenate_vectors( + gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); } else { auto alphaMulDotsOp = make_op(_alpha, sumColsOp); auto assignOp = make_op(vy, alphaMulDotsOp); - return concatenate_vectors(gemvEvent, - sb_handle.execute(assignOp, local_range)); + ret = concatenate_vectors( + gemvEvent, sb_handle.execute(assignOp, local_range, gemvEvent)); } + + blas::helper::enqueue_deallocate(ret, dot_products_buffer, + sb_handle.get_queue()); } + return ret; } /*! _TRMV. @@ -204,8 +218,8 @@ template ::type; - auto valT1 = blas::make_sycl_iterator_buffer(N * scratchSize); + constexpr bool is_usm = std::is_pointer::value; + auto valT1 = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize, sb_handle.get_queue()); auto mat1 = make_matrix_view(valT1, N, scratchSize, scratchSize); if (data_layout_t::is_col_major()) { @@ -254,28 +271,28 @@ typename sb_handle_t::event_t _trmv_impl( auto gemvC = make_gemv_col( mat1, mA, vx, nWGPerRow, nWGPerCol, scratchPadSize); ret = concatenate_vectors( - ret, - sb_handle.execute(gemvC, localSize, globalSize, scratchPadSize)); + ret, sb_handle.execute(gemvC, localSize, globalSize, scratchPadSize, + _dependencies)); } else { auto gemvC = make_gemv_col( mat1, mA, vx, nWGPerRow, nWGPerCol, scratchPadSize); ret = concatenate_vectors( - ret, - sb_handle.execute(gemvC, localSize, globalSize, scratchPadSize)); + ret, sb_handle.execute(gemvC, localSize, globalSize, scratchPadSize, + _dependencies)); } } else { if (unitDiag == 1) { auto gemvC = make_gemv_col( mat1, mA, vx, nWGPerRow, nWGPerCol, scratchPadSize); ret = concatenate_vectors( - ret, - sb_handle.execute(gemvC, localSize, globalSize, scratchPadSize)); + ret, sb_handle.execute(gemvC, localSize, globalSize, scratchPadSize, + _dependencies)); } else { auto gemvC = make_gemv_col( mat1, mA, vx, nWGPerRow, nWGPerCol, scratchPadSize); ret = concatenate_vectors( - ret, - sb_handle.execute(gemvC, localSize, globalSize, scratchPadSize)); + ret, sb_handle.execute(gemvC, localSize, globalSize, scratchPadSize, + _dependencies)); } } } else { // row_major @@ -284,34 +301,37 @@ typename sb_handle_t::event_t _trmv_impl( auto gemvR = make_gemv_row( mat1, mA, vx, nWGPerRow, nWGPerCol, scratchPadSize); ret = concatenate_vectors( - ret, - sb_handle.execute(gemvR, localSize, globalSize, scratchPadSize)); + ret, sb_handle.execute(gemvR, localSize, globalSize, scratchPadSize, + _dependencies)); } else { auto gemvR = make_gemv_row( mat1, mA, vx, nWGPerRow, nWGPerCol, scratchPadSize); ret = concatenate_vectors( - ret, - sb_handle.execute(gemvR, localSize, globalSize, scratchPadSize)); + ret, sb_handle.execute(gemvR, localSize, globalSize, scratchPadSize, + _dependencies)); } } else { if (unitDiag == 1) { auto gemvR = make_gemv_row( mat1, mA, vx, nWGPerRow, nWGPerCol, scratchPadSize); ret = concatenate_vectors( - ret, - sb_handle.execute(gemvR, localSize, globalSize, scratchPadSize)); + ret, sb_handle.execute(gemvR, localSize, globalSize, scratchPadSize, + _dependencies)); } else { auto gemvR = make_gemv_row( mat1, mA, vx, nWGPerRow, nWGPerCol, scratchPadSize); ret = concatenate_vectors( - ret, - sb_handle.execute(gemvR, localSize, globalSize, scratchPadSize)); + ret, sb_handle.execute(gemvR, localSize, globalSize, scratchPadSize, + _dependencies)); } } } auto addMOp = make_sum_matrix_columns(mat1); auto assignOp = make_op(vx, addMOp); - ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize)); + ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize, ret)); + + blas::helper::enqueue_deallocate(ret, valT1, sb_handle.get_queue()); + return ret; } @@ -324,13 +344,13 @@ template typename sb_handle_t::event_t _trsv_impl(sb_handle_t& sb_handle, index_t _N, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { + container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { #if (SYCL_LANGUAGE_VERSION < 202000) || (defined __HIPSYCL__) throw std::runtime_error("trsv requires SYCL 2020"); #else static_assert(subgroup_size % subgroups == 0, "`subgroups` needs to be a multiple of `subgroup_size`."); - using one = constant; constexpr bool is_upper = (uplo == uplo_type::Upper); constexpr bool is_transposed = (trn != transpose_type::Normal); @@ -348,19 +368,32 @@ typename sb_handle_t::event_t _trsv_impl(sb_handle_t& sb_handle, index_t _N, : ((roundUp(_N, subgroup_size) / subgroup_size) - 1); sync_vec[1] = sync_vec[0]; - auto sync_buffer = - blas::make_sycl_iterator_buffer(sync_vec, sync_vec.size()); - auto sync = make_vector_view(sync_buffer, one::value(), sync_vec.size()); + auto queue = sb_handle.get_queue(); + constexpr bool is_usm = std::is_pointer::value; + auto sync_buffer = blas::helper::allocate < is_usm + ? blas::helper::AllocType::usm + : blas::helper::AllocType::buffer, + int32_t > (sync_vec.size(), queue); + auto copy_sync = blas::helper::copy_to_device( + queue, sync_vec.data(), sync_buffer, sync_vec.size()); + sb_handle.wait(copy_sync); + + auto sync = make_vector_view(sync_buffer, 1, sync_vec.size()); auto trsv = make_trsv( vx, mA, sync); const index_t sub_num = subgroups; - return sb_handle.execute( + + auto ret = sb_handle.execute( trsv, static_cast(sub_num * subgroup_size), roundUp(sub_num * _N, sub_num * subgroup_size), - static_cast(subgroup_size * (subgroup_size + 2 + sub_num))); + static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), _dependencies); + + blas::helper::enqueue_deallocate(ret, sync_buffer, queue); + + return ret; #endif } @@ -386,8 +419,8 @@ typename sb_handle_t::event_t _symv_impl( sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, increment_t _incy, - index_t _localSize = 0, index_t _scratchPadSize = 0, index_t _nRowsWG = 0, - index_t _nColsWG = 0) { + const typename sb_handle_t::event_t& _dependencies, index_t _localSize = 0, + index_t _scratchPadSize = 0, index_t _nRowsWG = 0, index_t _nColsWG = 0) { _Uplo = tolower(_Uplo); typename sb_handle_t::event_t ret; if ((_Uplo != 'u') && (_Uplo != 'l')) { @@ -424,13 +457,18 @@ typename sb_handle_t::event_t _symv_impl( const index_t scratchSize_R = ((scratchPadSize == 0) ? std::min(N, localSize) : 1) * nWGPerCol_R; - auto valTR = blas::make_sycl_iterator_buffer(N * scratchSize_R); + constexpr bool is_usm = std::is_pointer::value; + auto valTR = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize_R, sb_handle.get_queue()); auto matR = make_matrix_view(valTR, N, scratchSize_R, scratchSize_R); const index_t scratchSize_C = nWGPerCol_C; - auto valTC = blas::make_sycl_iterator_buffer(N * scratchSize_C); + auto valTC = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (N * scratchSize_C, sb_handle.get_queue()); auto matC = make_matrix_view(valTC, N, scratchSize_C, scratchSize_C); @@ -440,18 +478,22 @@ typename sb_handle_t::event_t _symv_impl( auto gemvR = make_gemv_row( matR, mAT, vx, nWGPerRow_R, nWGPerCol_R, scratchPadSize); ret = concatenate_vectors( - ret, sb_handle.execute(gemvC, localSize, globalSize_C, scratchPadSize)); + ret, sb_handle.execute(gemvC, localSize, globalSize_C, scratchPadSize, + _dependencies)); ret = concatenate_vectors( - ret, sb_handle.execute(gemvR, localSize, globalSize_R, scratchPadSize)); + ret, + sb_handle.execute(gemvR, localSize, globalSize_R, scratchPadSize, ret)); } else { auto gemvC = make_gemv_col(matC, mA, vx, nWGPerRow_C, nWGPerCol_C, scratchPadSize); auto gemvR = make_gemv_row( matR, mAT, vx, nWGPerRow_R, nWGPerCol_R, scratchPadSize); ret = concatenate_vectors( - ret, sb_handle.execute(gemvC, localSize, globalSize_C, scratchPadSize)); + ret, sb_handle.execute(gemvC, localSize, globalSize_C, scratchPadSize, + _dependencies)); ret = concatenate_vectors( - ret, sb_handle.execute(gemvR, localSize, globalSize_R, scratchPadSize)); + ret, + sb_handle.execute(gemvR, localSize, globalSize_R, scratchPadSize, ret)); } auto scalOp1 = make_op(_beta, vy); @@ -461,7 +503,11 @@ typename sb_handle_t::event_t _symv_impl( auto scalOp2 = make_op(_alpha, addMOp); auto addOp = make_op(scalOp1, scalOp2); auto assignOp = make_op(vy, addOp); - ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize)); + ret = concatenate_vectors(ret, sb_handle.execute(assignOp, localSize, ret)); + + blas::helper::enqueue_deallocate(ret, valTR, sb_handle.get_queue()); + blas::helper::enqueue_deallocate(ret, valTC, sb_handle.get_queue()); + return ret; } @@ -472,12 +518,11 @@ typename sb_handle_t::event_t _symv_impl( template -typename sb_handle_t::event_t _gbmv_impl(sb_handle_t& sb_handle, index_t _M, - index_t _N, index_t _KL, index_t _KU, - element_t _alpha, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, increment_t _incy) { +typename sb_handle_t::event_t _gbmv_impl( + sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _KL, index_t _KU, + element_t _alpha, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, element_t _beta, container_t2 _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies) { if ((_KL >= _M) || (_KU >= _N)) { throw std::invalid_argument("Erroneous parameter: _KL >= _M || _KU >= _N"); } @@ -496,7 +541,8 @@ typename sb_handle_t::event_t _gbmv_impl(sb_handle_t& sb_handle, index_t _M, _beta, vy); return sb_handle.execute(gbmv, static_cast(local_range), - roundUp(y_vector_size, local_range)); + roundUp(y_vector_size, local_range), + _dependencies); } /*! _sbmv_impl. @@ -506,12 +552,11 @@ typename sb_handle_t::event_t _gbmv_impl(sb_handle_t& sb_handle, index_t _M, template -typename sb_handle_t::event_t _sbmv_impl(sb_handle_t& sb_handle, index_t _N, - index_t _K, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { +typename sb_handle_t::event_t _sbmv_impl( + sb_handle_t& sb_handle, index_t _N, index_t _K, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies) { if (_K >= _N) { throw std::invalid_argument("Erroneous parameter: _K >= _N"); } @@ -526,7 +571,8 @@ typename sb_handle_t::event_t _sbmv_impl(sb_handle_t& sb_handle, index_t _N, vx, _beta, vy); return sb_handle.execute(sbmv, static_cast(local_range), - roundUp(vector_size, local_range)); + roundUp(vector_size, local_range), + _dependencies); } /*! _spmv_impl. @@ -537,11 +583,10 @@ template -typename sb_handle_t::event_t _spmv_impl(sb_handle_t& sb_handle, index_t _N, - element_t _alpha, container_t0 _mA, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { +typename sb_handle_t::event_t _spmv_impl( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_t0 _mA, + container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, + increment_t _incy, const typename sb_handle_t::event_t& _dependencies) { static_assert(local_range_x % local_range_y == 0, "Local y range needs to be a multiple of local x range."); @@ -566,16 +611,17 @@ typename sb_handle_t::event_t _spmv_impl(sb_handle_t& sb_handle, index_t _N, spmv, static_cast(local_range_y * local_range_x), roundUp(local_range_y * vector_size, local_range_y * local_range_x), - static_cast(local_range_x * (loc_mem_leading_dim + 2))); + static_cast(local_range_x * (loc_mem_leading_dim + 2)), + _dependencies); } template -typename sb_handle_t::event_t _tbmv_impl(sb_handle_t& sb_handle, index_t _N, - index_t _K, container_t0 _mA, - index_t _lda, container_t1 _vx, - increment_t _incx) { +typename sb_handle_t::event_t _tbmv_impl( + sb_handle_t& sb_handle, index_t _N, index_t _K, container_t0 _mA, + index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { constexpr bool is_upper = (uplo == uplo_type::Upper); constexpr bool is_transposed = (trn != transpose_type::Normal); constexpr bool is_unit = (diag == diag_type::Unit); @@ -585,10 +631,12 @@ typename sb_handle_t::event_t _tbmv_impl(sb_handle_t& sb_handle, index_t _N, } using one = constant; + constexpr bool is_usm = std::is_pointer::value; + using element_t = typename ValueType::type; auto x_vector_size = _N; - auto res_buffer = - blas::make_sycl_iterator_buffer( - x_vector_size); + auto res_buffer = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (x_vector_size, sb_handle.get_queue()); auto mA = make_matrix_view(_mA, _K + 1, _N, _lda); auto vx = make_vector_view(_vx, _incx, x_vector_size); @@ -598,21 +646,25 @@ typename sb_handle_t::event_t _tbmv_impl(sb_handle_t& sb_handle, index_t _N, auto tbmv = make_tbmv(vres, mA, _K, vx); - auto tbmvEvent = - sb_handle.execute(tbmv, static_cast(local_range), global_size); + auto tbmvEvent = sb_handle.execute(tbmv, static_cast(local_range), + global_size, _dependencies); auto assignOp = make_op(vx, vres); - return concatenate_vectors(tbmvEvent, - sb_handle.execute(assignOp, local_range)); + auto ret = concatenate_vectors( + tbmvEvent, sb_handle.execute(assignOp, local_range, _dependencies)); + + blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue()); + + return ret; } template -typename sb_handle_t::event_t _tpmv_impl(sb_handle_t& sb_handle, index_t _N, - container_t0 _mA, container_t1 _vx, - increment_t _incx) { +typename sb_handle_t::event_t _tpmv_impl( + sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, + increment_t _incx, const typename sb_handle_t::event_t& _dependencies) { static_assert(local_range_x % local_range_y == 0, "Local y range needs to be a multiple of local x range."); @@ -624,16 +676,18 @@ typename sb_handle_t::event_t _tpmv_impl(sb_handle_t& sb_handle, index_t _N, index_t vector_size = _N; index_t matrix_size = ((_N + 1) * _N) / 2; + using element_t = typename ValueType::type; + constexpr bool is_usm = std::is_pointer::value; - auto res_buffer = - blas::make_sycl_iterator_buffer( - vector_size); + auto res_buffer = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (vector_size, sb_handle.get_queue()); auto mA = make_matrix_view(_mA, one, matrix_size, matrix_size); auto vx = make_vector_view(_vx, _incx, vector_size); auto vres = make_vector_view(res_buffer, one, vector_size); - typename container_t0::scalar_t unused; + element_t unused; auto tpmv = make_xpmv(unused, mA, vx, unused, vres); @@ -644,10 +698,16 @@ typename sb_handle_t::event_t _tpmv_impl(sb_handle_t& sb_handle, index_t _N, tpmv, static_cast(local_range_y * local_range_x), roundUp(local_range_y * vector_size, local_range_y * local_range_x), - static_cast(local_range_x * (loc_mem_leading_dim + 2))); + static_cast(local_range_x * (loc_mem_leading_dim + 2)), + _dependencies); auto assignOp = make_op(vx, vres); - return concatenate_vectors(tpmvEvent, sb_handle.execute(assignOp)); + auto ret = + concatenate_vectors(tpmvEvent, sb_handle.execute(assignOp, tpmvEvent)); + + blas::helper::enqueue_deallocate(ret, res_buffer, sb_handle.get_queue()); + + return ret; } template = _N) throw std::invalid_argument("Erroneous parameter: _K >= _N"); using one = constant; @@ -683,19 +745,32 @@ typename sb_handle_t::event_t _tbsv_impl(sb_handle_t& sb_handle, index_t _N, : ((roundUp(_N, subgroup_size) / subgroup_size) - 1); sync_vec[1] = sync_vec[0]; - auto sync_buffer = - blas::make_sycl_iterator_buffer(sync_vec, sync_vec.size()); - auto sync = make_vector_view(sync_buffer, one::value(), sync_vec.size()); + constexpr bool is_usm = std::is_pointer::value; + auto queue = sb_handle.get_queue(); + + auto sync_buffer = blas::helper::allocate < is_usm + ? blas::helper::AllocType::usm + : blas::helper::AllocType::buffer, + int32_t > (sync_vec.size(), queue); + auto copy_sync = blas::helper::copy_to_device( + queue, sync_vec.data(), sync_buffer, sync_vec.size()); + sb_handle.wait(copy_sync); + + auto sync = make_vector_view(sync_buffer, 1, sync_vec.size()); auto tbsv = make_tbsv( vx, mA, _K, sync); const index_t sub_num = subgroups; - return sb_handle.execute( + auto ret = sb_handle.execute( tbsv, static_cast(sub_num * subgroup_size), roundUp(sub_num * _N, sub_num * subgroup_size), - static_cast(subgroup_size * (subgroup_size + 2 + sub_num))); + static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), _dependencies); + + blas::helper::enqueue_deallocate(ret, sync_buffer, queue); + + return ret; #endif } @@ -705,7 +780,8 @@ template typename sb_handle_t::event_t _tpsv_impl(sb_handle_t& sb_handle, index_t _N, container_t0 _mA, container_t1 _vx, - increment_t _incx) { + increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { #if (SYCL_LANGUAGE_VERSION < 202000) || (defined __HIPSYCL__) throw std::runtime_error("tpsv requires SYCL 2020"); #else @@ -746,7 +822,8 @@ typename sb_handle_t::event_t _tpsv_impl(sb_handle_t& sb_handle, index_t _N, return sb_handle.execute( tpsv, static_cast(sub_num * subgroup_size), roundUp(sub_num * _N, sub_num * subgroup_size), - static_cast(subgroup_size * (subgroup_size + 2 + sub_num))); + static_cast(subgroup_size * (subgroup_size + 2 + sub_num)), + _dependencies); #endif } @@ -758,7 +835,8 @@ template ( mA, _alpha, vx, vx, nWGPerRow, nWGPerCol, scratchPadSize); return ret = concatenate_vectors( ret, sb_handle.execute(assignOp, localSize, globalSize, - scratchPadSize)); + scratchPadSize, _dependencies)); } } @@ -852,10 +931,10 @@ sspr ( character UPLO, */ template -typename sb_handle_t::event_t _spr_impl(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _mPA) { +typename sb_handle_t::event_t _spr_impl( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _mPA, + const typename sb_handle_t::event_t& _dependencies) { // throw exception if invalid arguments if (_N <= 0) { throw std::invalid_argument("Invalid vector size"); @@ -882,11 +961,13 @@ typename sb_handle_t::event_t _spr_impl(sb_handle_t& sb_handle, char _Uplo, if (Upper) { auto spr = make_spr(mA, _N, _alpha, vx, _incx, vx, _incx); return ret = concatenate_vectors( - ret, sb_handle.execute(spr, localSize, globalSize)); + ret, + sb_handle.execute(spr, localSize, globalSize, _dependencies)); } else { auto spr = make_spr(mA, _N, _alpha, vx, _incx, vx, _incx); return ret = concatenate_vectors( - ret, sb_handle.execute(spr, localSize, globalSize)); + ret, + sb_handle.execute(spr, localSize, globalSize, _dependencies)); } } @@ -907,11 +988,10 @@ sspr2 ( character UPLO, template -typename sb_handle_t::event_t _spr2_impl(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _vy, increment_t _incy, - container_t2 _mPA) { +typename sb_handle_t::event_t _spr2_impl( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _vy, increment_t _incy, + container_t2 _mPA, const typename sb_handle_t::event_t& _dependencies) { // throw exception if invalid arguments if (_N <= 0) { throw std::invalid_argument("Invalid vector size"); @@ -939,11 +1019,13 @@ typename sb_handle_t::event_t _spr2_impl(sb_handle_t& sb_handle, char _Uplo, if (Upper) { auto spr2 = make_spr(mA, _N, _alpha, vx, _incx, vy, _incy); return ret = concatenate_vectors( - ret, sb_handle.execute(spr2, localSize, globalSize)); + ret, + sb_handle.execute(spr2, localSize, globalSize, _dependencies)); } else { auto spr2 = make_spr(mA, _N, _alpha, vx, _incx, vy, _incy); return ret = concatenate_vectors( - ret, sb_handle.execute(spr2, localSize, globalSize)); + ret, + sb_handle.execute(spr2, localSize, globalSize, _dependencies)); } } @@ -965,7 +1047,8 @@ template ( mA, _alpha, vx, vy, nWGPerRow, nWGPerCol, scratchPadSize); - return sb_handle.execute(assignOp, localSize, globalSize, scratchPadSize); + return sb_handle.execute(assignOp, localSize, globalSize, scratchPadSize, + _dependencies); } else { auto assignOp = make_ger_col( mA, _alpha, vx, vy, nWGPerRow, nWGPerCol, scratchPadSize); - return sb_handle.execute(assignOp, localSize, globalSize, scratchPadSize); + return sb_handle.execute(assignOp, localSize, globalSize, scratchPadSize, + _dependencies); } } @@ -1030,31 +1115,32 @@ typename sb_handle_t::event_t inline _gemv( // when trans = "n" and (1+(n-1)*abs(incy) otherwise, // containing the vector "y" (if beta is nonzero). When // finished, y is overwritten with the updated vector. - increment_t _incy // The increment for elements in y (nonzero). -) { + increment_t _incy, // The increment for elements in y (nonzero). + const typename sb_handle_t::event_t& _dependencies) { return tolower(_trans) == 'n' ? blas::gemv::backend::_gemv( sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, - _incy) + _incy, _dependencies) : blas::gemv::backend::_gemv( sb_handle, _M, _N, _alpha, _mA, _lda, _vx, _incx, _beta, _vy, - _incy); + _incy, _dependencies); } template -typename sb_handle_t::event_t inline _trmv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - container_t0 _mA, index_t _lda, - container_t1 _vx, - increment_t _incx) { +typename sb_handle_t::event_t inline _trmv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { // TODO: Here we can use some heuristics to select localn global, local, and // scratch size per device return tolower(_trans) == 'n' ? _trmv_impl(sb_handle, _Uplo, _Diag, _N, - _mA, _lda, _vx, _incx) - : _trmv_impl( - sb_handle, _Uplo, _Diag, _N, _mA, _lda, _vx, _incx); + _mA, _lda, _vx, _incx, + _dependencies) + : _trmv_impl(sb_handle, _Uplo, _Diag, + _N, _mA, _lda, _vx, _incx, + _dependencies); } #define INST_UPLO_TRANS_DIAG(func, ...) \ @@ -1098,61 +1184,57 @@ typename sb_handle_t::event_t inline _trmv(sb_handle_t& sb_handle, char _Uplo, template -typename sb_handle_t::event_t inline _trsv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - container_t0 _mA, index_t _lda, - container_t1 _vx, - increment_t _incx) { +typename sb_handle_t::event_t inline _trsv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { INST_UPLO_TRANS_DIAG(blas::trsv::backend::_trsv, sb_handle, _N, _mA, _lda, - _vx, _incx) + _vx, _incx, _dependencies) } template -typename sb_handle_t::event_t inline _symv(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { +typename sb_handle_t::event_t inline _symv( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _mA, index_t _lda, container_t1 _vx, increment_t _incx, + element_t _beta, container_t2 _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies) { // TODO: Here we can use some heuristics to select localn global, local, and // scratch size per device return _symv_impl(sb_handle, _Uplo, _N, _alpha, _mA, _lda, _vx, _incx, _beta, - _vy, _incy); + _vy, _incy, _dependencies); } template -typename sb_handle_t::event_t inline _gbmv(sb_handle_t& sb_handle, char _trans, - index_t _M, index_t _N, index_t _KL, - index_t _KU, element_t _alpha, - container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx, - element_t _beta, container_t2 _vy, - increment_t _incy) { +typename sb_handle_t::event_t inline _gbmv( + sb_handle_t& sb_handle, char _trans, index_t _M, index_t _N, index_t _KL, + index_t _KU, element_t _alpha, container_t0 _mA, index_t _lda, + container_t1 _vx, increment_t _incx, element_t _beta, container_t2 _vy, + increment_t _incy, const typename sb_handle_t::event_t& _dependencies) { return tolower(_trans) == 'n' ? blas::gbmv::backend::_gbmv( sb_handle, _M, _N, _KL, _KU, _alpha, _mA, _lda, _vx, _incx, - _beta, _vy, _incy) + _beta, _vy, _incy, _dependencies) : blas::gbmv::backend::_gbmv( sb_handle, _M, _N, _KL, _KU, _alpha, _mA, _lda, _vx, _incx, - _beta, _vy, _incy); + _beta, _vy, _incy, _dependencies); } template -typename sb_handle_t::event_t inline _ger(sb_handle_t& sb_handle, index_t _M, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _vy, increment_t _incy, - container_t2 _mA, index_t _lda) { +typename sb_handle_t::event_t inline _ger( + sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _vy, increment_t _incy, + container_t2 _mA, index_t _lda, + const typename sb_handle_t::event_t& _dependencies) { // TODO: Here we can use some heuristics to select localn global, local, and // scratch size per device - return _ger_impl(sb_handle, _M, _N, _alpha, _vx, _incx, _vy, _incy, _mA, - _lda); + return _ger_impl(sb_handle, _M, _N, _alpha, _vx, _incx, _vy, _incy, _mA, _lda, + _dependencies); } template ( sb_handle, _N, _K, _alpha, _mA, _lda, _vx, - _incx, _beta, _vy, _incy) + _incx, _beta, _vy, _incy, _dependencies) : blas::sbmv::backend::_sbmv( sb_handle, _N, _K, _alpha, _mA, _lda, _vx, - _incx, _beta, _vy, _incy); + _incx, _beta, _vy, _incy, _dependencies); } template -typename sb_handle_t::event_t inline _spmv(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _mA, container_t1 _vx, - increment_t _incx, element_t _beta, - container_t2 _vy, - increment_t _incy) { - return tolower(_Uplo) == 'u' - ? blas::spmv::backend::_spmv( - sb_handle, _N, _alpha, _mA, _vx, _incx, _beta, _vy, _incy) - : blas::spmv::backend::_spmv( - sb_handle, _N, _alpha, _mA, _vx, _incx, _beta, _vy, _incy); +typename sb_handle_t::event_t inline _spmv( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _mA, container_t1 _vx, increment_t _incx, element_t _beta, + container_t2 _vy, increment_t _incy, + const typename sb_handle_t::event_t& _dependencies) { + return tolower(_Uplo) == 'u' ? blas::spmv::backend::_spmv( + sb_handle, _N, _alpha, _mA, _vx, _incx, + _beta, _vy, _incy, _dependencies) + : blas::spmv::backend::_spmv( + sb_handle, _N, _alpha, _mA, _vx, _incx, + _beta, _vy, _incy, _dependencies); } template -typename sb_handle_t::event_t inline _syr(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _mA, index_t _lda) { +typename sb_handle_t::event_t inline _syr( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _mA, index_t _lda, + const typename sb_handle_t::event_t& _dependencies) { // TODO: Here we can use some heuristics to select localn global, local, and // scratch size per device - return _syr_impl(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _mA, _lda); + return _syr_impl(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _mA, _lda, + _dependencies); } template -typename sb_handle_t::event_t inline _spr(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _mPA) { - return _spr_impl(sb_handle, _Uplo, _N, _alpha, _vx, _incx, - _mPA); +typename sb_handle_t::event_t inline _spr( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _mPA, + const typename sb_handle_t::event_t& _dependencies) { + return _spr_impl(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _mPA, + _dependencies); } template -typename sb_handle_t::event_t inline _spr2(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _vy, increment_t _incy, - container_t2 _mPA) { +typename sb_handle_t::event_t inline _spr2( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _vy, increment_t _incy, + container_t2 _mPA, const typename sb_handle_t::event_t& _dependencies) { return _spr2_impl(sb_handle, _Uplo, _N, _alpha, - _vx, _incx, _vy, _incy, _mPA); + _vx, _incx, _vy, _incy, _mPA, + _dependencies); } template -typename sb_handle_t::event_t inline _syr2(sb_handle_t& sb_handle, char _Uplo, - index_t _N, element_t _alpha, - container_t0 _vx, increment_t _incx, - container_t1 _vy, increment_t _incy, - container_t2 _mA, index_t _lda) { +typename sb_handle_t::event_t inline _syr2( + sb_handle_t& sb_handle, char _Uplo, index_t _N, element_t _alpha, + container_t0 _vx, increment_t _incx, container_t1 _vy, increment_t _incy, + container_t2 _mA, index_t _lda, + const typename sb_handle_t::event_t& _dependencies) { // TODO: Here we can use some heuristics to select localn global, local, and // scratch size per device return _syr2_impl(sb_handle, _Uplo, _N, _alpha, _vx, _incx, _vy, _incy, _mA, - _lda); + _lda, _dependencies); } + template -typename sb_handle_t::event_t _tbmv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbmv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + index_t _K, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, const typename sb_handle_t::event_t& _dependencies) { INST_UPLO_TRANS_DIAG(blas::tbmv::backend::_tbmv, sb_handle, _N, _K, _mA, _lda, - _vx, _incx) + _vx, _incx, _dependencies) } template @@ -1254,23 +1338,35 @@ typename sb_handle_t::event_t _tpmv(sb_handle_t& sb_handle, char _Uplo, } template -typename sb_handle_t::event_t _tbsv(sb_handle_t& sb_handle, char _Uplo, - char _trans, char _Diag, index_t _N, - index_t _K, container_t0 _mA, index_t _lda, - container_t1 _vx, increment_t _incx) { +typename sb_handle_t::event_t _tbsv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + index_t _K, container_t0 _mA, index_t _lda, container_t1 _vx, + increment_t _incx, const typename sb_handle_t::event_t& _dependencies) { INST_UPLO_TRANS_DIAG(blas::tbsv::backend::_tbsv, sb_handle, _N, _K, _mA, _lda, - _vx, _incx) + _vx, _incx, _dependencies) +} + +template +typename sb_handle_t::event_t _tpmv( + sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, + container_t0 _mA, container_t1 _vx, increment_t _incx, + const typename sb_handle_t::event_t& _dependencies) { +INST_UPLO_TRANS_DIAG(blas::tpmv::backend::_tpmv, sb_handle, _N, _mA, _vx, + _incx, _dependencies) } + template typename sb_handle_t::event_t _tpsv(sb_handle_t& sb_handle, char _Uplo, char _trans, char _Diag, index_t _N, container_t0 _mA, container_t1 _vx, - increment_t _incx) { + increment_t _incx, +const typename sb_handle_t::event_t& _dependencies) { INST_UPLO_TRANS_DIAG(blas::tpsv::backend::_tpsv, sb_handle, _N, _mA, _vx, - _incx) + _incx, _dependencies) } } // namespace internal } // namespace blas -#endif // BLAS2_INTERFACE_HPP +#endif // SYCL_BLAS_BLAS2_INTERFACE_HPP diff --git a/src/interface/blas3/backend/amd_gpu.hpp b/src/interface/blas3/backend/amd_gpu.hpp index b2afcd849..2f603e164 100644 --- a/src/interface/blas3/backend/amd_gpu.hpp +++ b/src/interface/blas3/backend/amd_gpu.hpp @@ -38,19 +38,21 @@ typename sb_handle_t::event_t _gemm( element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, container_1_t _b, index_t _ldb, index_t _strideb, element_t _beta, container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, - gemm_batch_type_t batch_type) { + gemm_batch_type_t batch_type, + const typename sb_handle_t::event_t& _dependencies) { static constexpr int ClSize = 64; static constexpr int tileWgSize = ClSize / sizeof(element_t); if (batch_type == gemm_batch_type_t::interleaved) { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<4, 4, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, - _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<4, 4, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 4, static_cast(gemm_batch_type_t::interleaved)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, - batch_size); + batch_size, _dependencies); } /* Tall & Skinny matrices. */ #ifdef GEMM_TALL_SKINNY_SUPPORT @@ -60,83 +62,84 @@ typename sb_handle_t::event_t _gemm( (!s_a && !s_b)) { if (_M <= 16 && _N > 32) { return blas::Gemm_Launcher< - 256, true, true, true, ClSize, Tile<1, 4, tileWgSize, tileWgSize>, - _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 256, true, true, true, + ClSize, Tile<1, 4, tileWgSize, tileWgSize>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 2, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else if (_M > 64 && _N <= 32) { return blas::Gemm_Launcher< - 256, true, true, true, ClSize, Tile<4, 1, tileWgSize, tileWgSize>, - _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 256, true, true, true, + ClSize, Tile<4, 1, tileWgSize, tileWgSize>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 2, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else if (_M <= 16 || _N <= 16) { return blas::Gemm_Launcher< - 256, true, true, true, ClSize, Tile<1, 1, tileWgSize, tileWgSize>, - _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 256, true, true, true, + ClSize, Tile<1, 1, tileWgSize, tileWgSize>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 2, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else if (_M <= 32 || _N <= 32) { return blas::Gemm_Launcher< - 256, true, true, true, ClSize, Tile<2, 2, tileWgSize, tileWgSize>, - _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 256, true, true, true, + ClSize, Tile<2, 2, tileWgSize, tileWgSize>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 2, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else { return blas::Gemm_Launcher< - 256, true, true, true, ClSize, Tile<4, 4, tileWgSize, tileWgSize>, - _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 256, true, true, true, + ClSize, Tile<4, 4, tileWgSize, tileWgSize>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 2, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } } else #endif // GEMM_TALL_SKINNY_SUPPORT if (_M * _N <= 65536) { - return blas::Gemm_Launcher< - 256, false, false, false, ClSize, - Tile<1, 1, tileWgSize, tileWgSize>, _t_a, _t_b, s_a, s_b, - static_cast(gemm_memory_t::local), - static_cast(gemm_algorithm_t::standard), - static_cast(gemm_vectorization_t::full), is_beta_zero, 1, - static_cast(gemm_batch_type_t::strided)>:: - template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, - _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 256, false, false, false, + ClSize, Tile<1, 1, tileWgSize, tileWgSize>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), + static_cast(gemm_algorithm_t::standard), + static_cast(gemm_vectorization_t::full), is_beta_zero, 1, + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } else { return blas::Gemm_Launcher< - 256, false, false, false, ClSize, Tile<4, 4, tileWgSize, tileWgSize>, - _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 256, false, false, false, + ClSize, Tile<4, 4, tileWgSize, tileWgSize>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 2, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } } } // namespace backend diff --git a/src/interface/blas3/backend/default_cpu.hpp b/src/interface/blas3/backend/default_cpu.hpp index 6fd5c8ca5..a079dae77 100644 --- a/src/interface/blas3/backend/default_cpu.hpp +++ b/src/interface/blas3/backend/default_cpu.hpp @@ -38,79 +38,65 @@ typename sb_handle_t::event_t _gemm( element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, container_1_t _b, index_t _ldb, index_t _strideb, element_t _beta, container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, - gemm_batch_type_t batch_type) { + gemm_batch_type_t batch_type, + const typename sb_handle_t::event_t& _dependencies) { if (batch_type == gemm_batch_type_t::interleaved) { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<2, 2, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, - _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<2, 2, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 4, static_cast(gemm_batch_type_t::interleaved)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, - batch_size); + batch_size, _dependencies); } #if defined(NAIVE_GEMM) return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<8, 8, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, 64, + Tile<8, 8, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::naive), static_cast(gemm_vectorization_t::partial), is_beta_zero, 1, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, _N, - _K, _alpha, _a, - _lda, _stridea, - _b, _ldb, - _strideb, _beta, - _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); #else if (_M <= 128 && _N <= 128 && _K <= 128 && !s_a && !s_b) { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<2, 2, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<2, 2, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 2, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } else if (!s_a && !s_b) { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<8, 8, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<8, 8, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::partial), is_beta_zero, 1, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } else { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<2, 2, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<2, 2, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 2, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } #endif diff --git a/src/interface/blas3/backend/intel_gpu.hpp b/src/interface/blas3/backend/intel_gpu.hpp index a42615d4a..22fbd786c 100644 --- a/src/interface/blas3/backend/intel_gpu.hpp +++ b/src/interface/blas3/backend/intel_gpu.hpp @@ -37,17 +37,19 @@ typename sb_handle_t::event_t _gemm( element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, container_1_t _b, index_t _ldb, index_t _strideb, element_t _beta, container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, - gemm_batch_type_t batch_type) { + gemm_batch_type_t batch_type, + const typename sb_handle_t::event_t& _dependencies) { if (batch_type == gemm_batch_type_t::interleaved) { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<4, 4, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, - _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<4, 4, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 4, static_cast(gemm_batch_type_t::interleaved)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, - batch_size); + batch_size, _dependencies); } #ifdef GEMM_TALL_SKINNY_SUPPORT if (!s_a && !s_b) { @@ -56,155 +58,152 @@ typename sb_handle_t::event_t _gemm( ((_K >= 4096 && _M * _N <= 16384) || (_K >= 1024 && _M * _N <= 4096))) { if (_M >= 16 && _N <= 4) { return blas::Gemm_Launcher< - 32, true, true, true, 64, Tile<2, 1, 8, 4>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 32, true, true, true, + 64, Tile<2, 1, 8, 4>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else if (_M <= 4 || _N <= 4) { // Need to increase the work group size for cl::sycl::half for the // launcher to be instancianted constexpr int wg_size = sizeof(element_t) == 2 ? 8 : 4; return blas::Gemm_Launcher< - 16, true, false, false, 64, Tile<1, 1, wg_size, wg_size>, _t_a, - _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 16, true, false, false, + 64, Tile<1, 1, wg_size, wg_size>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else if (_M >= 16 && _N <= 8) { return blas::Gemm_Launcher< - 32, true, true, true, 64, Tile<2, 2, 8, 4>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 32, true, true, true, + 64, Tile<2, 2, 8, 4>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else if (_M <= 8 || _N <= 8) { // Need to increase the work group size for cl::sycl::half for the // launcher to be instancianted constexpr int wg_size = sizeof(element_t) == 2 ? 8 : 4; return blas::Gemm_Launcher< - 16, true, false, false, 64, Tile<2, 2, wg_size, wg_size>, _t_a, - _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 16, true, false, false, + 64, Tile<2, 2, wg_size, wg_size>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else if (_M <= 16 || _N <= 16) { return blas::Gemm_Launcher< - 64, true, true, true, 64, Tile<2, 2, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, true, true, true, + 64, Tile<2, 2, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else if (_M <= 32 || _N <= 32) { return blas::Gemm_Launcher< - 64, true, true, true, 64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, true, true, true, + 64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else { constexpr int wg_size = sizeof(element_t) == 8 ? 8 : 16; return blas::Gemm_Launcher< - 256, true, true, true, 64, Tile<4, 4, wg_size, wg_size>, _t_a, _t_b, - s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 256, true, true, true, + 64, Tile<4, 4, wg_size, wg_size>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } } else if (batch_size == 1 && (_t_a || (_t_b && _M * _N > 1048576))) { if (_M <= 64 || _N <= 64) { return blas::Gemm_Launcher< - 64, true, true, true, 64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, true, true, true, + 64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } else { // Need to increase the work group size for double for the // launcher to be instancianted constexpr int wg_size = sizeof(element_t) == 8 ? 8 : 16; return blas::Gemm_Launcher< - 256, true, true, true, 64, Tile<4, 4, wg_size, wg_size>, _t_a, _t_b, - s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 256, true, true, true, + 64, Tile<4, 4, wg_size, wg_size>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::tall_skinny), static_cast(gemm_vectorization_t::none), is_beta_zero, 4, static_cast(gemm_batch_type_t::strided)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, - _stridec, batch_size); + _stridec, batch_size, _dependencies); } } } #endif if (_M <= 128 && _N <= 128) { return blas::Gemm_Launcher< - 64, true, false, false, 64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, true, false, false, 64, + Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 4, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } else if (_t_b && !_t_a && !s_a && !s_b) { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<8, 8, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<8, 8, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::partial), is_beta_zero, 4, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } else { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<4, 8, 16, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<4, 8, 16, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 4, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } } } // namespace backend diff --git a/src/interface/blas3/backend/nvidia_gpu.hpp b/src/interface/blas3/backend/nvidia_gpu.hpp index c84f7964f..6584dfc63 100644 --- a/src/interface/blas3/backend/nvidia_gpu.hpp +++ b/src/interface/blas3/backend/nvidia_gpu.hpp @@ -38,18 +38,19 @@ typename sb_handle_t::event_t _gemm( element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, container_1_t _b, index_t _ldb, index_t _strideb, element_t _beta, container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, - gemm_batch_type_t batch_type) { + gemm_batch_type_t batch_type, + const typename sb_handle_t::event_t& _dependencies) { if (batch_type == gemm_batch_type_t::interleaved) { return blas::Gemm_Launcher< - 64, false, false, false, 64, - Tile<2, 2, 4, 4, 1, 1, 1, 1, 4, 4, 1, 1, 1, float, float>, _t_a, _t_b, - s_a, s_b, static_cast(gemm_memory_t::no_local), + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<2, 2, 4, 4, 1, 1, 1, 1, 4, 4, 1, 1, 1, float, float>, _t_a, + _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 4, static_cast(gemm_batch_type_t::interleaved)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, - batch_size); + batch_size, _dependencies); } #ifdef SB_ENABLE_JOINT_MATRIX @@ -57,7 +58,8 @@ typename sb_handle_t::event_t _gemm( if (en_joint_matrix != NULL && *en_joint_matrix == '1') { if (_M > 1024 && _N > 1024) { return blas::Gemm_Launcher< - 256, false, true, true, 128, + container_0_t, container_1_t, container_2_t, 256, false, true, true, + 128, Tile<8, 8, 16, 16, 16, 2, 1, 1, 1, 1, 16, 16, 16, cl::sycl::half, float>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), @@ -66,10 +68,12 @@ typename sb_handle_t::event_t _gemm( static_cast(gemm_batch_type_t::strided), true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, - _ldc, _stridec, batch_size); + _ldc, _stridec, batch_size, + _dependencies); } else if (_M > 64 && _N > 64) { return blas::Gemm_Launcher< - 128, false, true, true, 128, + container_0_t, container_1_t, container_2_t, 128, false, true, true, + 128, Tile<4, 8, 16, 8, 16, 2, 1, 1, 1, 1, 16, 16, 16, cl::sycl::half, float>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), @@ -78,11 +82,13 @@ typename sb_handle_t::event_t _gemm( static_cast(gemm_batch_type_t::strided), true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, - _ldc, _stridec, batch_size); + _ldc, _stridec, batch_size, + _dependencies); } else { return blas::Gemm_Launcher< - 128, false, true, true, 128, + container_0_t, container_1_t, container_2_t, 128, false, true, true, + 128, Tile<2, 4, 16, 8, 16, 2, 1, 1, 1, 1, 16, 16, 16, cl::sycl::half, float>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), @@ -91,11 +97,12 @@ typename sb_handle_t::event_t _gemm( static_cast(gemm_batch_type_t::strided), true>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, - _ldc, _stridec, batch_size); + _ldc, _stridec, batch_size, + _dependencies); } } else { return blas::Gemm_Launcher< - 64, false, false, true, 64, + container_0_t, container_1_t, container_2_t, 64, false, false, true, 64, Tile<8, 8, 8, 8, 1, 1, 2, 2, 1, 1, 1, 1, 1, float, float>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), @@ -103,13 +110,14 @@ typename sb_handle_t::event_t _gemm( static_cast(gemm_batch_type_t::strided), false>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, - _ldc, _stridec, batch_size); + _ldc, _stridec, batch_size, + _dependencies); } #else // SB_ENABLE_JOINT_MATRIX else { return blas::Gemm_Launcher< - 64, false, false, true, 64, + container_0_t, container_1_t, container_2_t, 64, false, false, true, 64, Tile<8, 8, 8, 8, 1, 1, 2, 2, 1, 1, 1, 1, 1, float, float>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), @@ -117,7 +125,8 @@ typename sb_handle_t::event_t _gemm( static_cast(gemm_batch_type_t::strided), false>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, - _ldc, _stridec, batch_size); + _ldc, _stridec, batch_size, + _dependencies); } #endif } diff --git a/src/interface/blas3/backend/power_vr.hpp b/src/interface/blas3/backend/power_vr.hpp index bec1ed5d8..e255b67cf 100644 --- a/src/interface/blas3/backend/power_vr.hpp +++ b/src/interface/blas3/backend/power_vr.hpp @@ -51,7 +51,8 @@ struct Gemm_Launcher { static inline typename sb_handle_t::event_t _select_gemm( sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, value_t _alpha, container_0_t _A, container_1_t _B, value_t _beta, - container_2_t _C, index_t batch_size) { + container_2_t _C, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies) { auto m = static_cast(_M); auto n = static_cast(_N); auto k = static_cast(_K); @@ -306,18 +307,19 @@ typename sb_handle_t::event_t _gemm( } return blas::gemm::backend::sycl_imagination_nn_api::Gemm_Launcher< _t_a, _t_b>::template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _b, - _beta, _c, batch_size); + _beta, _c, batch_size, _dependencies); #else if (batch_type == gemm_batch_type_t::interleaved) { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<4, 4, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, - _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<4, 4, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 4, static_cast(gemm_batch_type_t::interleaved)>:: template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, - batch_size); + batch_size, _dependencies); } // The following _M, _N ,and _K is used for SSD + Mobilenet v2 (TF version) // We computed the best tile combination for each sizes -(4-March-2018) @@ -326,7 +328,8 @@ typename sb_handle_t::event_t _gemm( (_M == 273 && _K == 576 && _N == 100) || (_M == 384 && _K == 64 && _N == 361)) { return blas::Gemm_Launcher< - 96, true, false, false, 16, Tile<4, 6, 12, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 96, true, false, false, 16, + Tile<4, 6, 12, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 1, @@ -336,7 +339,8 @@ typename sb_handle_t::event_t _gemm( _a, _lda, _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, - batch_size); + batch_size, + _dependencies); } // The following _M, _N ,and _K is used for SSD + Mobilenet v2 (TF version) // We computed the best tile combination for each sizes -(4-March-2018) // POWER_VR Rogue @@ -347,38 +351,30 @@ typename sb_handle_t::event_t _gemm( (_M == 24 && _K == 256 && _N == 1) || (_M == 128 && _K == 64 && _N == 1)) { return blas::Gemm_Launcher< - 64, false, false, false, 128, Tile<1, 1, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 128, Tile<1, 1, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 1, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } // The following _M, _N ,and _K is used for SSD + Mobilenet v2 (TF version) // We computed the best tile combination for each sizes -(4-March-2018) // POWER_VR Rogue else if ((_M == 546 && _K == 128 && _N == 1) || (_M == 546 && _K == 256 && _N == 1)) { return blas::Gemm_Launcher< - 64, false, false, false, 64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::no_local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 1, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } // The following _M, _N ,and _K is used for SSD + Mobilenet v2 (TF version) // We computed the best tile combination for each sizes -(4-March-2018) // POWER_VR Rogue @@ -392,34 +388,26 @@ typename sb_handle_t::event_t _gemm( (_M > 64 && _K > 64 && _N > 64 && is_power_of_2(_M) && is_power_of_2(_K) && is_power_of_2(_N))) { return blas::Gemm_Launcher< - 128, false, false, false, 16, Tile<4, 8, 16, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 128, false, false, false, + 16, Tile<4, 8, 16, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 1, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } else { return blas::Gemm_Launcher< - 64, false, false, false, 32, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 32, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 1, - static_cast( - gemm_batch_type_t::strided)>::template _select_gemm(sb_handle, _M, - _N, _K, _alpha, - _a, _lda, - _stridea, _b, - _ldb, _strideb, - _beta, _c, _ldc, - _stridec, - batch_size); + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); } #endif } diff --git a/src/interface/blas3/gemm.cpp.in b/src/interface/blas3/gemm.cpp.in index b6108a8ef..25999119b 100644 --- a/src/interface/blas3/gemm.cpp.in +++ b/src/interface/blas3/gemm.cpp.in @@ -23,9 +23,8 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" #include "interface/gemm_interface.hpp" -#include "operations/blas_constants.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -35,14 +34,16 @@ template typename SB_Handle::event_t _gemm( SB_Handle& sb_handle, char _TransA, char _TransB, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, ${INDEX_TYPE} _K, ${DATA_TYPE} _alpha, ${container_t0} a_, ${INDEX_TYPE} _lda, ${container_t1} b_, ${INDEX_TYPE} _ldb, - ${DATA_TYPE} _beta, ${container_t2} _C, ${INDEX_TYPE} _ldc); + ${DATA_TYPE} _beta, ${container_t2} _C, ${INDEX_TYPE} _ldc, + const typename SB_Handle::event_t& _dependencies); // batched gemm template typename SB_Handle::event_t _gemm_batched( SB_Handle& sb_handle, char _TransA, char _TransB, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, ${INDEX_TYPE} _K, ${DATA_TYPE} _alpha, ${container_t0} a_, ${INDEX_TYPE} _lda, ${container_t1} b_, ${INDEX_TYPE} _ldb, ${DATA_TYPE} _beta, ${container_t2} _C, ${INDEX_TYPE} _ldc, - ${INDEX_TYPE} batch_size, gemm_batch_type_t batch_type); + ${INDEX_TYPE} batch_size, gemm_batch_type_t batch_type, + const typename SB_Handle::event_t& _dependencies); // strided batched gemm template typename SB_Handle::event_t _gemm_strided_batched( SB_Handle& sb_handle, char _TransA, char _TransB, ${INDEX_TYPE} _M, @@ -50,6 +51,7 @@ template typename SB_Handle::event_t _gemm_strided_batched( ${INDEX_TYPE} _lda, ${INDEX_TYPE} _stridea, ${container_t1} b_, ${INDEX_TYPE} _ldb, ${INDEX_TYPE} _strideb, ${DATA_TYPE} _beta, ${container_t2} _C, ${INDEX_TYPE} _ldc, ${INDEX_TYPE} _stridec, - ${INDEX_TYPE} batch_size); + ${INDEX_TYPE} batch_size, const typename SB_Handle::event_t& _dependencies); + } // namespace internal } // namespace blas diff --git a/src/interface/blas3/gemm_launcher.cpp.in b/src/interface/blas3/gemm_launcher.cpp.in index 971f605d4..eeb3db330 100644 --- a/src/interface/blas3/gemm_launcher.cpp.in +++ b/src/interface/blas3/gemm_launcher.cpp.in @@ -24,73 +24,45 @@ **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" #include "interface/gemm_launcher.hpp" #include "operations/blas3_trees.hpp" -#include "operations/extension/reduction.hpp" #include "operations/blas_constants.hpp" +#include "operations/extension/reduction.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { template class Gemm_Launcher< - ${WG_SIZE}, ${DOUBLE_BUFFER}, ${CONFLICT_A}, ${CONFLICT_B}, ${CL_SIZE}, - Tile<${TIR}, ${TIC}, ${TWR}, ${TWC}, ${TSR}, ${TSC}, ${TLR}, ${TLC}, ${TIB}, ${TWB}, - ${JM_M}, ${JM_N}, ${JM_K}, ${JM_IN_T}, ${JM_OUT_T}>, + ${container_t0}, ${container_t1}, ${container_t2}, ${WG_SIZE}, + ${DOUBLE_BUFFER}, ${CONFLICT_A}, ${CONFLICT_B}, ${CL_SIZE}, + Tile<${TIR}, ${TIC}, ${TWR}, ${TWC}, ${TSR}, ${TSC}, ${TLR}, ${TLC}, ${TIB}, + ${TWB}, ${JM_M}, ${JM_N}, ${JM_K}, ${JM_IN_T}, ${JM_OUT_T}>, ${TRANS_A}, ${TRANS_B}, ${SYMM_A}, ${SYMM_B}, static_cast(gemm_memory_t::${GEMM_MEMORY_TYPE}), static_cast(gemm_algorithm_t::${GEMM_SHAPE_TYPE}), static_cast(gemm_vectorization_t::${GEMM_VECTORIZE_TYPE}), ${IS_BETA_ZERO}, ${VECTOR_SIZE}, - static_cast(gemm_batch_type_t::${BATCH_TYPE}), - ${USE_JOINT_MATRIX}>; - -template typename SB_Handle::event_t Gemm_Launcher< - ${WG_SIZE}, ${DOUBLE_BUFFER}, ${CONFLICT_A}, ${CONFLICT_B}, ${CL_SIZE}, - Tile<${TIR}, ${TIC}, ${TWR}, ${TWC}, ${TSR}, ${TSC}, ${TLR}, ${TLC}, ${TIB}, ${TWB}, - ${JM_M}, ${JM_N}, ${JM_K}, ${JM_IN_T}, ${JM_OUT_T}>, - ${TRANS_A}, ${TRANS_B}, ${SYMM_A}, ${SYMM_B}, - static_cast(gemm_memory_t::${GEMM_MEMORY_TYPE}), - static_cast(gemm_algorithm_t::${GEMM_SHAPE_TYPE}), - static_cast(gemm_vectorization_t::${GEMM_VECTORIZE_TYPE}), - ${IS_BETA_ZERO}, ${VECTOR_SIZE}, - static_cast(gemm_batch_type_t::${BATCH_TYPE}), - ${USE_JOINT_MATRIX}>:: - _select_gemm, - BufferIterator<${DATA_TYPE}>, - BufferIterator<${DATA_TYPE}>, ${DATA_TYPE}, - ${INDEX_TYPE}>( - SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, - ${INDEX_TYPE} _K, ${DATA_TYPE} _alpha, - BufferIterator<${DATA_TYPE}> a_, ${INDEX_TYPE} _lda, - ${INDEX_TYPE} _stridea, BufferIterator<${DATA_TYPE}> b_, - ${INDEX_TYPE} _ldb, ${INDEX_TYPE} _strideb, - ${DATA_TYPE} _beta, BufferIterator<${DATA_TYPE}> _C, - ${INDEX_TYPE} _ldc, ${INDEX_TYPE} _stridec, - ${INDEX_TYPE} batch_size); + static_cast(gemm_batch_type_t::${BATCH_TYPE}), ${USE_JOINT_MATRIX}>; template typename SB_Handle::event_t Gemm_Launcher< - ${WG_SIZE}, ${DOUBLE_BUFFER}, ${CONFLICT_A}, ${CONFLICT_B}, ${CL_SIZE}, - Tile<${TIR}, ${TIC}, ${TWR}, ${TWC}, ${TSR}, ${TSC}, ${TLR}, ${TLC}, ${TIB}, ${TWB}, - ${JM_M}, ${JM_N}, ${JM_K}, ${JM_IN_T}, ${JM_OUT_T}>, + ${container_t0}, ${container_t1}, ${container_t2}, ${WG_SIZE}, + ${DOUBLE_BUFFER}, ${CONFLICT_A}, ${CONFLICT_B}, ${CL_SIZE}, + Tile<${TIR}, ${TIC}, ${TWR}, ${TWC}, ${TSR}, ${TSC}, ${TLR}, ${TLC}, ${TIB}, + ${TWB}, ${JM_M}, ${JM_N}, ${JM_K}, ${JM_IN_T}, ${JM_OUT_T}>, ${TRANS_A}, ${TRANS_B}, ${SYMM_A}, ${SYMM_B}, static_cast(gemm_memory_t::${GEMM_MEMORY_TYPE}), static_cast(gemm_algorithm_t::${GEMM_SHAPE_TYPE}), static_cast(gemm_vectorization_t::${GEMM_VECTORIZE_TYPE}), ${IS_BETA_ZERO}, ${VECTOR_SIZE}, static_cast(gemm_batch_type_t::${BATCH_TYPE}), - ${USE_JOINT_MATRIX}>:: - _select_gemm, - BufferIterator<${DATA_TYPE} const>, - BufferIterator<${DATA_TYPE}>, ${DATA_TYPE}, - ${INDEX_TYPE}>( - SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, - ${INDEX_TYPE} _K, ${DATA_TYPE} _alpha, - BufferIterator<${DATA_TYPE} const> a_, ${INDEX_TYPE} _lda, ${INDEX_TYPE} _stridea, - BufferIterator<${DATA_TYPE} const> b_, ${INDEX_TYPE} _ldb, ${INDEX_TYPE} _strideb, - ${DATA_TYPE} _beta, BufferIterator<${DATA_TYPE}> _C, - ${INDEX_TYPE} _ldc, ${INDEX_TYPE} _stridec, ${INDEX_TYPE} batch_size); + ${USE_JOINT_MATRIX}>::_select_gemm( + SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, ${INDEX_TYPE} _K, + ${DATA_TYPE} _alpha, ${container_t0} a_, ${INDEX_TYPE} _lda, + ${INDEX_TYPE} _stridea, ${container_t1} b_, ${INDEX_TYPE} _ldb, + ${INDEX_TYPE} _strideb, ${DATA_TYPE} _beta, ${container_t2} _C, + ${INDEX_TYPE} _ldc, ${INDEX_TYPE} _stridec, ${INDEX_TYPE} batch_size, + const typename SB_Handle::event_t& _dependencies); } // namespace blas diff --git a/src/interface/blas3/symm.cpp.in b/src/interface/blas3/symm.cpp.in index d3f0fcd98..8c6101f8b 100644 --- a/src/interface/blas3/symm.cpp.in +++ b/src/interface/blas3/symm.cpp.in @@ -23,19 +23,19 @@ * **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "interface/gemm_interface.hpp" #include "interface/symm_interface.hpp" -#include "operations/blas_constants.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { namespace internal { template typename SB_Handle::event_t _symm( - SB_Handle& sb_handle, char _side, char _uplo, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, - ${DATA_TYPE} _alpha, ${container_t0} a_, ${INDEX_TYPE} _lda, ${container_t1} b_, - ${INDEX_TYPE} _ldb, ${DATA_TYPE} _beta, ${container_t2} _C, ${INDEX_TYPE} _ldc); + SB_Handle& sb_handle, char _side, char _uplo, ${INDEX_TYPE} _M, + ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, ${container_t0} a_, + ${INDEX_TYPE} _lda, ${container_t1} b_, ${INDEX_TYPE} _ldb, + ${DATA_TYPE} _beta, ${container_t2} _C, ${INDEX_TYPE} _ldc, + const typename SB_Handle::event_t& dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/blas3/trsm.cpp.in b/src/interface/blas3/trsm.cpp.in index 0ffeabd39..ed4f74b35 100644 --- a/src/interface/blas3/trsm.cpp.in +++ b/src/interface/blas3/trsm.cpp.in @@ -22,12 +22,10 @@ **************************************************************************/ #include "container/sycl_iterator.hpp" -#include "sb_handle/sycl_blas_handle.hpp" -#include "sb_handle/kernel_constructor.hpp" -#include "interface/blas1_interface.hpp" #include "interface/trsm_interface.hpp" #include "operations/blas3/trsm.hpp" -#include "operations/blas_constants.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/sycl_blas_handle.hpp" #include "views/view_sycl.hpp" namespace blas { @@ -36,7 +34,8 @@ namespace internal { template typename SB_Handle::event_t _trsm( SB_Handle& sb_handle, char side, char uplo, char trans, char diag, ${INDEX_TYPE} M, ${INDEX_TYPE} N, ${DATA_TYPE} alpha, ${container_t0} A, - ${INDEX_TYPE} lda, ${container_t1} B, ${INDEX_TYPE} ldb); + ${INDEX_TYPE} lda, ${container_t1} B, ${INDEX_TYPE} ldb, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/extension/backend/amd_gpu.hpp b/src/interface/extension/backend/amd_gpu.hpp index 51ceba109..c4df3b6a0 100644 --- a/src/interface/extension/backend/amd_gpu.hpp +++ b/src/interface/extension/backend/amd_gpu.hpp @@ -35,16 +35,16 @@ template (1 << 18)) { return blas::internal::_transpose_outplace_impl<16, 256, 64, true>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out); + _inc_out, _dependencies); } else { return blas::internal::_transpose_outplace_impl<16, 64, 64, true>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out); + _inc_out, _dependencies); } } diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp index bf717a0dd..c2c2fb9ad 100644 --- a/src/interface/extension/backend/default_cpu.hpp +++ b/src/interface/extension/backend/default_cpu.hpp @@ -35,15 +35,15 @@ template ( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out); + _inc_out, _dependencies); } else { return blas::internal::_transpose_outplace_impl<32, 128, 64, false>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out); + _inc_out, _dependencies); } } diff --git a/src/interface/extension/backend/intel_gpu.hpp b/src/interface/extension/backend/intel_gpu.hpp index cd2734840..0bac8a3d4 100644 --- a/src/interface/extension/backend/intel_gpu.hpp +++ b/src/interface/extension/backend/intel_gpu.hpp @@ -35,16 +35,16 @@ template (1 << 18)) { return blas::internal::_transpose_outplace_impl<32, 256, 128, true>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out); + _inc_out, _dependencies); } else { return blas::internal::_transpose_outplace_impl<16, 64, 64, true>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out); + _inc_out, _dependencies); } } diff --git a/src/interface/extension/backend/nvidia_gpu.hpp b/src/interface/extension/backend/nvidia_gpu.hpp index a25206a2b..2b0e2085c 100644 --- a/src/interface/extension/backend/nvidia_gpu.hpp +++ b/src/interface/extension/backend/nvidia_gpu.hpp @@ -35,16 +35,16 @@ template (1 << 18)) { return blas::internal::_transpose_outplace_impl<32, 512, 128, true>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out); + _inc_out, _dependencies); } else { return blas::internal::_transpose_outplace_impl<32, 128, 128, true>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out); + _inc_out, _dependencies); } } diff --git a/src/interface/extension/matcopy.cpp.in b/src/interface/extension/matcopy.cpp.in index 8f90649f5..464350679 100644 --- a/src/interface/extension/matcopy.cpp.in +++ b/src/interface/extension/matcopy.cpp.in @@ -23,6 +23,7 @@ * **************************************************************************/ +#include "container/sycl_iterator.hpp" #include "interface/extension_interface.hpp" #include "sb_handle/kernel_constructor.hpp" #include "sb_handle/sycl_blas_handle.hpp" @@ -36,14 +37,14 @@ template typename SB_Handle::event_t _matcopy( SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n, ${DATA_TYPE} alpha, ${container_t0} in_memory, ${INDEX_TYPE} ld_in, ${INDEX_TYPE} inc_in, ${container_t0} out_memory, ${INDEX_TYPE} ld_out, - ${INDEX_TYPE} inc_out); + ${INDEX_TYPE} inc_out, const typename SB_Handle::event_t& _dependencies); // In-place template typename SB_Handle::event_t _matcopy( SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n, ${DATA_TYPE} alpha, ${container_t0} in_memory, ${INDEX_TYPE} ld_in, ${INDEX_TYPE} inc_in, ${container_t0} out_memory, ${INDEX_TYPE} ld_out, - ${INDEX_TYPE} inc_out); + ${INDEX_TYPE} inc_out, const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/extension/omatadd.cpp.in b/src/interface/extension/omatadd.cpp.in index e1fd5be1b..21dffafcc 100644 --- a/src/interface/extension/omatadd.cpp.in +++ b/src/interface/extension/omatadd.cpp.in @@ -35,7 +35,8 @@ template typename SB_Handle::event_t _omatadd( SB_Handle& sb_handle, char transA, char transB, ${INDEX_TYPE} m, ${INDEX_TYPE} N, ${DATA_TYPE} alpha, ${container_t0} a, ${INDEX_TYPE} lda, ${DATA_TYPE} beta, ${container_t0} b, - ${INDEX_TYPE} ldb, ${container_t0} C, ${INDEX_TYPE} ldc); + ${INDEX_TYPE} ldb, ${container_t0} C, ${INDEX_TYPE} ldc, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/extension/reduction.cpp.in b/src/interface/extension/reduction.cpp.in index 59603fc0a..6e454e2f2 100644 --- a/src/interface/extension/reduction.cpp.in +++ b/src/interface/extension/reduction.cpp.in @@ -35,10 +35,11 @@ namespace blas { namespace internal { -template -typename SB_Handle::event_t _reduction<${OPERATOR}, ${DATA_TYPE}>( +template typename SB_Handle::event_t _reduction<${OPERATOR}, ${DATA_TYPE}>( SB_Handle& sb_handle, ${container_t0} buffer_in, ${INDEX_TYPE} ld, - ${container_t1} buffer_out, ${INDEX_TYPE} rows, ${INDEX_TYPE} cols, reduction_dim_t reduction_type); + ${container_t1} buffer_out, ${INDEX_TYPE} rows, ${INDEX_TYPE} cols, + reduction_dim_t reduction_dim, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/extension/transpose.cpp.in b/src/interface/extension/transpose.cpp.in index 5a8ac8cc2..674e01893 100644 --- a/src/interface/extension/transpose.cpp.in +++ b/src/interface/extension/transpose.cpp.in @@ -23,6 +23,7 @@ * **************************************************************************/ +#include "container/sycl_iterator.hpp" #include "interface/extension_interface.hpp" #include "sb_handle/kernel_constructor.hpp" #include "sb_handle/sycl_blas_handle.hpp" @@ -35,13 +36,15 @@ namespace internal { template typename SB_Handle::event_t _transpose (SB_Handle& sb_handle, ${INDEX_TYPE} m, ${INDEX_TYPE} n, ${container_t0} A, ${INDEX_TYPE} ld_a, - ${container_t0} B, ${INDEX_TYPE} ld_b); + ${container_t0} B, ${INDEX_TYPE} ld_b, + const typename SB_Handle::event_t& _dependencies); // In-place template typename SB_Handle::event_t _transpose (SB_Handle& sb_handle, ${INDEX_TYPE} m, ${INDEX_TYPE} n, ${container_t0} A, ${INDEX_TYPE} ld_a, - ${container_t0} B, ${INDEX_TYPE} ld_b); + ${container_t0} B, ${INDEX_TYPE} ld_b, + const typename SB_Handle::event_t& _dependencies); } // namespace internal } // namespace blas diff --git a/src/interface/extension_interface.hpp b/src/interface/extension_interface.hpp index f05f6716b..23740a513 100644 --- a/src/interface/extension_interface.hpp +++ b/src/interface/extension_interface.hpp @@ -59,7 +59,7 @@ template (cl_size / sizeof(element_t))); constexpr const index_t num_tiles_per_line = num_line_elems / Tile_size; @@ -97,12 +97,12 @@ template ::type _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out) { + index_t ld_out, index_t inc_out, const typename sb_handle_t::event_t& _dependencies) { if constexpr (!in_place) { return blas::transpose::backend::_transpose_outplace< sb_handle_t, in_t, out_t, element_t, index_t>( sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, - inc_out); + inc_out, _dependencies); } else { // TODO @@ -120,7 +120,7 @@ template ::type _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out) { + index_t ld_out, index_t inc_out, const typename sb_handle_t::event_t& _dependencies) { typename sb_handle_t::event_t ret; // if alpha=1 no need to multiply if (alpha == 1) { @@ -128,14 +128,14 @@ _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, auto out_view = make_matrix_view(out_memory, m, n, ld_out, inc_out); auto copy_op = make_op(out_view, in_view); - ret = sb_handle.execute(copy_op); + ret = sb_handle.execute(copy_op, _dependencies); } else { auto in_view = make_matrix_view(in_memory, m, n, ld_in, inc_in); auto out_view = make_matrix_view(out_memory, m, n, ld_out, inc_out); auto scal_op = make_op(alpha, in_view); auto copy_op = make_op(out_view, scal_op); - ret = sb_handle.execute(copy_op); + ret = sb_handle.execute(copy_op, _dependencies); } return ret; } @@ -245,14 +245,9 @@ template typename sb_handle_t::event_t launch_type_based_reduction( sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out, - index_t rows, index_t cols) { -#ifdef POWER_VR - constexpr int ClSize = 32; - constexpr int WgSize = 64; -#else + index_t rows, index_t cols, const typename SB_Handle::event_t& dependencies) { constexpr int ClSize = 64; constexpr int WgSize = 256; -#endif constexpr index_t reductions_per_thread = 64; using params_t = blas::ReductionParams(matrix_buffer_in, temp_); reduction_event = - concatenate_vectors(reduction_event, sb_handle.execute(reduction)); + concatenate_vectors(reduction_event, sb_handle.execute(reduction, dependencies)); /* 2nd step */ auto reduction_step_2 = blas::make_reduction::type, params_t>(temp_, matrix_buffer_out); reduction_event = concatenate_vectors(reduction_event, - sb_handle.execute(reduction_step_2)); + sb_handle.execute(reduction_step_2, reduction_event)); } else { /* 1-step reduction */ auto reduction = blas::make_reduction( matrix_buffer_in, matrix_buffer_out); reduction_event = - concatenate_vectors(reduction_event, sb_handle.execute(reduction)); + concatenate_vectors(reduction_event, sb_handle.execute(reduction, dependencies)); } return reduction_event; @@ -318,7 +313,8 @@ typename sb_handle_t::event_t _matcopy(sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out) { + index_t ld_out, index_t inc_out, + const typename sb_handle_t::event_t& _dependencies) { // bail out early if the leading dimensions are not correct if (ld_in < (inc_in * (m - 1) + 1) || (ld_out - 1) < (trans == 't' ? inc_out * (n - 1) : inc_out * (m - 1))) { @@ -329,11 +325,11 @@ typename sb_handle_t::event_t _matcopy(sb_handle_t& sb_handle, char trans, if (trans == 't') { return _matcopy_impl(sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, - inc_out); + inc_out, _dependencies); } else { return _matcopy_impl(sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, - inc_out); + inc_out, _dependencies); } } @@ -344,7 +340,8 @@ typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a, element_t alpha, container_t a, index_t lda, element_t beta, container_t b, index_t ldb, - container_t c, index_t ldc) { + container_t c, index_t ldc, + const typename sb_handle_t::event_t& _dependencies) { if (trans_a == 't') { if (trans_b == 't') { return _omatadd_impl(sb_handle, m, n, alpha, a, lda, beta, b, @@ -370,7 +367,8 @@ template typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, - out_t B, index_t ld_b) { + out_t B, index_t ld_b, + const typename sb_handle_t::event_t& _dependencies) { // bail out early if the leading dimensions are not correct if (ld_a < m || ld_b < n) { typename sb_handle_t::event_t ret; @@ -381,7 +379,7 @@ typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, const element_t alpha = element_t(1); return _matcopy_impl(sb_handle, m, n, alpha, A, ld_a, inc, B, - ld_b, inc); + ld_b, inc, _dependencies); } template (sb_handle, buffer_in, ld, - buffer_out, rows, cols); + buffer_out, rows, cols, + dependencies); } else { // reduction_dim_t::outer return launch_type_based_reduction(sb_handle, buffer_in, ld, - buffer_out, rows, cols); + buffer_out, rows, cols, + dependencies); } } diff --git a/src/interface/gemm_interface.hpp b/src/interface/gemm_interface.hpp index 1bd56f3f3..9bdd8186f 100644 --- a/src/interface/gemm_interface.hpp +++ b/src/interface/gemm_interface.hpp @@ -56,10 +56,11 @@ typename sb_handle_t::event_t _gemm_platform_specific( element_t _alpha, container_0_t a_, index_t _lda, index_t _stridea, container_1_t b_, index_t _ldb, index_t _strideb, element_t _beta, container_2_t _C, index_t _ldc, index_t _stridec, index_t batch_size, - gemm_batch_type_t batch_type) { + gemm_batch_type_t batch_type, + const typename sb_handle_t::event_t& _dependencies) { return blas::gemm::backend::_gemm<_t_a, _t_b, s_a, s_b, is_beta_zero>( sb_handle, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, _ldb, _strideb, - _beta, _C, _ldc, _stridec, batch_size, batch_type); + _beta, _C, _ldc, _stridec, batch_size, batch_type, _dependencies); } template (0)) ? _gemm_platform_specific<_t_a, _t_b, s_a, s_b, true>( sb_handle, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, _ldb, - _strideb, _beta, _C, _ldc, _stridec, batch_size, batch_type) + _strideb, _beta, _C, _ldc, _stridec, batch_size, batch_type, + _dependencies) : _gemm_platform_specific<_t_a, _t_b, s_a, s_b, false>( sb_handle, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, _ldb, - _strideb, _beta, _C, _ldc, _stridec, batch_size, - batch_type)); + _strideb, _beta, _C, _ldc, _stridec, batch_size, batch_type, + _dependencies)); } template ( sb_handle, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, _ldb, _strideb, - _beta, _C, _ldc, _stridec, batch_size, batch_type); + _beta, _C, _ldc, _stridec, batch_size, batch_type, _dependencies); } else if (!_TrA && _TrB) { return _gemm_is_beta_zero( sb_handle, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, _ldb, _strideb, - _beta, _C, _ldc, _stridec, batch_size, batch_type); + _beta, _C, _ldc, _stridec, batch_size, batch_type, _dependencies); } else if (_TrA && !_TrB) { return _gemm_is_beta_zero( sb_handle, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, _ldb, _strideb, - _beta, _C, _ldc, _stridec, batch_size, batch_type); + _beta, _C, _ldc, _stridec, batch_size, batch_type, _dependencies); } else { return _gemm_is_beta_zero( sb_handle, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, _ldb, _strideb, - _beta, _C, _ldc, _stridec, batch_size, batch_type); + _beta, _C, _ldc, _stridec, batch_size, batch_type, _dependencies); } } template -typename sb_handle_t::event_t _gemm(sb_handle_t& sb_handle, char _TransA, - char _TransB, index_t _M, index_t _N, - index_t _K, element_t _alpha, - container_0_t a_, index_t _lda, - container_1_t b_, index_t _ldb, - element_t _beta, container_2_t _C, - index_t _ldc) { - return _gemm_backend(sb_handle, _TransA, _TransB, _M, _N, _K, - _alpha, a_, _lda, index_t(0), b_, _ldb, - index_t(0), _beta, _C, _ldc, index_t(0), - index_t(1), gemm_batch_type_t::strided); +typename sb_handle_t::event_t _gemm( + sb_handle_t& sb_handle, char _TransA, char _TransB, index_t _M, index_t _N, + index_t _K, element_t _alpha, container_0_t a_, index_t _lda, + container_1_t b_, index_t _ldb, element_t _beta, container_2_t _C, + index_t _ldc, const typename sb_handle_t::event_t& _dependencies) { + return _gemm_backend( + sb_handle, _TransA, _TransB, _M, _N, _K, _alpha, a_, _lda, index_t(0), b_, + _ldb, index_t(0), _beta, _C, _ldc, index_t(0), index_t(1), + gemm_batch_type_t::strided, _dependencies); } template ( - sb_handle, _TransA, _TransB, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, - _ldb, _strideb, _beta, _C, _ldc, _stridec, batch_size, batch_type); + return _gemm_backend(sb_handle, _TransA, _TransB, _M, _N, _K, + _alpha, a_, _lda, _stridea, b_, _ldb, + _strideb, _beta, _C, _ldc, _stridec, + batch_size, batch_type, _dependencies); } template (sb_handle, _TransA, _TransB, _M, _N, _K, - _alpha, a_, _lda, _stridea, b_, _ldb, - _strideb, _beta, _C, _ldc, _stridec, - batch_size, gemm_batch_type_t::strided); + index_t batch_size, const typename sb_handle_t::event_t& _dependencies) { + return _gemm_backend( + sb_handle, _TransA, _TransB, _M, _N, _K, _alpha, a_, _lda, _stridea, b_, + _ldb, _strideb, _beta, _C, _ldc, _stridec, batch_size, + gemm_batch_type_t::strided, _dependencies); } } // namespace internal diff --git a/src/interface/gemm_launcher.hpp b/src/interface/gemm_launcher.hpp index a62ac6031..3ead89c04 100644 --- a/src/interface/gemm_launcher.hpp +++ b/src/interface/gemm_launcher.hpp @@ -34,26 +34,27 @@ namespace blas { /*! * @brief Wrapper around Gemm. Creates the views, then makes and launches Gemm */ -template -template -typename sb_handle_t::event_t -Gemm_Launcher::_select_gemm(sb_handle_t& sb_handle, index_t _M, - index_t _N, index_t _K, - element_t _alpha, container_t0 a_, - index_t _lda, index_t _stridea, - container_t1 b_, index_t _ldb, - index_t _strideb, element_t _beta, - container_t2 _C, index_t _ldc, - index_t _stridec, - index_t batch_size) { +template +typename sb_handle_t::event_t Gemm_Launcher< + container_t0, container_t1, container_t2, WgSize, DoubleBuffer, ConflictA, + ConflictB, ClSize, TileT, TransA, TransB, SymmA, SymmB, GemmMemoryType, + GemmAlgorithm, GemmVectorization, is_beta_zero, VectorSize, BatchType, + UseJointMatrix>::_select_gemm(sb_handle_t& sb_handle, index_t _M, + index_t _N, index_t _K, element_t _alpha, + container_t0 a_, index_t _lda, + index_t _stridea, container_t1 b_, + index_t _ldb, index_t _strideb, + element_t _beta, container_t2 _C, + index_t _ldc, index_t _stridec, + index_t batch_size, + const typename sb_handle_t::event_t& + _dependencies) { auto buffer_a = make_matrix_view(a_, _M, _K, _lda); auto buffer_b = make_matrix_view(b_, _K, _N, _ldb); auto buffer_c = make_matrix_view(_C, _M, _N, _ldc); @@ -65,7 +66,7 @@ Gemm_Launcher -typename sb_handle_t::event_t _symm(sb_handle_t& sb_handle, char _side, - char _uplo, index_t _M, index_t _N, - element_t _alpha, container_0_t a_, - index_t _lda, container_1_t b_, - index_t _ldb, element_t _beta, - container_2_t _C, index_t _ldc) { +typename sb_handle_t::event_t _symm( + sb_handle_t& sb_handle, char _side, char _uplo, index_t _M, index_t _N, + element_t _alpha, container_0_t a_, index_t _lda, container_1_t b_, + index_t _ldb, element_t _beta, container_2_t _C, index_t _ldc, + const typename sb_handle_t::event_t& _dependencies) { const char TRANS_NO = 'n'; const char TRANS_YES = 't'; const char SIDE_RIGHT = 'r'; @@ -57,7 +56,7 @@ typename sb_handle_t::event_t _symm(sb_handle_t& sb_handle, char _side, return _gemm_backend( sb_handle, trans_symm, TRANS_NO, _M, _N, _M, _alpha, a_, _lda, index_t(0), b_, _ldb, index_t(0), _beta, _C, _ldc, index_t(0), - index_t(1), gemm_batch_type_t::strided); + index_t(1), gemm_batch_type_t::strided, _dependencies); } else if (_side == SIDE_RIGHT) { // C <- alpha * B * A + beta * C // if the valid values are in the upper side, transpose the matrix // to make gemm to start reading rows on a valid value. @@ -67,7 +66,7 @@ typename sb_handle_t::event_t _symm(sb_handle_t& sb_handle, char _side, return _gemm_backend( sb_handle, TRANS_NO, trans_symm, _M, _N, _N, _alpha, b_, _ldb, index_t(0), a_, _lda, index_t(0), _beta, _C, _ldc, index_t(0), - index_t(1), gemm_batch_type_t::strided); + index_t(1), gemm_batch_type_t::strided, _dependencies); } else { throw std::invalid_argument("invalid _side"); } diff --git a/src/interface/trsm_interface.hpp b/src/interface/trsm_interface.hpp index 3af818d78..19f59412b 100644 --- a/src/interface/trsm_interface.hpp +++ b/src/interface/trsm_interface.hpp @@ -46,9 +46,9 @@ namespace internal { * @param M The number of rows of matrix B, must be at least 1 * @param N The number of columns of B, must be at least 1 * @param alpha The scalar alpha that is applied to B - * @param A Buffer that holds the input matrix A + * @param A Memory object that holds the input matrix A * @param lda Leading dimension of matrix A - * @param B Buffer that holds the input/output matrix B + * @param B Memory object that holds the input/output matrix B * @param ldb Leading dimension of matrix B * * @note both matrices A and B are expected to be stored in column major order @@ -105,10 +105,11 @@ namespace internal { */ template -typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, - char uplo, char trans, char diag, index_t M, - index_t N, element_t alpha, container_0_t A, - index_t lda, container_1_t B, index_t ldb) { +typename sb_handle_t::event_t _trsm( + sb_handle_t& sb_handle, char side, char uplo, char trans, char diag, + index_t M, index_t N, element_t alpha, container_0_t A, index_t lda, + container_1_t B, index_t ldb, + const typename sb_handle_t::event_t& _dependencies) { // Makes sure all dimensions are larger than zero if ((M == 0) || (N == 0) || (lda == 0) || (ldb == 0)) { throw std::invalid_argument("invalid matrix size argument"); @@ -145,16 +146,18 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, // Temporary buffer for the inverse of the diagonal blocks of the matrix A // filled with zeroes const index_t invASize = roundUp(K, blockSize) * blockSize; - auto invA = make_sycl_iterator_buffer(invASize); - std::vector event = { - blas::helper::fill(sb_handle.get_queue(), invA, element_t{0}, invASize)}; + constexpr bool is_usm = std::is_pointer::value; + auto invA = helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (invASize, sb_handle.get_queue()); + typename sb_handle_t::event_t event = {blas::helper::fill( + sb_handle.get_queue(), invA, element_t{0}, invASize, _dependencies)}; trsmEvents = concatenate_vectors(trsmEvents, event); // Create the matrix views from the input buffers auto bufferA = make_matrix_view(A, K, K, lda); auto bufferInvA = make_matrix_view(invA, blockSize, blockSize, lda); - auto bufferB = make_matrix_view(B, M, N, ldb); // Calculate the parameters for the diagonal blocks inversion const index_t numBlocks = roundUp(K, blockSize) / blockSize; @@ -169,23 +172,23 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, if (isUnitDiag && isUpper) { auto diagInverter = make_diag_blocks_inverter(bufferA, bufferInvA); - invertBlocksEvent = - sb_handle.execute(diagInverter, localSize, globalSize, localMemSize); + invertBlocksEvent = sb_handle.execute(diagInverter, localSize, globalSize, + localMemSize, event); } else if (!isUnitDiag && isUpper) { auto diagInverter = make_diag_blocks_inverter(bufferA, bufferInvA); - invertBlocksEvent = - sb_handle.execute(diagInverter, localSize, globalSize, localMemSize); + invertBlocksEvent = sb_handle.execute(diagInverter, localSize, globalSize, + localMemSize, event); } else if (isUnitDiag && !isUpper) { auto diagInverter = make_diag_blocks_inverter(bufferA, bufferInvA); - invertBlocksEvent = - sb_handle.execute(diagInverter, localSize, globalSize, localMemSize); + invertBlocksEvent = sb_handle.execute(diagInverter, localSize, globalSize, + localMemSize, event); } else if (!isUnitDiag && !isUpper) { auto diagInverter = make_diag_blocks_inverter(bufferA, bufferInvA); - invertBlocksEvent = - sb_handle.execute(diagInverter, localSize, globalSize, localMemSize); + invertBlocksEvent = sb_handle.execute(diagInverter, localSize, globalSize, + localMemSize, event); } trsmEvents = concatenate_vectors(trsmEvents, invertBlocksEvent); @@ -193,9 +196,13 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, // output X will hold the TRSM result and will be copied to B at the end const index_t BSize = ldb * (N - 1) + M; const index_t ldx = ldb; - auto X = make_sycl_iterator_buffer(BSize); + auto X = helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (BSize, sb_handle.get_queue()); trsmEvents = concatenate_vectors( - trsmEvents, internal::_copy(sb_handle, BSize, B, 1, X, 1)); + trsmEvents, + internal::_copy( + sb_handle, BSize, B, 1, X, 1, trsmEvents)); if (isLeft) { if ((isUpper && isTranspose) || (!isUpper && !isTranspose)) { @@ -220,7 +227,7 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, sb_handle, isTranspose ? 't' : 'n', 'n', currentBlockSize, N, currentBlockSize, (i == 0) ? alpha : element_t{1}, invA + i * blockSize, blockSize, B + i, ldb, element_t{0}, X + i, - ldx); + ldx, trsmEvents); trsmEvents = concatenate_vectors(trsmEvents, gemmEvent); if ((i + blockSize) >= M) { @@ -230,10 +237,10 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, const std::ptrdiff_t offsetA = !isTranspose ? ((i + blockSize) + (i * lda)) : (i + (blockSize + i) * lda); - internal::_gemm( + gemmEvent = internal::_gemm( sb_handle, isTranspose ? 't' : 'n', 'n', M - i - blockSize, N, blockSize, element_t{-1}, A + offsetA, lda, X + i, ldx, - (i == 0) ? alpha : element_t{1}, B + i + blockSize, ldb); + (i == 0) ? alpha : element_t{1}, B + i + blockSize, ldb, gemmEvent); trsmEvents = concatenate_vectors(trsmEvents, gemmEvent); } } else { @@ -261,7 +268,7 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, sb_handle, isTranspose ? 't' : 'n', 'n', currentBlockSize, N, currentBlockSize, (i == iStart) ? alpha : element_t{1}, invA + i * blockSize, blockSize, B + i, ldb, element_t{0}, X + i, - ldx); + ldx, trsmEvents); trsmEvents = concatenate_vectors(trsmEvents, gemmEvent); if ((i - blockSize) < 0) { @@ -271,7 +278,7 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, gemmEvent = internal::_gemm( sb_handle, isTranspose ? 't' : 'n', 'n', i, N, currentBlockSize, element_t{-1}, A + (!isTranspose ? (i * lda) : i), lda, X + i, ldx, - (i == iStart) ? alpha : element_t{1}, B, ldb); + (i == iStart) ? alpha : element_t{1}, B, ldb, gemmEvent); trsmEvents = concatenate_vectors(trsmEvents, gemmEvent); } } @@ -303,7 +310,7 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, sb_handle, 'n', isTranspose ? 't' : 'n', M, currentBlockSize, currentBlockSize, (i == iStart) ? alpha : element_t{1}, B + i * ldb, ldb, invA + i * blockSize, blockSize, element_t{0}, X + i * ldx, - ldx); + ldx, trsmEvents); trsmEvents = concatenate_vectors(trsmEvents, gemmEvent); if ((i - blockSize) < 0) { @@ -313,7 +320,7 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, gemmEvent = internal::_gemm( sb_handle, 'n', isTranspose ? 't' : 'n', M, i, currentBlockSize, element_t{-1}, X + i * ldx, ldx, A + (!isTranspose ? i : (i * lda)), - lda, (i == iStart) ? alpha : element_t{1}, B, ldb); + lda, (i == iStart) ? alpha : element_t{1}, B, ldb, gemmEvent); trsmEvents = concatenate_vectors(trsmEvents, gemmEvent); } @@ -338,7 +345,8 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, auto gemmEvent = internal::_gemm( sb_handle, 'n', isTranspose ? 't' : 'n', M, currentBlockSize, currentBlockSize, (i == 0) ? alpha : element_t{1}, B + i * ldb, ldb, - invA + i * blockSize, blockSize, element_t{0}, X + i * ldx, ldx); + invA + i * blockSize, blockSize, element_t{0}, X + i * ldx, ldx, + trsmEvents); trsmEvents = concatenate_vectors(trsmEvents, gemmEvent); if ((i + blockSize) > N) { @@ -348,10 +356,11 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, const std::ptrdiff_t offset = !isTranspose ? (i + (blockSize + i) * lda) : (i + blockSize) + (i * lda); - gemmEvent = internal::_gemm( - sb_handle, 'n', isTranspose ? 't' : 'n', M, N - i - blockSize, - blockSize, element_t{-1}, X + i * ldx, ldx, A + offset, lda, - (i == 0) ? alpha : element_t{1}, B + (i + blockSize) * ldb, ldb); + gemmEvent = internal::_gemm(sb_handle, 'n', isTranspose ? 't' : 'n', M, + N - i - blockSize, blockSize, element_t{-1}, + X + i * ldx, ldx, A + offset, lda, + (i == 0) ? alpha : element_t{1}, + B + (i + blockSize) * ldb, ldb, gemmEvent); trsmEvents = concatenate_vectors(trsmEvents, gemmEvent); } } @@ -359,7 +368,13 @@ typename sb_handle_t::event_t _trsm(sb_handle_t& sb_handle, char side, // Copy bufferX to bufferB as the TRSM result trsmEvents = concatenate_vectors( - trsmEvents, internal::_copy(sb_handle, BSize, X, 1, B, 1)); + trsmEvents, + internal::_copy( + sb_handle, BSize, X, 1, B, 1, trsmEvents)); + + helper::enqueue_deallocate(trsmEvents, invA, sb_handle.get_queue()); + + helper::enqueue_deallocate(trsmEvents, X, sb_handle.get_queue()); return trsmEvents; } diff --git a/src/operations/blas1_trees.hpp b/src/operations/blas1_trees.hpp index 9fd795113..66d8666a2 100644 --- a/src/operations/blas1_trees.hpp +++ b/src/operations/blas1_trees.hpp @@ -28,6 +28,7 @@ #include "operations/blas1_trees.h" #include "operations/blas_operators.hpp" +#include "views/view.hpp" #include "views/view_sycl.hpp" #include #include diff --git a/src/operations/blas3/gemm_local.hpp b/src/operations/blas3/gemm_local.hpp index 0837e07c7..f4b1cbca9 100644 --- a/src/operations/blas3/gemm_local.hpp +++ b/src/operations/blas3/gemm_local.hpp @@ -278,12 +278,9 @@ class Gemm; + auto ptr_A = multi_ptr_(a_.get_pointer()) + (wg_batch_id * stridea_); + auto ptr_B = multi_ptr_(b_.get_pointer()) + (wg_batch_id * strideb_); + auto ptr_C = multi_ptr_(c_.get_pointer()) + (wg_batch_id * stridec_); auto sg = id.get_sub_group(); const index_t sg_id = sg.get_group_linear_id(); @@ -373,7 +372,6 @@ class Gemm *>(&scratch); diff --git a/src/operations/blas3/trsm.hpp b/src/operations/blas3/trsm.hpp index 1fc423a23..2c2ab5526 100644 --- a/src/operations/blas3/trsm.hpp +++ b/src/operations/blas3/trsm.hpp @@ -62,8 +62,8 @@ template SYCL_BLAS_INLINE void DiagonalBlocksInverter::eval( local_memory_t localMem, cl::sycl::nd_item<1> item) noexcept { - auto A = A_.get_data().get_pointer() + A_.get_access_displacement(); - auto invA = invA_.get_data().get_pointer() + invA_.get_access_displacement(); + auto A = A_.get_pointer(); + auto invA = invA_.get_pointer(); value_t* local = localMem.localAcc.get_pointer(); const index_t i = item.get_local_id(0); diff --git a/src/sb_handle/kernel_constructor.hpp b/src/sb_handle/kernel_constructor.hpp index 4f635466e..352037b8d 100644 --- a/src/sb_handle/kernel_constructor.hpp +++ b/src/sb_handle/kernel_constructor.hpp @@ -223,11 +223,9 @@ struct ExpressionTreeFunctor { }; template -static SYCL_BLAS_INLINE cl::sycl::event execute_tree(queue_t q_, - expression_tree_t t, - size_t _localSize, - size_t _globalSize, - size_t _shMem) { +static SYCL_BLAS_INLINE cl::sycl::event execute_tree( + queue_t q_, expression_tree_t t, size_t _localSize, size_t _globalSize, + size_t _shMem, std::vector dependencies) { using value_t = typename LocalMemoryType::type; @@ -237,6 +235,11 @@ static SYCL_BLAS_INLINE cl::sycl::event execute_tree(queue_t q_, cl::sycl::event ev; try { auto cg1 = [=](cl::sycl::handler &h) mutable { +#if SYCL_LANGUAGE_VERSION < 202000 + cl::sycl::event::wait(dependencies); +#else + h.depends_on(dependencies); +#endif t.bind(h); auto scratch = LocalMemory(shMem, h); diff --git a/src/sb_handle/sycl_blas_handle.hpp b/src/sb_handle/sycl_blas_handle.hpp index f8662bec8..6248df775 100644 --- a/src/sb_handle/sycl_blas_handle.hpp +++ b/src/sb_handle/sycl_blas_handle.hpp @@ -43,14 +43,15 @@ namespace blas { * @brief Executes the tree without defining required shared memory. */ template -inline typename SB_Handle::event_t SB_Handle::execute(expression_tree_t t) { +inline typename SB_Handle::event_t SB_Handle::execute( + expression_tree_t t, const typename SB_Handle::event_t& dependencies) { const auto localSize = get_work_group_size(); auto _N = t.get_size(); auto nWG = (_N + localSize - 1) / localSize; auto globalSize = nWG * localSize; - return {execute_tree(get_queue(), t, localSize, - globalSize, 0)}; + return {execute_tree( + get_queue(), t, localSize, globalSize, 0, dependencies)}; }; /*! @@ -58,13 +59,14 @@ inline typename SB_Handle::event_t SB_Handle::execute(expression_tree_t t) { * required shared memory. */ template -inline typename SB_Handle::event_t SB_Handle::execute(expression_tree_t t, - index_t localSize) { +inline typename SB_Handle::event_t SB_Handle::execute( + expression_tree_t t, index_t localSize, + const typename SB_Handle::event_t& dependencies) { auto _N = t.get_size(); auto nWG = (_N + localSize - 1) / localSize; auto globalSize = nWG * localSize; - return {execute_tree(q_, t, localSize, - globalSize, 0)}; + return {execute_tree( + q_, t, localSize, globalSize, 0, dependencies)}; }; /*! @@ -72,11 +74,11 @@ inline typename SB_Handle::event_t SB_Handle::execute(expression_tree_t t, * required shared memory. */ template -inline typename SB_Handle::event_t SB_Handle::execute(expression_tree_t t, - index_t localSize, - index_t globalSize) { - return {execute_tree(q_, t, localSize, - globalSize, 0)}; +inline typename SB_Handle::event_t SB_Handle::execute( + expression_tree_t t, index_t localSize, index_t globalSize, + const typename SB_Handle::event_t& dependencies) { + return {execute_tree( + q_, t, localSize, globalSize, 0, dependencies)}; } /*! @@ -84,12 +86,11 @@ inline typename SB_Handle::event_t SB_Handle::execute(expression_tree_t t, * memory values. */ template -inline typename SB_Handle::event_t SB_Handle::execute(expression_tree_t t, - index_t localSize, - index_t globalSize, - index_t shMem) { - return {execute_tree(q_, t, localSize, - globalSize, shMem)}; +inline typename SB_Handle::event_t SB_Handle::execute( + expression_tree_t t, index_t localSize, index_t globalSize, index_t shMem, + const typename SB_Handle::event_t& dependencies) { + return {execute_tree( + q_, t, localSize, globalSize, shMem, dependencies)}; } /*! @@ -97,7 +98,8 @@ inline typename SB_Handle::event_t SB_Handle::execute(expression_tree_t t, */ template inline typename SB_Handle::event_t SB_Handle::execute( - AssignReduction t) { + AssignReduction t, + const typename SB_Handle::event_t& dependencies) { using expression_tree_t = AssignReduction; auto _N = t.get_size(); auto localSize = t.local_num_thread_; @@ -111,15 +113,19 @@ inline typename SB_Handle::event_t SB_Handle::execute( // Two accessors to local memory auto sharedSize = ((nWG < localSize) ? localSize : nWG); - auto shMem1 = make_sycl_iterator_buffer(sharedSize); - auto shMem2 = make_sycl_iterator_buffer(sharedSize); - - auto opShMem1 = lhs_t( - shMem1.template get_range_accessor(), - (typename lhs_t::index_t)shMem1.get_offset(), 1, sharedSize); - auto opShMem2 = lhs_t( - shMem2.template get_range_accessor(), - (typename lhs_t::index_t)shMem2.get_offset(), 1, sharedSize); + constexpr bool is_usm = std::is_same::value; + auto shMem1 = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + typename lhs_t::value_t > (sharedSize, q_); + auto shMem2 = blas::helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + typename lhs_t::value_t > (sharedSize, q_); + + auto opShMem1 = + make_vector_view(shMem1, typename rhs_t::index_t{1}, sharedSize); + auto opShMem2 = + make_vector_view(shMem2, typename rhs_t::index_t{1}, sharedSize); typename SB_Handle::event_t event; bool frst = true; bool even = false; @@ -130,20 +136,25 @@ inline typename SB_Handle::event_t SB_Handle::execute( auto localTree = expression_tree_t(((nWG == 1) ? lhs : opShMem1), rhs, localSize, globalSize); event.push_back(execute_tree( - q_, localTree, localSize, globalSize, sharedSize)); + q_, localTree, localSize, globalSize, sharedSize, dependencies)); } else { // THE OTHER CASES ALWAYS USE THE BINARY FUNCTION auto localTree = AssignReduction( ((nWG == 1) ? lhs : (even ? opShMem2 : opShMem1)), (even ? opShMem1 : opShMem2), localSize, globalSize); event.push_back(execute_tree( - q_, localTree, localSize, globalSize, sharedSize)); + q_, localTree, localSize, globalSize, sharedSize, event)); } _N = nWG; nWG = (_N + (2 * localSize) - 1) / (2 * localSize); frst = false; even = !even; } while (_N > 1); + + blas::helper::enqueue_deallocate(event, shMem1, q_); + + blas::helper::enqueue_deallocate(event, shMem2, q_); + return event; } @@ -154,7 +165,8 @@ inline typename SB_Handle::event_t SB_Handle::execute( template inline typename SB_Handle::event_t SB_Handle::execute( - AssignReduction t, local_memory_t scr) { + AssignReduction t, local_memory_t scr, + const typename SB_Handle::event_t& dependencies) { using expression_tree_t = AssignReduction; auto _N = t.get_size(); auto localSize = t.local_num_thread_; @@ -180,14 +192,14 @@ inline typename SB_Handle::event_t SB_Handle::execute( auto localTree = expression_tree_t(((nWG == 1) ? lhs : opShMem1), rhs, localSize, globalSize); event.push_back(execute_tree( - q_, localTree, localSize, globalSize, sharedSize)); + q_, localTree, localSize, globalSize, sharedSize, dependencies)); } else { // THE OTHER CASES ALWAYS USE THE BINARY FUNCTION auto localTree = AssignReduction( ((nWG == 1) ? lhs : (even ? opShMem2 : opShMem1)), (even ? opShMem1 : opShMem2), localSize, globalSize); event.push_back(execute_tree( - q_, localTree, localSize, globalSize, sharedSize)); + q_, localTree, localSize, globalSize, sharedSize, dependencies)); } _N = nWG; nWG = (_N + (2 * localSize) - 1) / (2 * localSize); @@ -207,7 +219,8 @@ inline typename SB_Handle::event_t SB_Handle::execute( TransB, SymmA, SymmB, element_t, is_beta_zero, GemmMemoryType, GemmAlgorithm, GemmVectorization, VectorSize, BatchType, UseJointMatrix> - gemm_tree) { + gemm_tree, + const typename SB_Handle::event_t& dependencies) { using gemm_t = Gemm(gemm_memory_t::local), int, - using_local_memory::enabled, using_local_memory::disabled>::type>( + using_local_memory::enabled, using_local_memory::disabled>::type>( q_, gemm_tree, rng.get_local_range()[0], rng.get_global_range()[0], - gemm_t::local_memory_size)}; + gemm_t::local_memory_size, dependencies)}; } /* Tall and skinny Gemm */ @@ -231,7 +244,8 @@ inline typename SB_Handle::event_t SB_Handle::execute( TransB, SymmA, SymmB, element_t, is_beta_zero, GemmMemoryType, static_cast(gemm_algorithm_t::tall_skinny), GemmVectorization, VectorSize, BatchType> - gemm_wrapper) { + gemm_wrapper, + const typename SB_Handle::event_t& dependencies) { using index_t = typename std::make_signed::type; const index_t rows = gemm_wrapper.m_; @@ -251,15 +265,19 @@ inline typename SB_Handle::event_t SB_Handle::execute( TransA, TransB, true, is_beta_zero, element_t, GemmMemoryType> gemm_partial(gemm_wrapper.a_, gemm_wrapper.b_, gemm_wrapper.c_, gemm_wrapper.alpha_, gemm_wrapper.beta_, 1); - auto events = execute(gemm_partial); + auto events = execute(gemm_partial, dependencies); return events; } /* Else use the tall and skinny algorithm */ + constexpr bool is_usm = + std::is_same::value; /* First step: partial gemm */ /* Create the cube buffer that will hold the output of the partial gemm */ - auto cube_buffer = make_sycl_iterator_buffer(rows * cols * depth); + auto cube_buffer = helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (rows * cols * depth, q_); /* Create a first matrix view used for the partial gemm */ auto cube_gemm = @@ -271,7 +289,7 @@ inline typename SB_Handle::event_t SB_Handle::execute( TransA, TransB, false, true, element_t, GemmMemoryType> gemm_partial(gemm_wrapper.a_, gemm_wrapper.b_, cube_gemm, gemm_wrapper.alpha_, gemm_wrapper.beta_, depth); - auto events = execute(gemm_partial); + auto events = execute(gemm_partial, dependencies); /* Create a second view used for the reduction */ auto cube_reduction = @@ -288,23 +306,25 @@ inline typename SB_Handle::event_t SB_Handle::execute( if (is_beta_zero && ldc == rows) { Reduction reduction( cube_reduction, gemm_wrapper.c_); - events = concatenate_vectors(events, execute(reduction)); + events = concatenate_vectors(events, execute(reduction, events)); } /* Otherwise we reduce to a temporary buffer */ else { /* Create a temporary buffer to hold alpha * A * B */ - auto temp_buffer = make_sycl_iterator_buffer(rows * cols); + auto temp_buffer = helper::allocate < is_usm ? helper::AllocType::usm + : helper::AllocType::buffer, + element_t > (rows * cols, q_); auto temp = make_matrix_view(temp_buffer, rows, cols, rows); /* Execute the reduction */ Reduction reduction( cube_reduction, temp); - events = concatenate_vectors(events, execute(reduction)); + events = concatenate_vectors(events, execute(reduction, events)); /* If beta is zero, simply do a 2D copy from the temp buffer to C */ if (is_beta_zero) { auto assignOp = make_op(gemm_wrapper.c_, temp); - events = concatenate_vectors(events, execute(assignOp)); + events = concatenate_vectors(events, execute(assignOp, events)); } /* Else add temp and beta * C and then assign to C */ else { @@ -312,10 +332,14 @@ inline typename SB_Handle::event_t SB_Handle::execute( gemm_wrapper.c_); auto addOp = make_op(temp, scalOp); auto assignOp = make_op(gemm_wrapper.c_, addOp); - events = concatenate_vectors(events, execute(assignOp)); + events = concatenate_vectors(events, execute(assignOp, events)); } + + helper::enqueue_deallocate(events, temp_buffer, q_); } + helper::enqueue_deallocate(events, cube_buffer, q_); + return events; } @@ -326,27 +350,30 @@ template - gemm_partial) { + gemm_partial, + const typename SB_Handle::event_t& dependencies) { auto gemm_partial_range = gemm_partial.get_nd_range(SB_Handle::get_num_compute_units()); return {execute_tree< Choose(gemm_memory_t::local), int, using_local_memory::enabled, using_local_memory::disabled>::type>( q_, gemm_partial, gemm_partial_range.get_local_range()[0], - gemm_partial_range.get_global_range()[0], - gemm_partial.local_memory_size)}; + gemm_partial_range.get_global_range()[0], gemm_partial.local_memory_size, + dependencies)}; } /* ReductionPartial */ template inline typename SB_Handle::event_t SB_Handle::execute( - Reduction reduction) { + Reduction reduction, + const typename SB_Handle::event_t& dependencies) { auto step_range = reduction.get_nd_range(SB_Handle::get_num_compute_units()); return {execute_tree( q_, reduction, step_range.get_local_range()[0], - step_range.get_global_range()[0], params_t::get_local_memory_size())}; + step_range.get_global_range()[0], params_t::get_local_memory_size(), + dependencies)}; } } // namespace blas diff --git a/src/sycl_blas.hpp b/src/sycl_blas.hpp index 008d53ca1..3ccac13c5 100644 --- a/src/sycl_blas.hpp +++ b/src/sycl_blas.hpp @@ -55,3 +55,5 @@ #include "operations/blas_operators.hpp" #include "views/view_sycl.hpp" + +#include "views/view.hpp" diff --git a/src/views/view.hpp b/src/views/view.hpp index b00c85d83..e29f1bdb0 100644 --- a/src/views/view.hpp +++ b/src/views/view.hpp @@ -33,22 +33,6 @@ namespace blas { -/*! -@brief Constructs a view from the given container re-using the container size. -@param data -@param disp -@param strd -*/ -template -VectorView<_value_t, _container_t, _IndexType, _IncrementType>::VectorView( - _container_t &data, _IndexType disp, _IncrementType strd) - : data_(data), - size_data_(data_.size()), - size_(data_.size()), - disp_(disp), - strd_(strd) {} - /*! @brief Creates a view with a size smaller than the container size. @param data @@ -58,62 +42,29 @@ VectorView<_value_t, _container_t, _IndexType, _IncrementType>::VectorView( */ template -VectorView<_value_t, _container_t, _IndexType, _IncrementType>::VectorView( - _container_t &data, _IndexType disp, _IncrementType strd, _IndexType size) - : data_(data), - size_data_(data_.size()), - size_(0), - disp_(disp), - strd_(strd) { - initialize(size); -} +SYCL_BLAS_INLINE VectorView<_value_t, _container_t, _IndexType, + _IncrementType>::VectorView(_container_t data, + _IncrementType strd, + _IndexType size) + : data_(data), size_(size), strd_(strd), ptr_(data) {} /*! @brief Creates a view from an existing view. */ template +SYCL_BLAS_INLINE VectorView<_value_t, _container_t, _IndexType, _IncrementType>::VectorView( VectorView<_value_t, _container_t, _IndexType, _IncrementType> opV, - _IndexType disp, _IncrementType strd, _IndexType size) - : data_(opV.get_data()), - size_data_(opV.get_data().size()), - size_(0), - disp_(disp), - strd_(strd) { - initialize(size); -} - -/*! -@brief Initializes the view using the indexing values. -@param originalSize The original size of the container -*/ -template -inline void VectorView<_value_t, _container_t, _IndexType, - _IncrementType>::initialize(_IndexType originalSize) { - if (strd_ > 0) { - auto sizeV = (size_data_ - disp_); - auto quot = (sizeV + strd_ - 1) / strd_; // ceiling - size_ = quot; - } else if (strd_ > 0) { - auto nstrd = -strd_; - auto quot = (disp_ + nstrd) / nstrd; // ceiling - size_ = quot; - } else { - // Stride is zero, not valid! - throw std::invalid_argument("Cannot create view with 0 stride"); - } - if (originalSize < size_) size_ = originalSize; - if (strd_ < 0) disp_ += (size_ - 1) * strd_; -} + _IncrementType strd, _IndexType size) + : data_(opV.get_data()), size_(size), strd_(strd), ptr_(opV.get_data()) {} /*! * @brief Returns a reference to the container */ template -inline _container_t & +SYCL_BLAS_INLINE _container_t VectorView<_value_t, _container_t, _IndexType, _IncrementType>::get_data() { return data_; } @@ -123,38 +74,19 @@ VectorView<_value_t, _container_t, _IndexType, _IncrementType>::get_data() { */ template -inline _value_t * -VectorView<_value_t, _container_t, _IndexType, _IncrementType>::get_pointer() { - return data_; -} -/*! - * @brief Returns the displacement - */ -template -inline _IndexType VectorView<_value_t, _container_t, _IndexType, - _IncrementType>::get_access_displacement() { - return disp_; +SYCL_BLAS_INLINE _value_t* +VectorView<_value_t, _container_t, _IndexType, _IncrementType>::get_pointer() const { + return ptr_; } -/*! - * @brief Returns the displacement +/*! adjust_access_displacement + * @brief adjust pointer offset + * The user is responsible to adjust pointer offset for USM. */ template -inline void VectorView<_value_t, _container_t, _IndexType, - _IncrementType>::adjust_access_displacement() { - return data_ += disp_; -} - -/*! - * @brief Returns the size of the underlying container. - */ -template -inline _IndexType VectorView<_value_t, _container_t, _IndexType, - _IncrementType>::get_data_size() { - return size_data_; +SYCL_BLAS_INLINE void VectorView<_value_t, _container_t, _IndexType, + _IncrementType>::adjust_access_displacement() { } /*! @@ -162,8 +94,8 @@ inline _IndexType VectorView<_value_t, _container_t, _IndexType, */ template -inline _IndexType VectorView<_value_t, _container_t, _IndexType, - _IncrementType>::get_size() const { +SYCL_BLAS_INLINE _IndexType VectorView<_value_t, _container_t, _IndexType, + _IncrementType>::get_size() const { return size_; } @@ -172,177 +104,88 @@ inline _IndexType VectorView<_value_t, _container_t, _IndexType, */ template -inline _IncrementType +SYCL_BLAS_INLINE _IncrementType VectorView<_value_t, _container_t, _IndexType, _IncrementType>::get_stride() { return strd_; } -/**** EVALUATING ****/ -template -_value_t &VectorView<_value_t, _container_t, _IndexType, _IncrementType>::eval( - index_t i) { - auto ind = disp_; - if (strd_ > 0) { - ind += strd_ * i; - } else { - ind -= strd_ * (size_ - i - 1); - } - if (ind >= size_data_) { - // out of range access - throw std::invalid_argument("Out of range access"); - } - return data_[ind]; -} -template -void VectorView<_value_t, _container_t, _IndexType, _IncrementType>::print_h( - const char *name) { - int frst = 1; - std::cout << name << " = [ "; - for (index_t i = 0; i < size_; i++) { - if (frst) - std::cout << eval(i); - else - std::cout << " , " << eval(i); - frst = 0; - } - std::cout << " ]" << std::endl; -} - /*! * @brief Constructs a matrix view on the container. - * @param data Reference to the container. + * @param data Pointer to memory. * @param sizeR Number of rows. * @param sizeC Nummber of columns. */ -template -MatrixView<_value_t, _container_t, _IndexType, layout>::MatrixView( - _container_t &data, _IndexType sizeR, _IndexType sizeC) - : data_(data), - size_data_(data_.get_size()), +template +SYCL_BLAS_INLINE MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::MatrixView(_container_t data, _IndexType sizeR, _IndexType sizeC) : data_(data), sizeR_(sizeR), sizeC_(sizeC), - sizeL_((MatrixView<_value_t, _container_t, _IndexType, layout>::is_col_major()) - ? sizeR_ - : sizeC_)), - disp_(0) {} + sizeL_((layout::is_col_major()) ? sizeR_ : sizeC_), + ptr_(data) {} /*! * @brief Constructs a matrix view on the container. - * @param data Reference to the container. - * @param accessDev Row-major or column-major. + * @param data Pointer to memory. * @param sizeR Number of rows. * @param sizeC Number of columns. - * @param accessOpr * @param sizeL Size of the leading dimension. * @param disp Displacement from the start. */ template -MatrixView<_value_t, _container_t, _IndexType, layout>::MatrixView( - _container_t &data, _IndexType sizeR, _IndexType sizeC, _IndexType sizeL, - _IndexType disp) - : data_(data + disp), - size_data_(data_.size()), - sizeR_(sizeR), - sizeC_(sizeC), - sizeL_(sizeL), - disp_(0) {} - -/*! - * @brief Constructs a matrix view on the container. - * @param data Reference to the container. - * @param sizeR Number of rows. - * @param sizeC Number of columns. - * @param sizeL Size of the leading dimension. - * @param disp Displacement from the start. - */ -template -MatrixView<_value_t, _container_t, _IndexType, layout>::MatrixView( - _container_t &data, _IndexType sizeR, _IndexType sizeC, _IndexType sizeL, - _IndexType disp) - : data_(data + disp), - size_data_(data_.size()), - sizeR_(sizeR), - sizeC_(sizeC), - sizeL_(sizeL), - disp_(0) {} + typename layout, bool has_inc> +SYCL_BLAS_INLINE +MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::MatrixView( + _container_t data, _IndexType sizeR, _IndexType sizeC, _IndexType sizeL) + : data_(data), sizeR_(sizeR), sizeC_(sizeC), sizeL_(sizeL), ptr_(data) {} /*! *@brief Creates a matrix view from the given one but with different access * parameters. * @param opM Matrix view. - * @param accessDev Row-major or column-major. * @param sizeR Number of rows. * @param sizeC Number of columns. - * @param accessorOpr * @param sizeL Size of the leading dimension. * @param disp Displacement from the start. */ template -MatrixView<_value_t, _container_t, _IndexType, layout>::MatrixView( - MatrixView<_value_t, _container_t, _IndexType, layout> opM, - _IndexType sizeR, _IndexType sizeC, _IndexType sizeL, _IndexType disp) - : data_(opM.data_ + disp), - size_data_(opM.size_data_), - sizeR_(sizeR), - sizeC_(sizeC), - sizeL_(sizeL), - disp_(0) {} - -/*! - * @brief Creates a matrix view from the given one. - * @param opM Matrix view. - * @param sizeR Number of rows. - * @param sizeC Number of columns. - * @param accessorOpr - * @param sizeL Size of leading dimension. - * @param disp Displacement from the start. - */ -template -MatrixView<_value_t, _container_t, _IndexType, layout>::MatrixView( - MatrixView<_value_t, _container_t, _IndexType, layout> opM, - _IndexType sizeR, _IndexType sizeC, _IndexType sizeL, _IndexType disp) - : data_(opM.data_ + disp), - size_data_(opM.size_data_), + typename layout, bool has_inc> +SYCL_BLAS_INLINE +MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::MatrixView( + MatrixView<_value_t, _container_t, _IndexType, layout, has_inc> opM, + _IndexType sizeR, _IndexType sizeC, _IndexType sizeL) + : data_(opM.get_data()), sizeR_(sizeR), sizeC_(sizeC), sizeL_(sizeL), - disp_(0) {} + ptr_(opM.get_data()) {} /*! * @brief Returns the container */ template -inline _container_t & -MatrixView<_value_t, _container_t, _IndexType, layout>::get_data() { + typename layout, bool has_inc> +SYCL_BLAS_INLINE _container_t +MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::get_data() { return data_; } /*! - * @brief Returns the data size + * @brief Returns the size of the view. */ template -inline _IndexType -MatrixView<_value_t, _container_t, _IndexType, layout>::get_data_size() const { - return size_data_; + typename layout, bool has_inc> +SYCL_BLAS_INLINE _IndexType +MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::get_size() const { + return sizeR_ * sizeC_; } /*! - * @brief Returns the size of the view. + * @brief Returns a pointer to the container */ template -inline _IndexType -MatrixView<_value_t, _container_t, _IndexType, layout>::get_size() const { - return sizeR_ * sizeC_; + typename layout, bool has_inc> +SYCL_BLAS_INLINE _value_t* +MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::get_pointer() const { + return ptr_; } /*! get_size_row. @@ -351,9 +194,9 @@ MatrixView<_value_t, _container_t, _IndexType, layout>::get_size() const { * is currently set to Rows. */ template -inline _IndexType -MatrixView<_value_t, _container_t, _IndexType, layout>::get_size_row() const { + typename layout, bool has_inc> +SYCL_BLAS_INLINE _IndexType +MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::get_size_row() const { return sizeR_; } @@ -363,51 +206,30 @@ MatrixView<_value_t, _container_t, _IndexType, layout>::get_size_row() const { * is currently set to Rows. */ template -inline _IndexType -MatrixView<_value_t, _container_t, _IndexType, layout>::get_size_col() const { + typename layout, bool has_inc> +SYCL_BLAS_INLINE _IndexType +MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::get_size_col() const { return sizeC_; } -/*! get_access_displacement. - * @brief get displacement from the origin. - */ -template -inline _IndexType MatrixView<_value_t, _container_t, _IndexType, - layout>::get_access_displacement() const { - return disp_; -} - -/*! get_access_displacement. - * @brief get displacement from the origin. - */ -template -inline void MatrixView<_value_t, _container_t, _IndexType, - layout>::adjust_access_displacement() { - return data_ += disp_; -} - -/*! eval. - * @brief Evaluation for the given linear value. +/*! getSizeL. + * @brief Return the leading dimension. */ template -_value_t &MatrixView<_value_t, _container_t, _IndexType, layout>::eval( - _IndexType ind) { - return data_[ind]; + typename layout, bool has_inc> +SYCL_BLAS_INLINE const _IndexType +MatrixView<_value_t, _container_t, _IndexType, layout, has_inc>::getSizeL() const { + return sizeL_; } -/*! eval. - * @brief Evaluation for the pair of row/col. +/*! adjust_access_displacement. + * @brief adjust pointer offset + * The user is responsible to adjust pointer offset for USM. */ template -_value_t &MatrixView<_value_t, _container_t, _IndexType, layout>::eval( - _IndexType i, _IndexType j) { - return ((layout::is_col_major()) ? data_[(sizeL_ * i) + j] - : data_[(sizeL_ * j) + i]); + typename layout, bool has_inc> +SYCL_BLAS_INLINE void MatrixView<_value_t, _container_t, _IndexType, + layout, has_inc>::adjust_access_displacement() { } } // namespace blas diff --git a/src/views/view_sycl.hpp b/src/views/view_sycl.hpp index 266b0649f..451f8e7e1 100644 --- a/src/views/view_sycl.hpp +++ b/src/views/view_sycl.hpp @@ -139,11 +139,6 @@ struct VectorView< */ SYCL_BLAS_INLINE index_t get_size() const { return size_; } - /*! - * @brief See VectorView. - */ - SYCL_BLAS_INLINE index_t get_access_displacement() const { return disp_; } - /*! * @brief See VectorView. */ @@ -267,8 +262,6 @@ struct MatrixView< SYCL_BLAS_INLINE const index_t get_size_col() const { return sizeC_; } - SYCL_BLAS_INLINE index_t get_access_displacement() const { return disp_; } - SYCL_BLAS_INLINE scalar_t *get_pointer() const { return ptr_; } /**** EVALUATING ***/ diff --git a/test/unittest/blas1/blas1_asum_test.cpp b/test/unittest/blas1/blas1_asum_test.cpp index 2c64630f5..1dac65eaa 100644 --- a/test/unittest/blas1/blas1_asum_test.cpp +++ b/test/unittest/blas1/blas1_asum_test.cpp @@ -26,14 +26,15 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; api_type api; index_t size; index_t incX; - std::tie(api, size, incX) = combi; + std::tie(alloc, api, size, incX) = combi; // Input vector std::vector x_v(size * incX); @@ -55,37 +56,67 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); + auto gpu_x_v = helper::allocate(size * incX, q); + auto copy_x = + helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); if (api == api_type::async) { - auto gpu_out_s = blas::make_sycl_iterator_buffer(&out_s, 1); - _asum(sb_handle, size, gpu_x_v, incX, gpu_out_s); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), - gpu_out_s, &out_s, 1); + auto gpu_out_s = helper::allocate(1, q); + auto copy_out = helper::copy_to_device(q, &out_s, gpu_out_s, 1); + auto asum_event = + _asum(sb_handle, size, gpu_x_v, incX, gpu_out_s, {copy_x, copy_out}); + sb_handle.wait(asum_event); + auto event = helper::copy_to_host(sb_handle.get_queue(), + gpu_out_s, &out_s, 1); sb_handle.wait(event); + helper::deallocate(gpu_out_s, q); } else { - out_s = _asum(sb_handle, size, gpu_x_v, incX); + out_s = _asum(sb_handle, size, gpu_x_v, incX, {copy_x}); } // Validate the result const bool is_almost_equal = utils::almost_equal(out_s, out_cpu_s); ASSERT_TRUE(is_almost_equal); + + helper::deallocate(gpu_x_v, q); +} + +template +static void run_test(const combination_t combi) { + std::string alloc; + api_type api; + index_t size; + index_t incX; + std::tie(alloc, api, size, incX) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } template -const auto combi = ::testing::Combine(::testing::Values(api_type::async, - api_type::sync), // Api - ::testing::Values(11, 65, 10000, - 1002400), // size - ::testing::Values(1, 4) // incX -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(api_type::async, + api_type::sync), // Api + ::testing::Values(11, 65, 10000, + 1002400), // size + ::testing::Values(1, 4) // incX + ); template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; api_type api; int size, incX; - BLAS_GENERATE_NAME(info.param, api, size, incX); + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX); } BLAS_REGISTER_TEST_ALL(Asum, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_axpy_test.cpp b/test/unittest/blas1/blas1_axpy_test.cpp index 1df8671ec..cb395fb92 100644 --- a/test/unittest/blas1/blas1_axpy_test.cpp +++ b/test/unittest/blas1/blas1_axpy_test.cpp @@ -26,15 +26,16 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t size; scalar_t alpha; index_t incX; index_t incY; - std::tie(size, alpha, incX, incY) = combi; + std::tie(alloc, size, alpha, incX, incY) = combi; // Input vector std::vector x_v(size * incX); @@ -52,23 +53,52 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); - auto gpu_y_v = blas::make_sycl_iterator_buffer(y_v, size * incY); + auto gpu_x_v = helper::allocate(size * incX, q); + auto gpu_y_v = helper::allocate(size * incY, q); + + auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); + auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, size * incY); - _axpy(sb_handle, size, alpha, gpu_x_v, incX, gpu_y_v, incY); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_y_v, - y_v.data(), size * incY); + auto axpy_event = _axpy(sb_handle, size, alpha, gpu_x_v, incX, gpu_y_v, incY, + {copy_x, copy_y}); + sb_handle.wait(axpy_event); + + auto event = helper::copy_to_host(q, gpu_y_v, y_v.data(), size * incY); sb_handle.wait(event); // Validate the result const bool isAlmostEqual = utils::compare_vectors(y_v, y_cpu_v); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(gpu_x_v, q); + helper::deallocate(gpu_y_v, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t size; + scalar_t alpha; + index_t incX; + index_t incY; + std::tie(alloc, size, alpha, incX, incY) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values(11, 65, 1002, 1002400), // size + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 1002, 1002400), // size ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(1, 4), // incX ::testing::Values(1, 3) // incY @@ -76,7 +106,8 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values(11, 1002), // size + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1002), // size ::testing::Values(0.0, 1.5), // alpha ::testing::Values(1, 4), // incX ::testing::Values(1, 3) // incY @@ -86,9 +117,10 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int size, incX, incY; T alpha; - BLAS_GENERATE_NAME(info.param, size, alpha, incX, incY); + BLAS_GENERATE_NAME(info.param, alloc, size, alpha, incX, incY); } BLAS_REGISTER_TEST_ALL(Axpy, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_copy_test.cpp b/test/unittest/blas1/blas1_copy_test.cpp index caa68a5d9..7e0d03953 100644 --- a/test/unittest/blas1/blas1_copy_test.cpp +++ b/test/unittest/blas1/blas1_copy_test.cpp @@ -26,16 +26,17 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t size; index_t incX; index_t incY; scalar_t unused; /* Work around dpcpp compiler bug (https://github.com/intel/llvm/issues/7075) */ - std::tie(size, incX, incY, unused) = combi; + std::tie(alloc, size, incX, incY, unused) = combi; // Input vector std::vector x_v(size * incX); @@ -53,10 +54,18 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); - auto gpu_y_v = blas::make_sycl_iterator_buffer(y_v, size * incY); + auto gpu_x_v = blas::helper::allocate(size * incX, q); + auto gpu_y_v = blas::helper::allocate(size * incY, q); + + auto copy_event_x = + blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); - _copy(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY); + auto copy_event_y = + blas::helper::copy_to_device(q, y_v.data(), gpu_y_v, size * incY); + + auto kernel_event = _copy(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY, + {copy_event_x, copy_event_y}); + sb_handle.wait(kernel_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_y_v, y_v.data(), size * incY); sb_handle.wait(event); @@ -64,30 +73,59 @@ void run_test(const combination_t combi) { // Validate the result // For copy, the float tolerances are ok ASSERT_TRUE(utils::compare_vectors(y_v, y_cpu_v)); + + helper::deallocate(gpu_x_v, q); + helper::deallocate(gpu_y_v, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t size; + index_t incX; + index_t incY; + scalar_t unused; + std::tie(alloc, size, incX, incY, unused) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } #ifdef STRESS_TESTING template -const auto combi = ::testing::Combine(::testing::Values(11, 65, 1002, - 1002400), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 1002, + 1002400), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3), // incY + ::testing::Values(0) // unused + ); #else template -const auto combi = ::testing::Combine(::testing::Values(11, 1002), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3), // incY - ::testing::Values(0) // unused -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1002), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3), // incY + ::testing::Values(0) // unused + ); #endif template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int size, incX, incY; T unused; - BLAS_GENERATE_NAME(info.param, size, incX, incY, unused); + BLAS_GENERATE_NAME(info.param, alloc, size, incX, incY, unused); } BLAS_REGISTER_TEST_ALL(Copy, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_dot_test.cpp b/test/unittest/blas1/blas1_dot_test.cpp index deb1a40c7..6a8b3bc40 100644 --- a/test/unittest/blas1/blas1_dot_test.cpp +++ b/test/unittest/blas1/blas1_dot_test.cpp @@ -26,15 +26,16 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; api_type api; index_t size; index_t incX; index_t incY; - std::tie(api, size, incX, incY) = combi; + std::tie(alloc, api, size, incX, incY) = combi; // Input vectors std::vector x_v(size * incX); @@ -53,50 +54,86 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); - auto gpu_y_v = blas::make_sycl_iterator_buffer(y_v, size * incY); + // Iterators + auto gpu_x_v = helper::allocate(size * incX, q); + auto gpu_y_v = helper::allocate(size * incY, q); + + auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); + auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, size * incY); - if (api == api_type::async) { - auto gpu_out_s = blas::make_sycl_iterator_buffer(&out_s, 1); - _dot(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY, gpu_out_s); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_out_s, - &out_s, 1); - sb_handle.wait(event); - } else { - out_s = _dot(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY); - } + if (api == api_type::async) { + auto gpu_out_s = helper::allocate(1, q); + auto copy_out = helper::copy_to_device(q, &out_s, gpu_out_s, 1); + auto dot_event = _dot(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY, + gpu_out_s, {copy_x, copy_y, copy_out}); + sb_handle.wait(dot_event); + auto event = + helper::copy_to_host(sb_handle.get_queue(), gpu_out_s, &out_s, 1); + sb_handle.wait(event); + helper::deallocate(gpu_out_s, q); + } else { + out_s = + _dot(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY, {copy_x, copy_y}); + } // Validate the result const bool isAlmostEqual = utils::almost_equal(out_s, out_cpu_s); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(gpu_x_v, q); + helper::deallocate(gpu_y_v, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + api_type api; + index_t size; + index_t incX; + index_t incY; + std::tie(alloc, api, size, incX, incY) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } #ifdef STRESS_TESTING template -const auto combi = ::testing::Combine(::testing::Values(api_type::async, - api_type::sync), // Api - ::testing::Values(11, 65, 1002, - 1002400), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(api_type::async, + api_type::sync), // Api + ::testing::Values(11, 65, 1002, + 1002400), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3) // incY + ); #else template -const auto combi = ::testing::Combine(::testing::Values(api_type::async, - api_type::sync), // Api - ::testing::Values(11, 1002), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(api_type::async, + api_type::sync), // Api + ::testing::Values(11, 1002), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3) // incY + ); #endif template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; api_type api; int size, incX, incY; - BLAS_GENERATE_NAME(info.param, api, size, incX, incY); + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, incY); } BLAS_REGISTER_TEST_ALL(Dot, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_iamax_test.cpp b/test/unittest/blas1/blas1_iamax_test.cpp index 2872b9f61..385f451c4 100644 --- a/test/unittest/blas1/blas1_iamax_test.cpp +++ b/test/unittest/blas1/blas1_iamax_test.cpp @@ -2,15 +2,16 @@ #include "unittest/blas1/blas1_iaminmax_common.hpp" #include -template +template void run_test(const combination_t combi) { using tuple_t = IndexValueTuple; + std::string alloc; api_type api; index_t size; index_t incX; generation_mode_t mode; - std::tie(api, size, incX, mode) = combi; + std::tie(alloc, api, size, incX, mode) = combi; // Input vector std::vector x_v(size * incX); @@ -32,20 +33,48 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); + auto gpu_x_v = helper::allocate(size * incX, q); + + auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); if (api == api_type::async) { - auto gpu_out_s = blas::make_sycl_iterator_buffer(&out_s, 1); - _iamax(sb_handle, size, gpu_x_v, incX, gpu_out_s); - auto event = - blas::helper::copy_to_host(sb_handle.get_queue(), gpu_out_s, &out_s, 1); + auto gpu_out_s = helper::allocate(1, q); + auto copy_out = helper::copy_to_device(q, &out_s, gpu_out_s, 1); + auto iamax_event = + _iamax(sb_handle, size, gpu_x_v, incX, gpu_out_s, {copy_x, copy_out}); + sb_handle.wait(iamax_event); + auto event = helper::copy_to_host(sb_handle.get_queue(), gpu_out_s, + &out_s, 1); sb_handle.wait(event); + helper::deallocate(gpu_out_s, q); } else { - out_s.ind = _iamax(sb_handle, size, gpu_x_v, incX); + out_s.ind = _iamax(sb_handle, size, gpu_x_v, incX, {copy_x}); } // Validate the result ASSERT_EQ(out_cpu_s, out_s.ind); + + helper::deallocate(gpu_x_v, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + api_type api; + index_t size; + index_t incX; + generation_mode_t mode; + std::tie(alloc, api, size, incX, mode) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } BLAS_REGISTER_TEST_ALL(Iamax, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_iamin_test.cpp b/test/unittest/blas1/blas1_iamin_test.cpp index df71a66d9..5b32a6f12 100644 --- a/test/unittest/blas1/blas1_iamin_test.cpp +++ b/test/unittest/blas1/blas1_iamin_test.cpp @@ -27,15 +27,16 @@ #include "unittest/blas1/blas1_iaminmax_common.hpp" #include -template +template void run_test(const combination_t combi) { using tuple_t = IndexValueTuple; + std::string alloc; api_type api; index_t size; index_t incX; generation_mode_t mode; - std::tie(api, size, incX, mode) = combi; + std::tie(alloc, api, size, incX, mode) = combi; const scalar_t max = std::numeric_limits::max(); @@ -65,20 +66,49 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); + auto gpu_x_v = blas::helper::allocate(size * incX, q); + + auto copy_x = + blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); if (api == api_type::async) { - auto gpu_out_s = blas::make_sycl_iterator_buffer(&out_s, 1); - _iamin(sb_handle, size, gpu_x_v, incX, gpu_out_s); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_out_s, - &out_s, 1); + auto gpu_out_s = blas::helper::allocate(1, q); + auto copy_out = + blas::helper::copy_to_device(q, &out_s, gpu_out_s, 1); + auto iamin_event = + _iamin(sb_handle, size, gpu_x_v, incX, gpu_out_s, {copy_x, copy_out}); + sb_handle.wait(iamin_event); + auto event = blas::helper::copy_to_host(sb_handle.get_queue(), + gpu_out_s, &out_s, 1); sb_handle.wait(event); + helper::deallocate(gpu_out_s, q); } else { - out_s.ind = _iamin(sb_handle, size, gpu_x_v, incX); + out_s.ind = _iamin(sb_handle, size, gpu_x_v, incX, {copy_x}); } // Validate the result ASSERT_EQ(out_cpu_s, out_s.ind); + helper::deallocate(gpu_x_v, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + api_type api; + index_t size; + index_t incX; + generation_mode_t mode; + std::tie(alloc, api, size, incX, mode) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } BLAS_REGISTER_TEST_ALL(Iamin, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_iaminmax_common.hpp b/test/unittest/blas1/blas1_iaminmax_common.hpp index 3c28c4059..346886f47 100644 --- a/test/unittest/blas1/blas1_iaminmax_common.hpp +++ b/test/unittest/blas1/blas1_iaminmax_common.hpp @@ -36,7 +36,8 @@ enum class generation_mode_t : char { }; template -using combination_t = std::tuple; +using combination_t = + std::tuple; template void populate_data(generation_mode_t mode, scalar_t limit, @@ -64,6 +65,7 @@ void populate_data(generation_mode_t mode, scalar_t limit, #ifdef STRESS_TESTING template const auto combi = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 65, 10000, 1002400), // size ::testing::Values(1, 5), // incX @@ -73,6 +75,7 @@ const auto combi = ::testing::Combine( #else template const auto combi = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 65, 1000000), // size ::testing::Values(5), // incX @@ -90,10 +93,11 @@ inline void dump_arg(std::ostream &ss, template static std::string generate_name( const ::testing::TestParamInfo> &info) { + std::string alloc; api_type api; int size, incX; generation_mode_t mode; - BLAS_GENERATE_NAME(info.param, api, size, incX, mode); + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX, mode); } #endif diff --git a/test/unittest/blas1/blas1_nrm2_test.cpp b/test/unittest/blas1/blas1_nrm2_test.cpp index efa83fcae..4efd61725 100644 --- a/test/unittest/blas1/blas1_nrm2_test.cpp +++ b/test/unittest/blas1/blas1_nrm2_test.cpp @@ -26,14 +26,15 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; api_type api; index_t size; index_t incX; - std::tie(api, size, incX) = combi; + std::tie(alloc, api, size, incX) = combi; // Input vectors std::vector x_v(size * incX); @@ -50,36 +51,67 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); + auto gpu_x_v = blas::helper::allocate(size * incX, q); + + auto copy_x = + blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); if (api == api_type::async) { - auto gpu_out_s = blas::make_sycl_iterator_buffer(&out_s, 1); - _nrm2(sb_handle, size, gpu_x_v, incX, gpu_out_s); + auto gpu_out_s = blas::helper::allocate(1, q); + auto copy_out = + blas::helper::copy_to_device(q, &out_s, gpu_out_s, 1); + auto nrm2_event = + _nrm2(sb_handle, size, gpu_x_v, incX, gpu_out_s, {copy_x, copy_out}); + sb_handle.wait(nrm2_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_out_s, &out_s, 1); sb_handle.wait(event); + helper::deallocate(gpu_out_s, q); } else { - out_s = _nrm2(sb_handle, size, gpu_x_v, incX); + out_s = _nrm2(sb_handle, size, gpu_x_v, incX, {copy_x}); } // Validate the result const bool isAlmostEqual = utils::almost_equal(out_s, out_cpu_s); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(gpu_x_v, q); } template -const auto combi = ::testing::Combine(::testing::Values(api_type::async, - api_type::sync), // Api - ::testing::Values(11, 1002), // size - ::testing::Values(1, 4) // incX -); +void run_test(const combination_t combi) { + std::string alloc; + api_type api; + index_t size; + index_t incX; + std::tie(alloc, api, size, incX) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } +} +template +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(api_type::async, + api_type::sync), // Api + ::testing::Values(11, 1002), // size + ::testing::Values(1, 4) // incX + ); template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; api_type api; int size, incX; - BLAS_GENERATE_NAME(info.param, api, size, incX); + BLAS_GENERATE_NAME(info.param, alloc, api, size, incX); } BLAS_REGISTER_TEST_ALL(Nrm2, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_rot_test.cpp b/test/unittest/blas1/blas1_rot_test.cpp index b33b701d7..41a068ee9 100644 --- a/test/unittest/blas1/blas1_rot_test.cpp +++ b/test/unittest/blas1/blas1_rot_test.cpp @@ -26,15 +26,16 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t size; index_t incX; index_t incY; scalar_t unused; /* Necessary to work around dpcpp compiler bug */ - std::tie(size, incX, incY, unused) = combi; + std::tie(alloc, size, incX, incY, unused) = combi; // Input vectors std::vector a_v(size * incX); @@ -66,48 +67,82 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_a_v = blas::make_sycl_iterator_buffer(a_v, size * incX); - auto gpu_b_v = blas::make_sycl_iterator_buffer(b_v, size * incY); - auto gpu_out_s = blas::make_sycl_iterator_buffer(1); + auto gpu_a_v = helper::allocate(size * incX, q); + auto gpu_b_v = helper::allocate(size * incY, q); + auto gpu_out_s = helper::allocate(1, q); + + auto copy_a = helper::copy_to_device(q, a_v.data(), gpu_a_v, size * incX); + auto copy_b = helper::copy_to_device(q, b_v.data(), gpu_b_v, size * incY); auto c = static_cast(c_d); auto s = static_cast(s_d); - _rot(sb_handle, size, gpu_a_v, incX, gpu_b_v, incY, c, s); - _dot(sb_handle, size, gpu_a_v, incX, gpu_b_v, incY, gpu_out_s); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_out_s, - out_s.data(), 1); + auto rot_event = _rot(sb_handle, size, gpu_a_v, incX, gpu_b_v, incY, c, s, + {copy_a, copy_b}); + auto dot_event = _dot(sb_handle, size, gpu_a_v, incX, gpu_b_v, incY, + gpu_out_s, {rot_event}); + sb_handle.wait(dot_event); + auto event = helper::copy_to_host(q, gpu_out_s, out_s.data(), 1); sb_handle.wait(event); // Validate the result const bool isAlmostEqual = utils::almost_equal(out_s[0], out_cpu_s); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(gpu_a_v, q); + helper::deallocate(gpu_b_v, q); + helper::deallocate(gpu_out_s, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t size; + index_t incX; + index_t incY; + scalar_t unused; /* Necessary to work around dpcpp compiler bug */ + std::tie(alloc, size, incX, incY, unused) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } #ifdef STRESS_TESTING template -const auto combi = ::testing::Combine(::testing::Values(11, 65, 1002, - 1002400), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3), // incY - ::testing::Values(0) // unused -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 1002, + 1002400), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3), // incY + ::testing::Values(0) // unused + ); #else template -const auto combi = ::testing::Combine(::testing::Values(11, 1002), // size - ::testing::Values(4), // incX - ::testing::Values(3), // incY - ::testing::Values(0) // unused -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1002), // size + ::testing::Values(4), // incX + ::testing::Values(3), // incY + ::testing::Values(0) // unused + ); #endif template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int size, incX, incY; T unused; - BLAS_GENERATE_NAME(info.param, size, incX, incY, unused); + BLAS_GENERATE_NAME(info.param, alloc, size, incX, incY, unused); } BLAS_REGISTER_TEST_ALL(Rot, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_rotg_test.cpp b/test/unittest/blas1/blas1_rotg_test.cpp index 0fb5c18dc..4e2f62aea 100644 --- a/test/unittest/blas1/blas1_rotg_test.cpp +++ b/test/unittest/blas1/blas1_rotg_test.cpp @@ -26,15 +26,18 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; api_type api; scalar_t a_input; scalar_t b_input; + scalar_t c_input = 1; + scalar_t s_input = 1; - std::tie(api, a_input, b_input) = combi; + std::tie(alloc, api, a_input, b_input) = combi; scalar_t c_ref; scalar_t s_ref; @@ -50,28 +53,38 @@ void run_test(const combination_t combi) { scalar_t a = a_input; scalar_t b = b_input; if (api == api_type::async) { - auto device_a = blas::make_sycl_iterator_buffer(&a_input, 1); - auto device_b = blas::make_sycl_iterator_buffer(&b_input, 1); - auto device_c = blas::make_sycl_iterator_buffer(1); - auto device_s = blas::make_sycl_iterator_buffer(1); - auto event0 = _rotg(sb_handle, device_a, device_b, device_c, device_s); - sb_handle.wait(event0); - - auto event1 = - blas::helper::copy_to_host(sb_handle.get_queue(), device_c, &c, 1); - auto event2 = - blas::helper::copy_to_host(sb_handle.get_queue(), device_s, &s, 1); - auto event3 = - blas::helper::copy_to_host(sb_handle.get_queue(), device_a, &a, 1); - auto event4 = - blas::helper::copy_to_host(sb_handle.get_queue(), device_b, &b, 1); + auto device_a = helper::allocate(1, q); + auto device_b = helper::allocate(1, q); + auto device_c = helper::allocate(1, q); + auto device_s = helper::allocate(1, q); + + auto copy_a = helper::copy_to_device(q, &a_input, device_a, 1); + auto copy_b = helper::copy_to_device(q, &b_input, device_b, 1); + auto set_c = helper::copy_to_device(q, &c_input, device_c, 1); + auto set_s = helper::copy_to_device(q, &s_input, device_s, 1); + + auto rotg_event = _rotg(sb_handle, device_a, device_b, device_c, device_s, + {copy_a, copy_b, set_c, set_s}); + sb_handle.wait(rotg_event); + + auto event1 = helper::copy_to_host(sb_handle.get_queue(), device_c, &c, 1); + auto event2 = helper::copy_to_host(sb_handle.get_queue(), device_s, &s, 1); + auto event3 = helper::copy_to_host(sb_handle.get_queue(), device_a, &a, 1); + auto event4 = helper::copy_to_host(sb_handle.get_queue(), device_b, &b, 1); sb_handle.wait({event1, event2, event3, event4}); + + helper::deallocate(device_a, q); + helper::deallocate(device_b, q); + helper::deallocate(device_c, q); + helper::deallocate(device_s, q); + } else { _rotg(sb_handle, a, b, c, s); } - /* When there is an overflow in the calculation of the hypotenuse, results are - * implementation defined but r should return inf like reference_blas does */ + /* When there is an overflow in the calculation of the hypotenuse, results + * are implementation defined but r should return inf like reference_blas + * does */ if (std::isinf(a_ref)) { ASSERT_TRUE(std::isinf(a)); return; @@ -83,8 +96,31 @@ void run_test(const combination_t combi) { ASSERT_TRUE(utils::almost_equal(s, s_ref)); } +template +void run_test(const combination_t combi) { + std::string alloc; + api_type api; + scalar_t a_input; + scalar_t b_input; + scalar_t c_input = 1; + scalar_t s_input = 1; + + std::tie(alloc, api, a_input, b_input) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } +} + template const auto combi = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(0, 2.5, -7.3, std::numeric_limits::max()), // a @@ -95,9 +131,10 @@ const auto combi = ::testing::Combine( template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; api_type api; T a, b; - BLAS_GENERATE_NAME(info.param, api, a, b); + BLAS_GENERATE_NAME(info.param, alloc, api, a, b); } BLAS_REGISTER_TEST_ALL(Rotg, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_rotm_test.cpp b/test/unittest/blas1/blas1_rotm_test.cpp index f6eeffcb2..37e5dee4a 100644 --- a/test/unittest/blas1/blas1_rotm_test.cpp +++ b/test/unittest/blas1/blas1_rotm_test.cpp @@ -26,15 +26,16 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t size; index_t incX; index_t incY; scalar_t flag; - std::tie(size, incX, incY, flag) = combi; + std::tie(alloc, size, incX, incY, flag) = combi; // Setup param constexpr size_t param_size = 5; @@ -63,27 +64,61 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); - auto gpu_y_v = blas::make_sycl_iterator_buffer(y_v, size * incY); - auto gpu_param = blas::make_sycl_iterator_buffer(param, param_size); + auto gpu_x_v = helper::allocate(size * incX, q); + auto gpu_y_v = helper::allocate(size * incY, q); + auto gpu_param = helper::allocate(param_size, q); + + auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); + auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, size * incY); + auto copy_param = + helper::copy_to_device(q, param.data(), gpu_param, param_size); - _rotm(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY, gpu_param); + sb_handle.wait(copy_param); - auto event1 = blas::helper::copy_to_host( - sb_handle.get_queue(), gpu_x_v, x_v.data(), size * incX); - auto event2 = blas::helper::copy_to_host( - sb_handle.get_queue(), gpu_y_v, y_v.data(), size * incY); + auto rotm_event = _rotm(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY, + gpu_param, {copy_x, copy_y}); + sb_handle.wait(rotm_event); + + auto event1 = + helper::copy_to_host(q, gpu_x_v, x_v.data(), size * incX); + auto event2 = + helper::copy_to_host(q, gpu_y_v, y_v.data(), size * incY); sb_handle.wait({event1, event2}); // Validate the result const bool isAlmostEqual = utils::compare_vectors(x_cpu_v, x_v) && utils::compare_vectors(y_cpu_v, y_v); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(gpu_x_v, q); + helper::deallocate(gpu_y_v, q); + helper::deallocate(gpu_param, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t size; + index_t incX; + index_t incY; + scalar_t flag; + std::tie(alloc, size, incX, incY, flag) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(11, 65, 1002, 1002400), // size ::testing::Values(1, 4), // incX ::testing::Values(1, 3), // incY @@ -91,21 +126,23 @@ const auto combi = ::testing::Combine( ); #else template -const auto combi = ::testing::Combine(::testing::Values(11, 1002), // size - ::testing::Values(4), // incX - ::testing::Values(3), // incY - ::testing::Values(-2.0, -1.0, - 0.0, 1.0, - -4.0) // flag -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1002), // size + ::testing::Values(4), // incX + ::testing::Values(3), // incY + ::testing::Values(-2.0, -1.0, 0.0, 1.0, + -4.0) // flag + ); #endif template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int size, incX, incY; T flag; - BLAS_GENERATE_NAME(info.param, size, incX, incY, flag); + BLAS_GENERATE_NAME(info.param, alloc, size, incX, incY, flag); } BLAS_REGISTER_TEST_ALL(Rotm, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_rotmg_test.cpp b/test/unittest/blas1/blas1_rotmg_test.cpp index 0d710f101..4ba2ecd65 100644 --- a/test/unittest/blas1/blas1_rotmg_test.cpp +++ b/test/unittest/blas1/blas1_rotmg_test.cpp @@ -25,7 +25,7 @@ #include "blas_test.hpp" -template +template struct RotmgTest { /* Magic numbers used by the rotmg algorithm */ static constexpr scalar_t gamma = static_cast(4096.0); @@ -57,38 +57,48 @@ struct RotmgTest { void validate_with_rotm(); }; -template -void RotmgTest::run_sycl_blas_rotmg() { +template +void RotmgTest::run_sycl_blas_rotmg() { auto q = make_queue(); blas::SB_Handle sb_handle(q); sycl_out = RotmgParameters{input.d1, input.d2, input.x1, input.y1}; - auto device_d1 = blas::make_sycl_iterator_buffer(&sycl_out.d1, 1); - auto device_d2 = blas::make_sycl_iterator_buffer(&sycl_out.d2, 1); - auto device_x1 = blas::make_sycl_iterator_buffer(&sycl_out.x1, 1); - auto device_y1 = blas::make_sycl_iterator_buffer(&sycl_out.y1, 1); - auto device_param = - blas::make_sycl_iterator_buffer(sycl_out.param, param_size); - auto event0 = _rotmg(sb_handle, device_d1, device_d2, device_x1, device_y1, - device_param); - sb_handle.wait(event0); - - auto event1 = blas::helper::copy_to_host(sb_handle.get_queue(), device_d1, - &sycl_out.d1, 1); - auto event2 = blas::helper::copy_to_host(sb_handle.get_queue(), device_d2, - &sycl_out.d2, 1); - auto event3 = blas::helper::copy_to_host(sb_handle.get_queue(), device_x1, - &sycl_out.x1, 1); - auto event4 = blas::helper::copy_to_host(sb_handle.get_queue(), device_y1, - &sycl_out.y1, 1); - auto event5 = blas::helper::copy_to_host(sb_handle.get_queue(), device_param, - sycl_out.param.data(), param_size); + auto device_d1 = helper::allocate(1, q); + auto device_d2 = helper::allocate(1, q); + auto device_x1 = helper::allocate(1, q); + auto device_y1 = helper::allocate(1, q); + auto device_param = helper::allocate(param_size, q); + + auto copy_d1 = helper::copy_to_device(q, &sycl_out.d1, device_d1, 1); + auto copy_d2 = helper::copy_to_device(q, &sycl_out.d2, device_d2, 1); + auto copy_x1 = helper::copy_to_device(q, &sycl_out.x1, device_x1, 1); + auto copy_y1 = helper::copy_to_device(q, &sycl_out.y1, device_y1, 1); + auto copy_params = helper::copy_to_device(q, sycl_out.param.data(), + device_param, param_size); + + auto rotmg_event = + _rotmg(sb_handle, device_d1, device_d2, device_x1, device_y1, + device_param, {copy_d1, copy_d2, copy_x1, copy_y1, copy_params}); + sb_handle.wait(rotmg_event); + + auto event1 = helper::copy_to_host(q, device_d1, &sycl_out.d1, 1); + auto event2 = helper::copy_to_host(q, device_d2, &sycl_out.d2, 1); + auto event3 = helper::copy_to_host(q, device_x1, &sycl_out.x1, 1); + auto event4 = helper::copy_to_host(q, device_y1, &sycl_out.y1, 1); + auto event5 = + helper::copy_to_host(q, device_param, sycl_out.param.data(), param_size); sb_handle.wait({event1, event2, event3, event4, event5}); + + helper::deallocate(device_d1, q); + helper::deallocate(device_d2, q); + helper::deallocate(device_x1, q); + helper::deallocate(device_y1, q); + helper::deallocate(device_param, q); } -template -void RotmgTest::validate_with_reference() { +template +void RotmgTest::validate_with_reference() { scalar_t d1_ref = input.d1; scalar_t d2_ref = input.d2; scalar_t x1_ref = input.x1; @@ -165,8 +175,8 @@ void RotmgTest::validate_with_reference() { * x1_output * sqrt(d1_output) = [ h11 h12 ] * [ x1_input] * 0.0 * sqrt(d2_output) [ h21 h22 ] [ y1_input] */ -template -void RotmgTest::validate_with_rotm() { +template +void RotmgTest::validate_with_rotm() { if (sycl_out.param[0] == 2 || sycl_out.d2 < 0) { return; } @@ -189,29 +199,55 @@ void RotmgTest::validate_with_rotm() { } template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; scalar_t d1_input; scalar_t d2_input; scalar_t x1_input; scalar_t y1_input; bool will_overflow; - std::tie(d1_input, d2_input, x1_input, y1_input, will_overflow) = combi; + std::tie(alloc, d1_input, d2_input, x1_input, y1_input, will_overflow) = + combi; - RotmgTest test{d1_input, d2_input, x1_input, y1_input}; + RotmgTest test{d1_input, d2_input, x1_input, y1_input}; test.run_sycl_blas_rotmg(); - /* Do not test with things that might overflow or underflow. Results will not - * make sense if that happens */ + /* Do not test with things that might overflow or underflow. Results will + * not make sense if that happens */ if (!will_overflow) { test.validate_with_reference(); test.validate_with_rotm(); } } +template +void run_test(const combination_t combi) { + std::string alloc; + scalar_t d1_input; + scalar_t d2_input; + scalar_t x1_input; + scalar_t y1_input; + bool will_overflow; + + std::tie(alloc, d1_input, d2_input, x1_input, y1_input, will_overflow) = + combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } +} + template constexpr scalar_t min_rng = 0.5; template @@ -236,132 +272,173 @@ scalar_t r_gen() { /* Generate large enough number so that rotmg will scale it down */ template scalar_t scale_down_gen() { - return random_scalar(RotmgTest::gamma_sq, - RotmgTest::gamma_sq * 2); + /*Setting the mem_alloc parameter to helper::AllocType::usm, it should work if + * set to false as well*/ + return random_scalar( + RotmgTest::gamma_sq, + RotmgTest::gamma_sq * 2); } /* Generate small enough number so that rotmg will scale it up */ template scalar_t scale_up_gen() { - return random_scalar(RotmgTest::inv_gamma_sq / 2, - RotmgTest::inv_gamma_sq); + /*Setting the mem_alloc parameter to helper::AllocType::usm, it should work if + * set to false as well*/ + return random_scalar( + RotmgTest::inv_gamma_sq / 2, + RotmgTest::inv_gamma_sq); } /* This tests try to cover every code path of the rotmg algorithm */ -template -const auto combi = ::testing::Values( - /* d1 < 0 */ - std::make_tuple(-2.5, p_gen(), r_gen(), - r_gen(), false), - /* Input point (c, 0) */ - std::make_tuple(p_gen(), p_gen(), r_gen(), - 0.0, false), - /* Input point (c, 0) && d2 == 0 */ - std::make_tuple(p_gen(), 0.0, r_gen(), 0.0, false), - /* Input point (c, 0) && d2 == 0 */ - std::make_tuple(p_gen(), 0.0, r_gen(), - r_gen(), false), - /* Input point (c, 0) and big numbers (test that no rescaling happened) */ - std::make_tuple(scale_up_gen(), scale_up_gen(), - scale_up_gen(), 0.0, false), - std::make_tuple(scale_down_gen(), scale_down_gen(), - scale_down_gen(), 0.0, false), - /* Input point (0, c) */ - std::make_tuple(p_gen(), p_gen(), 0.0, - r_gen(), false), - /* Input point (0, c) && d1 == 0 */ - std::make_tuple(0.0, p_gen(), 0.0, r_gen(), false), - /* Input point (0, c) && d2 == 0 */ - std::make_tuple(p_gen(), 0.0, 0.0, r_gen(), false), - /* Input point (0, c) && d2 < 0 */ - std::make_tuple(p_gen(), -3.4, 0.0, r_gen(), false), - /* Input point (0, c) && rescaling */ - std::make_tuple(p_gen(), scale_up_gen(), 0.0, - r_gen(), false), - std::make_tuple(p_gen(), scale_down_gen(), 0.0, - r_gen(), false), - std::make_tuple(scale_up_gen(), p_gen(), 0.0, - r_gen(), false), - std::make_tuple(scale_down_gen(), p_gen(), 0.0, - r_gen(), false), - /* d1 == 0 */ - std::make_tuple(0.0, p_gen(), r_gen(), - r_gen(), false), - /* d1 == 0 && d2 < 0 */ - std::make_tuple(0.0, -3.4, r_gen(), r_gen(), false), - /* d1 * x1 > d2 * y1 (i.e. abs_c > abs_s) */ - std::make_tuple(4.0, 2.1, 3.4, 1.5, false), - std::make_tuple(4.0, 1.5, -3.4, 2.1, false), - std::make_tuple(4.0, -1.5, 3.4, 2.1, false), - std::make_tuple(4.0, -1.5, 3.4, -2.1, false), - std::make_tuple(4.0, -1.5, -3.4, -2.1, false), - /* d1 * x1 > d2 * y1 (i.e. abs_c > abs_s) && rescaling */ - std::make_tuple(scale_down_gen(), 2.1, 3.4, 1.5, false), - std::make_tuple(scale_down_gen(), 2.1, scale_down_gen(), - 1.5, false), - std::make_tuple(scale_up_gen(), 2.1, scale_down_gen(), - 1.5, false), - std::make_tuple(scale_down_gen(), 2.1, scale_up_gen(), - 1.5, false), - /* d1 * x1 > d2 * y1 (i.e. abs_c > abs_s) && Underflow */ - std::make_tuple(0.01, 0.01, std::numeric_limits::min(), - std::numeric_limits::min(), true), - /* d1 * x1 > d2 * y1 && Overflow */ - std::make_tuple(std::numeric_limits::max(), - std::numeric_limits::max(), 0.01, 0.01, true), - /* d1 * x1 <= d2 * y1 (i.e. abs_c <= abs_s) */ - std::make_tuple(2.1, 4.0, 1.5, 3.4, false), - std::make_tuple(2.1, 4.0, -1.5, 3.4, false), - std::make_tuple(2.1, -4.0, 1.5, 3.4, false), - std::make_tuple(2.1, -4.0, 1.5, -3.4, false), - std::make_tuple(2.1, -4.0, -1.5, -3.4, false), - /* d1 * x1 <= d2 * y1 (i.e. abs_c <= abs_s) && rescaling */ - std::make_tuple(2.1, scale_down_gen(), 1.5, 3.4, false), - std::make_tuple(2.1, scale_down_gen(), 1.5, - scale_down_gen(), false), - std::make_tuple(2.1, scale_up_gen(), 1.5, - scale_down_gen(), false), - std::make_tuple(2.1, scale_down_gen(), 1.5, - scale_up_gen(), false), - /* d1 * x1 <= d2 * y1 (i.e. abs_c <= abs_s) && Underflow */ - std::make_tuple(std::numeric_limits::min(), - std::numeric_limits::min(), 0.01, 0.01, true), - /* d1 * x1 <= d2 * y1 (i.e. abs_c <= abs_s) && Overflow */ - std::make_tuple(0.01, 0.01, std::numeric_limits::max(), - std::numeric_limits::max(), true), - /* Overflow all */ - std::make_tuple(std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), true), - /* Underflow all */ - std::make_tuple(std::numeric_limits::min(), - std::numeric_limits::min(), - std::numeric_limits::min(), - std::numeric_limits::min(), true), - /* Numeric limits of one parameter */ - std::make_tuple(1.0, 1.0, 1.0, std::numeric_limits::max(), false), - std::make_tuple(1.0, 1.0, std::numeric_limits::max(), 1.0, false), - std::make_tuple(1.0, std::numeric_limits::max(), 1.0, 1.0, false), - std::make_tuple(std::numeric_limits::max(), 1.0, 1.0, 1.0, false), - /* Case that creates an infinite loop on cblas */ - std::make_tuple(std::numeric_limits::min(), -2.2, - std::numeric_limits::min(), - std::numeric_limits::min(), true), - /* Case that triggers underflow detection on abs_c <= abs_s && s >= 0 */ - std::make_tuple(15.5, -2.2, std::numeric_limits::min(), - std::numeric_limits::min(), false), - /* Test for previous errors */ - std::make_tuple(0.0516274, -0.197215, -0.270436, - -0.157621, false) - ); +#define INSTANTIATE_ROTMG_TESTS(NAME, C) \ + template \ + const auto NAME = ::testing:: \ + Values(/* d1 < 0 */ \ + std::make_tuple(C, -2.5, p_gen(), r_gen(), \ + r_gen(), \ + false), /* Input point (c, 0) */ \ + std::make_tuple(C, p_gen(), p_gen(), \ + r_gen(), 0.0, \ + false), /* Input point (c, 0) && d2 == 0 */ \ + std::make_tuple(C, p_gen(), 0.0, r_gen(), \ + 0.0, false), /* Input point (c, 0) && d2 == 0 */ \ + std::make_tuple(C, p_gen(), 0.0, r_gen(), \ + r_gen(), \ + false), /* Input point (c, 0) and big numbers \ + (test that no rescaling happened) */ \ + std::make_tuple(C, scale_up_gen(), \ + scale_up_gen(), \ + scale_up_gen(), 0.0, false), \ + std::make_tuple(C, scale_down_gen(), \ + scale_down_gen(), \ + scale_down_gen(), 0.0, \ + false), /* Input point (0, c) */ \ + std::make_tuple(C, p_gen(), p_gen(), 0.0, \ + r_gen(), \ + false), /* Input point (0, c) && d1 == 0 */ \ + std::make_tuple(C, 0.0, p_gen(), 0.0, \ + r_gen(), \ + false), /* Input point (0, c) && d2 == 0 */ \ + std::make_tuple(C, p_gen(), 0.0, 0.0, \ + r_gen(), \ + false), /* Input point (0, c) && d2 < 0 */ \ + std::make_tuple(C, p_gen(), -3.4, 0.0, \ + r_gen(), \ + false), /* Input point (0, c) && rescaling */ \ + std::make_tuple(C, p_gen(), scale_up_gen(), \ + 0.0, r_gen(), false), \ + std::make_tuple(C, p_gen(), scale_down_gen(), \ + 0.0, r_gen(), false), \ + std::make_tuple(C, scale_up_gen(), p_gen(), \ + 0.0, r_gen(), false), \ + std::make_tuple(C, scale_down_gen(), p_gen(), \ + 0.0, r_gen(), false), /* d1 == 0 */ \ + std::make_tuple(C, 0.0, p_gen(), r_gen(), \ + r_gen(), \ + false), /* d1 == 0 && d2 < 0 */ \ + std::make_tuple( \ + C, 0.0, -3.4, r_gen(), r_gen(), \ + false), /* d1 * x1 > d2 * y1 (i.e. abs_c > abs_s) */ \ + std::make_tuple(C, 4.0, 2.1, 3.4, 1.5, false), \ + std::make_tuple(C, 4.0, 1.5, -3.4, 2.1, false), \ + std::make_tuple(C, 4.0, -1.5, 3.4, 2.1, false), \ + std::make_tuple(C, 4.0, -1.5, 3.4, -2.1, false), \ + std::make_tuple(C, 4.0, -1.5, -3.4, -2.1, \ + false), /* d1 * x1 > d2 * y1 (i.e. abs_c > abs_s) \ + && rescaling */ \ + std::make_tuple(C, scale_down_gen(), 2.1, 3.4, 1.5, \ + false), \ + std::make_tuple(C, scale_down_gen(), 2.1, \ + scale_down_gen(), 1.5, false), \ + std::make_tuple(C, scale_up_gen(), 2.1, \ + scale_down_gen(), 1.5, false), \ + std::make_tuple(C, scale_down_gen(), 2.1, \ + scale_up_gen(), 1.5, \ + false), /* d1 * x1 > d2 * y1 (i.e. abs_c > abs_s) \ + && Underflow */ \ + std::make_tuple(C, 0.01, 0.01, \ + std::numeric_limits::min(), \ + std::numeric_limits::min(), \ + true), /* d1 * x1 > d2 * y1 && Overflow */ \ + std::make_tuple( \ + C, std::numeric_limits::max(), \ + std::numeric_limits::max(), 0.01, 0.01, \ + true), /* d1 * x1 <= d2 * y1 (i.e. abs_c <= abs_s) */ \ + std::make_tuple(C, 2.1, 4.0, 1.5, 3.4, false), \ + std::make_tuple(C, 2.1, 4.0, -1.5, 3.4, false), \ + std::make_tuple(C, 2.1, -4.0, 1.5, 3.4, false), \ + std::make_tuple(C, 2.1, -4.0, 1.5, -3.4, false), \ + std::make_tuple(C, 2.1, -4.0, -1.5, -3.4, \ + false), /* d1 * x1 <= d2 * y1 (i.e. abs_c <= \ + abs_s) && rescaling */ \ + std::make_tuple(C, 2.1, scale_down_gen(), 1.5, 3.4, \ + false), \ + std::make_tuple(C, 2.1, scale_down_gen(), 1.5, \ + scale_down_gen(), false), \ + std::make_tuple(C, 2.1, scale_up_gen(), 1.5, \ + scale_down_gen(), false), \ + std::make_tuple(C, 2.1, scale_down_gen(), 1.5, \ + scale_up_gen(), \ + false), /* d1 * x1 <= d2 * y1 (i.e. abs_c <= \ + abs_s) && Underflow */ \ + std::make_tuple(C, std::numeric_limits::min(), \ + std::numeric_limits::min(), 0.01, 0.01, \ + true), /* d1 * x1 <= d2 * y1 (i.e. abs_c <= \ + abs_s) && Overflow */ \ + std::make_tuple(C, 0.01, 0.01, \ + std::numeric_limits::max(), \ + std::numeric_limits::max(), \ + true), /* Overflow all */ \ + std::make_tuple(C, std::numeric_limits::max(), \ + std::numeric_limits::max(), \ + std::numeric_limits::max(), \ + std::numeric_limits::max(), \ + true), /* Underflow all */ \ + std::make_tuple(C, std::numeric_limits::min(), \ + std::numeric_limits::min(), \ + std::numeric_limits::min(), \ + std::numeric_limits::min(), \ + true), /* Numeric limits of one parameter */ \ + std::make_tuple(C, 1.0, 1.0, 1.0, \ + std::numeric_limits::max(), false), \ + std::make_tuple(C, 1.0, 1.0, \ + std::numeric_limits::max(), 1.0, \ + false), \ + std::make_tuple(C, 1.0, std::numeric_limits::max(), \ + 1.0, 1.0, false), \ + std::make_tuple( \ + C, std::numeric_limits::max(), 1.0, 1.0, 1.0, \ + false), /* Case that creates an infinite loop on cblas */ \ + std::make_tuple(C, std::numeric_limits::min(), -2.2, \ + std::numeric_limits::min(), \ + std::numeric_limits::min(), \ + true), /* Case that triggers underflow detection \ + on abs_c <= abs_s && s >= 0 */ \ + std::make_tuple(C, 15.5, -2.2, \ + std::numeric_limits::min(), \ + std::numeric_limits::min(), \ + false), /* Test for previous errors */ \ + std::make_tuple(C, 0.0516274, -0.197215, -0.270436, -0.157621, \ + false)) + +#ifdef SB_ENABLE_USM +INSTANTIATE_ROTMG_TESTS(combi_usm, "usm"); // instantiate usm tests +#endif +INSTANTIATE_ROTMG_TESTS(combi_buffer, "buf"); // instantiate buffer tests template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; T d1, d2, x1, y1; bool will_overflow; - BLAS_GENERATE_NAME(info.param, d1, d2, x1, y1, will_overflow); + BLAS_GENERATE_NAME(info.param, alloc, d1, d2, x1, y1, will_overflow); } -BLAS_REGISTER_TEST_ALL(Rotmg, combination_t, combi, generate_name); +#ifdef SB_ENABLE_USM +BLAS_REGISTER_TEST_ALL(Rotmg_Usm, combination_t, combi_usm, generate_name); +#endif +BLAS_REGISTER_TEST_ALL(Rotmg_Buffer, combination_t, combi_buffer, + generate_name); + +#undef INSTANTIATE_ROTMG_TESTS diff --git a/test/unittest/blas1/blas1_scal_test.cpp b/test/unittest/blas1/blas1_scal_test.cpp index b648ae20d..a4f95f6bc 100644 --- a/test/unittest/blas1/blas1_scal_test.cpp +++ b/test/unittest/blas1/blas1_scal_test.cpp @@ -26,14 +26,15 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t size; scalar_t alpha; index_t incX; - std::tie(size, alpha, incX) = combi; + std::tie(alloc, size, alpha, incX) = combi; // Input/output vector std::vector x_v(size * incX); @@ -47,29 +48,55 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); + auto gpu_x_v = blas::helper::allocate(size * incX, q); + + auto copy_event = + blas::helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); - _scal(sb_handle, size, alpha, gpu_x_v, incX); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_x_v, - x_v.data(), size * incX); + auto scal_event = _scal(sb_handle, size, alpha, gpu_x_v, incX, {copy_event}); + sb_handle.wait(scal_event); + + auto event = blas::helper::copy_to_host(q, gpu_x_v, x_v.data(), size * incX); sb_handle.wait(event); // Validate the result const bool isAlmostEqual = utils::compare_vectors(x_v, x_cpu_v); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(gpu_x_v, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t size; + scalar_t alpha; + index_t incX; + std::tie(alloc, size, alpha, incX) = combi; + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values(11, 65, 1002, 1002400), // size + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 1002, 1002400), // size ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(1, 4) // incX ); #else template const auto combi = - ::testing::Combine(::testing::Values(11, 1002), // size + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1002), // size ::testing::Values(0.0, 1.5), // alpha ::testing::Values(4) // incX ); @@ -78,9 +105,10 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int size, incX; T alpha; - BLAS_GENERATE_NAME(info.param, size, alpha, incX); + BLAS_GENERATE_NAME(info.param, alloc, size, alpha, incX); } BLAS_REGISTER_TEST_ALL(Scal, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_sdsdot_test.cpp b/test/unittest/blas1/blas1_sdsdot_test.cpp index 8b676d8ab..6814454e8 100644 --- a/test/unittest/blas1/blas1_sdsdot_test.cpp +++ b/test/unittest/blas1/blas1_sdsdot_test.cpp @@ -28,19 +28,18 @@ #include template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { - /* sdsdot is only valid when using floats */ - static_assert(std::is_same::value); - + std::string alloc; index_t N; float sb; index_t incX; index_t incY; api_type api; - std::tie(api, N, sb, incX, incY) = combi; + std::tie(alloc, api, N, sb, incX, incY) = combi; /* Sycl Buffers do not work with size = 0. So setting input vectors to size * one to test the edge case where if size equals 0 then sb should be @@ -66,32 +65,66 @@ void run_test(const combination_t combi) { // Iterators auto gpu_x_v = - blas::make_sycl_iterator_buffer(int(vectorSize * incX)); - blas::helper::copy_to_device(sb_handle.get_queue(), x_v.data(), gpu_x_v, - vectorSize * incX); + helper::allocate(int(vectorSize * incX), q); auto gpu_y_v = - blas::make_sycl_iterator_buffer(int(vectorSize * incY)); - blas::helper::copy_to_device(sb_handle.get_queue(), y_v.data(), gpu_y_v, - vectorSize * incY); + helper::allocate(int(vectorSize * incY), q); + + auto copy_x = + helper::copy_to_device(q, x_v.data(), gpu_x_v, vectorSize * incX); + auto copy_y = + helper::copy_to_device(q, y_v.data(), gpu_y_v, vectorSize * incY); if (api == api_type::async) { - auto gpu_out_s = blas::make_sycl_iterator_buffer(&out_s, 1); - _sdsdot(sb_handle, N, sb, gpu_x_v, incX, gpu_y_v, incY, gpu_out_s); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), - gpu_out_s, &out_s, 1); + auto gpu_out_s = helper::allocate(1, q); + auto copy_out = helper::copy_to_device(q, &out_s, gpu_out_s, 1); + auto sdsdot_event = _sdsdot(sb_handle, N, sb, gpu_x_v, incX, gpu_y_v, incY, + gpu_out_s, {copy_x, copy_y, copy_out}); + sb_handle.wait(sdsdot_event); + auto event = helper::copy_to_host(sb_handle.get_queue(), + gpu_out_s, &out_s, 1); sb_handle.wait(event); + helper::deallocate(gpu_out_s, q); } else { - out_s = _sdsdot(sb_handle, N, sb, gpu_x_v, incX, gpu_y_v, incY); + out_s = _sdsdot(sb_handle, N, sb, gpu_x_v, incX, gpu_y_v, incY, + {copy_x, copy_y}); } // Validate the result const bool isAlmostEqual = utils::almost_equal(out_s, out_cpu_s); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(gpu_x_v, q); + helper::deallocate(gpu_y_v, q); +} + +template +void run_test(const combination_t combi) { + /* sdsdot is only valid when using floats */ + static_assert(std::is_same::value); + + std::string alloc; + index_t N; + float sb; + index_t incX; + index_t incY; + api_type api; + std::tie(alloc, api, N, sb, incX, incY) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 65, 1002, 1002400), // N ::testing::Values(9.5f, 0.5f), // sb @@ -101,6 +134,7 @@ const auto combi = ::testing::Combine( #else template const auto combi = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(api_type::async, api_type::sync), // Api ::testing::Values(11, 1002, 0), // N ::testing::Values(9.5f, 0.5f, 0.0f), // sb @@ -113,10 +147,11 @@ const auto combi = ::testing::Combine( template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int size, incX, incY; float sb; api_type api; - BLAS_GENERATE_NAME(info.param, api, size, sb, incX, incY); + BLAS_GENERATE_NAME(info.param, alloc, api, size, sb, incX, incY); } BLAS_REGISTER_TEST_FLOAT(Sdsdot, combination_t, combi, generate_name); diff --git a/test/unittest/blas1/blas1_swap_test.cpp b/test/unittest/blas1/blas1_swap_test.cpp index 1cb0ab463..8f63c45cf 100644 --- a/test/unittest/blas1/blas1_swap_test.cpp +++ b/test/unittest/blas1/blas1_swap_test.cpp @@ -26,14 +26,15 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t size; index_t incX; index_t incY; - std::tie(size, incX, incY) = combi; + std::tie(alloc, size, incX, incY) = combi; // Input/Output vector std::vector x_v(size * incX); @@ -52,43 +53,75 @@ void run_test(const combination_t combi) { blas::SB_Handle sb_handle(q); // Iterators - auto gpu_x_v = blas::make_sycl_iterator_buffer(x_v, size * incX); - auto gpu_y_v = blas::make_sycl_iterator_buffer(y_v, size * incY); - - _swap(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_x_v, - x_v.data(), size * incX); - sb_handle.wait(event); - event = blas::helper::copy_to_host(sb_handle.get_queue(), gpu_y_v, y_v.data(), + auto gpu_x_v = helper::allocate(size * incX, q); + auto gpu_y_v = helper::allocate(size * incY, q); + + auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, size * incX); + auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, size * incY); + + auto swap_event = + _swap(sb_handle, size, gpu_x_v, incX, gpu_y_v, incY, {copy_x, copy_y}); + sb_handle.wait(swap_event); + + auto event0 = helper::copy_to_host(sb_handle.get_queue(), gpu_x_v, x_v.data(), + size * incX); + auto event1 = helper::copy_to_host(sb_handle.get_queue(), gpu_y_v, y_v.data(), size * incY); - sb_handle.wait(event); + sb_handle.wait({event0, event1}); // Validate the result // Since this is just a swap operation, float tolerances are fine ASSERT_TRUE(utils::compare_vectors(y_v, y_cpu_v)); ASSERT_TRUE(utils::compare_vectors(x_v, x_cpu_v)); + + helper::deallocate(gpu_x_v, q); + helper::deallocate(gpu_y_v, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t size; + index_t incX; + index_t incY; + std::tie(alloc, size, incX, incY) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } } #ifdef STRESS_TESTING template -const auto combi = ::testing::Combine(::testing::Values(11, 65, 1002, - 1002400), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 1002, + 1002400), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3) // incY + ); #else template -const auto combi = ::testing::Combine(::testing::Values(11, 1002), // size - ::testing::Values(1, 4), // incX - ::testing::Values(1, 3) // incY -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1002), // size + ::testing::Values(1, 4), // incX + ::testing::Values(1, 3) // incY + ); #endif template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int size, incX, incY; - BLAS_GENERATE_NAME(info.param, size, incX, incY); + BLAS_GENERATE_NAME(info.param, alloc, size, incX, incY); } BLAS_REGISTER_TEST_ALL(Swap, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_gbmv_test.cpp b/test/unittest/blas2/blas2_gbmv_test.cpp index a6886c52b..d3ef48af6 100644 --- a/test/unittest/blas2/blas2_gbmv_test.cpp +++ b/test/unittest/blas2/blas2_gbmv_test.cpp @@ -26,10 +26,12 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t m; index_t n; index_t kl; @@ -40,7 +42,8 @@ void run_test(const combination_t combi) { index_t incX; index_t incY; index_t lda_mul; - std::tie(m, n, kl, ku, alpha, beta, trans, incX, incY, lda_mul) = combi; + std::tie(alloc, m, n, kl, ku, alpha, beta, trans, incX, incY, lda_mul) = + combi; const char* t_str = trans ? "t" : "n"; @@ -67,30 +70,73 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); - auto v_y_gpu = - blas::make_sycl_iterator_buffer(y_v_gpu_result, y_size); + auto m_a_gpu = helper::allocate(a_size, q); + auto v_x_gpu = helper::allocate(x_size, q); + auto v_y_gpu = helper::allocate(y_size, q); + + auto copy_m = + helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_x = + helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); + auto copy_y = helper::copy_to_device(q, y_v_gpu_result.data(), + v_y_gpu, y_size); + + sb_handle.wait({copy_m, copy_x, copy_y}); // SYCLGBMV - _gbmv(sb_handle, *t_str, m, n, kl, ku, alpha, m_a_gpu, - (kl + ku + 1) * lda_mul, v_x_gpu, incX, beta, v_y_gpu, incY); + auto gbmv_event = + _gbmv(sb_handle, *t_str, m, n, kl, ku, alpha, m_a_gpu, + (kl + ku + 1) * lda_mul, v_x_gpu, incX, beta, v_y_gpu, incY); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_y_gpu, - y_v_gpu_result.data(), y_size); + sb_handle.wait(gbmv_event); + + auto event = + blas::helper::copy_to_host(q, v_y_gpu, y_v_gpu_result.data(), y_size); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(y_v_gpu_result, y_v_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(v_x_gpu, q); + helper::deallocate(v_y_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t m; + index_t n; + index_t kl; + index_t ku; + bool trans; + scalar_t alpha; + scalar_t beta; + index_t incX; + index_t incY; + index_t lda_mul; + std::tie(alloc, m, n, kl, ku, alpha, beta, trans, incX, incY, lda_mul) = + combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values(11, 65, 255, 1023), // m - ::testing::Values(14, 63, 257, 1010), // n - ::testing::Values(3, 4, 9), // kl - ::testing::Values(2, 5, 7), // ku + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 255, 1023), // m + ::testing::Values(14, 63, 257, 1010), // n + ::testing::Values(3, 4, 9), // kl + ::testing::Values(2, 5, 7), // ku ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(0.0, 1.0, 1.5), // beta ::testing::Values(true, false), // trans @@ -103,11 +149,12 @@ const auto combi = // (the stress_test above takes about ~5 minutes) template const auto combi = - ::testing::Combine(::testing::Values(11, 1023), // m - ::testing::Values(14, 1010), // n - ::testing::Values(3, 4), // kl - ::testing::Values(2, 3), // ku - ::testing::Values(1.5), // alpha + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1023), // m + ::testing::Values(14, 1010), // n + ::testing::Values(3, 4), // kl + ::testing::Values(2, 3), // ku + ::testing::Values(1.5), // alpha ::testing::Values(0.0, 1.5), // beta ::testing::Values(false, true), // trans ::testing::Values(2), // incX @@ -119,11 +166,12 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int m, n, kl, ku, incX, incY, ldaMul; T alpha, beta; bool trans; - BLAS_GENERATE_NAME(info.param, m, n, kl, ku, alpha, beta, trans, incX, incY, - ldaMul); + BLAS_GENERATE_NAME(info.param, alloc, m, n, kl, ku, alpha, beta, trans, incX, + incY, ldaMul); } BLAS_REGISTER_TEST_ALL(Gbmv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_gemv_test.cpp b/test/unittest/blas2/blas2_gemv_test.cpp index 75c2a9853..ced6bb972 100644 --- a/test/unittest/blas2/blas2_gemv_test.cpp +++ b/test/unittest/blas2/blas2_gemv_test.cpp @@ -26,10 +26,12 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t m; index_t n; bool trans; @@ -38,7 +40,7 @@ void run_test(const combination_t combi) { index_t incX; index_t incY; index_t lda_mul; - std::tie(m, n, alpha, beta, trans, incX, incY, lda_mul) = combi; + std::tie(alloc, m, n, alpha, beta, trans, incX, incY, lda_mul) = combi; const char* t_str = trans ? "t" : "n"; @@ -64,27 +66,66 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); - auto v_y_gpu = - blas::make_sycl_iterator_buffer(y_v_gpu_result, y_size); + auto m_a_gpu = helper::allocate(a_size, q); + auto v_x_gpu = helper::allocate(x_size, q); + auto v_y_gpu = helper::allocate(y_size, q); + + auto copy_m = + helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_x = + helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); + auto copy_y = helper::copy_to_device(q, y_v_gpu_result.data(), + v_y_gpu, y_size); + + sb_handle.wait({copy_m, copy_x, copy_y}); // SYCLGEMV - _gemv(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda_mul * m, v_x_gpu, incX, - beta, v_y_gpu, incY); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_y_gpu, - y_v_gpu_result.data(), y_size); + auto gemv_event = _gemv(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda_mul * m, + v_x_gpu, incX, beta, v_y_gpu, incY); + sb_handle.wait(gemv_event); + + auto event = + blas::helper::copy_to_host(q, v_y_gpu, y_v_gpu_result.data(), y_size); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(y_v_gpu_result, y_v_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(v_x_gpu, q); + helper::deallocate(v_y_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t m; + index_t n; + bool trans; + scalar_t alpha; + scalar_t beta; + index_t incX; + index_t incY; + index_t lda_mul; + std::tie(alloc, m, n, alpha, beta, trans, incX, incY, lda_mul) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values(11, 65, 255, 1023), // m - ::testing::Values(14, 63, 257, 1010), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 255, 1023), // m + ::testing::Values(14, 63, 257, 1010), // n ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(0.0, 1.0, 1.5), // beta ::testing::Values(true, false), // trans @@ -97,9 +138,10 @@ const auto combi = // (the stress_test above takes about ~5 minutes) template const auto combi = - ::testing::Combine(::testing::Values(11, 1023), // m - ::testing::Values(14, 1010), // n - ::testing::Values(1.5), // alpha + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1023), // m + ::testing::Values(14, 1010), // n + ::testing::Values(1.5), // alpha ::testing::Values(0.0, 1.5), // beta ::testing::Values(false, true), // trans ::testing::Values(2), // incX @@ -111,10 +153,12 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int m, n, incX, incY, ldaMul; T alpha, beta; bool trans; - BLAS_GENERATE_NAME(info.param, m, n, alpha, beta, trans, incX, incY, ldaMul); + BLAS_GENERATE_NAME(info.param, alloc, m, n, alpha, beta, trans, incX, incY, + ldaMul); } BLAS_REGISTER_TEST_ALL(Gemv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_ger_test.cpp b/test/unittest/blas2/blas2_ger_test.cpp index bcef2fa3d..b4b8343d9 100644 --- a/test/unittest/blas2/blas2_ger_test.cpp +++ b/test/unittest/blas2/blas2_ger_test.cpp @@ -26,17 +26,19 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t m; index_t n; index_t lda_mul; index_t incX; index_t incY; scalar_t alpha; - std::tie(m, n, alpha, incX, incY, lda_mul) = combi; + std::tie(alloc, m, n, alpha, incX, incY, lda_mul) = combi; index_t lda = m * lda_mul; // Input matrix @@ -56,26 +58,64 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto v_a_gpu = blas::make_sycl_iterator_buffer(a_v, m * incX); - auto v_b_gpu = blas::make_sycl_iterator_buffer(b_v, n * incY); - auto m_c_gpu = - blas::make_sycl_iterator_buffer(c_m_gpu_result, lda * n); + auto v_a_gpu = helper::allocate(m * incX, q); + auto v_b_gpu = helper::allocate(n * incY, q); + auto m_c_gpu = helper::allocate(lda * n, q); + + auto copy_a = + helper::copy_to_device(q, a_v.data(), v_a_gpu, m * incX); + auto copy_b = + helper::copy_to_device(q, b_v.data(), v_b_gpu, n * incY); + auto copy_c = helper::copy_to_device(q, c_m_gpu_result.data(), + m_c_gpu, lda * n); + + sb_handle.wait({copy_a, copy_b, copy_c}); // SYCLger - _ger(sb_handle, m, n, alpha, v_a_gpu, incX, v_b_gpu, incY, m_c_gpu, lda); + auto ger_event = + _ger(sb_handle, m, n, alpha, v_a_gpu, incX, v_b_gpu, incY, m_c_gpu, lda); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), m_c_gpu, - c_m_gpu_result.data(), lda * n); + sb_handle.wait(ger_event); + + auto event = + blas::helper::copy_to_host(q, m_c_gpu, c_m_gpu_result.data(), lda * n); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(c_m_gpu_result, c_m_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(v_a_gpu, q); + helper::deallocate(v_b_gpu, q); + helper::deallocate(m_c_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t m; + index_t n; + index_t lda_mul; + index_t incX; + index_t incY; + scalar_t alpha; + std::tie(alloc, m, n, alpha, incX, incY, lda_mul) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values(11, 65, 255, 1023, 1024 * 1024), // m + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 255, 1023, 1024 * 1024), // m ::testing::Values(14, 63, 257, 1010, 1024 * 1024), // n ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(1, 2), // incX @@ -86,8 +126,9 @@ const auto combi = // For the purpose of travis and other slower platforms, we need a faster test template const auto combi = - ::testing::Combine(::testing::Values(11, 1023), // m - ::testing::Values(14, 1010), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 1023), // m + ::testing::Values(14, 1010), // n ::testing::Values(0.0, 1.5), // alpha ::testing::Values(2), // incX ::testing::Values(3), // incY @@ -98,9 +139,10 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int m, n, incX, incY, ldaMul; T alpha; - BLAS_GENERATE_NAME(info.param, m, n, alpha, incX, incY, ldaMul); + BLAS_GENERATE_NAME(info.param, alloc, m, n, alpha, incX, incY, ldaMul); } BLAS_REGISTER_TEST_ALL(Ger, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_sbmv_test.cpp b/test/unittest/blas2/blas2_sbmv_test.cpp index 5988662d2..1a5bdfc9b 100644 --- a/test/unittest/blas2/blas2_sbmv_test.cpp +++ b/test/unittest/blas2/blas2_sbmv_test.cpp @@ -26,10 +26,12 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; index_t k; bool upper; @@ -38,7 +40,7 @@ void run_test(const combination_t combi) { index_t incX; index_t incY; index_t lda_mul; - std::tie(n, k, alpha, beta, upper, incX, incY, lda_mul) = combi; + std::tie(alloc, n, k, alpha, beta, upper, incX, incY, lda_mul) = combi; const char* uplo_str = upper ? "u" : "l"; @@ -64,28 +66,67 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); - auto v_y_gpu = - blas::make_sycl_iterator_buffer(y_v_gpu_result, y_size); + auto m_a_gpu = helper::allocate(a_size, q); + auto v_x_gpu = helper::allocate(x_size, q); + auto v_y_gpu = helper::allocate(y_size, q); + + auto copy_m = + helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_x = + helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); + auto copy_y = helper::copy_to_device(q, y_v_gpu_result.data(), + v_y_gpu, y_size); + sb_handle.wait({copy_m, copy_x, copy_y}); // SYCL SBMV - _sbmv(sb_handle, *uplo_str, n, k, alpha, m_a_gpu, (k + 1) * lda_mul, v_x_gpu, - incX, beta, v_y_gpu, incY); + auto sbmv_event = + _sbmv(sb_handle, *uplo_str, n, k, alpha, m_a_gpu, (k + 1) * lda_mul, + v_x_gpu, incX, beta, v_y_gpu, incY); + + sb_handle.wait(sbmv_event); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_y_gpu, - y_v_gpu_result.data(), y_size); + auto event = + blas::helper::copy_to_host(q, v_y_gpu, y_v_gpu_result.data(), y_size); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(y_v_gpu_result, y_v_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(v_x_gpu, q); + helper::deallocate(v_y_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + index_t k; + bool upper; + scalar_t alpha; + scalar_t beta; + index_t incX; + index_t incY; + index_t lda_mul; + std::tie(alloc, n, k, alpha, beta, upper, incX, incY, lda_mul) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values(14, 63, 257, 1010), // n - ::testing::Values(3, 4, 9), // k + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(14, 63, 257, 1010), // n + ::testing::Values(3, 4, 9), // k ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(0.0, 1.0, 1.5), // beta ::testing::Values(true, false), // upper @@ -98,9 +139,10 @@ const auto combi = // (the stress_test above takes about ~5 minutes) template const auto combi = - ::testing::Combine(::testing::Values(14, 1010), // n - ::testing::Values(3, 4), // kl - ::testing::Values(1.5), // alpha + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(14, 1010), // n + ::testing::Values(3, 4), // kl + ::testing::Values(1.5), // alpha ::testing::Values(0.0, 1.5), // beta ::testing::Values(true, false), // upper ::testing::Values(2), // incX @@ -112,10 +154,12 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int n, k, incX, incY, ldaMul; T alpha, beta; bool upper; - BLAS_GENERATE_NAME(info.param, n, k, alpha, beta, upper, incX, incY, ldaMul); + BLAS_GENERATE_NAME(info.param, alloc, n, k, alpha, beta, upper, incX, incY, + ldaMul); } BLAS_REGISTER_TEST_ALL(Sbmv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_spmv_test.cpp b/test/unittest/blas2/blas2_spmv_test.cpp index 79d383ee5..d137f4a12 100644 --- a/test/unittest/blas2/blas2_spmv_test.cpp +++ b/test/unittest/blas2/blas2_spmv_test.cpp @@ -26,17 +26,19 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; bool upper; scalar_t alpha; scalar_t beta; index_t incX; index_t incY; - std::tie(n, alpha, beta, upper, incX, incY) = combi; + std::tie(alloc, n, alpha, beta, upper, incX, incY) = combi; const char* uplo_str = upper ? "u" : "l"; @@ -62,14 +64,22 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); - auto v_y_gpu = - blas::make_sycl_iterator_buffer(y_v_gpu_result, y_size); + auto m_a_gpu = blas::helper::allocate(a_size, q); + auto v_x_gpu = blas::helper::allocate(x_size, q); + auto v_y_gpu = blas::helper::allocate(y_size, q); + + auto copy_m = + helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_x = + helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); + auto copy_y = helper::copy_to_device(q, y_v_gpu_result.data(), + v_y_gpu, y_size); // SYCL SPMV - _spmv(sb_handle, *uplo_str, n, alpha, m_a_gpu, v_x_gpu, incX, beta, v_y_gpu, - incY); + auto spmv_event = _spmv(sb_handle, *uplo_str, n, alpha, m_a_gpu, v_x_gpu, + incX, beta, v_y_gpu, incY, {copy_m, copy_x, copy_y}); + + sb_handle.wait(spmv_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_y_gpu, y_v_gpu_result.data(), y_size); @@ -78,12 +88,39 @@ void run_test(const combination_t combi) { const bool isAlmostEqual = utils::compare_vectors(y_v_gpu_result, y_v_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(v_x_gpu, q); + helper::deallocate(v_y_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + bool upper; + scalar_t alpha; + scalar_t beta; + index_t incX; + index_t incY; + std::tie(alloc, n, alpha, beta, upper, incX, incY) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Range(1, 999), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Range(1, 999), // n ::testing::Values(1.0, 1.5, 6.0), // alpha ::testing::Values(0.0, 1.0, 1.5), // beta ::testing::Values(true, false), // upper @@ -95,7 +132,8 @@ const auto combi = // (the stress_test above takes about ~5 minutes) template const auto combi = - ::testing::Combine(::testing::Values(14, 63, 257, 1010, 7717), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(14, 63, 257, 1010, 7717), // n ::testing::Values(1.0, 6.0), // alpha ::testing::Values(0.0, 1.0), // beta ::testing::Values(true, false), // upper @@ -107,10 +145,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; index_t n, k, incX, incY; T alpha, beta; bool upper; - BLAS_GENERATE_NAME(info.param, n, alpha, beta, upper, incX, incY); + BLAS_GENERATE_NAME(info.param, alloc, n, alpha, beta, upper, incX, incY); } BLAS_REGISTER_TEST_ALL(Spmv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_spr2_test.cpp b/test/unittest/blas2/blas2_spr2_test.cpp index 0fe5996e9..b6621afbc 100644 --- a/test/unittest/blas2/blas2_spr2_test.cpp +++ b/test/unittest/blas2/blas2_spr2_test.cpp @@ -27,15 +27,16 @@ template using combination_t = - std::tuple; + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; int n; scalar_t alpha; char layout, uplo; int incX, incY; - std::tie(layout, uplo, n, alpha, incX, incY) = combi; + std::tie(alloc, layout, uplo, n, alpha, incX, incY) = combi; const size_t x_size = 1 + (n - 1) * incX; const size_t y_size = 1 + (n - 1) * incY; @@ -61,12 +62,22 @@ void run_test(const combination_t combi) { auto q = make_queue(); SB_Handle sb_handle(q); - auto vx_gpu = blas::make_sycl_iterator_buffer(vx_cpu, x_size); - auto vy_gpu = blas::make_sycl_iterator_buffer(vy_cpu, y_size); + auto vx_gpu = helper::allocate(x_size, q); + auto vy_gpu = helper::allocate(y_size, q); + + auto a_mp_gpu = helper::allocate(m_size, q); - auto a_mp_gpu = blas::make_sycl_iterator_buffer(a_mp, m_size); + auto copy_x = + helper::copy_to_device(q, vx_cpu.data(), vx_gpu, x_size); + auto copy_y = + helper::copy_to_device(q, vy_cpu.data(), vy_gpu, y_size); + auto copy_a = + helper::copy_to_device(q, a_mp.data(), a_mp_gpu, m_size); - _spr2(sb_handle, uplo, n, alpha, vx_gpu, incX, vy_gpu, incY, a_mp_gpu); + auto spr2_event = _spr2(sb_handle, uplo, n, alpha, vx_gpu, incX, vy_gpu, incY, + a_mp_gpu, {copy_x, copy_y, copy_a}); + + sb_handle.wait(spr2_event); auto event = helper::copy_to_host(sb_handle.get_queue(), a_mp_gpu, a_mp.data(), m_size); @@ -75,13 +86,38 @@ void run_test(const combination_t combi) { const bool isAlmostEqual = utils::compare_vectors(a_mp, a_cpu_mp); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(vx_gpu, q); + helper::deallocate(vy_gpu, q); + helper::deallocate(a_mp_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + int n; + scalar_t alpha; + char layout, uplo; + int incX, incY; + std::tie(alloc, layout, uplo, n, alpha, incX, incY) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values('r', 'c'), // matrix layout - ::testing::Values('u', 'l'), // UPLO + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('r', 'c'), // matrix layout + ::testing::Values('u', 'l'), // UPLO ::testing::Values(1024, 2048, 4096, 8192, 16384), // n ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(1, 2), // incX @@ -91,7 +127,8 @@ const auto combi = // For the purpose of travis and other slower platforms, we need a faster test template const auto combi = - ::testing::Combine(::testing::Values('r', 'c'), // matrix layout + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('r', 'c'), // matrix layout ::testing::Values('u', 'l'), // UPLO ::testing::Values(14, 63, 257, 1010), // n ::testing::Values(1.0), // alpha @@ -103,10 +140,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; char layout, uplo; int n, incX, incY; T alpha; - BLAS_GENERATE_NAME(info.param, layout, uplo, n, alpha, incX, incY); + BLAS_GENERATE_NAME(info.param, alloc, layout, uplo, n, alpha, incX, incY); } BLAS_REGISTER_TEST_ALL(Spr2, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_spr_test.cpp b/test/unittest/blas2/blas2_spr_test.cpp index 94177b2b2..93fcf3ea4 100644 --- a/test/unittest/blas2/blas2_spr_test.cpp +++ b/test/unittest/blas2/blas2_spr_test.cpp @@ -26,16 +26,18 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; index_t lda_mul; index_t incX; char layout, uplo; scalar_t alpha; - std::tie(layout, uplo, n, alpha, incX) = combi; + std::tie(alloc, layout, uplo, n, alpha, incX) = combi; index_t mA_size = n * n; index_t x_size = 1 + (n - 1) * std::abs(incX); @@ -56,26 +58,60 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto x_v_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); - auto a_mp_gpu = blas::make_sycl_iterator_buffer(a_mp, mA_size); + auto x_v_gpu = helper::allocate(x_size, q); + auto a_mp_gpu = helper::allocate(mA_size, q); + + auto copy_x = + helper::copy_to_device(q, x_v.data(), x_v_gpu, x_size); + auto copy_a = + helper::copy_to_device(q, a_mp.data(), a_mp_gpu, mA_size); + + sb_handle.wait({copy_x, copy_a}); // SYCLspr - _spr(sb_handle, uplo, n, alpha, x_v_gpu, incX, a_mp_gpu); + auto spr_event = _spr(sb_handle, uplo, n, alpha, + x_v_gpu, incX, a_mp_gpu); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), a_mp_gpu, - a_mp.data(), mA_size); + sb_handle.wait(spr_event); + + auto event = blas::helper::copy_to_host(q, a_mp_gpu, a_mp.data(), mA_size); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(a_mp, a_cpu_mp); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(x_v_gpu, q); + helper::deallocate(a_mp_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + index_t lda_mul; + index_t incX; + char layout, uplo; + scalar_t alpha; + std::tie(alloc, layout, uplo, n, alpha, incX) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values('r', 'c'), // matrix layout - ::testing::Values('u', 'l'), // UPLO + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('r', 'c'), // matrix layout + ::testing::Values('u', 'l'), // UPLO ::testing::Values(1024, 2048, 4096, 8192, 16384), // n ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(1, 2) // incX @@ -84,7 +120,8 @@ const auto combi = // For the purpose of travis and other slower platforms, we need a faster test template const auto combi = - ::testing::Combine(::testing::Values('r', 'c'), // matrix layout + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('r', 'c'), // matrix layout ::testing::Values('u', 'l'), // UPLO ::testing::Values(14, 63, 257, 1010), // n ::testing::Values(1.0), // alpha @@ -95,10 +132,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; char layout, uplo; int n, incX; T alpha; - BLAS_GENERATE_NAME(info.param, layout, uplo, n, alpha, incX); + BLAS_GENERATE_NAME(info.param, alloc, layout, uplo, n, alpha, incX); } BLAS_REGISTER_TEST_ALL(Spr, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_symv_test.cpp b/test/unittest/blas2/blas2_symv_test.cpp index f59da57ca..39e6b0417 100644 --- a/test/unittest/blas2/blas2_symv_test.cpp +++ b/test/unittest/blas2/blas2_symv_test.cpp @@ -26,10 +26,12 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; index_t lda_mul; index_t incX; @@ -37,7 +39,7 @@ void run_test(const combination_t combi) { char uplo; scalar_t alpha; scalar_t beta; - std::tie(uplo, n, alpha, lda_mul, incX, beta, incY) = combi; + std::tie(alloc, uplo, n, alpha, lda_mul, incX, beta, incY) = combi; index_t lda = n * lda_mul; // Input matrix @@ -58,26 +60,65 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto a_m_gpu = blas::make_sycl_iterator_buffer(a_m, lda * n); - auto x_v_gpu = blas::make_sycl_iterator_buffer(x_v, n * incX); - auto y_v_gpu = blas::make_sycl_iterator_buffer(y_v, n * incY); + auto a_m_gpu = helper::allocate(lda * n, q); + auto x_v_gpu = helper::allocate(n * incX, q); + auto y_v_gpu = helper::allocate(n * incY, q); + + auto copy_a = + helper::copy_to_device(q, a_m.data(), a_m_gpu, lda * n); + auto copy_x = + helper::copy_to_device(q, x_v.data(), x_v_gpu, n * incX); + auto copy_y = + helper::copy_to_device(q, y_v.data(), y_v_gpu, n * incY); + + sb_handle.wait({copy_a, copy_x, copy_y}); // SYCLsymv - _symv(sb_handle, uplo, n, alpha, a_m_gpu, lda, x_v_gpu, incX, beta, y_v_gpu, - incY); + auto symv_event = _symv(sb_handle, uplo, n, alpha, a_m_gpu, lda, x_v_gpu, + incX, beta, y_v_gpu, incY); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), y_v_gpu, - y_v.data(), n * incY); + sb_handle.wait(symv_event); + + auto event = blas::helper::copy_to_host(q, y_v_gpu, y_v.data(), n * incY); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(y_v, y_cpu_v); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(a_m_gpu, q); + helper::deallocate(x_v_gpu, q); + helper::deallocate(y_v_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + index_t lda_mul; + index_t incX; + index_t incY; + char uplo; + scalar_t alpha; + scalar_t beta; + std::tie(alloc, uplo, n, alpha, lda_mul, incX, beta, incY) = combi; + index_t lda = n * lda_mul; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values('u', 'l'), // UPLO + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('u', 'l'), // UPLO ::testing::Values(14, 63, 257, 1010, 2025), // n ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(1, 2), // lda_mul @@ -89,8 +130,9 @@ const auto combi = // For the purpose of travis and other slower platforms, we need a faster test template const auto combi = - ::testing::Combine(::testing::Values('u', 'l'), // UPLO - ::testing::Values(2025), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('u', 'l'), // UPLO + ::testing::Values(2025), // n ::testing::Values(0.0, 1.5), // alpha ::testing::Values(2), // lda_mul ::testing::Values(2), // incX @@ -102,10 +144,12 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; char upl0; int n, ldaMul, incX, incY; T alpha, beta; - BLAS_GENERATE_NAME(info.param, upl0, n, alpha, ldaMul, incX, beta, incY); + BLAS_GENERATE_NAME(info.param, alloc, upl0, n, alpha, ldaMul, incX, beta, + incY); } BLAS_REGISTER_TEST_ALL(Symv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_syr2_test.cpp b/test/unittest/blas2/blas2_syr2_test.cpp index d08e80f36..cd978a773 100644 --- a/test/unittest/blas2/blas2_syr2_test.cpp +++ b/test/unittest/blas2/blas2_syr2_test.cpp @@ -26,17 +26,19 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; index_t lda_mul; index_t incX; index_t incY; char uplo; scalar_t alpha; - std::tie(uplo, n, alpha, incX, incY, lda_mul) = combi; + std::tie(alloc, uplo, n, alpha, incX, incY, lda_mul) = combi; index_t lda = n * lda_mul; // Input vector @@ -55,25 +57,63 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto x_v_gpu = blas::make_sycl_iterator_buffer(x_v, n * incX); - auto y_v_gpu = blas::make_sycl_iterator_buffer(y_v, n * incY); - auto a_m_gpu = blas::make_sycl_iterator_buffer(a_m, lda * n); + auto x_v_gpu = helper::allocate(n * incX, q); + auto y_v_gpu = helper::allocate(n * incY, q); + auto a_m_gpu = helper::allocate(lda * n, q); + + auto copy_x = + helper::copy_to_device(q, x_v.data(), x_v_gpu, n * incX); + auto copy_y = + helper::copy_to_device(q, y_v.data(), y_v_gpu, n * incY); + auto copy_a = + helper::copy_to_device(q, a_m.data(), a_m_gpu, lda * n); + + sb_handle.wait({copy_x, copy_y, copy_a}); // SYCLsyr2 - _syr2(sb_handle, uplo, n, alpha, x_v_gpu, incX, y_v_gpu, incY, a_m_gpu, lda); + auto syr2_event = _syr2(sb_handle, uplo, n, alpha, x_v_gpu, incX, y_v_gpu, + incY, a_m_gpu, lda); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), a_m_gpu, - a_m.data(), n * lda); + sb_handle.wait(syr2_event); + + auto event = blas::helper::copy_to_host(q, a_m_gpu, a_m.data(), n * lda); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(a_m, a_cpu_m); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(x_v_gpu, q); + helper::deallocate(y_v_gpu, q); + helper::deallocate(a_m_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + index_t lda_mul; + index_t incX; + index_t incY; + char uplo; + scalar_t alpha; + std::tie(alloc, uplo, n, alpha, incX, incY, lda_mul) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values('u', 'l'), // UPLO + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('u', 'l'), // UPLO ::testing::Values(14, 63, 257, 1010, 2025), // n ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(1, 2), // incX @@ -84,8 +124,9 @@ const auto combi = // For the purpose of travis and other slower platforms, we need a faster test template const auto combi = - ::testing::Combine(::testing::Values('u', 'l'), // UPLO - ::testing::Values(2025), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('u', 'l'), // UPLO + ::testing::Values(2025), // n ::testing::Values(0.0, 1.5), // alpha ::testing::Values(2), // incX ::testing::Values(2), // incY @@ -96,10 +137,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; char upl0; int n, incX, incY, ldaMul; T alpha; - BLAS_GENERATE_NAME(info.param, upl0, n, alpha, incX, incY, ldaMul); + BLAS_GENERATE_NAME(info.param, alloc, upl0, n, alpha, incX, incY, ldaMul); } BLAS_REGISTER_TEST_ALL(Syr2, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_syr_test.cpp b/test/unittest/blas2/blas2_syr_test.cpp index 3bb9f01b6..f5fa8e3db 100644 --- a/test/unittest/blas2/blas2_syr_test.cpp +++ b/test/unittest/blas2/blas2_syr_test.cpp @@ -26,16 +26,17 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; index_t lda_mul; index_t incX; char uplo; scalar_t alpha; - std::tie(uplo, n, alpha, incX, lda_mul) = combi; + std::tie(alloc, uplo, n, alpha, incX, lda_mul) = combi; index_t lda = n * lda_mul; // Input vector @@ -51,24 +52,57 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto x_v_gpu = blas::make_sycl_iterator_buffer(x_v, n * incX); - auto a_m_gpu = blas::make_sycl_iterator_buffer(a_m, lda * n); + auto x_v_gpu = helper::allocate(n * incX, q); + auto a_m_gpu = helper::allocate(lda * n, q); + + auto copy_x = + helper::copy_to_device(q, x_v.data(), x_v_gpu, n * incX); + auto copy_a = + helper::copy_to_device(q, a_m.data(), a_m_gpu, lda * n); + + sb_handle.wait({copy_x, copy_a}); // SYCLsyr - _syr(sb_handle, uplo, n, alpha, x_v_gpu, incX, a_m_gpu, lda); + auto syr_event = _syr(sb_handle, uplo, n, alpha, x_v_gpu, incX, a_m_gpu, lda); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), a_m_gpu, - a_m.data(), n * lda); + sb_handle.wait(syr_event); + + auto event = blas::helper::copy_to_host(q, a_m_gpu, a_m.data(), n * lda); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(a_m, a_cpu_m); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(x_v_gpu, q); + helper::deallocate(a_m_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + index_t lda_mul; + index_t incX; + char uplo; + scalar_t alpha; + std::tie(alloc, uplo, n, alpha, incX, lda_mul) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values('u', 'l'), // UPLO + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('u', 'l'), // UPLO ::testing::Values(14, 63, 257, 1010, 2025), // n ::testing::Values(0.0, 1.0, 1.5), // alpha ::testing::Values(1, 2), // incX @@ -78,8 +112,9 @@ const auto combi = // For the purpose of travis and other slower platforms, we need a faster test template const auto combi = - ::testing::Combine(::testing::Values('u', 'l'), // UPLO - ::testing::Values(14, 1010), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('u', 'l'), // UPLO + ::testing::Values(14, 1010), // n ::testing::Values(0.0, 1.5), // alpha ::testing::Values(2), // incX ::testing::Values(2) // lda_mul @@ -89,10 +124,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; char upl0; int n, incX, ldaMul; T alpha; - BLAS_GENERATE_NAME(info.param, upl0, n, alpha, incX, ldaMul); + BLAS_GENERATE_NAME(info.param, alloc, upl0, n, alpha, incX, ldaMul); } BLAS_REGISTER_TEST_ALL(Syr, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_tbmv_test.cpp b/test/unittest/blas2/blas2_tbmv_test.cpp index e6808f773..8a4d10d71 100644 --- a/test/unittest/blas2/blas2_tbmv_test.cpp +++ b/test/unittest/blas2/blas2_tbmv_test.cpp @@ -26,10 +26,12 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; index_t k; bool trans; @@ -37,7 +39,7 @@ void run_test(const combination_t combi) { bool is_unit; index_t incX; index_t lda_mul; - std::tie(n, k, is_upper, trans, is_unit, incX, lda_mul) = combi; + std::tie(alloc, n, k, is_upper, trans, is_unit, incX, lda_mul) = combi; const char* t_str = trans ? "t" : "n"; const char* uplo_str = is_upper ? "u" : "l"; @@ -64,25 +66,60 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); + auto m_a_gpu = helper::allocate(a_size, q); + auto v_x_gpu = helper::allocate(x_size, q); + + auto copy_a = + helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_x = + helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); + + sb_handle.wait({copy_a, copy_x}); // SYCL TBMV - _tbmv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, m_a_gpu, - (k + 1) * lda_mul, v_x_gpu, incX); + auto tbmv_event = _tbmv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, + m_a_gpu, (k + 1) * lda_mul, v_x_gpu, incX); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_x_gpu, - x_v.data(), x_size); + sb_handle.wait(tbmv_event); + + auto event = blas::helper::copy_to_host(q, v_x_gpu, x_v.data(), x_size); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(x_v, x_v_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(v_x_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + index_t k; + bool trans; + bool is_upper; + bool is_unit; + index_t incX; + index_t lda_mul; + std::tie(alloc, n, k, is_upper, trans, is_unit, incX, lda_mul) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values(14, 63, 257, 1010), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(14, 63, 257, 1010), // n ::testing::Values(3, 4, 9), // k ::testing::Values(true, false), // is_upper ::testing::Values(true, false), // trans @@ -95,24 +132,27 @@ const auto combi = // (the stress_test above takes about ~5 minutes) template const auto combi = - ::testing::Combine(::testing::Values(14, 1010), // n - ::testing::Values(3, 4), // k - ::testing::Values(true, false), // is_upper - ::testing::Values(true, false), // trans - ::testing::Values(true, false), // is_unit - ::testing::Values(2), // incX - ::testing::Values(2) // lda_mul + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(14, 1010), // n + ::testing::Values(3, 4), // k + ::testing::Values(true, false), // is_upper + ::testing::Values(true, false), // trans + ::testing::Values(true, false), // is_unit + ::testing::Values(2), // incX + ::testing::Values(2) // lda_mul ); #endif template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int n, k, incX, ldaMul; bool is_upper; bool trans; bool is_unit; - BLAS_GENERATE_NAME(info.param, n, k, is_upper, trans, is_unit, incX, ldaMul); + BLAS_GENERATE_NAME(info.param, alloc, n, k, is_upper, trans, is_unit, incX, + ldaMul); } BLAS_REGISTER_TEST_ALL(Tbmv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_tbsv_test.cpp b/test/unittest/blas2/blas2_tbsv_test.cpp index e9c0536c7..f9dec5f12 100644 --- a/test/unittest/blas2/blas2_tbsv_test.cpp +++ b/test/unittest/blas2/blas2_tbsv_test.cpp @@ -27,10 +27,11 @@ template using combination_t = - std::tuple; + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; index_t k; bool trans; @@ -38,9 +39,10 @@ void run_test(const combination_t combi) { bool is_unit; index_t incX; index_t lda_mul; + scalar_t wa; scalar_t unused; /* Work around dpcpp compiler bug (https://github.com/intel/llvm/issues/7075) */ - std::tie(n, k, is_upper, trans, is_unit, incX, lda_mul, unused) = combi; + std::tie(alloc, n, k, is_upper, trans, is_unit, incX, lda_mul, unused) = combi; const char* t_str = trans ? "t" : "n"; const char* uplo_str = is_upper ? "u" : "l"; @@ -74,12 +76,19 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); + auto m_a_gpu = blas::helper::allocate(a_size, q); + auto v_x_gpu = blas::helper::allocate(x_size, q); + + auto copy_m = + blas::helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_v = + blas::helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); // SYCL TBSV - _tbsv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, m_a_gpu, - (k + 1) * lda_mul, v_x_gpu, incX); + auto tbsv_event = + _tbsv(sb_handle, *uplo_str, *t_str, *diag_str, n, k, m_a_gpu, + (k + 1) * lda_mul, v_x_gpu, incX, {copy_m, copy_v}); + sb_handle.wait(tbsv_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_x_gpu, x_v.data(), x_size); @@ -87,12 +96,37 @@ void run_test(const combination_t combi) { const bool isAlmostEqual = utils::compare_vectors(x_v, x_v_cpu); ASSERT_TRUE(isAlmostEqual); + + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + int n, k, incX, ldaMul; + bool is_upper; + bool trans; + bool is_unit; + scalar_t wa; + std::tie(alloc, n, k, is_upper, trans, is_unit, incX, ldaMul, wa) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Range(1000, 1200), // n + ::testing::Combine(::testing::Values("usm", "buffer"), // allocation type + ::testing::Range(1000, 1200), // n ::testing::Values(1, 23, 32, 34, 38, 72, 89, 120), // k ::testing::Values(true, false), // is_upper ::testing::Values(true, false), // trans @@ -106,6 +140,7 @@ const auto combi = // (the stress_test above takes about ~5 minutes) template const auto combi = ::testing::Combine( + ::testing::Values("usm", "buffer"), // allocation type ::testing::Values(121, 288, 448, 553, 600, 996, 1024, 1999, 5252), // n ::testing::Values(1, 23, 32, 34, 38, 72, 89, 120), // k ::testing::Values(true, false), // is_upper @@ -120,12 +155,14 @@ const auto combi = ::testing::Combine( template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; index_t n, k, incX, ldaMul; bool is_upper; bool trans; bool is_unit; + T wa; T unused; - BLAS_GENERATE_NAME(info.param, n, k, is_upper, trans, is_unit, incX, ldaMul, + BLAS_GENERATE_NAME(info.param, alloc, n, k, is_upper, trans, is_unit, incX, ldaMul, unused); } diff --git a/test/unittest/blas2/blas2_tpmv_test.cpp b/test/unittest/blas2/blas2_tpmv_test.cpp index 28ee9bbd0..6ca51ce34 100644 --- a/test/unittest/blas2/blas2_tpmv_test.cpp +++ b/test/unittest/blas2/blas2_tpmv_test.cpp @@ -26,10 +26,12 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; bool trans; bool is_upper; @@ -37,7 +39,7 @@ void run_test(const combination_t combi) { index_t incX; scalar_t unused; /* Work around dpcpp compiler bug (https://github.com/intel/llvm/issues/7075) */ - std::tie(n, is_upper, trans, is_unit, incX, unused) = combi; + std::tie(alloc, n, is_upper, trans, is_unit, incX, unused) = combi; const char* t_str = trans ? "t" : "n"; const char* uplo_str = is_upper ? "u" : "l"; @@ -64,11 +66,18 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); + auto m_a_gpu = helper::allocate(a_size, q); + auto v_x_gpu = helper::allocate(x_size, q); + + auto copy_m = + helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_v = + helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); // SYCL TPMV - _tpmv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, v_x_gpu, incX); + auto tpmv_event = _tpmv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, + v_x_gpu, incX, {copy_m, copy_v}); + sb_handle.wait(tpmv_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_x_gpu, x_v.data(), x_size); @@ -76,24 +85,52 @@ void run_test(const combination_t combi) { const bool isAlmostEqual = utils::compare_vectors(x_v, x_v_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(v_x_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + bool trans; + bool is_upper; + bool is_unit; + index_t incX; + scalar_t unused; /* Work around dpcpp compiler bug + (https://github.com/intel/llvm/issues/7075) */ + std::tie(alloc, n, is_upper, trans, is_unit, incX, unused) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Range(1, 999), // n - ::testing::Values(true, false), // is_upper - ::testing::Values(true, false), // trans - ::testing::Values(true, false), // is_unit - ::testing::Values(1, 2, 3), // incX - ::testing::Values(0) // unused + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Range(1, 999), // n + ::testing::Values(true, false), // is_upper + ::testing::Values(true, false), // trans + ::testing::Values(true, false), // is_unit + ::testing::Values(1, 2, 3), // incX + ::testing::Values(0) // unused ); #else // For the purpose of travis and other slower platforms, we need a faster test // (the stress_test above takes about ~5 minutes) template const auto combi = - ::testing::Combine(::testing::Values(14, 63, 257, 1010, 7717), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(14, 63, 257, 1010, 7717), // n ::testing::Values(true, false), // is_upper ::testing::Values(true, false), // trans ::testing::Values(true, false), // is_unit @@ -105,12 +142,14 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; index_t n, incX; bool is_upper; bool trans; bool is_unit; T unused; - BLAS_GENERATE_NAME(info.param, n, is_upper, trans, is_unit, incX, unused); + BLAS_GENERATE_NAME(info.param, alloc, n, is_upper, trans, is_unit, incX, + unused); } BLAS_REGISTER_TEST_ALL(Tpmv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_tpsv_test.cpp b/test/unittest/blas2/blas2_tpsv_test.cpp index 5797f163a..dbc834b75 100644 --- a/test/unittest/blas2/blas2_tpsv_test.cpp +++ b/test/unittest/blas2/blas2_tpsv_test.cpp @@ -26,10 +26,11 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; bool trans; bool is_upper; @@ -37,7 +38,7 @@ void run_test(const combination_t combi) { index_t incX; scalar_t unused; /* Work around dpcpp compiler bug (https://github.com/intel/llvm/issues/7075) */ - std::tie(n, is_upper, trans, is_unit, incX, unused) = combi; + std::tie(alloc, n, is_upper, trans, is_unit, incX, unused) = combi; const char* t_str = trans ? "t" : "n"; const char* uplo_str = is_upper ? "u" : "l"; @@ -84,24 +85,59 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); + auto m_a_gpu = helper::allocate(a_size, q); + auto v_x_gpu = helper::allocate(x_size, q); + + auto copy_m = + helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_v = + helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); // SYCL TPSV - _tpsv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, v_x_gpu, incX); + auto tpsv_event = _tpsv(sb_handle, *uplo_str, *t_str, + *diag_str, n, m_a_gpu, v_x_gpu, + incX, {copy_m, copy_v}); + sb_handle.wait(tpsv_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_x_gpu, x_v.data(), x_size); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(x_v, x_v_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(v_x_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n, incX; + bool is_upper; + bool trans; + bool is_unit; + scalar_t unused; /* Work around dpcpp compiler bug + (https://github.com/intel/llvm/issues/7075) */ + std::tie(alloc, n, is_upper, trans, is_unit, incX, unused) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } + #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Range(1000, 1200), // n + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Range(1000, 1200), // n ::testing::Values(true, false), // is_upper ::testing::Values(true, false), // trans ::testing::Values(true, false), // is_unit @@ -113,7 +149,8 @@ const auto combi = // (the stress_test above takes about ~5 minutes) template const auto combi = - ::testing::Combine(::testing::Values(32, 64, 128, 512, 14, 127, 504, 780, + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(32, 64, 128, 512, 14, 127, 504, 780, 1010, 1140, 2300, 8192), // n ::testing::Values(true, false), // is_upper ::testing::Values(true, false), // trans @@ -126,12 +163,13 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; index_t n, incX; bool is_upper; bool trans; bool is_unit; T unused; - BLAS_GENERATE_NAME(info.param, n, is_upper, trans, is_unit, incX, unused); + BLAS_GENERATE_NAME(info.param, alloc, n, is_upper, trans, is_unit, incX, unused); } BLAS_REGISTER_TEST_ALL(Tpsv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_trmv_test.cpp b/test/unittest/blas2/blas2_trmv_test.cpp index 529cb177d..cd1aa6a63 100644 --- a/test/unittest/blas2/blas2_trmv_test.cpp +++ b/test/unittest/blas2/blas2_trmv_test.cpp @@ -31,17 +31,18 @@ // Seems to have been fixed in modern OpenBLASes template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; index_t lda_mul; index_t incX; char uplo; char trans; char diag; - std::tie(uplo, trans, diag, n, incX, lda_mul) = combi; + std::tie(alloc, uplo, trans, diag, n, incX, lda_mul) = combi; index_t lda = n * lda_mul; // Input matrix @@ -66,49 +67,88 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto a_m_gpu = blas::make_sycl_iterator_buffer(a_m, lda * n); - auto x_v_gpu = blas::make_sycl_iterator_buffer(x_v, n * incX); + auto a_m_gpu = helper::allocate(lda * n, q); + auto x_v_gpu = helper::allocate(n * incX, q); + + auto copy_a = + helper::copy_to_device(q, a_m.data(), a_m_gpu, lda * n); + auto copy_x = + helper::copy_to_device(q, x_v.data(), x_v_gpu, n * incX); + + sb_handle.wait({copy_a, copy_x}); // SYCLtrmv - _trmv(sb_handle, uplo, trans, diag, n, a_m_gpu, lda, x_v_gpu, incX); + auto trmv_event = + _trmv(sb_handle, uplo, trans, diag, n, a_m_gpu, lda, x_v_gpu, incX); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), x_v_gpu, - x_v.data(), n * incX); + sb_handle.wait(trmv_event); + + auto event = blas::helper::copy_to_host(q, x_v_gpu, x_v.data(), n * incX); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(x_v, x_cpu_v); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(a_m_gpu, q); + helper::deallocate(x_v_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + index_t lda_mul; + index_t incX; + char uplo; + char trans; + char diag; + std::tie(alloc, uplo, trans, diag, n, incX, lda_mul) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING // For the purpose of travis and other slower platforms, we need a faster test template -const auto combi = ::testing::Combine(::testing::Values('u', 'l'), // UPLO - ::testing::Values('n', 't'), // TRANS - ::testing::Values('u', 'n'), // DIAG - ::testing::Values(14, 63, 257, 1010, - 1024 * 5), // n - ::testing::Values(1, 2), // incX - ::testing::Values(1, 2) // lda_mul -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('u', 'l'), // UPLO + ::testing::Values('n', 't'), // TRANS + ::testing::Values('u', 'n'), // DIAG + ::testing::Values(14, 63, 257, 1010, + 1024 * 5), // n + ::testing::Values(1, 2), // incX + ::testing::Values(1, 2) // lda_mul + ); #else // For the purpose of travis and other slower platforms, we need a faster test template -const auto combi = ::testing::Combine(::testing::Values('u', 'l'), // UPLO - ::testing::Values('n', 't'), // TRANS - ::testing::Values('u', 'n'), // DIAG - ::testing::Values(2025), // n - ::testing::Values(2), // incX - ::testing::Values(2) // lda_mul -); +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('u', 'l'), // UPLO + ::testing::Values('n', 't'), // TRANS + ::testing::Values('u', 'n'), // DIAG + ::testing::Values(2025), // n + ::testing::Values(2), // incX + ::testing::Values(2) // lda_mul + ); #endif template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; char upl0, trans, diag; int n, incX, ldaMul; - BLAS_GENERATE_NAME(info.param, upl0, trans, diag, n, incX, ldaMul); + BLAS_GENERATE_NAME(info.param, alloc, upl0, trans, diag, n, incX, ldaMul); } BLAS_REGISTER_TEST_ALL(Trmv, combination_t, combi, generate_name); diff --git a/test/unittest/blas2/blas2_trsv_test.cpp b/test/unittest/blas2/blas2_trsv_test.cpp index 326163bf1..2fa8878e9 100644 --- a/test/unittest/blas2/blas2_trsv_test.cpp +++ b/test/unittest/blas2/blas2_trsv_test.cpp @@ -27,10 +27,11 @@ template using combination_t = - std::tuple; + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t n; bool trans; bool is_upper; @@ -39,7 +40,7 @@ void run_test(const combination_t combi) { index_t lda_mul; scalar_t unused; /* Work around dpcpp compiler bug (https://github.com/intel/llvm/issues/7075) */ - std::tie(n, is_upper, trans, is_unit, incX, lda_mul, unused) = combi; + std::tie(alloc, n, is_upper, trans, is_unit, incX, lda_mul, unused) = combi; const char* t_str = trans ? "t" : "n"; const char* uplo_str = is_upper ? "u" : "l"; @@ -78,12 +79,18 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto m_a_gpu = blas::make_sycl_iterator_buffer(a_m, a_size); - auto v_x_gpu = blas::make_sycl_iterator_buffer(x_v, x_size); + auto m_a_gpu = blas::helper::allocate(a_size, q); + auto v_x_gpu = blas::helper::allocate(x_size, q); + + auto copy_m = + blas::helper::copy_to_device(q, a_m.data(), m_a_gpu, a_size); + auto copy_v = + blas::helper::copy_to_device(q, x_v.data(), v_x_gpu, x_size); // SYCL TRSV - _trsv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, n * lda_mul, - v_x_gpu, incX); + auto trsv_event = _trsv(sb_handle, *uplo_str, *t_str, *diag_str, n, m_a_gpu, + n * lda_mul, v_x_gpu, incX, {copy_m, copy_v}); + sb_handle.wait(trsv_event); auto event = blas::helper::copy_to_host(sb_handle.get_queue(), v_x_gpu, x_v.data(), x_size); @@ -91,12 +98,40 @@ void run_test(const combination_t combi) { const bool isAlmostEqual = utils::compare_vectors(x_v, x_v_cpu); ASSERT_TRUE(isAlmostEqual); + + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(v_x_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t n; + bool trans; + bool is_upper; + bool is_unit; + index_t incX; + index_t lda_mul; + scalar_t wa; + std::tie(alloc, n, is_upper, trans, is_unit, incX, lda_mul, wa) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Range(1000, 1200), // n + ::testing::Combine(::testing::Values("usm", "buffer"), // allocation type + ::testing::Values(32, 64, 128, 512, 14, 127, 504, 780, + 1010, 1140, 2300, 8192), // n ::testing::Values(true, false), // is_upper ::testing::Values(true, false), // trans ::testing::Values(true, false), // is_unit @@ -109,13 +144,13 @@ const auto combi = // (the stress_test above takes about ~5 minutes) template const auto combi = - ::testing::Combine(::testing::Values(32, 64, 128, 512, 14, 127, 504, 780, - 1010, 1140, 2300, 8192), // n + ::testing::Combine(::testing::Values("usm", "buffer"), // allocation type + ::testing::Values(14, 64, 33, 515, 1024, 1200, 3000), // n ::testing::Values(true, false), // is_upper ::testing::Values(true, false), // trans ::testing::Values(true, false), // is_unit - ::testing::Values(1, 3), // incX - ::testing::Values(1, 2), // lda_mul + ::testing::Values(4), // incX + ::testing::Values(3), // lda_mul ::testing::Values(0) // unused ); #endif @@ -123,12 +158,13 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; index_t n, incX, ldaMul; bool is_upper; bool trans; bool is_unit; T unused; - BLAS_GENERATE_NAME(info.param, n, is_upper, trans, is_unit, incX, ldaMul, + BLAS_GENERATE_NAME(info.param, alloc, n, is_upper, trans, is_unit, incX, ldaMul, unused); } diff --git a/test/unittest/blas3/blas3_gemm_batched_test.cpp b/test/unittest/blas3/blas3_gemm_batched_test.cpp index f38922025..8e7b50619 100644 --- a/test/unittest/blas3/blas3_gemm_batched_test.cpp +++ b/test/unittest/blas3/blas3_gemm_batched_test.cpp @@ -28,6 +28,7 @@ template const auto BetaNonZeroLDMatch = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(5), // batch ::testing::Values(63, 128), // m @@ -46,6 +47,7 @@ GENERATE_GEMM_TEST(BatchGemm, BetaNonZeroLDMatch); template const auto BetaNonZeroLDMultiplied = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(1, 5), // batch ::testing::Values(63, 128, 129), // m @@ -65,6 +67,7 @@ GENERATE_GEMM_TEST(BatchGemm, BetaNonZeroLDMultiplied); template const auto BetaNonZeroLDMatchAlpha0 = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(5), // batch ::testing::Values(128), // m @@ -83,6 +86,7 @@ GENERATE_GEMM_TEST(BatchGemm, BetaNonZeroLDMatchAlpha0); template const auto BetaNonZeroLDMultipliedAlpha0 = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(5), // batch ::testing::Values(63), // m @@ -102,7 +106,8 @@ GENERATE_GEMM_TEST(BatchGemm, BetaNonZeroLDMultipliedAlpha0); // GEMM STRIDED BATCHED tests template const auto DefaultGemmAndGemmBatched = - ::testing::Combine(::testing::Values(0), // offset + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(0), // offset ::testing::Values(1, 5), // batch ::testing::Values(63, 128), // m ::testing::Values(63, 128), // n @@ -122,7 +127,8 @@ GENERATE_GEMM_STRIDED_BATCHED_TEST(BatchStridedGemm, DefaultGemmAndGemmBatched); template const auto AllStridedBatched = - ::testing::Combine(::testing::Values(0), // offset + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(0), // offset ::testing::Values(5), // batch ::testing::Values(128), // m ::testing::Values(128), // n diff --git a/test/unittest/blas3/blas3_gemm_common.hpp b/test/unittest/blas3/blas3_gemm_common.hpp index 1eca8bfec..7deebbf4c 100644 --- a/test/unittest/blas3/blas3_gemm_common.hpp +++ b/test/unittest/blas3/blas3_gemm_common.hpp @@ -28,13 +28,14 @@ #include "blas_test.hpp" template -using gemm_arguments_t = std::tuple; +using gemm_arguments_t = + std::tuple; template using gemm_batched_strided_arguments_t = - std::tuple; + std::tuple; // Convert batch_type=strided to interleaved on the host template @@ -70,8 +71,9 @@ inline std::vector interleaved_to_strided( return output; } -template +template inline void verify_gemm(const gemm_arguments_t arguments) { + std::string alloc; index_t offset; index_t batch; index_t m; @@ -85,7 +87,7 @@ inline void verify_gemm(const gemm_arguments_t arguments) { index_t ldb_mul; index_t ldc_mul; gemm_batch_type_t batch_type; - std::tie(offset, batch, m, n, k, transa, transb, alpha, beta, lda_mul, + std::tie(alloc, offset, batch, m, n, k, transa, transb, alpha, beta, lda_mul, ldb_mul, ldc_mul, batch_type) = arguments; const char ta_str[2] = {transa, '\0'}; @@ -131,29 +133,33 @@ inline void verify_gemm(const gemm_arguments_t arguments) { c_m_gpu = strided_to_interleaved(c_m_gpu, offset, ldc, n, batch); } - auto m_a_gpu = blas::make_sycl_iterator_buffer(buffer_size_a); - auto m_b_gpu = blas::make_sycl_iterator_buffer(buffer_size_b); - auto m_c_gpu = blas::make_sycl_iterator_buffer(buffer_size_c); + auto m_a_gpu = blas::helper::allocate(buffer_size_a, q); + auto m_b_gpu = blas::helper::allocate(buffer_size_b, q); + auto m_c_gpu = blas::helper::allocate(buffer_size_c, q); - blas::helper::copy_to_device(sb_handle.get_queue(), a_m.data(), m_a_gpu, - buffer_size_a); - blas::helper::copy_to_device(sb_handle.get_queue(), b_m.data(), m_b_gpu, - buffer_size_b); - blas::helper::copy_to_device(sb_handle.get_queue(), c_m_gpu.data(), m_c_gpu, - buffer_size_c); + auto copy_a = + blas::helper::copy_to_device(q, a_m.data(), m_a_gpu, buffer_size_a); + auto copy_b = + blas::helper::copy_to_device(q, b_m.data(), m_b_gpu, buffer_size_b); + auto copy_c = + blas::helper::copy_to_device(q, c_m_gpu.data(), m_c_gpu, buffer_size_c); // SYCL BLAS GEMM implementation + typename blas::SB_Handle::event_t gemm_event; if (batch == 1) { - _gemm(sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, - m_b_gpu + offset, ldb, beta, m_c_gpu + offset, ldc); + gemm_event = _gemm(sb_handle, transa, transb, m, n, k, alpha, + m_a_gpu + offset, lda, m_b_gpu + offset, ldb, beta, + m_c_gpu + offset, ldc, {copy_a, copy_b, copy_c}); } else { - _gemm_batched(sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, - lda, m_b_gpu + offset, ldb, beta, m_c_gpu + offset, ldc, - batch, batch_type); + gemm_event = _gemm_batched(sb_handle, transa, transb, m, n, k, alpha, + m_a_gpu + offset, lda, m_b_gpu + offset, ldb, + beta, m_c_gpu + offset, ldc, batch, batch_type, + {copy_a, copy_b, copy_c}); } + sb_handle.wait(gemm_event); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), m_c_gpu, - c_m_gpu.data(), buffer_size_c); + auto event = + blas::helper::copy_to_host(q, m_c_gpu, c_m_gpu.data(), buffer_size_c); sb_handle.wait(event); if (batch > 1 && batch_type == gemm_batch_type_t::interleaved) { @@ -164,6 +170,40 @@ inline void verify_gemm(const gemm_arguments_t arguments) { const bool isAlmostEqual = utils::compare_vectors(c_m_gpu, c_m_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(m_b_gpu, q); + helper::deallocate(m_c_gpu, q); +} + +template +inline void verify_gemm(const gemm_arguments_t arguments) { + std::string alloc; + index_t offset; + index_t batch; + index_t m; + index_t n; + index_t k; + char transa; + char transb; + scalar_t alpha; + scalar_t beta; + index_t lda_mul; + index_t ldb_mul; + index_t ldc_mul; + gemm_batch_type_t batch_type; + std::tie(alloc, offset, batch, m, n, k, transa, transb, alpha, beta, lda_mul, + ldb_mul, ldc_mul, batch_type) = arguments; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + verify_gemm(arguments); +#else + GTEST_SKIP(); +#endif + } else { + verify_gemm(arguments); + } } template <> @@ -175,17 +215,19 @@ inline void dump_arg(std::ostream& ss, template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int offset, batch, m, n, k, ldaMul, ldbMul, ldcMul; char transa, transb; T alpha, beta; gemm_batch_type_t batchType; - BLAS_GENERATE_NAME(info.param, offset, batch, m, n, k, transa, transb, alpha, - beta, ldaMul, ldbMul, ldcMul, batchType); + BLAS_GENERATE_NAME(info.param, alloc, offset, batch, m, n, k, transa, transb, + alpha, beta, ldaMul, ldbMul, ldcMul, batchType); } -template +template inline void verify_gemm( const gemm_batched_strided_arguments_t arguments) { + std::string alloc; index_t offset; index_t batch; index_t m; @@ -201,7 +243,7 @@ inline void verify_gemm( index_t stride_a_mul; index_t stride_b_mul; index_t stride_c_mul; - std::tie(offset, batch, m, n, k, transa, transb, alpha, beta, lda_mul, + std::tie(alloc, offset, batch, m, n, k, transa, transb, alpha, beta, lda_mul, ldb_mul, ldc_mul, stride_a_mul, stride_b_mul, stride_c_mul) = arguments; @@ -244,24 +286,26 @@ inline void verify_gemm( c_m_cpu.data() + i * stride_c + offset, ldc); } - auto m_a_gpu = blas::make_sycl_iterator_buffer(buffer_size_a); - auto m_b_gpu = blas::make_sycl_iterator_buffer(buffer_size_b); - auto m_c_gpu = blas::make_sycl_iterator_buffer(buffer_size_c); + auto m_a_gpu = blas::helper::allocate(buffer_size_a, q); + auto m_b_gpu = blas::helper::allocate(buffer_size_b, q); + auto m_c_gpu = blas::helper::allocate(buffer_size_c, q); - blas::helper::copy_to_device(sb_handle.get_queue(), a_m.data(), m_a_gpu, - buffer_size_a); - blas::helper::copy_to_device(sb_handle.get_queue(), b_m.data(), m_b_gpu, - buffer_size_b); - blas::helper::copy_to_device(sb_handle.get_queue(), c_m_gpu.data(), m_c_gpu, - buffer_size_c); + auto copy_a = + blas::helper::copy_to_device(q, a_m.data(), m_a_gpu, buffer_size_a); + auto copy_b = + blas::helper::copy_to_device(q, b_m.data(), m_b_gpu, buffer_size_b); + auto copy_c = + blas::helper::copy_to_device(q, c_m_gpu.data(), m_c_gpu, buffer_size_c); // SYCL BLAS GEMM STRIDED BATCHED implementation - _gemm_strided_batched(sb_handle, transa, transb, m, n, k, alpha, - m_a_gpu + offset, lda, stride_a, m_b_gpu + offset, ldb, - stride_b, beta, m_c_gpu + offset, ldc, stride_c, batch); - - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), m_c_gpu, - c_m_gpu.data(), buffer_size_c); + auto gemm_batched_event = _gemm_strided_batched( + sb_handle, transa, transb, m, n, k, alpha, m_a_gpu + offset, lda, + stride_a, m_b_gpu + offset, ldb, stride_b, beta, m_c_gpu + offset, ldc, + stride_c, batch, {copy_a, copy_b, copy_c}); + + sb_handle.wait({gemm_batched_event}); + auto event = + blas::helper::copy_to_host(q, m_c_gpu, c_m_gpu.data(), buffer_size_c); sb_handle.wait(event); const bool isAlmostEqual = @@ -269,18 +313,55 @@ inline void verify_gemm( ? utils::compare_vectors(c_m_gpu, c_m_cpu) : utils::compare_vectors_strided(c_m_gpu, c_m_cpu, stride_c, size_c); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(m_b_gpu, q); + helper::deallocate(m_c_gpu, q); +} + +template +inline void verify_gemm( + const gemm_batched_strided_arguments_t arguments) { + std::string alloc; + index_t offset; + index_t batch; + index_t m; + index_t n; + index_t k; + char transa; + char transb; + scalar_t alpha; + scalar_t beta; + index_t lda_mul; + index_t ldb_mul; + index_t ldc_mul; + index_t stride_a_mul; + index_t stride_b_mul; + index_t stride_c_mul; + std::tie(alloc, offset, batch, m, n, k, transa, transb, alpha, beta, lda_mul, + ldb_mul, ldc_mul, stride_a_mul, stride_b_mul, stride_c_mul) = + arguments; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + verify_gemm(arguments); +#endif + } else { + verify_gemm(arguments); + } } template static std::string generate_batched_strided_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int offset, batch, m, n, k, ldaMul, ldbMul, ldcMul, stride_a_mul, stride_b_mul, stride_c_mul; char transa, transb; T alpha, beta; - BLAS_GENERATE_NAME(info.param, offset, batch, m, n, k, transa, transb, alpha, - beta, ldaMul, ldbMul, ldcMul, stride_a_mul, stride_b_mul, - stride_c_mul); + BLAS_GENERATE_NAME(info.param, alloc, offset, batch, m, n, k, transa, transb, + alpha, beta, ldaMul, ldbMul, ldcMul, stride_a_mul, + stride_b_mul, stride_c_mul); } /** Registers GEMM test for all supported data types diff --git a/test/unittest/blas3/blas3_gemm_tall_skinny_test.cpp b/test/unittest/blas3/blas3_gemm_tall_skinny_test.cpp index c717f05c2..417a8118c 100644 --- a/test/unittest/blas3/blas3_gemm_tall_skinny_test.cpp +++ b/test/unittest/blas3/blas3_gemm_tall_skinny_test.cpp @@ -28,6 +28,7 @@ template const auto BetaNonZeroLDMatch = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(1), // batch ::testing::Values(7, 65), // m @@ -46,6 +47,7 @@ GENERATE_GEMM_TEST(TallSkinnyGemm, BetaNonZeroLDMatch); template const auto BetaNonZeroLDMultiplied = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(1), // batch ::testing::Values(7, 65), // m @@ -64,6 +66,7 @@ GENERATE_GEMM_TEST(TallSkinnyGemm, BetaNonZeroLDMultiplied); template const auto BetaZero = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(1), // batch ::testing::Values(7), // m @@ -82,6 +85,7 @@ GENERATE_GEMM_TEST(TallSkinnyGemm, BetaZero); template const auto OffsetNonZero = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(10), // offset ::testing::Values(1), // batch ::testing::Values(7), // m diff --git a/test/unittest/blas3/blas3_gemm_test.cpp b/test/unittest/blas3/blas3_gemm_test.cpp index e7c14c9de..8237bd629 100644 --- a/test/unittest/blas3/blas3_gemm_test.cpp +++ b/test/unittest/blas3/blas3_gemm_test.cpp @@ -28,6 +28,7 @@ template const auto SmallBetaNonZeroLDMatch = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(1), // batch ::testing::Values(11, 16, 32), // m @@ -46,6 +47,7 @@ GENERATE_GEMM_TEST(Gemm, SmallBetaNonZeroLDMatch); template const auto SmallBetaZeroLDMatch = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(1), // batch ::testing::Values(11, 32), // m @@ -64,6 +66,7 @@ GENERATE_GEMM_TEST(Gemm, SmallBetaZeroLDMatch); template const auto SmallBetaZeroLDMultiplied = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(1), // batch ::testing::Values(11, 32), // m @@ -82,6 +85,7 @@ GENERATE_GEMM_TEST(Gemm, SmallBetaZeroLDMultiplied); template const auto AlphaZero = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0, 10), // offset ::testing::Values(1), // batch ::testing::Values(16), // m @@ -100,6 +104,7 @@ GENERATE_GEMM_TEST(Gemm, AlphaZero); template const auto OffsetNonZero = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(1, 10), // offset ::testing::Values(1), // batch ::testing::Values(16, 63), // m @@ -118,6 +123,7 @@ GENERATE_GEMM_TEST(Gemm, OffsetNonZero); template const auto LargeBetaNonZeroLDMatch = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(0), // offset ::testing::Values(1), // batch ::testing::Values(253, 511), // m diff --git a/test/unittest/blas3/blas3_symm_test.cpp b/test/unittest/blas3/blas3_symm_test.cpp index 136ff3bf6..c7652dcf0 100644 --- a/test/unittest/blas3/blas3_symm_test.cpp +++ b/test/unittest/blas3/blas3_symm_test.cpp @@ -28,10 +28,12 @@ #include "blas_test.hpp" template -using symm_arguments_t = std::tuple; +using symm_arguments_t = + std::tuple; -template +template inline void verify_symm(const symm_arguments_t arguments) { + std::string alloc; index_t m; index_t n; char side; @@ -41,7 +43,7 @@ inline void verify_symm(const symm_arguments_t arguments) { index_t lda_mul; index_t ldb_mul; index_t ldc_mul; - std::tie(m, n, side, uplo, alpha, beta, lda_mul, ldb_mul, ldc_mul) = + std::tie(alloc, m, n, side, uplo, alpha, beta, lda_mul, ldb_mul, ldc_mul) = arguments; auto q = make_queue(); @@ -74,37 +76,68 @@ inline void verify_symm(const symm_arguments_t arguments) { reference_blas::symm(side_str, uplo_str, m, n, alpha, a_m.data(), lda, b_m.data(), ldb, beta, c_m_cpu.data(), ldc); - auto m_a_gpu = blas::make_sycl_iterator_buffer(size_a); - auto m_b_gpu = blas::make_sycl_iterator_buffer(size_b); - auto m_c_gpu = blas::make_sycl_iterator_buffer(size_c); + auto m_a_gpu = blas::helper::allocate(size_a, q); + auto m_b_gpu = blas::helper::allocate(size_b, q); + auto m_c_gpu = blas::helper::allocate(size_c, q); - blas::helper::copy_to_device(sb_handle.get_queue(), a_m.data(), m_a_gpu, - size_a); - blas::helper::copy_to_device(sb_handle.get_queue(), b_m.data(), m_b_gpu, - size_b); - blas::helper::copy_to_device(sb_handle.get_queue(), c_m_gpu.data(), m_c_gpu, - size_c); + auto copy_a = blas::helper::copy_to_device(q, a_m.data(), m_a_gpu, size_a); + auto copy_b = blas::helper::copy_to_device(q, b_m.data(), m_b_gpu, size_b); + auto copy_c = + blas::helper::copy_to_device(q, c_m_gpu.data(), m_c_gpu, size_c); // SYCL BLAS SYMM implementation - _symm(sb_handle, side, uplo, m, n, alpha, m_a_gpu, lda, m_b_gpu, ldb, beta, - m_c_gpu, ldc); + auto symm_event = + _symm(sb_handle, side, uplo, m, n, alpha, m_a_gpu, lda, m_b_gpu, ldb, + beta, m_c_gpu, ldc, {copy_a, copy_b, copy_c}); + + sb_handle.wait(symm_event); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), m_c_gpu, - c_m_gpu.data(), size_c); + auto event = blas::helper::copy_to_host(q, m_c_gpu, c_m_gpu.data(), size_c); sb_handle.wait(event); const bool isAlmostEqual = utils::compare_vectors(c_m_gpu, c_m_cpu); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(m_b_gpu, q); + helper::deallocate(m_c_gpu, q); +} + +template +inline void verify_symm(const symm_arguments_t arguments) { + std::string alloc; + index_t m; + index_t n; + char side; + char uplo; + scalar_t alpha; + scalar_t beta; + index_t lda_mul; + index_t ldb_mul; + index_t ldc_mul; + std::tie(alloc, m, n, side, uplo, alpha, beta, lda_mul, ldb_mul, ldc_mul) = + arguments; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + verify_symm(arguments); +#else + GTEST_SKIP(); +#endif + } else { + verify_symm(arguments); + } } template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int m, n, ldaMul, ldbMul, ldcMul; char side, uplo; T alpha, beta; - BLAS_GENERATE_NAME(info.param, m, n, side, uplo, alpha, beta, ldaMul, ldbMul, - ldcMul); + BLAS_GENERATE_NAME(info.param, alloc, m, n, side, uplo, alpha, beta, ldaMul, + ldbMul, ldcMul); } /** Registers SYMM test for all supported data types @@ -119,7 +152,8 @@ static std::string generate_name( template const auto SmallBetaNonZeroLDMatch = - ::testing::Combine(::testing::Values(11, 16, 32), // m + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 16, 32), // m ::testing::Values(11, 16, 32), // n ::testing::Values('l', 'r'), // side ::testing::Values('l', 'u'), // uplo @@ -133,11 +167,12 @@ GENERATE_SYMM_TEST(Symm, SmallBetaNonZeroLDMatch); template const auto AlphaZero = - ::testing::Combine(::testing::Values(16), // m - ::testing::Values(16), // n - ::testing::Values('l', 'r'), // side - ::testing::Values('l', 'u'), // uplo - ::testing::Values(0.0), // alpha + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(16), // m + ::testing::Values(16), // n + ::testing::Values('l', 'r'), // side + ::testing::Values('l', 'u'), // uplo + ::testing::Values(0.0), // alpha ::testing::Values(0.0, 1.0), // beta ::testing::Values(1, 2), // lda_mul ::testing::Values(1, 2), // ldb_mul @@ -147,7 +182,8 @@ GENERATE_SYMM_TEST(Symm, AlphaZero); template const auto OffsetNonZero = - ::testing::Combine(::testing::Values(16, 63), // m + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(16, 63), // m ::testing::Values(16, 63), // n ::testing::Values('l', 'r'), // side ::testing::Values('l', 'u'), // uplo @@ -161,7 +197,8 @@ GENERATE_SYMM_TEST(Symm, OffsetNonZero); template const auto LargeBetaNonZeroLDMatch = - ::testing::Combine(::testing::Values(253, 511), // m + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(253, 511), // m ::testing::Values(257, 511), // n ::testing::Values('l', 'r'), // side ::testing::Values('l', 'u'), // uplo diff --git a/test/unittest/blas3/blas3_trsm_test.cpp b/test/unittest/blas3/blas3_trsm_test.cpp index ceb38cd00..6090466f1 100644 --- a/test/unittest/blas3/blas3_trsm_test.cpp +++ b/test/unittest/blas3/blas3_trsm_test.cpp @@ -24,11 +24,12 @@ #include "blas_test.hpp" template -using combination_t = std::tuple; +using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; index_t m; index_t n; char trans; @@ -39,8 +40,8 @@ void run_test(const combination_t combi) { scalar_t ldaMul; scalar_t ldbMul; scalar_t unusedValue; - std::tie(m, n, trans, side, diag, uplo, alpha, ldaMul, ldbMul, unusedValue) = - combi; + std::tie(alloc, m, n, trans, side, diag, uplo, alpha, ldaMul, ldbMul, + unusedValue) = combi; const index_t lda = (side == 'l' ? m : n) * ldaMul; const index_t ldb = m * ldbMul; @@ -69,48 +70,102 @@ void run_test(const combination_t combi) { auto q = make_queue(); blas::SB_Handle sb_handle(q); - auto a_gpu = blas::make_sycl_iterator_buffer(A, A.size()); - auto b_gpu = blas::make_sycl_iterator_buffer(B, B.size()); + auto a_gpu = helper::allocate(A.size(), q); + auto b_gpu = helper::allocate(B.size(), q); + + auto copy_a = helper::copy_to_device(q, A.data(), a_gpu, A.size()); + auto copy_b = helper::copy_to_device(q, B.data(), b_gpu, B.size()); - _trsm(sb_handle, side, uplo, trans, diag, m, n, alpha, a_gpu, lda, b_gpu, - ldb); + auto trsm_event = _trsm(sb_handle, side, uplo, trans, diag, m, n, alpha, + a_gpu, lda, b_gpu, ldb, {copy_a, copy_b}); + sb_handle.wait({trsm_event}); - auto event = blas::helper::copy_to_host(sb_handle.get_queue(), - b_gpu, B.data(), B.size()); + auto event = + blas::helper::copy_to_host(q, b_gpu, B.data(), B.size()); sb_handle.wait(event); bool isAlmostEqual = utils::compare_vectors(cpu_B, B); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(a_gpu, q); + helper::deallocate(b_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t m; + index_t n; + char trans; + char side; + char diag; + char uplo; + scalar_t alpha; + scalar_t ldaMul; + scalar_t ldbMul; + scalar_t unusedValue; + std::tie(alloc, m, n, trans, side, diag, uplo, alpha, ldaMul, ldbMul, + unusedValue) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } static constexpr double NaN = std::numeric_limits::quiet_NaN(); +#ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values(7, 513, 1027), // m - ::testing::Values(7, 513, 1027), // n - ::testing::Values('n', 't'), // trans - ::testing::Values('l', 'r'), // side - ::testing::Values('u', 'n'), // diag - ::testing::Values('l', 'u'), // uplo - ::testing::Values(1.0, 2.0), // alpha - ::testing::Values(1.0, 2.0), // lda_mul - ::testing::Values(1.0, 2.0), // ldb_mul - ::testing::Values(0.0, NaN) // unused + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Range(7, 513), // m + ::testing::Range(7, 513), // n + ::testing::Values('n', 't'), // trans + ::testing::Values('l', 'r'), // side + ::testing::Values('u', 'n'), // diag + ::testing::Values('l', 'u'), // uplo + ::testing::Values(2.0), // alpha + ::testing::Values(2.0), // lda_mul + ::testing::Values(2.0), // ldb_mul + ::testing::Values(0.0, NaN) // unused ); - +#else +// TODO (Tanvir): Reduce no. of tests to avoid timeout. +// Enable more tests later on. +template +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(7, 513, 1027), // m + ::testing::Values(7, 513, 1027), // n + ::testing::Values('n', 't'), // trans + ::testing::Values('l', 'r'), // side + ::testing::Values('u', 'n'), // diag + ::testing::Values('l', 'u'), // uplo + ::testing::Values(2.0), // alpha + ::testing::Values(2.0), // lda_mul + ::testing::Values(2.0), // ldb_mul + ::testing::Values(0.0, NaN) // unused + ); +#endif // unused is a value that will be placed in the input matrix and is not meant to // be accessed by the trsm implementation template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; int m, n; char trans, side, diag, uplo; T alpha, ldaMul, ldbMul, unusedValue; - BLAS_GENERATE_NAME(info.param, m, n, trans, side, diag, uplo, alpha, ldaMul, - ldbMul, unusedValue); + BLAS_GENERATE_NAME(info.param, alloc, m, n, trans, side, diag, uplo, alpha, + ldaMul, ldbMul, unusedValue); } BLAS_REGISTER_TEST_ALL(Trsm, combination_t, combi, generate_name); diff --git a/test/unittest/extension/omatadd_test.cpp b/test/unittest/extension/omatadd_test.cpp index eb92e8417..6392e74e5 100644 --- a/test/unittest/extension/omatadd_test.cpp +++ b/test/unittest/extension/omatadd_test.cpp @@ -27,16 +27,17 @@ #include "extension_reference.hpp" template -using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; char trans_a, trans_b; index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul; scalar_t alpha, beta; - std::tie(trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) = + std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) = combi; auto q = make_queue(); @@ -61,23 +62,54 @@ void run_test(const combination_t combi) { reference_blas::ext_omatadd(trans_a, trans_b, m, n, alpha, A, lda, beta, B, ldb, C_ref, ldc); - auto m_a_gpu = - blas::make_sycl_iterator_buffer(A, base_size * ld_a_mul); - auto m_b_gpu = - blas::make_sycl_iterator_buffer(B, base_size * ld_b_mul); - auto m_c_gpu = - blas::make_sycl_iterator_buffer(C, base_size * ld_c_mul); + const auto size_m_a = base_size * ld_a_mul; + const auto size_m_b = base_size * ld_b_mul; + const auto size_m_c = base_size * ld_c_mul; + + auto m_a_gpu = helper::allocate(size_m_a, q); + auto m_b_gpu = helper::allocate(size_m_b, q); + auto m_c_gpu = helper::allocate(size_m_c, q); - blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, m_a_gpu, lda, beta, - m_b_gpu, ldb, m_c_gpu, ldc); + auto copy_m_a = helper::copy_to_device(q, A.data(), m_a_gpu, size_m_a); + auto copy_m_b = helper::copy_to_device(q, B.data(), m_b_gpu, size_m_b); + auto copy_m_c = helper::copy_to_device(q, C.data(), m_c_gpu, size_m_c); + + auto omatadd_event = blas::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, m_a_gpu, lda, beta, + m_b_gpu, ldb, m_c_gpu, ldc, {copy_m_a, copy_m_b, copy_m_c}); + sb_handle.wait(omatadd_event); auto event = blas::helper::copy_to_host( - sb_handle.get_queue(), m_c_gpu, C.data(), base_size * ld_c_mul); + sb_handle.get_queue(), m_c_gpu, C.data(), size_m_c); sb_handle.wait(event); // Validate the result const bool isAlmostEqual = utils::compare_vectors(C, C_ref); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(m_a_gpu, q); + helper::deallocate(m_b_gpu, q); + helper::deallocate(m_c_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + char trans_a, trans_b; + index_t m, n, ld_a_mul, ld_b_mul, ld_c_mul; + scalar_t alpha, beta; + + std::tie(alloc, trans_a, trans_b, m, n, alpha, beta, ld_a_mul, ld_b_mul, ld_c_mul) = + combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING @@ -95,7 +127,8 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values('n', 't'), // trans_a + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans_a ::testing::Values('n', 't'), // trans_b ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n @@ -109,10 +142,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo> &info) { + std::string alloc; char trans_a, trans_b; index_t m, n, lda_mul, ldb_mul, ldc_mul; T alpha, beta; - BLAS_GENERATE_NAME(info.param, trans_a, trans_b, m, n, alpha, beta, lda_mul, + BLAS_GENERATE_NAME(info.param, alloc, trans_a, trans_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul); } diff --git a/test/unittest/extension/omatcopy2_test.cpp b/test/unittest/extension/omatcopy2_test.cpp index f3ed06f8b..da5d0556a 100644 --- a/test/unittest/extension/omatcopy2_test.cpp +++ b/test/unittest/extension/omatcopy2_test.cpp @@ -27,16 +27,17 @@ #include "extension_reference.hpp" template -using combination_t = std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; char trans; index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; scalar_t alpha; - std::tie(trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi; + std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi; // Leading dimensions are computed as multiples of the minimum value specified // in the oneMKL documentation at : @@ -68,11 +69,18 @@ void run_test(const combination_t combi) { reference_blas::ext_omatcopy2(trans, m, n, alpha, A_ref, ld_in, inc_in, B_ref, ld_out, inc_out); - auto matrix_in = blas::make_sycl_iterator_buffer(A, m_a_size); - auto matrix_out = blas::make_sycl_iterator_buffer(B, m_b_size); + auto matrix_in = helper::allocate(m_a_size, q); + auto matrix_out = helper::allocate(m_b_size, q); + + auto copy_in = + helper::copy_to_device(q, A.data(), matrix_in, m_a_size); + auto copy_out = + helper::copy_to_device(q, B.data(), matrix_out, m_b_size); - blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in, - matrix_out, ld_out, inc_out); + auto omatcopy2_event = blas::_omatcopy2(sb_handle, trans, m, n, alpha, matrix_in, ld_in, inc_in, + matrix_out, ld_out, inc_out, {copy_in, copy_out}); + + sb_handle.wait(omatcopy2_event); auto event = blas::helper::copy_to_host( sb_handle.get_queue(), matrix_out, B.data(), m_b_size); @@ -81,12 +89,36 @@ void run_test(const combination_t combi) { // Validate the result const bool isAlmostEqual = utils::compare_vectors(B, B_ref); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(matrix_in, q); + helper::deallocate(matrix_out, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + char trans; + index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; + scalar_t alpha; + + std::tie(alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values('n', 't'), // trans + ::testing::Combine(::testing::Values("usm", "buf"), + ::testing::Values('n', 't'), // trans ::testing::Values(1024, 4050, 16380), // m ::testing::Values(1024, 4050, 16380), // n ::testing::Values(0, 2.5), // alpha @@ -97,7 +129,8 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values('n', 't'), // trans + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n ::testing::Values(0, 2), // alpha @@ -110,10 +143,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; char trans; index_t m, n, inc_in, ld_in_m, inc_out, ld_out_m; T alpha; - BLAS_GENERATE_NAME(info.param, trans, m, n, alpha, inc_in, ld_in_m, inc_out, + BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, inc_in, ld_in_m, inc_out, ld_out_m); } diff --git a/test/unittest/extension/omatcopy_test.cpp b/test/unittest/extension/omatcopy_test.cpp index 0ed4f6559..32f9355ca 100644 --- a/test/unittest/extension/omatcopy_test.cpp +++ b/test/unittest/extension/omatcopy_test.cpp @@ -28,15 +28,16 @@ template using combination_t = - std::tuple; + std::tuple; -template +template void run_test(const combination_t combi) { + std::string alloc; char trans; index_t m, n, ld_in_m, ld_out_m; scalar_t alpha; - std::tie(trans, m, n, alpha, ld_in_m, ld_out_m) = combi; + std::tie(alloc, trans, m, n, alpha, ld_in_m, ld_out_m) = combi; // Compute leading dimensions using ld multipliers index_t ld_in = ld_in_m * m; @@ -59,11 +60,18 @@ void run_test(const combination_t combi) { // Reference implementation reference_blas::ext_omatcopy(trans, m, n, alpha, A_ref, ld_in, B_ref, ld_out); - auto matrix_in = blas::make_sycl_iterator_buffer(A, size_a); - auto matrix_out = blas::make_sycl_iterator_buffer(B, size_b); + auto matrix_in = helper::allocate(size_a, q); + auto matrix_out = helper::allocate(size_b, q); + + auto copy_in = + helper::copy_to_device(q, A.data(), matrix_in, size_a); + auto copy_out = + helper::copy_to_device(q, B.data(), matrix_out, size_b); - blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, matrix_out, - ld_out); + auto omatcopy_event = blas::_omatcopy(sb_handle, trans, m, n, alpha, matrix_in, ld_in, matrix_out, + ld_out, {copy_in, copy_out}); + + sb_handle.wait(omatcopy_event); auto event = blas::helper::copy_to_host( sb_handle.get_queue(), matrix_out, B.data(), size_b); @@ -72,12 +80,37 @@ void run_test(const combination_t combi) { // Validate the result const bool isAlmostEqual = utils::compare_vectors(B, B_ref); ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(matrix_in, q); + helper::deallocate(matrix_out, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + char trans; + index_t m, n, ld_in_m, ld_out_m; + scalar_t alpha; + + std::tie(alloc, trans, m, n, alpha, ld_in_m, ld_out_m) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } + #ifdef STRESS_TESTING template const auto combi = - ::testing::Combine(::testing::Values('n', 't'), // trans + ::testing::Combine(::testing::Values("usm", "buf"), + ::testing::Values('n', 't'), // trans ::testing::Values(1024, 4050, 16380), // m ::testing::Values(1024, 4050, 16380), // n ::testing::Values(0, 1.05, 2.01), // alpha @@ -86,10 +119,11 @@ const auto combi = #else template const auto combi = - ::testing::Combine(::testing::Values('n', 't'), // trans + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('n', 't'), // trans ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n - ::testing::Values(0, 1, 2), // alpha + ::testing::Values(0, 1, 2), // alpha ::testing::Values(1, 2, 3), // ld_in_m ::testing::Values(1, 2, 3)); // ld_in_n #endif @@ -97,10 +131,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; char trans; index_t m, n, ld_in_m, ld_out_m; T alpha; - BLAS_GENERATE_NAME(info.param, trans, m, n, alpha, ld_in_m, ld_out_m); + BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, ld_in_m, ld_out_m); } BLAS_REGISTER_TEST_ALL(OmatCopy, combination_t, combi, generate_name); diff --git a/test/unittest/extension/reduction_test.cpp b/test/unittest/extension/reduction_test.cpp index f2cfcbd7d..a94319894 100644 --- a/test/unittest/extension/reduction_test.cpp +++ b/test/unittest/extension/reduction_test.cpp @@ -39,11 +39,12 @@ enum operator_t : int { using index_t = int; template -using combination_t = std::tuple; +using combination_t = std::tuple; template const auto combi = ::testing::Combine( + ::testing::Values("usm", "buf"), // allocation type ::testing::Values(1, 7, 513), // rows ::testing::Values(1, 15, 1000, 1337, 8195), // columns ::testing::Values(1, 2, 3), // ld_mul @@ -67,21 +68,23 @@ inline void dump_arg(std::ostream& ss, template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; index_t rows, cols, ldMul; operator_t op; reduction_dim_t reductionDim; T unused; - BLAS_GENERATE_NAME(info.param, rows, cols, ldMul, op, reductionDim, unused); + BLAS_GENERATE_NAME(info.param, alloc, rows, cols, ldMul, op, reductionDim, unused); } -template +template void run_test(const combination_t combi) { + std::string alloc; index_t rows, cols, ld_mul; operator_t op; reduction_dim_t reduction_dim; scalar_t unused; /* Work around dpcpp compiler bug (https://github.com/intel/llvm/issues/7075) */ - std::tie(rows, cols, ld_mul, op, reduction_dim, unused) = combi; + std::tie(alloc, rows, cols, ld_mul, op, reduction_dim, unused) = combi; auto q = make_queue(); blas::SB_Handle sb_handle(q); @@ -172,52 +175,85 @@ void run_test(const combination_t combi) { if (op == operator_t::Mean) { const auto nelems = reduction_dim == reduction_dim_t::outer ? cols : rows; std::transform(out_v_cpu.begin(), out_v_cpu.end(), out_v_cpu.begin(), - [=](scalar_t val) -> scalar_t { - return val / static_cast(nelems); - }); + [=](scalar_t val) -> scalar_t { + return val / static_cast(nelems); + }); } - auto m_in_gpu = blas::make_sycl_iterator_buffer(in_m, ld * cols); + auto m_in_gpu = + blas::helper::allocate(ld * cols, q); // in_m, auto v_out_gpu = - blas::make_sycl_iterator_buffer(out_v_gpu, out_size); + blas::helper::allocate(out_size, q); // out_v_gpu + + auto copy_m = blas::helper::copy_to_device(q, in_m.data(), + m_in_gpu, ld * cols); + auto copy_v = blas::helper::copy_to_device(q, out_v_gpu.data(), + v_out_gpu, out_size); blas::SB_Handle::event_t ev; try { switch (op) { case operator_t::Add: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); break; case operator_t::Product: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); break; case operator_t::Max: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); break; case operator_t::Min: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); break; case operator_t::AbsoluteAdd: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); break; case operator_t::Mean: ev = extension::_reduction( - sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim); + sb_handle, m_in_gpu, ld, v_out_gpu, rows, cols, reduction_dim, {copy_m, copy_v}); break; } } catch (cl::sycl::exception& e) { std::cerr << "Exception occured:" << std::endl; std::cerr << e.what() << std::endl; } + + sb_handle.wait(ev); + auto event = blas::helper::copy_to_host( sb_handle.get_queue(), v_out_gpu, out_v_gpu.data(), out_size); sb_handle.wait(event); ASSERT_TRUE(utils::compare_vectors(out_v_gpu, out_v_cpu)); + + helper::deallocate(m_in_gpu, q); + helper::deallocate(v_out_gpu, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t rows, cols, ld_mul; + operator_t op; + reduction_dim_t reduction_dim; + scalar_t unused; + std::tie(alloc, rows, cols, ld_mul, op, reduction_dim, unused) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } } -BLAS_REGISTER_TEST_ALL(ReductionPartial, combination_t, combi, generate_name); +BLAS_REGISTER_TEST_ALL(ReductionPartial, combination_t, combi, + generate_name); diff --git a/test/unittest/extension/transpose_test.cpp b/test/unittest/extension/transpose_test.cpp index 829045af6..bc1e5d11c 100644 --- a/test/unittest/extension/transpose_test.cpp +++ b/test/unittest/extension/transpose_test.cpp @@ -28,15 +28,16 @@ template using combination_t = - std::tuple; + std::tuple; -template +template void run_test(const combination_t& combi) { + std::string alloc; char place; index_t m, n, ld_in_m, ld_out_m; scalar_t unused; /* Work around dpcpp compiler bug (https://github.com/intel/llvm/issues/7075) */ - std::tie(place, m, n, ld_in_m, ld_out_m, unused) = combi; + std::tie(alloc, place, m, n, ld_in_m, ld_out_m, unused) = combi; // Compute leading dimensions using ld multipliers index_t ld_in = ld_in_m * m; @@ -61,11 +62,18 @@ void run_test(const combination_t& combi) { n); if (place == 'o') { - auto matrix_in = blas::make_sycl_iterator_buffer(A, size_a); - auto matrix_out = blas::make_sycl_iterator_buffer(B, size_b); + auto matrix_in = helper::allocate(size_a, q); + auto matrix_out = helper::allocate(size_b, q); + + auto copy_in = + helper::copy_to_device(q, A.data(), matrix_in, size_a); + auto copy_out = + helper::copy_to_device(q, B.data(), matrix_out, size_b); - blas::extension::_transpose(sb_handle, m, n, matrix_in, ld_in, - matrix_out, ld_out); + auto trans_event = blas::extension::_transpose(sb_handle, m, n, matrix_in, ld_in, + matrix_out, ld_out, {copy_in, copy_out}); + + sb_handle.wait(trans_event); auto event = blas::helper::copy_to_host( sb_handle.get_queue(), matrix_out, B.data(), size_b); @@ -75,14 +83,38 @@ void run_test(const combination_t& combi) { const bool isAlmostEqual = utils::compare_vectors(B, B_ref); ASSERT_TRUE(isAlmostEqual); + helper::deallocate(matrix_in, q); + helper::deallocate(matrix_out, q); + } else { // Inplace Transpose: TODO } } +template +void run_test(const combination_t combi) { + std::string alloc; + char place; + index_t m, n, ld_in_m, ld_out_m; + scalar_t unused; /* Work around dpcpp compiler bug + (https://github.com/intel/llvm/issues/7075) */ + std::tie(alloc, place, m, n, ld_in_m, ld_out_m, unused) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } +} + template const auto combi = - ::testing::Combine(::testing::Values('i', 'o'), // Inplace | Outplace + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values('i', 'o'), // Inplace | Outplace ::testing::Values(64, 129, 255), // m ::testing::Values(64, 129, 255), // n ::testing::Values(1, 2, 3), // ld_in_m @@ -92,10 +124,11 @@ const auto combi = template static std::string generate_name( const ::testing::TestParamInfo>& info) { + std::string alloc; index_t m, n, ld_in_m, ld_out_m; T unused; char place; - BLAS_GENERATE_NAME(info.param, place, m, n, ld_in_m, ld_out_m, unused); + BLAS_GENERATE_NAME(info.param, alloc, place, m, n, ld_in_m, ld_out_m, unused); } BLAS_REGISTER_TEST_ALL(TransposeTest, combination_t, combi, generate_name);