diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt index 4ac3fdeaa..87fc58eaf 100644 --- a/benchmark/portblas/CMakeLists.txt +++ b/benchmark/portblas/CMakeLists.txt @@ -69,6 +69,7 @@ set(sources extension/omatcopy_batched.cpp extension/omatadd.cpp extension/omatadd_batched.cpp + extension/axpy_batch.cpp ) if(${BLAS_ENABLE_EXTENSIONS}) diff --git a/benchmark/portblas/extension/axpy_batch.cpp b/benchmark/portblas/extension/axpy_batch.cpp new file mode 100644 index 000000000..2dcbe3451 --- /dev/null +++ b/benchmark/portblas/extension/axpy_batch.cpp @@ -0,0 +1,182 @@ +/*************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * portBLAS: BLAS implementation using SYCL + * + * @filename axpy_batch.cpp + * + **************************************************************************/ + +#include "../utils.hpp" + +constexpr blas_benchmark::utils::ExtensionOp benchmark_op = + blas_benchmark::utils::ExtensionOp::axpy_batch; + +template +void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size, + scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul, + index_t stride_y_mul, index_t batch_size, bool* success) { + // initialize the state label + blas_benchmark::utils::set_benchmark_label( + state, sb_handle_ptr->get_queue()); + + // Google-benchmark counters are double. + blas_benchmark::utils::init_extension_counters( + state, size, batch_size); + + blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); + + const auto stride_x{size * std::abs(inc_x) * stride_x_mul}; + const auto stride_y{size * std::abs(inc_y) * stride_y_mul}; + + const index_t size_x{stride_x * batch_size}; + const index_t size_y{stride_y * batch_size}; + // Create data + std::vector vx = + blas_benchmark::utils::random_data(size_x); + std::vector vy = + blas_benchmark::utils::random_data(size_y); + + auto inx = blas::helper::allocate(size_x, q); + auto iny = blas::helper::allocate(size_y, q); + + auto copy_x = + blas::helper::copy_to_device(q, vx.data(), inx, size_x); + auto copy_y = + blas::helper::copy_to_device(q, vy.data(), iny, size_y); + + sb_handle.wait({copy_x, copy_y}); + +#ifdef BLAS_VERIFY_BENCHMARK + // Run a first time with a verification of the results + std::vector y_ref = vy; + for (auto i = 0; i < batch_size; ++i) { + reference_blas::axpy(size, static_cast(alpha), + vx.data() + i * stride_x, inc_x, + y_ref.data() + i * stride_y, inc_y); + } + std::vector y_temp = vy; + { + auto y_temp_gpu = blas::helper::allocate(size_y, q); + auto copy_temp = blas::helper::copy_to_device(q, y_temp.data(), + y_temp_gpu, size_y); + sb_handle.wait(copy_temp); + auto axpy_batch_event = + _axpy_batch(sb_handle, size, alpha, inx, inc_x, stride_x, y_temp_gpu, + inc_y, stride_y, batch_size); + sb_handle.wait(axpy_batch_event); + auto copy_output = + blas::helper::copy_to_host(q, y_temp_gpu, y_temp.data(), size_y); + sb_handle.wait(copy_output); + + blas::helper::deallocate(y_temp_gpu, q); + } + + std::ostringstream err_stream; + if (!utils::compare_vectors(y_temp, y_ref, err_stream, "")) { + const std::string& err_str = err_stream.str(); + state.SkipWithError(err_str.c_str()); + *success = false; + }; +#endif + + auto blas_method_def = [&]() -> std::vector { + auto event = _axpy_batch(sb_handle, size, alpha, inx, inc_x, stride_x, iny, + inc_y, stride_y, batch_size); + sb_handle.wait(event); + return event; + }; + + // Warmup + blas_benchmark::utils::warmup(blas_method_def); + sb_handle.wait(); + + blas_benchmark::utils::init_counters(state); + + // Measure + for (auto _ : state) { + // Run + std::tuple times = + blas_benchmark::utils::timef(blas_method_def); + + // Report + blas_benchmark::utils::update_counters(state, times); + } + + state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); + state.SetBytesProcessed(state.iterations() * + state.counters["bytes_processed"]); + + blas_benchmark::utils::calc_avg_counters(state); + + blas::helper::deallocate(inx, q); + blas::helper::deallocate(iny, q); +} + +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { + index_t n, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size; + scalar_t alpha; + std::tie(n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size) = + p; + auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, + index_t size, scalar_t alpha, index_t inc_x, + index_t inc_y, index_t stride_x_mul, + index_t stride_y_mul, index_t batch_size, + bool* success) { + run(st, sb_handle_ptr, size, alpha, inc_x, inc_y, + stride_x_mul, stride_y_mul, batch_size, success); + }; + benchmark::RegisterBenchmark( + blas_benchmark::utils::get_name( + n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size, + mem_type) + .c_str(), + BM_lambda, sb_handle_ptr, n, alpha, inc_x, inc_y, stride_x_mul, + stride_y_mul, batch_size, success) + ->UseRealTime(); + } +} + +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto axpy_batch_params = + blas_benchmark::utils::get_axpy_batch_params(args); + + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, + axpy_batch_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, + axpy_batch_params); +#endif +} + +namespace blas_benchmark { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); +} +} // namespace blas_benchmark diff --git a/benchmark/rocblas/CMakeLists.txt b/benchmark/rocblas/CMakeLists.txt index 64a559931..8332590c4 100644 --- a/benchmark/rocblas/CMakeLists.txt +++ b/benchmark/rocblas/CMakeLists.txt @@ -74,7 +74,7 @@ set(sources # Extension blas extension/omatcopy.cpp extension/omatadd.cpp - + extension/axpy_batch.cpp ) # Operators supporting COMPLEX types benchmarking diff --git a/benchmark/rocblas/extension/axpy_batch.cpp b/benchmark/rocblas/extension/axpy_batch.cpp new file mode 100644 index 000000000..3ce877721 --- /dev/null +++ b/benchmark/rocblas/extension/axpy_batch.cpp @@ -0,0 +1,175 @@ +/*************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * portBLAS: BLAS implementation using SYCL + * + * @filename axpy_batch.cpp + * + **************************************************************************/ + +#include "../utils.hpp" +#include "common/common_utils.hpp" + +constexpr blas_benchmark::utils::ExtensionOp benchmark_op = + blas_benchmark::utils::ExtensionOp::axpy_batch; + +template +static inline void rocblas_axpy_strided_batched_f(args_t&&... args) { + if constexpr (std::is_same_v) { + CHECK_ROCBLAS_STATUS( + rocblas_saxpy_strided_batched(std::forward(args)...)); + } else if constexpr (std::is_same_v) { + CHECK_ROCBLAS_STATUS( + rocblas_daxpy_strided_batched(std::forward(args)...)); + } + return; +} + +template +void run(benchmark::State& state, rocblas_handle& rb_handle, index_t size, + scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul, + index_t stride_y_mul, index_t batch_size, bool* success) { + // initialize the state label + blas_benchmark::utils::set_benchmark_label(state); + + // Google-benchmark counters are double. + blas_benchmark::utils::init_extension_counters( + state, size, batch_size); + + const auto stride_x{size * std::abs(inc_x) * stride_x_mul}; + const auto stride_y{size * std::abs(inc_y) * stride_y_mul}; + + const index_t size_x{stride_x * batch_size}; + const index_t size_y{stride_y * batch_size}; + // Create data + std::vector vx = + blas_benchmark::utils::random_data(size_x); + std::vector vy = + blas_benchmark::utils::random_data(size_y); + + blas_benchmark::utils::HIPVector inx(size_x, vx.data()); + blas_benchmark::utils::HIPVector iny(size_y, vy.data()); + +#ifdef BLAS_VERIFY_BENCHMARK + // Run a first time with a verification of the results + std::vector y_ref = vy; + for (auto i = 0; i < batch_size; ++i) { + reference_blas::axpy(size, static_cast(alpha), + vx.data() + i * stride_x, inc_x, + y_ref.data() + i * stride_y, inc_y); + } + std::vector y_temp = vy; + { + blas_benchmark::utils::HIPVector y_temp_gpu(size_y, + y_temp.data()); + rocblas_axpy_strided_batched_f(rb_handle, size, &alpha, inx, + inc_x, stride_x, y_temp_gpu, inc_y, + stride_y, batch_size); + } + + std::ostringstream err_stream; + if (!utils::compare_vectors(y_temp, y_ref, err_stream, "")) { + const std::string& err_str = err_stream.str(); + state.SkipWithError(err_str.c_str()); + *success = false; + }; +#endif + + auto blas_warmup = [&]() -> void { + rocblas_axpy_strided_batched_f(rb_handle, size, &alpha, inx, + inc_x, stride_x, iny, inc_y, + stride_y, batch_size); + return; + }; + + hipEvent_t start, stop; + CHECK_HIP_ERROR(hipEventCreate(&start)); + CHECK_HIP_ERROR(hipEventCreate(&stop)); + + auto blas_method_def = [&]() -> std::vector { + CHECK_HIP_ERROR(hipEventRecord(start, NULL)); + rocblas_axpy_strided_batched_f(rb_handle, size, &alpha, inx, + inc_x, stride_x, iny, inc_y, + stride_y, batch_size); + CHECK_HIP_ERROR(hipEventRecord(stop, NULL)); + CHECK_HIP_ERROR(hipEventSynchronize(stop)); + return std::vector{start, stop}; + }; + + // Warmup + blas_benchmark::utils::warmup(blas_method_def); + CHECK_HIP_ERROR(hipStreamSynchronize(NULL)); + + blas_benchmark::utils::init_counters(state); + + // Measure + for (auto _ : state) { + // Run + std::tuple times = + blas_benchmark::utils::timef_hip(blas_method_def); + + // Report + blas_benchmark::utils::update_counters(state, times); + } + + state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); + state.SetBytesProcessed(state.iterations() * + state.counters["bytes_processed"]); + + blas_benchmark::utils::calc_avg_counters(state); + + CHECK_HIP_ERROR(hipEventDestroy(start)); + CHECK_HIP_ERROR(hipEventDestroy(stop)); +} + +template +void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle, + bool* success) { + auto axpy_batch_params = + blas_benchmark::utils::get_axpy_batch_params(args); + + for (auto p : axpy_batch_params) { + index_t n, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size; + scalar_t alpha; + std::tie(n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size) = + p; + auto BM_lambda = + [&](benchmark::State& st, rocblas_handle rb_handle, index_t size, + scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul, + index_t stride_y_mul, index_t batch_size, bool* success) { + run(st, rb_handle, size, alpha, inc_x, inc_y, stride_x_mul, + stride_y_mul, batch_size, success); + }; + benchmark::RegisterBenchmark( + blas_benchmark::utils::get_name( + n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size, + blas_benchmark::utils::MEM_TYPE_USM) + .c_str(), + BM_lambda, rb_handle, n, alpha, inc_x, inc_y, stride_x_mul, + stride_y_mul, batch_size, success) + ->UseRealTime(); + } +} + +namespace blas_benchmark { +void create_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle, + bool* success) { + BLAS_REGISTER_BENCHMARK(args, rb_handle, success); +} +} // namespace blas_benchmark diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake index 3bed39572..f2d3244dc 100644 --- a/cmake/CmakeFunctionHelper.cmake +++ b/cmake/CmakeFunctionHelper.cmake @@ -701,7 +701,8 @@ function (build_library LIB_NAME ENABLE_EXTENSIONS) $ $ $ - $) + $ + $) if (${ENABLE_EXTENSIONS}) list(APPEND LIB_SRCS $) diff --git a/common/include/common/benchmark_identifier.hpp b/common/include/common/benchmark_identifier.hpp index 8d85dcb6f..46e75e3fd 100644 --- a/common/include/common/benchmark_identifier.hpp +++ b/common/include/common/benchmark_identifier.hpp @@ -82,7 +82,8 @@ enum class ExtensionOp : int { imatcopy_batch = 4, omatadd_batch = 5, omatcopy2 = 6, - reduction = 7 + reduction = 7, + axpy_batch = 8 }; template @@ -195,6 +196,8 @@ std::string get_operator_name() { return "Omatcopy2"; else if constexpr (op == ExtensionOp::reduction) return "Reduction"; + else if constexpr (op == ExtensionOp::axpy_batch) + return "Axpy_batch"; else throw std::runtime_error("Unknown BLAS extension operator"); } diff --git a/common/include/common/benchmark_names.hpp b/common/include/common/benchmark_names.hpp index 87d6424d9..f06a6dee6 100644 --- a/common/include/common/benchmark_names.hpp +++ b/common/include/common/benchmark_names.hpp @@ -296,6 +296,15 @@ get_name(index_t rows, index_t cols, std::string reduction_dim, return internal::get_name(rows, cols, reduction_dim, mem_type); } +template +inline typename std::enable_if::type +get_name(index_t n, scalar_t alpha, index_t inc_x, index_t inc_y, + index_t stride_x_mul, index_t stride_y_mul, index_t batch_size, + std::string mem_type) { + return internal::get_name(n, alpha, inc_x, inc_y, stride_x_mul, + stride_y_mul, batch_size, mem_type); +} + } // namespace utils } // namespace blas_benchmark diff --git a/common/include/common/blas_extension_state_counters.hpp b/common/include/common/blas_extension_state_counters.hpp index 7b02654f2..f6720a1f5 100644 --- a/common/include/common/blas_extension_state_counters.hpp +++ b/common/include/common/blas_extension_state_counters.hpp @@ -91,6 +91,21 @@ init_extension_counters(benchmark::State& state, const char* t_a, } return; } + +template +inline typename std::enable_if::type +init_extension_counters(benchmark::State& state, index_t n, + index_t batch_size) { + // The way counters are computed are the same as axpy but multiplied + // by the batch_size + // Google-benchmark counters are double. + double size_d = static_cast(n); + state.counters["size"] = size_d * batch_size; + state.counters["n_fl_ops"] = 2.0 * size_d * batch_size; + state.counters["bytes_processed"] = + 3 * size_d * sizeof(scalar_t) * batch_size; + return; +} } // namespace utils } // namespace blas_benchmark diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp index 251ee9b7f..fc4562aa8 100644 --- a/common/include/common/common_utils.hpp +++ b/common/include/common/common_utils.hpp @@ -141,6 +141,10 @@ using omatadd_batch_param_t = std::tuple; +template +using axpy_batch_param_t = + std::tuple; + namespace blas_benchmark { namespace utils { @@ -1483,6 +1487,49 @@ get_omatadd_batch_params(Args& args) { } } +/** + * @fn get_axpy_batch_params + * @brief Returns a vector containing the axpy_batch benchmark parameters, + * either read from a file according to the command-line args, or the default + * ones. + */ +template +static inline std::vector> get_axpy_batch_params( + Args& args) { + if (args.csv_param.empty()) { + warning_no_csv(); + std::vector> axpy_batch_default; + constexpr index_t dmin = 1 << 10, dmax = 1 << 22; + constexpr index_t batch_size{5}; + constexpr index_t incX{1}; + constexpr index_t incY{1}; + constexpr index_t stride_x_mul{1}; + constexpr index_t stride_y_mul{1}; + constexpr scalar_t alpha{1}; + for (auto n = dmin; n <= dmax; n *= 2) { + axpy_batch_default.push_back(std::make_tuple( + n, alpha, incX, incY, stride_x_mul, stride_y_mul, batch_size)); + } + return axpy_batch_default; + } else { + return parse_csv_file>( + args.csv_param, [&](std::vector& v) { + if (v.size() != 7) { + throw std::runtime_error( + "invalid number of parameters (7 expected)"); + } + try { + return std::make_tuple( + str_to_int(v[0]), str_to_scalar(v[1]), + str_to_int(v[2]), str_to_int(v[3]), + str_to_int(v[4]), str_to_int(v[5]), + str_to_int(v[6])); + } catch (...) { + throw std::runtime_error("invalid parameter"); + } + }); + } +} /** * @fn get_type_name * @brief Returns a string with the given type. The C++ specification diff --git a/include/interface/extension_interface.h b/include/interface/extension_interface.h index f59d3f289..2e78d7935 100644 --- a/include/interface/extension_interface.h +++ b/include/interface/extension_interface.h @@ -146,6 +146,23 @@ typename sb_handle_t::event_t _transpose_add_impl( index_t _ldc, index_t _stride_c, index_t _batch_size, const typename sb_handle_t::event_t& _dependencies); +template +typename sb_handle_t::event_t _axpy_batch( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies); + +template +typename sb_handle_t::event_t _axpy_batch_impl( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies, index_t global_size); + } // namespace internal /** @@ -349,6 +366,34 @@ typename sb_handle_t::event_t _omatadd_batch( stride_c, batch_size, _dependencies); } +/** + * \brief Compute a batch of AXPY operation all together + * + * Implements AXPY \f$y = ax + y\f$ + * + * @param sb_handle SB_Handle + * @param _alpha scalar + * @param _vx BufferIterator or USM pointer + * @param _incx Increment for the vector X + * @param _stride_x Stride distance of two consecutive vectors in X + * @param _vy BufferIterator or USM pointer + * @param _incy Increment for the vector Y + * @param _stride_y Stride distance of two consecutive vectors in Y + * @param _batch_size number of axpy operations to compute + * @param _dependencies Vector of events + */ +template +typename sb_handle_t::event_t _axpy_batch( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies = {}) { + return internal::_axpy_batch(sb_handle, _N, _alpha, _vx, _incx, _stride_x, + _vy, _incy, _stride_y, _batch_size, + _dependencies); +} + namespace extension { /** * \brief Transpose a Matrix in-place @@ -417,7 +462,6 @@ typename sb_handle_t::event_t _reduction( } } // namespace extension - } // namespace blas #endif // PORTBLAS_EXTENSION_INTERFACE_H diff --git a/include/operations/extension/axpy_batch.h b/include/operations/extension/axpy_batch.h new file mode 100644 index 000000000..a034201ab --- /dev/null +++ b/include/operations/extension/axpy_batch.h @@ -0,0 +1,90 @@ +/*************************************************************************** + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL-BLAS: BLAS implementation using SYCL + * + * @filename axpy_batch.h + * + **************************************************************************/ + +#ifndef PORTBLAS_EXTENSION_AXPY_BATCH_H +#define PORTBLAS_EXTENSION_AXPY_BATCH_H + +namespace blas { + +/*! + * This class holds the kernel implementation to perform axpy_batch + * operator. + * + * It has three additional template parameters to keep the operation simple and + * to avoid some computation or code divergence inside the kernel code. + * + * If sameSign is false the kernel always assumes that inc_r is negative. This + * is true by construction. When the increases are of different sizes the result + * positions are swapped and indexes must be computed accordingly. Keeping + * always inc_r negative and inc_l positive reduces keep index + * computation consistent, obtaining the correct result. + * + * sameSign indicate if inc_r and inc_l are of the sameSign. The code + * implementation need to follow different index computation. This template + * allow the condition at compile time, avoiding code divergency. + * + * localSize local size of group, allow some device tailoring at compile + * time. + * + * maxBlockPerBatch set the number of device group to use for each + * batch. If possible multiple batches are computed concurrently. + */ + +template +struct Axpy_batch { + using value_t = typename lhs_t::value_t; + using index_t = typename rhs_t::index_t; + + lhs_t lhs_; + rhs_t rhs_; + value_t alpha_; + index_t n_, inc_r, inc_l, lhs_stride_, rhs_stride_, batch_size_, + n_block_per_loop; + + Axpy_batch(lhs_t _lhs, rhs_t _rhs_1, value_t _alpha, index_t _N, + index_t _inc_l, index_t _lhs_stride, index_t _inc_r, + index_t _rhs_stride, index_t _batch_size); + index_t get_size() const; + bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + value_t eval(cl::sycl::nd_item<1> ndItem); + void bind(cl::sycl::handler &h); + void adjust_access_displacement(); +}; + +template +Axpy_batch make_axpy_batch( + lhs_t _lhs, rhs_t _rhs_1, typename rhs_t::value_t _alpha, + typename rhs_t::index_t _N, typename rhs_t::index_t _inc_l, + typename rhs_t::index_t _lhs_stride, typename rhs_t::index_t _inc_r, + typename rhs_t::index_t _rhs_stride, typename rhs_t::index_t _batch_size) { + return Axpy_batch( + _lhs, _rhs_1, _alpha, _N, _inc_l, _lhs_stride, _inc_r, _rhs_stride, + _batch_size); +} + +} // namespace blas + +#endif // PORTBLAS_EXTENSION_AXPY_BATCH_H diff --git a/include/portblas.h b/include/portblas.h index 1c212166a..93719cfd6 100644 --- a/include/portblas.h +++ b/include/portblas.h @@ -55,6 +55,8 @@ #include "operations/extension/matcopy_batch.h" +#include "operations/extension/axpy_batch.h" + #include "operations/blas_constants.h" #include "operations/blas_operators.h" diff --git a/src/interface/extension/CMakeLists.txt b/src/interface/extension/CMakeLists.txt index 836b68cbd..132628f70 100644 --- a/src/interface/extension/CMakeLists.txt +++ b/src/interface/extension/CMakeLists.txt @@ -28,5 +28,6 @@ generate_blas_objects(extension transpose) generate_blas_objects(extension omatadd) generate_blas_objects(extension matcopy_batch) generate_blas_objects(extension omatadd_batch) +generate_blas_objects(extension axpy_batch) generate_blas_reduction_objects(extension reduction) diff --git a/src/interface/extension/axpy_batch.cpp.in b/src/interface/extension/axpy_batch.cpp.in new file mode 100644 index 000000000..89b0940d3 --- /dev/null +++ b/src/interface/extension/axpy_batch.cpp.in @@ -0,0 +1,76 @@ +/*************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * portBLAS: BLAS implementation using SYCL + * + * @filename axpy_batch.cpp.in + * + **************************************************************************/ + +#include "interface/extension_interface.hpp" +#include "operations/extension/axpy_batch.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/portblas_handle.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/portblas_handle.hpp" + +namespace blas { +namespace internal { + +/** + * \brief AXPY_BATCH constant times a vector plus a vector. + * + * Implements AXPY_BATCH \f$y = ax + y\f$ + * + * @param SB_Handle + * @param _vx ${DATA_TYPE} + * @param _incx Increment in X axis + * @param _stridex Stride distance of vector in X + * @param _vy ${DATA_TYPE} + * @param _incy Increment in Y axis + * @param _stridey Stride distance of vector in Y + * @param _batch_size number of batches + */ + +template typename SB_Handle::event_t _axpy_batch( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, + BufferIterator<${DATA_TYPE}> _vx, ${INDEX_TYPE} _incx, + ${INDEX_TYPE} _stridex, BufferIterator<${DATA_TYPE}> _vy, + ${INDEX_TYPE} _incy, ${INDEX_TYPE} _stridey, + ${INDEX_TYPE} _batch_size, + const typename SB_Handle::event_t& dependencies); + +#ifdef SB_ENABLE_USM +template typename SB_Handle::event_t _axpy_batch( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, + ${DATA_TYPE} * _vx, ${INDEX_TYPE} _incx, ${INDEX_TYPE} _stridex, + ${DATA_TYPE} * _vy, ${INDEX_TYPE} _incy, ${INDEX_TYPE} _stridey, + ${INDEX_TYPE} _batch_size, + const typename SB_Handle::event_t& dependencies); + +template typename SB_Handle::event_t _axpy_batch( + SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, + const ${DATA_TYPE} * _vx, ${INDEX_TYPE} _incx, + ${INDEX_TYPE} _stridex, ${DATA_TYPE} * _vy, ${INDEX_TYPE} _incy, + ${INDEX_TYPE} _stridey, ${INDEX_TYPE} _batch_size, + const typename SB_Handle::event_t& dependencies); +#endif + +} // namespace internal +} // end namespace blas diff --git a/src/interface/extension/backend/amd_gpu.hpp b/src/interface/extension/backend/amd_gpu.hpp index 3ee9db746..f969f77a1 100644 --- a/src/interface/extension/backend/amd_gpu.hpp +++ b/src/interface/extension/backend/amd_gpu.hpp @@ -138,6 +138,31 @@ typename sb_handle_t::event_t _omatadd_batch( } } // namespace backend } // namespace omatadd_batch + +namespace axpy_batch { +namespace backend { +template +typename sb_handle_t::event_t _axpy_batch( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies) { + // local_size taken empirically + constexpr index_t local_size = static_cast(256); + const auto nWG = (_N + local_size - 1) / local_size; + // the limit for _N*batch_size is taken empirically from test on AMDW6800 + const index_t global_size = + (_N * _batch_size >= 163840) + ? (_N > (1 << 19)) ? (local_size * nWG) / 4 : local_size * nWG + : local_size * nWG * _batch_size; + return blas::internal::_axpy_batch_impl<256, 32>( + sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y, + _batch_size, _dependencies, global_size); +} +} // namespace backend +} // namespace axpy_batch + } // namespace blas #endif diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp index b168bb6c5..d8a2f6c24 100644 --- a/src/interface/extension/backend/default_cpu.hpp +++ b/src/interface/extension/backend/default_cpu.hpp @@ -111,6 +111,29 @@ typename sb_handle_t::event_t _omatadd_batch( } } // namespace backend } // namespace omatadd_batch + +namespace axpy_batch { +namespace backend { +template +typename sb_handle_t::event_t _axpy_batch( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies) { + // local_size taken empirically + constexpr index_t local_size = static_cast(256); + const auto nWG = (_N + local_size - 1) / local_size; + // the limit for _N*batch_size is taken empirically from test on i9 CPU + const index_t global_size = (_N * _batch_size >= 163840) + ? local_size * nWG + : local_size * nWG * _batch_size; + return blas::internal::_axpy_batch_impl<256, 32>( + sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y, + _batch_size, _dependencies, global_size); +} +} // namespace backend +} // namespace axpy_batch } // namespace blas #endif diff --git a/src/interface/extension/backend/intel_gpu.hpp b/src/interface/extension/backend/intel_gpu.hpp index 9e2566aa7..90ec53746 100644 --- a/src/interface/extension/backend/intel_gpu.hpp +++ b/src/interface/extension/backend/intel_gpu.hpp @@ -132,6 +132,29 @@ typename sb_handle_t::event_t _omatadd_batch( } } // namespace backend } // namespace omatadd_batch + +namespace axpy_batch { +namespace backend { +template +typename sb_handle_t::event_t _axpy_batch( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies) { + // local_size taken empirically + constexpr index_t local_size = static_cast(256); + const auto nWG = (_N + local_size - 1) / local_size; + // the limit for _N*batch_size is taken empirically from test on intelGPU + const index_t global_size = (_N * _batch_size > 327680) + ? local_size * nWG + : local_size * nWG * _batch_size; + return blas::internal::_axpy_batch_impl<256, 128>( + sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y, + _batch_size, _dependencies, global_size); +} +} // namespace backend +} // namespace axpy_batch } // namespace blas #endif diff --git a/src/interface/extension/backend/nvidia_gpu.hpp b/src/interface/extension/backend/nvidia_gpu.hpp index e3aac7028..b21228f5d 100644 --- a/src/interface/extension/backend/nvidia_gpu.hpp +++ b/src/interface/extension/backend/nvidia_gpu.hpp @@ -138,6 +138,39 @@ typename sb_handle_t::event_t _omatadd_batch( } } // namespace backend } // namespace omatadd_batch + +namespace axpy_batch { +namespace backend { +template +typename sb_handle_t::event_t _axpy_batch( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies) { + // local_size taken empirically + constexpr index_t local_size = static_cast(256); + const auto nWG = (_N + local_size - 1) / local_size; + // the limit for _N*batch_size is taken empirically from test on A100 + if (_N * _batch_size <= 81920 || _N <= 16384) { + const index_t global_size = local_size * nWG * _batch_size; + return blas::internal::_axpy_batch_impl<256, 32>( + sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y, + _batch_size, _dependencies, global_size); + } else if (_N <= (1 << 19)) { + const index_t global_size = local_size * nWG; + return blas::internal::_axpy_batch_impl<256, 64>( + sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y, + _batch_size, _dependencies, global_size); + } else { + const index_t global_size = (local_size * nWG); + return blas::internal::_axpy_batch_impl<256, 128>( + sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y, + _batch_size, _dependencies, global_size); + } +} +} // namespace backend +} // namespace axpy_batch } // namespace blas #endif diff --git a/src/interface/extension_interface.hpp b/src/interface/extension_interface.hpp index 4ebeafc01..9613a4aeb 100644 --- a/src/interface/extension_interface.hpp +++ b/src/interface/extension_interface.hpp @@ -31,6 +31,7 @@ #include "interface/extension_interface.h" #include "operations/blas1_trees.h" #include "operations/blas_operators.hpp" +#include "operations/extension/axpy_batch.h" #include "operations/extension/matcopy_batch.h" #include "operations/extension/reduction.h" #include "operations/extension/transpose.h" @@ -596,6 +597,55 @@ typename sb_handle_t::event_t _reduction( sb_handle, buffer_in, ld, buffer_out, rows, cols, dependencies); } } +template +typename sb_handle_t::event_t _axpy_batch( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies) { + return blas::axpy_batch::backend::_axpy_batch( + sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y, + _batch_size, _dependencies); +} + +template +typename sb_handle_t::event_t _axpy_batch_impl( + sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx, + index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy, + index_t _stride_y, index_t _batch_size, + const typename sb_handle_t::event_t& _dependencies, index_t global_size) { + // if inc are of opposite sign the values are exchanged. It doesn't matter + // which one is positive or negative, so to simplify index computation in + // kernel we always set incx to be negative and incy to be positive. + if (_incx > 0 && _incy < 0) { + _incx = -_incx; + _incy = -_incy; + } + // if _stride_x is zero use _N as vx size + const index_t overall_vx_size = (_stride_x) ? _stride_x * _batch_size : _N; + typename VectorViewType::type vx = + make_vector_view(_vx, static_cast(_incx), overall_vx_size); + auto vy = make_vector_view(_vy, _incy, _stride_y * _batch_size); + // If both vectors are read from the same side it doesn't matter the sign of + // the increment + if (_incx * _incy > 0) { + auto op = make_axpy_batch( + vy, vx, _alpha, _N, std::abs(_incy), _stride_y, std::abs(_incx), + _stride_x, _batch_size); + typename sb_handle_t::event_t ret = sb_handle.execute( + op, static_cast(localSize), global_size, _dependencies); + return ret; + } else { + auto op = make_axpy_batch( + vy, vx, _alpha, _N, _incy, _stride_y, _incx, _stride_x, _batch_size); + typename sb_handle_t::event_t ret = sb_handle.execute( + op, static_cast(localSize), global_size, _dependencies); + return ret; + } +} } // namespace internal } // namespace blas diff --git a/src/operations/extension/axpy_batch.hpp b/src/operations/extension/axpy_batch.hpp new file mode 100644 index 000000000..d5a70a7d6 --- /dev/null +++ b/src/operations/extension/axpy_batch.hpp @@ -0,0 +1,141 @@ +/*************************************************************************** + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL-BLAS: BLAS implementation using SYCL + * + * @filename axpy_batch.hpp + * + **************************************************************************/ + +#ifndef PORTBLAS_EXTENSION_AXPY_BATCH_HPP +#define PORTBLAS_EXTENSION_AXPY_BATCH_HPP + +#include "blas_meta.h" +#include "operations/extension/axpy_batch.h" + +namespace blas { + +template +Axpy_batch::Axpy_batch( + lhs_t _lhs, rhs_t _rhs, typename lhs_t::value_t _alpha, + typename rhs_t::index_t _N, typename rhs_t::index_t _inc_l, + typename rhs_t::index_t _lhs_stride, typename rhs_t::index_t _inc_r, + typename rhs_t::index_t _rhs_stride, typename rhs_t::index_t _batch_size) + : lhs_(_lhs), + rhs_(_rhs), + alpha_(_alpha), + n_(_N), + inc_l(_inc_l), + lhs_stride_(_lhs_stride), + inc_r(_inc_r), + rhs_stride_(_rhs_stride), + batch_size_(_batch_size), + n_block_per_loop(std::min((n_ + localSize - 1) / localSize, + static_cast(maxBlockPerBatch))){}; + +template +PORTBLAS_INLINE typename lhs_t::value_t +Axpy_batch::eval( + cl::sycl::nd_item<1> ndItem) { + const index_t n{n_}; + const value_t alpha{alpha_}; + const auto vx = rhs_.get_data(); + const auto vy = lhs_.get_data(); + const auto nbl{n_block_per_loop}; + + const index_t block_id = ndItem.get_group(0) % nbl; + const index_t l_id = + static_cast(ndItem.get_local_range(0)) * block_id + + ndItem.get_local_id(0); + const index_t group_id = static_cast(ndItem.get_group(0) / nbl); + + const index_t size_compute_rateo = + (n > nbl * localSize) ? n / (nbl * localSize) : batch_size_; + const index_t jump_value{sycl::min(batch_size_, size_compute_rateo)}; + + if (group_id >= jump_value || l_id > n) return {}; + + const index_t stride_x = ndItem.get_local_range(0) * nbl * inc_r; + const index_t stride_y = ndItem.get_local_range(0) * nbl * inc_l; + index_t x_index{}; + index_t y_index{}; + int j{}; + + if constexpr (sameSign) { + for (auto out_loop = group_id; out_loop < batch_size_; + out_loop += jump_value) { + x_index = out_loop * rhs_stride_ + l_id * inc_r; + y_index = out_loop * lhs_stride_ + l_id * inc_l; + j = y_index; + for (auto i = x_index; i < (out_loop * rhs_stride_) + n * inc_r; + i += stride_x, j += stride_y) { + vy[j] += alpha * vx[i]; + } + } + + } else { + for (auto out_loop = group_id; out_loop < batch_size_; + out_loop += jump_value) { + x_index = out_loop * rhs_stride_ + inc_r + n * (-inc_r) + l_id * inc_r; + y_index = out_loop * lhs_stride_ + l_id * inc_l; + j = y_index; + for (auto i = x_index; i >= (out_loop * rhs_stride_); + i += stride_x, j += stride_y) { + vy[j] += alpha * vx[i]; + } + } + } + + return {}; +} + +template +PORTBLAS_INLINE void Axpy_batch::bind(cl::sycl::handler& h) { + lhs_.bind(h); + rhs_.bind(h); +} + +template +PORTBLAS_INLINE void Axpy_batch::adjust_access_displacement() { + lhs_.adjust_access_displacement(); + rhs_.adjust_access_displacement(); +} + +template +PORTBLAS_INLINE typename rhs_t::index_t Axpy_batch< + sameSign, localSize, maxBlockPerBatch, lhs_t, rhs_t>::get_size() const { + return n_ * batch_size_; +} + +template +PORTBLAS_INLINE bool +Axpy_batch::valid_thread( + cl::sycl::nd_item<1> ndItem) const { + return true; +} +} // namespace blas + +#endif diff --git a/src/portblas.hpp b/src/portblas.hpp index 623a0e302..5ed50d98d 100644 --- a/src/portblas.hpp +++ b/src/portblas.hpp @@ -52,6 +52,8 @@ #include "operations/extension/matcopy_batch.hpp" +#include "operations/extension/axpy_batch.hpp" + #include "operations/blas_constants.hpp" #include "operations/blas_operators.hpp" diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index 54b386df8..b5629e059 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -64,6 +64,7 @@ set(SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/extension/omatcopy2_test.cpp ${PORTBLAS_UNITTEST}/extension/omatcopy_batched_test.cpp ${PORTBLAS_UNITTEST}/extension/omatadd_batched_test.cpp + ${PORTBLAS_UNITTEST}/extension/axpy_batch_test.cpp ) if(${BLAS_ENABLE_EXTENSIONS}) diff --git a/test/unittest/extension/axpy_batch_test.cpp b/test/unittest/extension/axpy_batch_test.cpp new file mode 100644 index 000000000..7c7bce441 --- /dev/null +++ b/test/unittest/extension/axpy_batch_test.cpp @@ -0,0 +1,151 @@ +/*************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * portBLAS: BLAS implementation using SYCL + * + * @filename axpy_batch_test.cpp + * + **************************************************************************/ + +#include "blas_test.hpp" + +template +using combination_t = std::tuple; + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t size; + scalar_t alpha; + index_t incX; + index_t incY; + index_t stride_mul_x; + index_t stride_mul_y; + index_t batch_size; + std::tie(alloc, size, alpha, incX, incY, stride_mul_x, stride_mul_y, + batch_size) = combi; + + const index_t stride_x{size * std::abs(incX) * stride_mul_x}; + const index_t stride_y{size * std::abs(incY) * stride_mul_y}; + + auto x_size = (stride_x) ? stride_x * batch_size : size * std::abs(incX); + auto y_size = stride_y * batch_size; + // Input vector + std::vector x_v(x_size); + fill_random(x_v); + + // Output vector + std::vector y_v(y_size, 10.0); + std::vector y_cpu_v(y_size, 10.0); + + // Reference implementation + for (index_t i = 0; i < batch_size; ++i) { + reference_blas::axpy(size, alpha, x_v.data() + i * stride_x, incX, + y_cpu_v.data() + i * stride_y, incY); + } + + // SYCL implementation + auto q = make_queue(); + blas::SB_Handle sb_handle(q); + + // Iterators + auto gpu_x_v = helper::allocate(x_size, q); + auto gpu_y_v = helper::allocate(y_size, q); + + auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, x_size); + auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, y_size); + + auto axpy_batch_event = + _axpy_batch(sb_handle, size, alpha, gpu_x_v, incX, stride_x, gpu_y_v, + incY, stride_y, batch_size, {copy_x, copy_y}); + sb_handle.wait(axpy_batch_event); + + auto event = helper::copy_to_host(q, gpu_y_v, y_v.data(), y_size); + sb_handle.wait(event); + + // Validate the result + const bool isAlmostEqual = utils::compare_vectors(y_v, y_cpu_v); + ASSERT_TRUE(isAlmostEqual); + + helper::deallocate(gpu_x_v, q); + helper::deallocate(gpu_y_v, q); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + index_t size; + scalar_t alpha; + index_t incX; + index_t incY; + index_t stride_mul_x; + index_t stride_mul_y; + index_t batch_size; + std::tie(alloc, size, alpha, incX, incY, stride_mul_x, stride_mul_y, + batch_size) = combi; + + if (alloc == "usm") { // usm alloc +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { // buffer alloc + run_test(combi); + } +} + +#ifdef STRESS_TESTING +template +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 10020, 1000240), // size + ::testing::Values(0.0, 1.3, 2.5), // alpha + ::testing::Values(1, -1, 2, -7), // incX + ::testing::Values(1, -1, 3, -5), // incY + ::testing::Values(1, 2, 3), // stride_mul_x + ::testing::Values(1, 2, 3), // stride_mul_y + ::testing::Values(5) // batch_size + ); +#else +template +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), // allocation type + ::testing::Values(11, 65, 1002, 10240), // size + ::testing::Values(0.0, 1.3), // alpha + ::testing::Values(1, -1, 2, -4), // incX + ::testing::Values(1, -1, 3, -5), // incY + ::testing::Values(0, 1, 2, 3), // stride_mul_x + ::testing::Values(1, 2, 3), // stride_mul_y + ::testing::Values(5) // batch_size + ); +#endif + +template +static std::string generate_name( + const ::testing::TestParamInfo>& info) { + std::string alloc; + index_t size, incX, incY, stride_mul_x, stride_mul_y, batch_size; + T alpha; + BLAS_GENERATE_NAME(info.param, alloc, size, alpha, incX, incY, stride_mul_x, + stride_mul_y, batch_size); +} + +BLAS_REGISTER_TEST_ALL(Axpy_batch, combination_t, combi, generate_name);