Skip to content

Commit

Permalink
Add Axpy_batch implementation (#479)
Browse files Browse the repository at this point in the history
Added axpy_batch extension BLAS operator with benchmarks.
  • Loading branch information
s-Nick authored Nov 23, 2023
1 parent ece6336 commit a6d044a
Show file tree
Hide file tree
Showing 23 changed files with 1,099 additions and 4 deletions.
1 change: 1 addition & 0 deletions benchmark/portblas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ set(sources
extension/omatcopy_batched.cpp
extension/omatadd.cpp
extension/omatadd_batched.cpp
extension/axpy_batch.cpp
)

if(${BLAS_ENABLE_EXTENSIONS})
Expand Down
182 changes: 182 additions & 0 deletions benchmark/portblas/extension/axpy_batch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/***************************************************************************
*
* @license
* Copyright (C) Codeplay Software Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* For your convenience, a copy of the License has been included in this
* repository.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* portBLAS: BLAS implementation using SYCL
*
* @filename axpy_batch.cpp
*
**************************************************************************/

#include "../utils.hpp"

constexpr blas_benchmark::utils::ExtensionOp benchmark_op =
blas_benchmark::utils::ExtensionOp::axpy_batch;

template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul,
index_t stride_y_mul, index_t batch_size, bool* success) {
// initialize the state label
blas_benchmark::utils::set_benchmark_label<scalar_t>(
state, sb_handle_ptr->get_queue());

// Google-benchmark counters are double.
blas_benchmark::utils::init_extension_counters<benchmark_op, scalar_t>(
state, size, batch_size);

blas::SB_Handle& sb_handle = *sb_handle_ptr;
auto q = sb_handle.get_queue();

const auto stride_x{size * std::abs(inc_x) * stride_x_mul};
const auto stride_y{size * std::abs(inc_y) * stride_y_mul};

const index_t size_x{stride_x * batch_size};
const index_t size_y{stride_y * batch_size};
// Create data
std::vector<scalar_t> vx =
blas_benchmark::utils::random_data<scalar_t>(size_x);
std::vector<scalar_t> vy =
blas_benchmark::utils::random_data<scalar_t>(size_y);

auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size_x, q);
auto iny = blas::helper::allocate<mem_alloc, scalar_t>(size_y, q);

auto copy_x =
blas::helper::copy_to_device<scalar_t>(q, vx.data(), inx, size_x);
auto copy_y =
blas::helper::copy_to_device<scalar_t>(q, vy.data(), iny, size_y);

sb_handle.wait({copy_x, copy_y});

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<scalar_t> y_ref = vy;
for (auto i = 0; i < batch_size; ++i) {
reference_blas::axpy(size, static_cast<scalar_t>(alpha),
vx.data() + i * stride_x, inc_x,
y_ref.data() + i * stride_y, inc_y);
}
std::vector<scalar_t> y_temp = vy;
{
auto y_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_y, q);
auto copy_temp = blas::helper::copy_to_device<scalar_t>(q, y_temp.data(),
y_temp_gpu, size_y);
sb_handle.wait(copy_temp);
auto axpy_batch_event =
_axpy_batch(sb_handle, size, alpha, inx, inc_x, stride_x, y_temp_gpu,
inc_y, stride_y, batch_size);
sb_handle.wait(axpy_batch_event);
auto copy_output =
blas::helper::copy_to_host(q, y_temp_gpu, y_temp.data(), size_y);
sb_handle.wait(copy_output);

blas::helper::deallocate<mem_alloc>(y_temp_gpu, q);
}

std::ostringstream err_stream;
if (!utils::compare_vectors(y_temp, y_ref, err_stream, "")) {
const std::string& err_str = err_stream.str();
state.SkipWithError(err_str.c_str());
*success = false;
};
#endif

auto blas_method_def = [&]() -> std::vector<cl::sycl::event> {
auto event = _axpy_batch(sb_handle, size, alpha, inx, inc_x, stride_x, iny,
inc_y, stride_y, batch_size);
sb_handle.wait(event);
return event;
};

// Warmup
blas_benchmark::utils::warmup(blas_method_def);
sb_handle.wait();

blas_benchmark::utils::init_counters(state);

// Measure
for (auto _ : state) {
// Run
std::tuple<double, double> times =
blas_benchmark::utils::timef(blas_method_def);

// Report
blas_benchmark::utils::update_counters(state, times);
}

state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
state.SetBytesProcessed(state.iterations() *
state.counters["bytes_processed"]);

blas_benchmark::utils::calc_avg_counters(state);

blas::helper::deallocate<mem_alloc>(inx, q);
blas::helper::deallocate<mem_alloc>(iny, q);
}

template <typename scalar_t, blas::helper::AllocType mem_alloc>
void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
std::string mem_type,
std::vector<axpy_batch_param_t<scalar_t>> params) {
for (auto p : params) {
index_t n, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size;
scalar_t alpha;
std::tie(n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size) =
p;
auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
index_t size, scalar_t alpha, index_t inc_x,
index_t inc_y, index_t stride_x_mul,
index_t stride_y_mul, index_t batch_size,
bool* success) {
run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, alpha, inc_x, inc_y,
stride_x_mul, stride_y_mul, batch_size, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op, scalar_t, index_t>(
n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size,
mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, n, alpha, inc_x, inc_y, stride_x_mul,
stride_y_mul, batch_size, success)
->UseRealTime();
}
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
auto axpy_batch_params =
blas_benchmark::utils::get_axpy_batch_params<scalar_t>(args);

register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
axpy_batch_params);
#ifdef SB_ENABLE_USM
register_benchmark<scalar_t, blas::helper::AllocType::usm>(
sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
axpy_batch_params);
#endif
}

namespace blas_benchmark {
void create_benchmark(blas_benchmark::Args& args,
blas::SB_Handle* sb_handle_ptr, bool* success) {
BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success);
}
} // namespace blas_benchmark
2 changes: 1 addition & 1 deletion benchmark/rocblas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ set(sources
# Extension blas
extension/omatcopy.cpp
extension/omatadd.cpp

extension/axpy_batch.cpp
)

# Operators supporting COMPLEX types benchmarking
Expand Down
175 changes: 175 additions & 0 deletions benchmark/rocblas/extension/axpy_batch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
/***************************************************************************
*
* @license
* Copyright (C) Codeplay Software Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* For your convenience, a copy of the License has been included in this
* repository.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* portBLAS: BLAS implementation using SYCL
*
* @filename axpy_batch.cpp
*
**************************************************************************/

#include "../utils.hpp"
#include "common/common_utils.hpp"

constexpr blas_benchmark::utils::ExtensionOp benchmark_op =
blas_benchmark::utils::ExtensionOp::axpy_batch;

template <typename scalar_t, typename... args_t>
static inline void rocblas_axpy_strided_batched_f(args_t&&... args) {
if constexpr (std::is_same_v<scalar_t, float>) {
CHECK_ROCBLAS_STATUS(
rocblas_saxpy_strided_batched(std::forward<args_t>(args)...));
} else if constexpr (std::is_same_v<scalar_t, double>) {
CHECK_ROCBLAS_STATUS(
rocblas_daxpy_strided_batched(std::forward<args_t>(args)...));
}
return;
}

template <typename scalar_t>
void run(benchmark::State& state, rocblas_handle& rb_handle, index_t size,
scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul,
index_t stride_y_mul, index_t batch_size, bool* success) {
// initialize the state label
blas_benchmark::utils::set_benchmark_label<scalar_t>(state);

// Google-benchmark counters are double.
blas_benchmark::utils::init_extension_counters<benchmark_op, scalar_t>(
state, size, batch_size);

const auto stride_x{size * std::abs(inc_x) * stride_x_mul};
const auto stride_y{size * std::abs(inc_y) * stride_y_mul};

const index_t size_x{stride_x * batch_size};
const index_t size_y{stride_y * batch_size};
// Create data
std::vector<scalar_t> vx =
blas_benchmark::utils::random_data<scalar_t>(size_x);
std::vector<scalar_t> vy =
blas_benchmark::utils::random_data<scalar_t>(size_y);

blas_benchmark::utils::HIPVector<scalar_t> inx(size_x, vx.data());
blas_benchmark::utils::HIPVector<scalar_t> iny(size_y, vy.data());

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<scalar_t> y_ref = vy;
for (auto i = 0; i < batch_size; ++i) {
reference_blas::axpy(size, static_cast<scalar_t>(alpha),
vx.data() + i * stride_x, inc_x,
y_ref.data() + i * stride_y, inc_y);
}
std::vector<scalar_t> y_temp = vy;
{
blas_benchmark::utils::HIPVector<scalar_t, true> y_temp_gpu(size_y,
y_temp.data());
rocblas_axpy_strided_batched_f<scalar_t>(rb_handle, size, &alpha, inx,
inc_x, stride_x, y_temp_gpu, inc_y,
stride_y, batch_size);
}

std::ostringstream err_stream;
if (!utils::compare_vectors(y_temp, y_ref, err_stream, "")) {
const std::string& err_str = err_stream.str();
state.SkipWithError(err_str.c_str());
*success = false;
};
#endif

auto blas_warmup = [&]() -> void {
rocblas_axpy_strided_batched_f<scalar_t>(rb_handle, size, &alpha, inx,
inc_x, stride_x, iny, inc_y,
stride_y, batch_size);
return;
};

hipEvent_t start, stop;
CHECK_HIP_ERROR(hipEventCreate(&start));
CHECK_HIP_ERROR(hipEventCreate(&stop));

auto blas_method_def = [&]() -> std::vector<hipEvent_t> {
CHECK_HIP_ERROR(hipEventRecord(start, NULL));
rocblas_axpy_strided_batched_f<scalar_t>(rb_handle, size, &alpha, inx,
inc_x, stride_x, iny, inc_y,
stride_y, batch_size);
CHECK_HIP_ERROR(hipEventRecord(stop, NULL));
CHECK_HIP_ERROR(hipEventSynchronize(stop));
return std::vector{start, stop};
};

// Warmup
blas_benchmark::utils::warmup(blas_method_def);
CHECK_HIP_ERROR(hipStreamSynchronize(NULL));

blas_benchmark::utils::init_counters(state);

// Measure
for (auto _ : state) {
// Run
std::tuple<double, double> times =
blas_benchmark::utils::timef_hip(blas_method_def);

// Report
blas_benchmark::utils::update_counters(state, times);
}

state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
state.SetBytesProcessed(state.iterations() *
state.counters["bytes_processed"]);

blas_benchmark::utils::calc_avg_counters(state);

CHECK_HIP_ERROR(hipEventDestroy(start));
CHECK_HIP_ERROR(hipEventDestroy(stop));
}

template <typename scalar_t>
void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
bool* success) {
auto axpy_batch_params =
blas_benchmark::utils::get_axpy_batch_params<scalar_t>(args);

for (auto p : axpy_batch_params) {
index_t n, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size;
scalar_t alpha;
std::tie(n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size) =
p;
auto BM_lambda =
[&](benchmark::State& st, rocblas_handle rb_handle, index_t size,
scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul,
index_t stride_y_mul, index_t batch_size, bool* success) {
run<scalar_t>(st, rb_handle, size, alpha, inc_x, inc_y, stride_x_mul,
stride_y_mul, batch_size, success);
};
benchmark::RegisterBenchmark(
blas_benchmark::utils::get_name<benchmark_op, scalar_t, index_t>(
n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size,
blas_benchmark::utils::MEM_TYPE_USM)
.c_str(),
BM_lambda, rb_handle, n, alpha, inc_x, inc_y, stride_x_mul,
stride_y_mul, batch_size, success)
->UseRealTime();
}
}

namespace blas_benchmark {
void create_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
bool* success) {
BLAS_REGISTER_BENCHMARK(args, rb_handle, success);
}
} // namespace blas_benchmark
3 changes: 2 additions & 1 deletion cmake/CmakeFunctionHelper.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,8 @@ function (build_library LIB_NAME ENABLE_EXTENSIONS)
$<TARGET_OBJECTS:matcopy_batch>
$<TARGET_OBJECTS:transpose>
$<TARGET_OBJECTS:omatadd>
$<TARGET_OBJECTS:omatadd_batch>)
$<TARGET_OBJECTS:omatadd_batch>
$<TARGET_OBJECTS:axpy_batch>)

if (${ENABLE_EXTENSIONS})
list(APPEND LIB_SRCS $<TARGET_OBJECTS:reduction>)
Expand Down
Loading

0 comments on commit a6d044a

Please sign in to comment.