Skip to content

Commit

Permalink
Added Omatcopy_batch BLAS Extension operator (#445)
Browse files Browse the repository at this point in the history
Co-authored-by: Ouadie El Farouki <[email protected]>
  • Loading branch information
s-Nick and OuadiElfarouki authored Sep 13, 2023
1 parent 1f3c107 commit 5c11158
Show file tree
Hide file tree
Showing 35 changed files with 1,351 additions and 330 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ For all these operations:
* `A`, `B` and `C` are containers for the column-major matrices A, B and C.
* `lda`, `ldb` and `ldc` are the leading dimensions of the matrices A, B and C
(cf BLAS 2). The leading dimension of a matrix must be greater than or equal
to its number of rows. In the case of in-place transpose, the same matrix `A`
to its number of rows. In the case of in-place copy/transpose, the same matrix `A`
is used with two different leading dimensions for input & output.
* `stride_a`, `stride_b` and `stride_c` are the striding size between consecutive
matrices in a batched entry for inputs/outputs A, B and C.
Expand All @@ -327,6 +327,8 @@ matrices in a batched entry for inputs/outputs A, B and C.
| `_omatcopy` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `B`, `ldb` | Perform an out-of-place scaled matrix transpose or copy operation using a general dense matrix. |
| `_omatcopy2`| `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `inc_a`, `B`, `ldb`, `inc_b` | Computes two-strided scaling and out-of-place transposition or copying of general dense matrices. |
| `_omatadd`| `sb_handle`, `transa`, `transb`, `M`, `N`, `alpha`, `A`, `lda`, `beta`, `B`, `ldb`, `C`,`ldc` | Computes scaled general dense matrix addition with possibly transposed arguments. |
| `_omatcopy_batch` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `stride_a`, `B`, `ldb`, `stride_b`, `batch_size` | Perform an out-of-place scaled batched-strided matrix transpose or copy operation using a general dense matrix. |
| `_imatcopy_batch` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `ldb`, `stride`, `batch_size` | Perform an in-place scaled batched-strided matrix transpose* or copy operation using a general dense matrix. (*: Currently the transpose case is not supported). |

Other non-official extension operators :
| operation | arguments | description |
Expand Down
22 changes: 6 additions & 16 deletions benchmark/cublas/extension/omatadd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,6 @@
#include "../../../test/unittest/extension/extension_reference.hpp"
#include "../utils.hpp"

template <typename scalar_t>
std::string get_name(std::string ts_a, std::string ts_b, int m, int n,
scalar_t alpha, scalar_t beta, index_t lda_mul,
index_t ldb_mul, index_t ldc_mul) {
std::ostringstream str{};
str << "BM_omatadd<" << blas_benchmark::utils::get_type_name<scalar_t>()
<< ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha
<< "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul;
return str.str();
}

template <typename scalar_t, typename... args_t>
static inline void cublas_routine(args_t&&... args) {
if constexpr (std::is_same_v<scalar_t, float>) {
Expand Down Expand Up @@ -71,7 +60,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti_a,
const auto size_c = ldc * n;

blas_benchmark::utils::init_extension_counters<
blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>(
blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul);

cublasHandle_t& cuda_handle = *cuda_handle_ptr;
Expand Down Expand Up @@ -182,11 +171,12 @@ void register_benchmark(blas_benchmark::Args& args,
lda_mul, ldb_mul, ldc_mul, success);
};
benchmark::RegisterBenchmark(
get_name<scalar_t>(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul,
ldc_mul)
blas_benchmark::utils::get_name<
blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul)
.c_str(),
BM_lambda, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul,
ldb_mul, ldc_mul, success)
BM_lambda, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul, ldb_mul,
ldc_mul, success)
->UseRealTime();
}
}
Expand Down
23 changes: 8 additions & 15 deletions benchmark/cublas/extension/omatcopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,6 @@
#include "../../../test/unittest/extension/extension_reference.hpp"
#include "../utils.hpp"

template <typename scalar_t>
std::string get_name(std::string ts_a, int m, int n, scalar_t alpha,
index_t lda_mul, index_t ldb_mul) {
std::ostringstream str{};
str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name<scalar_t>()
<< ">/" << ts_a << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
<< "/" << ldb_mul;
return str.str();
}

template <typename scalar_t, typename... args_t>
static inline void cublas_routine(args_t&&... args) {
if constexpr (std::is_same_v<scalar_t, float>) {
Expand Down Expand Up @@ -66,7 +56,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti,
const auto cuda_size_b = cuda_ldb * n;

blas_benchmark::utils::init_extension_counters<
blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>(
blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
state, t_str, m, n, lda_mul, ldb_mul);

cublasHandle_t& cuda_handle = *cuda_handle_ptr;
Expand All @@ -89,10 +79,10 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti,

#ifdef BLAS_VERIFY_BENCHMARK
// Run a first time with a verification of the results
std::vector<scalar_t> m_b_ref = m_b; // m_b;
std::vector<scalar_t> m_b_ref = m_b;

reference_blas::ext_omatcopy<false>(*t_str, m, n, alpha, m_a, cuda_lda,
m_b_ref, cuda_ldb);
reference_blas::ext_omatcopy<false>(*t_str, m, n, alpha, m_a.data(), cuda_lda,
m_b_ref.data(), cuda_ldb);

std::vector<scalar_t> m_b_temp = m_b;
{
Expand Down Expand Up @@ -180,7 +170,10 @@ void register_benchmark(blas_benchmark::Args& args,
success);
};
benchmark::RegisterBenchmark(
get_name<scalar_t>(ts_a, m, n, alpha, lda_mul, ldb_mul).c_str(),
blas_benchmark::utils::get_name<
blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
ts_a, m, n, alpha, lda_mul, ldb_mul)
.c_str(),
BM_lambda, cublas_handle_ptr, t_a, m, n, alpha, lda_mul, ldb_mul,
success)
->UseRealTime();
Expand Down
1 change: 1 addition & 0 deletions benchmark/portblas/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ set(sources
extension/omatcopy.cpp
extension/omatcopy2.cpp
extension/omatadd.cpp
extension/omatcopy_batched.cpp
)

if(${BLAS_ENABLE_EXTENSIONS})
Expand Down
19 changes: 4 additions & 15 deletions benchmark/portblas/extension/omatadd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,6 @@
#include "../../../test/unittest/extension/extension_reference.hpp"
#include "../utils.hpp"

template <typename scalar_t>
std::string get_name(std::string ts_a, std::string ts_b, int m, int n,
scalar_t alpha, scalar_t beta, index_t lda_mul,
index_t ldb_mul, index_t ldc_mul, std::string mem_type) {
std::ostringstream str{};
str << "BM_omatadd<" << blas_benchmark::utils::get_type_name<scalar_t>()
<< ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha
<< "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul;
str << "/" << mem_type;
return str.str();
}

template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti_a,
int ti_b, index_t m, index_t n, scalar_t alpha, scalar_t beta,
Expand All @@ -63,7 +51,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti_a,
const auto size_c = ldc * n;

blas_benchmark::utils::init_extension_counters<
blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>(
blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul);

blas::SB_Handle& sb_handle = *sb_handle_ptr;
Expand Down Expand Up @@ -179,8 +167,9 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
lda_mul, ldb_mul, ldc_mul, success);
};
benchmark::RegisterBenchmark(
get_name<scalar_t>(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul,
ldc_mul, mem_type)
blas_benchmark::utils::get_name<
blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul, mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul, ldb_mul,
ldc_mul, success)
Expand Down
21 changes: 7 additions & 14 deletions benchmark/portblas/extension/omatcopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,6 @@
#include "../../../test/unittest/extension/extension_reference.hpp"
#include "../utils.hpp"

template <typename scalar_t>
std::string get_name(std::string t, int m, int n, scalar_t alpha,
index_t lda_mul, index_t ldb_mul, std::string mem_type) {
std::ostringstream str{};
str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name<scalar_t>()
<< ">/" << t << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
<< "/" << ldb_mul;
str << "/" << mem_type;
return str.str();
}

template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul,
Expand All @@ -57,7 +46,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
const auto size_b = ldb * ((*t_str == 't') ? m : n);

blas_benchmark::utils::init_extension_counters<
blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>(
blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
state, t_str, m, n, lda_mul, ldb_mul);

blas::SB_Handle& sb_handle = *sb_handle_ptr;
Expand All @@ -83,7 +72,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
// Run a first time with a verification of the results
std::vector<scalar_t> m_b_ref = m_b;

reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a, lda, m_b_ref, ldb);
reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a.data(), lda,
m_b_ref.data(), ldb);

std::vector<scalar_t> m_b_temp = m_b;
{
Expand Down Expand Up @@ -163,7 +153,10 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
ldb_mul, success);
};
benchmark::RegisterBenchmark(
get_name<scalar_t>(ts, m, n, alpha, lda_mul, ldb_mul, mem_type).c_str(),
blas_benchmark::utils::get_name<
blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
ts, m, n, alpha, lda_mul, ldb_mul, mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul, success)
->UseRealTime();
}
Expand Down
23 changes: 6 additions & 17 deletions benchmark/portblas/extension/omatcopy2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,6 @@
#include "../../../test/unittest/extension/extension_reference.hpp"
#include "../utils.hpp"

template <typename scalar_t>
std::string get_name(std::string t, int m, int n, scalar_t alpha,
index_t lda_mul, index_t ldb_mul, index_t inc_a,
index_t inc_b, std::string mem_type) {
std::ostringstream str{};
str << "BM_omatcopy2<" << blas_benchmark::utils::get_type_name<scalar_t>()
<< ">/" << t << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
<< "/" << ldb_mul << "/" << inc_a << "/" << inc_b;
str << "/" << mem_type;
return str.str();
}

template <typename scalar_t, blas::helper::AllocType mem_alloc>
void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul,
Expand All @@ -59,7 +47,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
const auto size_b = ldb * ((*t_str == 't') ? m : n);

blas_benchmark::utils::init_extension_counters<
blas_benchmark::utils::ExtensionOP::omatcopy2, scalar_t>(
blas_benchmark::utils::ExtensionOp::omatcopy2, scalar_t>(
state, t_str, m, n, lda_mul, ldb_mul, inc_a, inc_b);

blas::SB_Handle& sb_handle = *sb_handle_ptr;
Expand All @@ -85,8 +73,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
// Run a first time with a verification of the results
std::vector<scalar_t> m_b_ref = m_b;

reference_blas::ext_omatcopy2(*t_str, m, n, alpha, m_a, lda, inc_a, m_b_ref,
ldb, inc_b);
reference_blas::ext_omatcopy2(*t_str, m, n, alpha, m_a.data(), lda, inc_a,
m_b_ref.data(), ldb, inc_b);

std::vector<scalar_t> m_b_temp = m_b;
{
Expand Down Expand Up @@ -167,8 +155,9 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
ldb_mul, inc_a, inc_b, success);
};
benchmark::RegisterBenchmark(
get_name<scalar_t>(ts, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b,
mem_type)
blas_benchmark::utils::get_name<
blas_benchmark::utils::ExtensionOp::omatcopy2, scalar_t>(
ts, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b, mem_type)
.c_str(),
BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul, inc_a,
inc_b, success)
Expand Down
Loading

0 comments on commit 5c11158

Please sign in to comment.