From 5c11158b7df41539aa7320f5777bd05434701533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= <9421873+s-Nick@users.noreply.github.com> Date: Wed, 13 Sep 2023 16:04:12 +0200 Subject: [PATCH] Added Omatcopy_batch BLAS Extension operator (#445) Co-authored-by: Ouadie El Farouki --- README.md | 4 +- benchmark/cublas/extension/omatadd.cpp | 22 +- benchmark/cublas/extension/omatcopy.cpp | 23 +- benchmark/portblas/CMakeLists.txt | 1 + benchmark/portblas/extension/omatadd.cpp | 19 +- benchmark/portblas/extension/omatcopy.cpp | 21 +- benchmark/portblas/extension/omatcopy2.cpp | 23 +- .../portblas/extension/omatcopy_batched.cpp | 202 ++++++++++++++++ benchmark/rocblas/extension/omatadd.cpp | 18 +- benchmark/rocblas/extension/omatcopy.cpp | 21 +- cmake/CmakeFunctionHelper.cmake | 3 +- .../include/common/benchmark_identifier.hpp | 27 ++- common/include/common/benchmark_names.hpp | 35 +++ .../common/blas_extension_state_counters.hpp | 42 ++-- common/include/common/common_utils.hpp | 56 +++++ include/interface/extension_interface.h | 197 ++++++++++----- include/operations/extension/matcopy_batch.h | 72 ++++++ include/operations/extension/transpose.h | 30 ++- include/portblas.h | 2 + src/interface/extension/CMakeLists.txt | 2 + src/interface/extension/backend/amd_gpu.hpp | 43 +++- .../extension/backend/default_cpu.hpp | 31 ++- src/interface/extension/backend/intel_gpu.hpp | 43 +++- .../extension/backend/nvidia_gpu.hpp | 43 +++- src/interface/extension/matcopy_batch.cpp.in | 87 +++++++ src/interface/extension_interface.hpp | 227 +++++++++++------- src/operations/extension/matcopy_batch.hpp | 171 +++++++++++++ src/operations/extension/transpose.hpp | 35 ++- src/portblas.hpp | 2 + test/unittest/CMakeLists.txt | 1 + .../extension/extension_reference.hpp | 8 +- test/unittest/extension/omatcopy2_test.cpp | 4 +- .../extension/omatcopy_batched_test.cpp | 156 ++++++++++++ test/unittest/extension/omatcopy_test.cpp | 7 +- test/unittest/extension/transpose_test.cpp | 3 +- 35 files changed, 1351 insertions(+), 330 deletions(-) create mode 100644 benchmark/portblas/extension/omatcopy_batched.cpp create mode 100644 include/operations/extension/matcopy_batch.h create mode 100644 src/interface/extension/matcopy_batch.cpp.in create mode 100644 src/operations/extension/matcopy_batch.hpp create mode 100644 test/unittest/extension/omatcopy_batched_test.cpp diff --git a/README.md b/README.md index adebe5426..5b8541b18 100644 --- a/README.md +++ b/README.md @@ -311,7 +311,7 @@ For all these operations: * `A`, `B` and `C` are containers for the column-major matrices A, B and C. * `lda`, `ldb` and `ldc` are the leading dimensions of the matrices A, B and C (cf BLAS 2). The leading dimension of a matrix must be greater than or equal - to its number of rows. In the case of in-place transpose, the same matrix `A` + to its number of rows. In the case of in-place copy/transpose, the same matrix `A` is used with two different leading dimensions for input & output. * `stride_a`, `stride_b` and `stride_c` are the striding size between consecutive matrices in a batched entry for inputs/outputs A, B and C. @@ -327,6 +327,8 @@ matrices in a batched entry for inputs/outputs A, B and C. | `_omatcopy` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `B`, `ldb` | Perform an out-of-place scaled matrix transpose or copy operation using a general dense matrix. | | `_omatcopy2`| `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `inc_a`, `B`, `ldb`, `inc_b` | Computes two-strided scaling and out-of-place transposition or copying of general dense matrices. | | `_omatadd`| `sb_handle`, `transa`, `transb`, `M`, `N`, `alpha`, `A`, `lda`, `beta`, `B`, `ldb`, `C`,`ldc` | Computes scaled general dense matrix addition with possibly transposed arguments. | +| `_omatcopy_batch` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `stride_a`, `B`, `ldb`, `stride_b`, `batch_size` | Perform an out-of-place scaled batched-strided matrix transpose or copy operation using a general dense matrix. | +| `_imatcopy_batch` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `ldb`, `stride`, `batch_size` | Perform an in-place scaled batched-strided matrix transpose* or copy operation using a general dense matrix. (*: Currently the transpose case is not supported). | Other non-official extension operators : | operation | arguments | description | diff --git a/benchmark/cublas/extension/omatadd.cpp b/benchmark/cublas/extension/omatadd.cpp index 67cc544cf..7a4202cd2 100644 --- a/benchmark/cublas/extension/omatadd.cpp +++ b/benchmark/cublas/extension/omatadd.cpp @@ -26,17 +26,6 @@ #include "../../../test/unittest/extension/extension_reference.hpp" #include "../utils.hpp" -template -std::string get_name(std::string ts_a, std::string ts_b, int m, int n, - scalar_t alpha, scalar_t beta, index_t lda_mul, - index_t ldb_mul, index_t ldc_mul) { - std::ostringstream str{}; - str << "BM_omatadd<" << blas_benchmark::utils::get_type_name() - << ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha - << "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul; - return str.str(); -} - template static inline void cublas_routine(args_t&&... args) { if constexpr (std::is_same_v) { @@ -71,7 +60,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti_a, const auto size_c = ldc * n; blas_benchmark::utils::init_extension_counters< - blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>( + blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>( state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul); cublasHandle_t& cuda_handle = *cuda_handle_ptr; @@ -182,11 +171,12 @@ void register_benchmark(blas_benchmark::Args& args, lda_mul, ldb_mul, ldc_mul, success); }; benchmark::RegisterBenchmark( - get_name(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, - ldc_mul) + blas_benchmark::utils::get_name< + blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>( + ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul) .c_str(), - BM_lambda, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul, - ldb_mul, ldc_mul, success) + BM_lambda, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul, ldb_mul, + ldc_mul, success) ->UseRealTime(); } } diff --git a/benchmark/cublas/extension/omatcopy.cpp b/benchmark/cublas/extension/omatcopy.cpp index 2704ffe1b..02d3527d1 100644 --- a/benchmark/cublas/extension/omatcopy.cpp +++ b/benchmark/cublas/extension/omatcopy.cpp @@ -26,16 +26,6 @@ #include "../../../test/unittest/extension/extension_reference.hpp" #include "../utils.hpp" -template -std::string get_name(std::string ts_a, int m, int n, scalar_t alpha, - index_t lda_mul, index_t ldb_mul) { - std::ostringstream str{}; - str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name() - << ">/" << ts_a << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul - << "/" << ldb_mul; - return str.str(); -} - template static inline void cublas_routine(args_t&&... args) { if constexpr (std::is_same_v) { @@ -66,7 +56,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti, const auto cuda_size_b = cuda_ldb * n; blas_benchmark::utils::init_extension_counters< - blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>( + blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>( state, t_str, m, n, lda_mul, ldb_mul); cublasHandle_t& cuda_handle = *cuda_handle_ptr; @@ -89,10 +79,10 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti, #ifdef BLAS_VERIFY_BENCHMARK // Run a first time with a verification of the results - std::vector m_b_ref = m_b; // m_b; + std::vector m_b_ref = m_b; - reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a, cuda_lda, - m_b_ref, cuda_ldb); + reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a.data(), cuda_lda, + m_b_ref.data(), cuda_ldb); std::vector m_b_temp = m_b; { @@ -180,7 +170,10 @@ void register_benchmark(blas_benchmark::Args& args, success); }; benchmark::RegisterBenchmark( - get_name(ts_a, m, n, alpha, lda_mul, ldb_mul).c_str(), + blas_benchmark::utils::get_name< + blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>( + ts_a, m, n, alpha, lda_mul, ldb_mul) + .c_str(), BM_lambda, cublas_handle_ptr, t_a, m, n, alpha, lda_mul, ldb_mul, success) ->UseRealTime(); diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt index 00e73379d..03d86997e 100644 --- a/benchmark/portblas/CMakeLists.txt +++ b/benchmark/portblas/CMakeLists.txt @@ -67,6 +67,7 @@ set(sources extension/omatcopy.cpp extension/omatcopy2.cpp extension/omatadd.cpp + extension/omatcopy_batched.cpp ) if(${BLAS_ENABLE_EXTENSIONS}) diff --git a/benchmark/portblas/extension/omatadd.cpp b/benchmark/portblas/extension/omatadd.cpp index 393304d6b..da2879299 100644 --- a/benchmark/portblas/extension/omatadd.cpp +++ b/benchmark/portblas/extension/omatadd.cpp @@ -26,18 +26,6 @@ #include "../../../test/unittest/extension/extension_reference.hpp" #include "../utils.hpp" -template -std::string get_name(std::string ts_a, std::string ts_b, int m, int n, - scalar_t alpha, scalar_t beta, index_t lda_mul, - index_t ldb_mul, index_t ldc_mul, std::string mem_type) { - std::ostringstream str{}; - str << "BM_omatadd<" << blas_benchmark::utils::get_type_name() - << ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha - << "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul; - str << "/" << mem_type; - return str.str(); -} - template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti_a, int ti_b, index_t m, index_t n, scalar_t alpha, scalar_t beta, @@ -63,7 +51,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti_a, const auto size_c = ldc * n; blas_benchmark::utils::init_extension_counters< - blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>( + blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>( state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul); blas::SB_Handle& sb_handle = *sb_handle_ptr; @@ -179,8 +167,9 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, lda_mul, ldb_mul, ldc_mul, success); }; benchmark::RegisterBenchmark( - get_name(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, - ldc_mul, mem_type) + blas_benchmark::utils::get_name< + blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>( + ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul, mem_type) .c_str(), BM_lambda, sb_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul, success) diff --git a/benchmark/portblas/extension/omatcopy.cpp b/benchmark/portblas/extension/omatcopy.cpp index 6f214ff57..17e990d90 100644 --- a/benchmark/portblas/extension/omatcopy.cpp +++ b/benchmark/portblas/extension/omatcopy.cpp @@ -26,17 +26,6 @@ #include "../../../test/unittest/extension/extension_reference.hpp" #include "../utils.hpp" -template -std::string get_name(std::string t, int m, int n, scalar_t alpha, - index_t lda_mul, index_t ldb_mul, std::string mem_type) { - std::ostringstream str{}; - str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name() - << ">/" << t << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul - << "/" << ldb_mul; - str << "/" << mem_type; - return str.str(); -} - template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul, @@ -57,7 +46,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, const auto size_b = ldb * ((*t_str == 't') ? m : n); blas_benchmark::utils::init_extension_counters< - blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>( + blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>( state, t_str, m, n, lda_mul, ldb_mul); blas::SB_Handle& sb_handle = *sb_handle_ptr; @@ -83,7 +72,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, // Run a first time with a verification of the results std::vector m_b_ref = m_b; - reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a, lda, m_b_ref, ldb); + reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a.data(), lda, + m_b_ref.data(), ldb); std::vector m_b_temp = m_b; { @@ -163,7 +153,10 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, ldb_mul, success); }; benchmark::RegisterBenchmark( - get_name(ts, m, n, alpha, lda_mul, ldb_mul, mem_type).c_str(), + blas_benchmark::utils::get_name< + blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>( + ts, m, n, alpha, lda_mul, ldb_mul, mem_type) + .c_str(), BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul, success) ->UseRealTime(); } diff --git a/benchmark/portblas/extension/omatcopy2.cpp b/benchmark/portblas/extension/omatcopy2.cpp index 783438a9b..121eef9f7 100644 --- a/benchmark/portblas/extension/omatcopy2.cpp +++ b/benchmark/portblas/extension/omatcopy2.cpp @@ -26,18 +26,6 @@ #include "../../../test/unittest/extension/extension_reference.hpp" #include "../utils.hpp" -template -std::string get_name(std::string t, int m, int n, scalar_t alpha, - index_t lda_mul, index_t ldb_mul, index_t inc_a, - index_t inc_b, std::string mem_type) { - std::ostringstream str{}; - str << "BM_omatcopy2<" << blas_benchmark::utils::get_type_name() - << ">/" << t << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul - << "/" << ldb_mul << "/" << inc_a << "/" << inc_b; - str << "/" << mem_type; - return str.str(); -} - template void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul, @@ -59,7 +47,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, const auto size_b = ldb * ((*t_str == 't') ? m : n); blas_benchmark::utils::init_extension_counters< - blas_benchmark::utils::ExtensionOP::omatcopy2, scalar_t>( + blas_benchmark::utils::ExtensionOp::omatcopy2, scalar_t>( state, t_str, m, n, lda_mul, ldb_mul, inc_a, inc_b); blas::SB_Handle& sb_handle = *sb_handle_ptr; @@ -85,8 +73,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, // Run a first time with a verification of the results std::vector m_b_ref = m_b; - reference_blas::ext_omatcopy2(*t_str, m, n, alpha, m_a, lda, inc_a, m_b_ref, - ldb, inc_b); + reference_blas::ext_omatcopy2(*t_str, m, n, alpha, m_a.data(), lda, inc_a, + m_b_ref.data(), ldb, inc_b); std::vector m_b_temp = m_b; { @@ -167,8 +155,9 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, ldb_mul, inc_a, inc_b, success); }; benchmark::RegisterBenchmark( - get_name(ts, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b, - mem_type) + blas_benchmark::utils::get_name< + blas_benchmark::utils::ExtensionOp::omatcopy2, scalar_t>( + ts, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b, mem_type) .c_str(), BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b, success) diff --git a/benchmark/portblas/extension/omatcopy_batched.cpp b/benchmark/portblas/extension/omatcopy_batched.cpp new file mode 100644 index 000000000..c88ce624a --- /dev/null +++ b/benchmark/portblas/extension/omatcopy_batched.cpp @@ -0,0 +1,202 @@ +/************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL-BLAS: BLAS implementation using SYCL + * + * @filename omatcopy_batched.cpp + * + **************************************************************************/ + +#include "../../../test/unittest/extension/extension_reference.hpp" +#include "../utils.hpp" + +template +void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti, + index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul, + index_t stride_a_mul, index_t stride_b_mul, index_t batch_size, + bool* success) { + // initialize the state label + blas_benchmark::utils::set_benchmark_label( + state, sb_handle_ptr->get_queue()); + + // Standard test setup. + std::string ts = blas_benchmark::utils::from_transpose_enum( + static_cast(ti)); + const char* t_str = ts.c_str(); + + const auto lda = lda_mul * m; + const auto ldb = (*t_str == 't') ? ldb_mul * n : ldb_mul * m; + + const auto stride_a = lda * n * stride_a_mul; + const auto stride_b = ((*t_str == 't') ? ldb * m : ldb * n) * stride_b_mul; + + const auto size_a = stride_a * batch_size; + const auto size_b = stride_b * batch_size; + + blas_benchmark::utils::init_extension_counters< + blas_benchmark::utils::ExtensionOp::omatcopy_batch, scalar_t>( + state, t_str, m, n, lda_mul, ldb_mul, stride_a_mul, stride_b_mul, + batch_size); + + blas::SB_Handle& sb_handle = *sb_handle_ptr; + auto q = sb_handle.get_queue(); + + // Input matrix/vector, output vector. + std::vector m_a = + blas_benchmark::utils::random_data(size_a); + std::vector m_b = + blas_benchmark::utils::random_data(size_b); + + auto m_a_gpu = blas::helper::allocate(size_a, q); + auto m_b_gpu = blas::helper::allocate(size_b, q); + + auto copy_a = blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, size_a); + auto copy_b = blas::helper::copy_to_device(q, m_b.data(), m_b_gpu, size_b); + + sb_handle.wait({copy_a, copy_b}); + +#ifdef BLAS_VERIFY_BENCHMARK + // Run a first time with a verification of the results + std::vector m_b_ref = m_b; + + for (int i = 0; i < batch_size; ++i) { + reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a.data() + i * stride_a, + lda, m_b_ref.data() + i * stride_b, ldb); + } + + std::vector m_b_temp = m_b; + { + auto m_b_temp_gpu = blas::helper::allocate(size_b, q); + auto copy_tmp = blas::helper::copy_to_device( + q, m_b_temp.data(), m_b_temp_gpu, size_b); + + auto event = blas::_omatcopy_batch(sb_handle, *t_str, m, n, alpha, m_a_gpu, + lda, stride_a, m_b_temp_gpu, ldb, + stride_b, batch_size, {copy_tmp}); + sb_handle.wait(event); + auto copy_res = blas::helper::copy_to_host( + q, m_b_temp_gpu, m_b_temp.data(), size_b); + + sb_handle.wait(copy_res); + blas::helper::deallocate(m_b_temp_gpu, q); + } + + std::ostringstream err_stream; + if (!utils::compare_vectors(m_b_temp, m_b_ref, err_stream, "")) { + const std::string& err_str = err_stream.str(); + state.SkipWithError(err_str.c_str()); + *success = false; + }; +#endif + + auto blas_warmup_method_def = [&]() -> void { + auto event = + blas::_omatcopy_batch(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda, + stride_a, m_b_gpu, ldb, stride_b, batch_size); + return; + }; + + auto blas_method_def = [&]() -> std::vector { + auto event = + blas::_omatcopy_batch(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda, + stride_a, m_b_gpu, ldb, stride_b, batch_size); + sb_handle.wait(event); + return event; + }; + + // Warmup + blas_benchmark::utils::warmup(blas_warmup_method_def); + sb_handle.wait(); + + blas_benchmark::utils::init_counters(state); + + // Measure + for (auto _ : state) { + // Run + std::tuple times = + blas_benchmark::utils::timef(blas_method_def); + + // Report + blas_benchmark::utils::update_counters(state, times); + } + + state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); + state.SetBytesProcessed(state.iterations() * + state.counters["bytes_processed"]); + + blas_benchmark::utils::calc_avg_counters(state); + + blas::helper::deallocate(m_a_gpu, q); + blas::helper::deallocate(m_b_gpu, q); +} + +template +void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success, + std::string mem_type, + std::vector> params) { + for (auto p : params) { + std::string ts; + index_t m, n, lda_mul, ldb_mul, stride_a_mul, stride_b_mul, batch_size; + scalar_t alpha; + std::tie(ts, m, n, alpha, lda_mul, ldb_mul, stride_a_mul, stride_b_mul, + batch_size) = p; + int t = static_cast(blas_benchmark::utils::to_transpose_enum(ts)); + + auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr, + int t, index_t m, index_t n, scalar_t alpha, + index_t lda_mul, index_t ldb_mul, index_t stride_a_mul, + index_t stride_b_mul, index_t batch_size, + bool* success) { + run(st, sb_handle_ptr, t, m, n, alpha, lda_mul, + ldb_mul, stride_a_mul, stride_b_mul, batch_size, + success); + }; + benchmark::RegisterBenchmark( + blas_benchmark::utils::get_name< + blas_benchmark::utils::ExtensionOp::omatcopy_batch, scalar_t, + index_t>(ts, m, n, alpha, lda_mul, ldb_mul, stride_a_mul, + stride_b_mul, batch_size, mem_type) + .c_str(), + BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul, + stride_a_mul, stride_b_mul, batch_size, success) + ->UseRealTime(); + } +} + +template +void register_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + auto omatcopy_batch_params = + blas_benchmark::utils::get_matcopy_batch_params(args); + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER, + omatcopy_batch_params); +#ifdef SB_ENABLE_USM + register_benchmark( + sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM, + omatcopy_batch_params); +#endif +} + +namespace blas_benchmark { +void create_benchmark(blas_benchmark::Args& args, + blas::SB_Handle* sb_handle_ptr, bool* success) { + BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success); +} +} // namespace blas_benchmark diff --git a/benchmark/rocblas/extension/omatadd.cpp b/benchmark/rocblas/extension/omatadd.cpp index f2d1bb39b..71f0ca483 100644 --- a/benchmark/rocblas/extension/omatadd.cpp +++ b/benchmark/rocblas/extension/omatadd.cpp @@ -26,17 +26,6 @@ #include "../../../../test/unittest/extension/extension_reference.hpp" #include "../utils.hpp" -template -std::string get_name(std::string ts_a, std::string ts_b, int m, int n, - scalar_t alpha, scalar_t beta, index_t lda_mul, - index_t ldb_mul, index_t ldc_mul) { - std::ostringstream str{}; - str << "BM_omatadd<" << blas_benchmark::utils::get_type_name() - << ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha - << "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul; - return str.str(); -} - template static inline void rocblas_geam_f(args_t&&... args) { if constexpr (std::is_same_v) { @@ -71,7 +60,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int ti_a, int ti_b, const auto size_c = ldc * n; blas_benchmark::utils::init_extension_counters< - blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>( + blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>( state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul); // Input matrix/vector, output vector. @@ -183,8 +172,9 @@ void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle, ldb_mul, ldc_mul, success); }; benchmark::RegisterBenchmark( - get_name(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, - ldc_mul) + blas_benchmark::utils::get_name< + blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>( + ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul) .c_str(), BM_lambda, rb_handle, t_a, t_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul, success) diff --git a/benchmark/rocblas/extension/omatcopy.cpp b/benchmark/rocblas/extension/omatcopy.cpp index 5d0e7862d..8d87dd7a4 100644 --- a/benchmark/rocblas/extension/omatcopy.cpp +++ b/benchmark/rocblas/extension/omatcopy.cpp @@ -26,16 +26,6 @@ #include "../../../../test/unittest/extension/extension_reference.hpp" #include "../utils.hpp" -template -std::string get_name(std::string t, int m, int n, scalar_t alpha, - index_t lda_mul, index_t ldb_mul) { - std::ostringstream str{}; - str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name() - << ">/" << t << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul - << "/" << ldb_mul; - return str.str(); -} - template static inline void rocblas_geam_f(args_t&&... args) { if constexpr (std::is_same_v) { @@ -65,7 +55,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int ti, index_t m, const auto size_b = ldb * n; blas_benchmark::utils::init_extension_counters< - blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>( + blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>( state, t_str, m, n, lda_mul, ldb_mul); // Input matrix/vector, output vector. @@ -90,8 +80,8 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int ti, index_t m, // Run a first time with a verification of the results std::vector m_b_ref = m_b; - reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a, lda, m_b_ref, - ldb); + reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a.data(), lda, + m_b_ref.data(), ldb); std::vector m_b_temp = m_b; { @@ -176,7 +166,10 @@ void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle, run(st, rb_handle_, t, m, n, alpha, lda_mul, ldb_mul, success); }; benchmark::RegisterBenchmark( - get_name(ts, m, n, alpha, lda_mul, ldb_mul).c_str(), + blas_benchmark::utils::get_name< + blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>( + ts, m, n, alpha, lda_mul, ldb_mul) + .c_str(), BM_lambda, rb_handle, t, m, n, alpha, lda_mul, ldb_mul, success) ->UseRealTime(); } diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake index df35c8c37..638ce8a8e 100644 --- a/cmake/CmakeFunctionHelper.cmake +++ b/cmake/CmakeFunctionHelper.cmake @@ -562,7 +562,8 @@ function (build_library LIB_NAME ENABLE_EXTENSIONS) $ $ $ - $) + $ + $) if (${ENABLE_EXTENSIONS}) list(APPEND LIB_SRCS $) diff --git a/common/include/common/benchmark_identifier.hpp b/common/include/common/benchmark_identifier.hpp index d0fa906cf..832f00614 100644 --- a/common/include/common/benchmark_identifier.hpp +++ b/common/include/common/benchmark_identifier.hpp @@ -74,7 +74,16 @@ enum class Level3Op : int { trsm = 8 }; -enum class ExtensionOp : int { reduction = 0 }; +enum class ExtensionOp : int { + omatcopy = 0, + imatcopy = 1, + omatadd = 2, + omatcopy_batch = 3, + imatcopy_batch = 4, + omatadd_batch = 5, + omatcopy2 = 6, + reduction = 7 +}; template std::string get_operator_name() { @@ -170,7 +179,21 @@ std::string get_operator_name() { template std::string get_operator_name() { - if constexpr (op == ExtensionOp::reduction) + if constexpr ( op == ExtensionOp::omatcopy) + return "Omatcopy"; + else if constexpr (op == ExtensionOp::imatcopy) + return "Imatcopy"; + else if constexpr (op == ExtensionOp::omatadd) + return "Omatadd"; + else if constexpr (op == ExtensionOp::omatcopy_batch) + return "Omatcopy_batch"; + else if constexpr (op == ExtensionOp::imatcopy_batch) + return "Imatcopy_batch"; + else if constexpr (op == ExtensionOp::omatadd_batch) + return "Omatadd_batch"; + else if constexpr (op == ExtensionOp::omatcopy2) + return "Omatcopy2"; + else if constexpr (op == ExtensionOp::reduction) return "Reduction"; else throw std::runtime_error("Unknown BLAS extension operator"); diff --git a/common/include/common/benchmark_names.hpp b/common/include/common/benchmark_names.hpp index 0e3d06c01..d23f140e1 100644 --- a/common/include/common/benchmark_names.hpp +++ b/common/include/common/benchmark_names.hpp @@ -241,6 +241,41 @@ get_name(char side, char uplo, char trans, char diag, index_t m, index_t n, stride_b_mul, mem_type); } +template +inline typename std::enable_if::type +get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul, + index_t ldb_mul, std::string mem_type) { + return internal::get_name(trans, m, n, alpha, lda_mul, ldb_mul, + mem_type); +} + +template +inline typename std::enable_if::type +get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul, + index_t ldb_mul, index_t inc_a, index_t inc_b, std::string mem_type) { + return internal::get_name(trans, m, n, alpha, lda_mul, ldb_mul, + inc_a, inc_b, mem_type); +} + +template +inline typename std::enable_if::type +get_name(std::string trans_a, std::string trans_b, int m, int n, scalar_t alpha, + scalar_t beta, index_t lda_mul, index_t ldb_mul, index_t ldc_mul, + std::string mem_type) { + return internal::get_name(trans_a, trans_b, m, n, alpha, beta, + lda_mul, ldb_mul, ldc_mul, mem_type); +} +template +inline typename std::enable_if::type +get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul, + index_t ldb_mul, index_t stride_a_mul, index_t stride_b_mul, + index_t batch_size, std::string mem_type) { + return internal::get_name(trans, m, n, alpha, lda_mul, ldb_mul, + stride_a_mul, stride_b_mul, + batch_size, mem_type); +} + template inline typename std::enable_if::type get_name(index_t rows, index_t cols, std::string reduction_dim, diff --git a/common/include/common/blas_extension_state_counters.hpp b/common/include/common/blas_extension_state_counters.hpp index 3fe58134c..65761255b 100644 --- a/common/include/common/blas_extension_state_counters.hpp +++ b/common/include/common/blas_extension_state_counters.hpp @@ -26,44 +26,46 @@ #ifndef COMMON_BLAS_EXTENSION_STATE_COUNTERS #define COMMON_BLAS_EXTENSION_STATE_COUNTERS +#include "benchmark_identifier.hpp" + namespace blas_benchmark { namespace utils { -enum class ExtensionOP : int { - omatcopy = 0, - imatcopy = 1, - omatadd = 2, - omatcopy_batch = 3, - imatcopy_batch = 4, - omatadd_batch = 5, - omatcopy2 = 6 -}; - -template -inline typename std::enable_if::type +template +inline typename std::enable_if< + op == ExtensionOp::omatcopy || op == ExtensionOp::imatcopy || + op == ExtensionOp::omatcopy2 || op == ExtensionOp::omatcopy_batch || + op == ExtensionOp::imatcopy_batch>::type init_extension_counters(benchmark::State& state, const char* trans, index_t m, index_t n, index_t lda_mul, index_t ldb_mul, - index_t inc_a = 1, index_t inc_b = 1) { + index_t inc_a = 1, index_t inc_b = 1, + index_t stride_a_mul = 1, index_t stride_b_mul = 1, + index_t batch_size = 1) { // Google-benchmark counters are double. double size_d = static_cast(m * n); state.counters["m"] = static_cast(m); state.counters["n"] = static_cast(n); - state.counters["n_fl_ops"] = size_d; + state.counters["n_fl_ops"] = size_d * batch_size; state.counters["lda_m"] = static_cast(lda_mul); state.counters["ldb_m"] = static_cast(ldb_mul); state.counters["trans"] = static_cast((*trans == 't') ? 1 : 0); - state.counters["bytes_processed"] = (2 * size_d + 1) * sizeof(scalar_t); - if constexpr (op == ExtensionOP::omatcopy2) { + state.counters["bytes_processed"] = + (2 * size_d + 1) * sizeof(scalar_t) * batch_size; + if constexpr (op == ExtensionOp::omatcopy_batch || + op == ExtensionOp::imatcopy_batch) { + state.counters["stride_a_mul"] = static_cast(stride_a_mul); + state.counters["stride_b_mul"] = static_cast(stride_b_mul); + state.counters["batch_size"] = static_cast(batch_size); + } + if constexpr (op == ExtensionOp::omatcopy2) { state.counters["inc_a"] = static_cast(inc_a); state.counters["inc_b"] = static_cast(inc_b); } return; } -template -inline typename std::enable_if::type +template +inline typename std::enable_if::type init_extension_counters(benchmark::State& state, const char* t_a, const char* t_b, index_t m, index_t n, index_t lda_mul, index_t ldb_mul, index_t ldc_mul) { diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp index 3c5161c2e..f4b9b88f6 100644 --- a/common/include/common/common_utils.hpp +++ b/common/include/common/common_utils.hpp @@ -113,6 +113,11 @@ template using omatadd_param_t = std::tuple; +template +using matcopy_batch_param_t = + std::tuple; + namespace blas_benchmark { namespace utils { @@ -1193,6 +1198,57 @@ static inline std::vector> get_omatadd_params( } } +/** + * @fn get_matcopy_batch_params + * @brief Returns a vector containing the matcopy_batch benchmark parameters, + * either read from a file according to the command - line args, or the default + * ones. + */ +template +static inline std::vector> +get_matcopy_batch_params(Args& args) { + if (args.csv_param.empty()) { + warning_no_csv(); + std::vector> matcopy_batch_default; + constexpr index_t dmin = 256, dmax = 8192; + constexpr scalar_t alpha{2}; + constexpr index_t batch_size{5}; + constexpr index_t stride_a_mul{1}; + constexpr index_t stride_b_mul{1}; + for (char trans : {'n', 't'}) { + for (index_t m = dmin; m <= dmax; m *= 2) { + for (index_t n = dmin; n <= dmax; n *= 2) { + for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { + for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { + matcopy_batch_default.push_back( + std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul, + stride_a_mul, stride_b_mul, batch_size)); + } + } + } + } + } + return matcopy_batch_default; + } else { + return parse_csv_file>( + args.csv_param, [&](std::vector& v) { + if (v.size() != 9) { + throw std::runtime_error( + "invalid number of parameters (9 expected)"); + } + try { + return std::make_tuple( + v[0][0], str_to_int(v[1]), str_to_int(v[2]), + str_to_scalar(v[3]), str_to_int(v[4]), + str_to_int(v[5]), str_to_int(v[6]), + str_to_int(v[7]), str_to_int(v[8])); + } catch (...) { + throw std::runtime_error("invalid parameter"); + } + }); + } +} + /** * @fn get_type_name * @brief Returns a string with the given type. The C++ specification diff --git a/include/interface/extension_interface.h b/include/interface/extension_interface.h index 600fb728e..646858ade 100644 --- a/include/interface/extension_interface.h +++ b/include/interface/extension_interface.h @@ -61,46 +61,58 @@ namespace internal { */ template -typename sb_handle_t::event_t _matcopy(sb_handle_t& sb_handle, char trans, - index_t m, index_t n, element_t alpha, - in_t in_memory, index_t ld_in, - index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out, - const typename sb_handle_t::event_t& _dependencies); +typename sb_handle_t::event_t _matcopy( + sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, + index_t ld_out, index_t inc_out, + const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a, - char trans_b, index_t m, index_t n, - element_t alpha, container_0_t a, - index_t lda, element_t beta, - container_1_t b, index_t ldb, - container_2_t c, index_t ldc, - const typename sb_handle_t::event_t& _dependencies); + typename container_0_t, typename container_1_t, + typename container_2_t> +typename sb_handle_t::event_t _omatadd( + sb_handle_t& sb_handle, char trans_a, char trans_b, index_t m, index_t n, + element_t alpha, container_0_t a, index_t lda, element_t beta, + container_1_t b, index_t ldb, container_2_t c, index_t ldc, + const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, - index_t n, in_t A, index_t ld_a, - out_t B, index_t ld_b, - const typename sb_handle_t::event_t& _dependencies); +typename sb_handle_t::event_t _transpose( + sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, out_t B, + index_t ld_b, const typename sb_handle_t::event_t& _dependencies); + +template +typename sb_handle_t::event_t _matcopy_batch( + sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t stride_in, out_t out_memory, + index_t ld_out, index_t stride_out, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies); + +template +typename sb_handle_t::event_t _matcopy_batch_impl( + sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, in_t memory, + index_t ld_in, index_t in_stride, out_t out_memory, index_t ld_out, + index_t out_stride, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle, - input_t buffer_in, index_t ld, - output_t buffer_out, index_t rows, - index_t cols, - reduction_dim_t reduction_dim, - const typename sb_handle_t::event_t& _dependencies); +typename sb_handle_t::event_t _reduction( + sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out, + index_t rows, index_t cols, reduction_dim_t reduction_dim, + const typename sb_handle_t::event_t& _dependencies); template typename sb_handle_t::event_t _transpose_outplace_impl( sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, - container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_, - index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies); + container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in, + container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out, + index_t _batch_size, const typename sb_handle_t::event_t& _dependencies); template -typename sb_handle_t::event_t _omatcopy(sb_handle_t& sb_handle, char trans, - index_t m, index_t n, element_t alpha, - in_t in_memory, index_t ld_in, - out_t out_memory, index_t ld_out, - const typename sb_handle_t::event_t& _dependencies = {}) { - return internal::_matcopy(sb_handle, trans, m, n, alpha, in_memory, - ld_in, static_cast(1), out_memory, - ld_out, static_cast(1), - _dependencies); +typename sb_handle_t::event_t _omatcopy( + sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, out_t out_memory, index_t ld_out, + const typename sb_handle_t::event_t& _dependencies = {}) { + return internal::_matcopy( + sb_handle, trans, m, n, alpha, in_memory, ld_in, static_cast(1), + out_memory, ld_out, static_cast(1), _dependencies); } /** @@ -172,12 +182,11 @@ typename sb_handle_t::event_t _omatcopy(sb_handle_t& sb_handle, char trans, */ template -typename sb_handle_t::event_t _omatcopy2(sb_handle_t& sb_handle, char trans, - index_t m, index_t n, element_t alpha, - in_t in_memory, index_t ld_in, - index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out, - const typename sb_handle_t::event_t& _dependencies = {}) { +typename sb_handle_t::event_t _omatcopy2( + sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, + index_t ld_out, index_t inc_out, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_matcopy(sb_handle, trans, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, inc_out, _dependencies); @@ -205,17 +214,80 @@ typename sb_handle_t::event_t _omatcopy2(sb_handle_t& sb_handle, char trans, * @param ldc Matrix C leading dimension */ template -typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a, - char trans_b, index_t m, index_t n, - element_t alpha, container_0_t A, - index_t lda, element_t beta, - container_1_t B, index_t ldb, - container_2_t C, index_t ldc, - const typename sb_handle_t::event_t& _dependencies = {}) { + typename container_0_t, typename container_1_t, + typename container_2_t> +typename sb_handle_t::event_t _omatadd( + sb_handle_t& sb_handle, char trans_a, char trans_b, index_t m, index_t n, + element_t alpha, container_0_t A, index_t lda, element_t beta, + container_1_t B, index_t ldb, container_2_t C, index_t ldc, + const typename sb_handle_t::event_t& _dependencies = {}) { return internal::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, A, lda, beta, B, ldb, C, ldc, _dependencies); } +/** + * \brief COPY batch of matrices inplace with scaling factor of alpha + * + * @tparam sb_handle_t SB_Handle type + * @tparam element_t Scaling factor type + * @tparam index_t Index type + * @tparam in_out_t input/output type + * @param sb_handle SB_Handle + * @param trans compute matrix transpose or not + * @param m rows of matrix + * @param n cols of matrix + * @param alpha Scaling factor + * @param memory container of input & output matrices + * @param ld_in leading dimension at input + * @param ld_out leading dimention at output + * @param stride stride distance between matrices inside batch + * @param batch_size number of matrices to compute + */ +template +typename sb_handle_t::event_t _imatcopy_batch(sb_handle_t& sb_handle, + char trans, index_t m, index_t n, + element_t alpha, in_out_t memory, + index_t ld_in, index_t ld_out, + index_t stride, + index_t batch_size) { + return internal::_matcopy_batch(sb_handle, trans, m, n, alpha, memory, + ld_in, stride, memory, ld_out, stride, + batch_size); +} + +/** + * \brief COPY batch of matrices outplace from in_memory to out_memory with + * scaling factor of alpha + * + * @tparam sb_handle_t SB_Handle type + * @tparam element_t Scaling factor type + * @tparam index_t Index type + * @tparam in_t container input type + * @tparam out_t container output type + * @param sb_handle SB_Handle + * @param trans compute matrix transpose or not + * @param m rows of matrix + * @param n cols of matrix + * @param alpha Scaling factor + * @param in_memory input matrix container + * @param ld_in leading dimension of input + * @param stride_in stride distance between matrices inside batch + * @param out_memory output matrix container + * @param ld_out leading dimention of output + * @param stride_out stride distance between matrices inside batch + * @param batch_size number of matrices to compute + */ +template +typename sb_handle_t::event_t _omatcopy_batch( + sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t stride_in, out_t out_memory, + index_t ld_out, index_t stride_out, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies = {}) { + return internal::_matcopy_batch( + sb_handle, trans, m, n, alpha, in_memory, ld_in, stride_in, out_memory, + ld_out, stride_out, batch_size, _dependencies); +} namespace extension { /** @@ -238,10 +310,9 @@ namespace extension { */ template -typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, - index_t n, in_t A, index_t ld_in, - index_t ld_out, - const typename sb_handle_t::event_t& _dependencies = {}) { +typename sb_handle_t::event_t _transpose( + sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_in, + index_t ld_out, const typename sb_handle_t::event_t& _dependencies = {}) { return blas::internal::_transpose(sb_handle, m, n, A, ld_in, A, ld_out, _dependencies); } @@ -267,24 +338,22 @@ typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, */ template -typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, - index_t n, in_t A, index_t ld_a, - out_t B, index_t ld_b, - const typename sb_handle_t::event_t& _dependencies = {}) { +typename sb_handle_t::event_t _transpose( + sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, out_t B, + index_t ld_b, const typename sb_handle_t::event_t& _dependencies = {}) { return blas::internal::_transpose(sb_handle, m, n, A, ld_a, B, ld_b, _dependencies); } template -typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle, - input_t buffer_in, index_t ld, - output_t buffer_out, index_t rows, - index_t cols, - reduction_dim_t reduction_dim, - const typename sb_handle_t::event_t& _dependencies = {}) { +typename sb_handle_t::event_t _reduction( + sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out, + index_t rows, index_t cols, reduction_dim_t reduction_dim, + const typename sb_handle_t::event_t& _dependencies = {}) { return blas::internal::_reduction( - sb_handle, buffer_in, ld, buffer_out, rows, cols, reduction_dim, _dependencies); + sb_handle, buffer_in, ld, buffer_out, rows, cols, reduction_dim, + _dependencies); } } // namespace extension diff --git a/include/operations/extension/matcopy_batch.h b/include/operations/extension/matcopy_batch.h new file mode 100644 index 000000000..b09fcc3ca --- /dev/null +++ b/include/operations/extension/matcopy_batch.h @@ -0,0 +1,72 @@ +/*************************************************************************** + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL-BLAS: BLAS implementation using SYCL + * + * @filename Matcopy_batch.h + * + **************************************************************************/ + +#ifndef PORTBLAS_EXTENSION_MATCOPY_BATCH_H +#define PORTBLAS_EXTENSION_MATCOPY_BATCH_H + +namespace blas { + +template +struct Matcopy_batch { + public: + using value_t = typename lhs_t::value_t; + using index_t = typename rhs_t::index_t; + + lhs_t lhs_; + rhs_t rhs_1_; + rhs_t rhs_2_; + value_t alpha_, beta_; + index_t m_, n_, lhs_ld_, rhs_1_ld_, rhs_2_ld_, lhs_stride_, rhs_1_stride_, + rhs_2_stride_, batch_size_; + + Matcopy_batch(lhs_t lhs, rhs_t rhs_1, rhs_t rhs_2, value_t alpha, + value_t beta, index_t m, index_t n, index_t lhs_ld, + index_t rhs_ld, index_t rhs_2_ld, index_t lhs_stride, + index_t rhs_stride, index_t rhs_2_stride, index_t batch_size); + index_t get_size() const; + bool valid_thread(cl::sycl::nd_item<1> ndItem) const; + value_t eval(index_t i); + value_t eval(cl::sycl::nd_item<1> ndItem); + template + value_t eval(sharedT shMem, cl::sycl::nd_item<1> ndItem); + void bind(cl::sycl::handler &h); + void adjust_access_displacement(); +}; + +template +Matcopy_batch make_matcopy_batch( + lhs_t lhs, rhs_t rhs_1, rhs_t rhs_2, typename rhs_t::value_t alpha, + typename rhs_t::value_t beta, typename rhs_t::index_t m, + typename rhs_t::index_t n, typename rhs_t::index_t lhs_ld, + typename rhs_t::index_t rhs_ld, typename rhs_t::index_t rhs_2_ld, + typename rhs_t::index_t lhs_stride, typename rhs_t::index_t rhs_stride, + typename rhs_t::index_t rhs_2_stride, typename rhs_t::index_t batch_size) { + return Matcopy_batch( + lhs, rhs_1, rhs_2, alpha, beta, m, n, lhs_ld, rhs_ld, rhs_2_ld, + lhs_stride, rhs_stride, rhs_2_stride, batch_size); +} + +} // namespace blas + +#endif // PORTBLAS_EXTENSION_MATCOPY_BATCH_H diff --git a/include/operations/extension/transpose.h b/include/operations/extension/transpose.h index 471eb6814..c29d85f9e 100644 --- a/include/operations/extension/transpose.h +++ b/include/operations/extension/transpose.h @@ -67,18 +67,26 @@ class Transpose { // Increment value (denoted stride in oneMKL specification) index_t inc_a_; index_t inc_at_; + // Stride values (denoted stride in oneMKL specification for batched + // operations) + index_t stride_a_; + index_t stride_at_; // Minimum number of tiles used to cover matrices rows & columns index_t tile_count_m_; index_t tile_count_n_; // Total number of tiles used to cover the matrix index_t tile_count_total_; - // Inner WG Tiles + // Number of Inner WG Tiles static constexpr const index_t inner_tile_size_ = wg_size / Tile_size; static constexpr const index_t inner_tile_count_ = Tile_size / inner_tile_size_; // Minimum number of Tile-mutliple rows & columns to cover the matrices index_t M_pad_; index_t N_pad_; + // Total size of Tile-mutliple covering matrix + index_t size_pad_; + // Batch size when using batched transpose + index_t batch_size_; // Number of contiguous elements to be used in local memory to avoid bank // conflicts static constexpr index_t get_non_bank_conflict_line_size() { @@ -90,7 +98,9 @@ class Transpose { return get_non_bank_conflict_line_size() / Tile_size; } - Transpose(in_t &A, index_t &inc_a, out_t &At, index_t &inc_at, value_t &alpha) + Transpose(in_t &A, index_t &inc_a, index_t &stride_a, out_t &At, + index_t &inc_at, index_t &stride_at, value_t &alpha, + index_t &batch_size) : A_(A), At_(At), lda_(A_.getSizeL()), @@ -102,9 +112,13 @@ class Transpose { tile_count_n_((N_ - 1) / Tile_size + 1), tile_count_total_(tile_count_m_ * tile_count_n_), inc_a_(inc_a), + stride_a_(stride_a), + stride_at_(stride_at), inc_at_(inc_at), M_pad_(tile_count_m_ * Tile_size), - N_pad_(tile_count_n_ * Tile_size) {} + N_pad_(tile_count_n_ * Tile_size), + size_pad_(M_pad_ * N_pad_), + batch_size_(batch_size) {} index_t get_size() const; @@ -119,7 +133,7 @@ class Transpose { index_t &out_local_idx, index_t &i_block_start, index_t &j_block_start, index_t &il, index_t &jl); void get_indices(cl::sycl::nd_item<1> id, index_t &in_idx, index_t &out_idx, - index_t &il, index_t &jl); + index_t &i, index_t &j); }; /*! @@ -130,10 +144,12 @@ template Transpose -make_transpose(in_t &A, index_t inc_a, out_t &At, index_t inc_a_t, - element_t &alpha) { +make_transpose(in_t &A, index_t inc_a, index_t &stride_a, out_t &At, + index_t inc_a_t, index_t &stride_at, element_t &alpha, + index_t &batch_size) { return Transpose(A, inc_a, At, inc_a_t, alpha); + out_t, element_t>(A, inc_a, stride_a, At, inc_a_t, stride_at, + alpha, batch_size); } /*! diff --git a/include/portblas.h b/include/portblas.h index e289c55ce..1c212166a 100644 --- a/include/portblas.h +++ b/include/portblas.h @@ -53,6 +53,8 @@ #include "operations/extension/transpose.h" +#include "operations/extension/matcopy_batch.h" + #include "operations/blas_constants.h" #include "operations/blas_operators.h" diff --git a/src/interface/extension/CMakeLists.txt b/src/interface/extension/CMakeLists.txt index a1269f10d..6e9ff9fff 100644 --- a/src/interface/extension/CMakeLists.txt +++ b/src/interface/extension/CMakeLists.txt @@ -26,4 +26,6 @@ generate_blas_objects(extension matcopy) generate_blas_objects(extension transpose) generate_blas_objects(extension omatadd) +generate_blas_objects(extension matcopy_batch) + generate_blas_reduction_objects(extension reduction) diff --git a/src/interface/extension/backend/amd_gpu.hpp b/src/interface/extension/backend/amd_gpu.hpp index 8ced784df..fed86006e 100644 --- a/src/interface/extension/backend/amd_gpu.hpp +++ b/src/interface/extension/backend/amd_gpu.hpp @@ -34,17 +34,18 @@ template typename sb_handle_t::event_t _transpose_outplace( sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, - container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_, - index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) { + container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in, + container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out, + index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { if (_M * _N > (1 << 18)) { return blas::internal::_transpose_outplace_impl<16, 256, 64, true>( - sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out, _dependencies); + sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, + _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } else { return blas::internal::_transpose_outplace_impl<16, 64, 64, true>( - sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out, _dependencies); + sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, + _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } } @@ -70,6 +71,36 @@ typename sb_handle_t::event_t _transpose_add( } // namespace backend } // namespace transpose + +namespace matcopy_batch { +namespace backend { +template +typename std::enable_if::type +_matcopy_batch(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t in_stride, + out_t out_memory, index_t ld_out, index_t out_stride, + index_t batch_size, + const typename sb_handle_t::event_t& _dependencies) { + if ((m * n) > (1 << 20)) { + return blas::internal::_matcopy_batch_impl<32, 4, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } else if ((m * n) > (1 << 14)) { + return blas::internal::_matcopy_batch_impl<8, 16, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } else { + return blas::internal::_matcopy_batch_impl<4, 64, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } +} +} // namespace backend +} // namespace matcopy_batch } // namespace blas #endif diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp index f73e9ec1f..936a1f651 100644 --- a/src/interface/extension/backend/default_cpu.hpp +++ b/src/interface/extension/backend/default_cpu.hpp @@ -34,16 +34,17 @@ template typename sb_handle_t::event_t _transpose_outplace( sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, - container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_, - index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) { + container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in, + container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out, + index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { if (_M * _N < (1 << 20)) { return blas::internal::_transpose_outplace_impl<16, 64, 64, false>( - sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out, _dependencies); + sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, + _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } else { return blas::internal::_transpose_outplace_impl<32, 128, 64, false>( - sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out, _dependencies); + sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, + _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } } @@ -69,6 +70,24 @@ typename sb_handle_t::event_t _transpose_add( } // namespace backend } // namespace transpose + +namespace matcopy_batch { +namespace backend { +template +typename std::enable_if::type +_matcopy_batch(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t in_stride, + out_t out_memory, index_t ld_out, index_t out_stride, + index_t batch_size, + const typename sb_handle_t::event_t& _dependencies) { + return blas::internal::_matcopy_batch_impl<1, 8, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); +} +} // namespace backend +} // namespace matcopy_batch } // namespace blas #endif diff --git a/src/interface/extension/backend/intel_gpu.hpp b/src/interface/extension/backend/intel_gpu.hpp index f17049614..bdf6ddaf6 100644 --- a/src/interface/extension/backend/intel_gpu.hpp +++ b/src/interface/extension/backend/intel_gpu.hpp @@ -34,17 +34,18 @@ template typename sb_handle_t::event_t _transpose_outplace( sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, - container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_, - index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) { + container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in, + container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out, + index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { if (_M * _N > (1 << 18)) { return blas::internal::_transpose_outplace_impl<32, 256, 128, true>( - sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out, _dependencies); + sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, + _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } else { return blas::internal::_transpose_outplace_impl<16, 64, 64, true>( - sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out, _dependencies); + sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, + _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } } @@ -70,6 +71,36 @@ typename sb_handle_t::event_t _transpose_add( } // namespace backend } // namespace transpose + +namespace matcopy_batch { +namespace backend { +template +typename std::enable_if::type +_matcopy_batch(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t in_stride, + out_t out_memory, index_t ld_out, index_t out_stride, + index_t batch_size, + const typename sb_handle_t::event_t& _dependencies) { + if ((m * n) >= (1 << 18)) { + return blas::internal::_matcopy_batch_impl<16, 4, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } else if ((m * n) >= (1 << 14)) { + return blas::internal::_matcopy_batch_impl<4, 16, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } else { + return blas::internal::_matcopy_batch_impl<1, 256, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } +} +} // namespace backend +} // namespace matcopy_batch } // namespace blas #endif diff --git a/src/interface/extension/backend/nvidia_gpu.hpp b/src/interface/extension/backend/nvidia_gpu.hpp index 994c48964..ce4307f3b 100644 --- a/src/interface/extension/backend/nvidia_gpu.hpp +++ b/src/interface/extension/backend/nvidia_gpu.hpp @@ -34,17 +34,18 @@ template typename sb_handle_t::event_t _transpose_outplace( sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, - container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_, - index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) { + container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in, + container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out, + index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { if (_M * _N > (1 << 18)) { return blas::internal::_transpose_outplace_impl<32, 512, 128, true>( - sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out, _dependencies); + sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, + _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } else { return blas::internal::_transpose_outplace_impl<32, 128, 128, true>( - sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out, - _inc_out, _dependencies); + sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, + _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } } @@ -70,6 +71,34 @@ typename sb_handle_t::event_t _transpose_add( } // namespace backend } // namespace transpose -} // namespace blas +namespace matcopy_batch { +namespace backend { +template +typename std::enable_if::type +_matcopy_batch(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t in_stride, + out_t out_memory, index_t ld_out, index_t out_stride, + index_t batch_size, const typename sb_handle_t::event_t& _dependencies) { + if ((m * n) > (1 << 20)) { + return blas::internal::_matcopy_batch_impl<32, 4, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } else if ((m * n) > (1 << 14)) { + return blas::internal::_matcopy_batch_impl<8, 16, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } else { + return blas::internal::_matcopy_batch_impl<4, 64, sb_handle_t, element_t, + index_t, in_t, out_t>( + sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out, + out_stride, batch_size, _dependencies); + } +} +} // namespace backend +} // namespace matcopy_batch +} // namespace blas #endif diff --git a/src/interface/extension/matcopy_batch.cpp.in b/src/interface/extension/matcopy_batch.cpp.in new file mode 100644 index 000000000..7fd441947 --- /dev/null +++ b/src/interface/extension/matcopy_batch.cpp.in @@ -0,0 +1,87 @@ +/*************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL-BLAS: BLAS implementation using SYCL + * + * @filename matcopy.cpp.in + * + **************************************************************************/ + +#include "interface/extension_interface.hpp" +#include "sb_handle/kernel_constructor.hpp" +#include "sb_handle/portblas_handle.hpp" +#include "operations/extension/matcopy_batch.hpp" +#include "operations/extension/transpose.hpp" + +namespace blas { +namespace internal { + +// This function represents both imatcopy and omatcopy operators +template typename SB_Handle::event_t _matcopy_batch( + SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n, + ${DATA_TYPE} alpha, BufferIterator<${DATA_TYPE}> in_memory, ${INDEX_TYPE} ld_in, + ${INDEX_TYPE} in_stride, BufferIterator<${DATA_TYPE}> out_memory, ${INDEX_TYPE} ld_out, + ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size, + const typename SB_Handle::event_t& _dependencies); + + +// This function represents both imatcopy and omatcopy operators +template typename SB_Handle::event_t _matcopy_batch( + SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n, + ${DATA_TYPE} alpha, BufferIterator<${DATA_TYPE}> in_memory, ${INDEX_TYPE} ld_in, + ${INDEX_TYPE} in_stride, BufferIterator<${DATA_TYPE}> out_memory, ${INDEX_TYPE} ld_out, + ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size, + const typename SB_Handle::event_t& _dependencies); + +#ifdef SB_ENABLE_USM +// This function represents both imatcopy and omatcopy operators +template typename SB_Handle::event_t _matcopy_batch( + SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n, + ${DATA_TYPE} alpha, const ${DATA_TYPE}* in_memory, ${INDEX_TYPE} ld_in, + ${INDEX_TYPE} in_stride, ${DATA_TYPE}* out_memory, ${INDEX_TYPE} ld_out, + ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size, + const typename SB_Handle::event_t& _dependencies); + +// This function represents both imatcopy and omatcopy operators +template typename SB_Handle::event_t _matcopy_batch( + SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n, + ${DATA_TYPE} alpha, ${DATA_TYPE}* in_memory, ${INDEX_TYPE} ld_in, + ${INDEX_TYPE} in_stride, ${DATA_TYPE}* out_memory, ${INDEX_TYPE} ld_out, + ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size, + const typename SB_Handle::event_t& _dependencies); + +// This function represents both imatcopy and omatcopy operators +template typename SB_Handle::event_t _matcopy_batch( + SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n, + ${DATA_TYPE} alpha, const ${DATA_TYPE}* in_memory, ${INDEX_TYPE} ld_in, + ${INDEX_TYPE} in_stride, ${DATA_TYPE}* out_memory, ${INDEX_TYPE} ld_out, + ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size, + const typename SB_Handle::event_t& _dependencies); + +// This function represents both imatcopy and omatcopy operators +template typename SB_Handle::event_t _matcopy_batch( + SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n, + ${DATA_TYPE} alpha, ${DATA_TYPE}* in_memory, ${INDEX_TYPE} ld_in, + ${INDEX_TYPE} in_stride, ${DATA_TYPE}* out_memory, ${INDEX_TYPE} ld_out, + ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size, + const typename SB_Handle::event_t& _dependencies); +#endif + +} // namespace internal +} // namespace blas diff --git a/src/interface/extension_interface.hpp b/src/interface/extension_interface.hpp index c9c63a0c0..d90966099 100644 --- a/src/interface/extension_interface.hpp +++ b/src/interface/extension_interface.hpp @@ -31,10 +31,11 @@ #include "interface/extension_interface.h" #include "operations/blas1_trees.h" #include "operations/blas_operators.hpp" +#include "operations/extension/matcopy_batch.h" #include "operations/extension/reduction.h" #include "operations/extension/transpose.h" -#include "sb_handle/portblas_handle.h" #include "portblas_helper.h" +#include "sb_handle/portblas_handle.h" #include "views/view.h" namespace blas { @@ -58,25 +59,26 @@ template typename sb_handle_t::event_t _transpose_outplace_impl( sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, - container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_, - index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) { + container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in, + container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out, + index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { constexpr const index_t num_line_elems = std::max(Tile_size, static_cast(cl_size / sizeof(element_t))); constexpr const index_t num_tiles_per_line = num_line_elems / Tile_size; // Matrix Views auto in_view = make_matrix_view(in_, _M, _N, _ld_in); - auto out_view = - make_matrix_view(out_, _M, _N, _ld_out); + auto out_view = make_matrix_view(out_, _M, _N, _ld_out); // Work items & groups sizes index_t n_wg = ((_M - 1) / Tile_size + 1) * ((_N - 1) / Tile_size + 1); - index_t global_size = n_wg * wg_size; + index_t global_size = n_wg * wg_size * _batch_size; // Transpose expression Tree auto trans_scale_tree = make_transpose( - in_view, _inc_in, out_view, _inc_out, _alpha); + in_view, _inc_in, _stride_in, out_view, _inc_out, _stride_out, _alpha, + _batch_size); if constexpr (local_memory) { index_t local_mem = static_cast((num_line_elems + 1) * Tile_size / @@ -96,19 +98,20 @@ template typename std::enable_if::type _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, - in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out, const typename sb_handle_t::event_t& _dependencies) { + in_t in_memory, index_t ld_in, index_t inc_in, index_t stride_in, + out_t out_memory, index_t ld_out, index_t inc_out, + index_t stride_out, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies) { if constexpr (!in_place) { return blas::transpose::backend::_transpose_outplace< sb_handle_t, in_t, out_t, element_t, index_t>( - sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, - inc_out, _dependencies); + sb_handle, m, n, alpha, in_memory, ld_in, inc_in, stride_in, out_memory, + ld_out, inc_out, stride_out, batch_size, _dependencies); } else { // TODO // In-place transpose not implemented. - typename sb_handle_t::event_t ret; - return ret; + throw std::runtime_error("In-place transpose not implemented."); } } @@ -121,12 +124,15 @@ template ::type _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out, const typename sb_handle_t::event_t& _dependencies) { + index_t ld_out, index_t inc_out, + const typename sb_handle_t::event_t& _dependencies) { typename sb_handle_t::event_t ret; typename MatrixViewType::type in_view = - make_matrix_view(in_memory, m, n, ld_in, inc_in); - typename MatrixViewType::type out_view = - make_matrix_view(out_memory, m, n, ld_out, inc_out); + make_matrix_view( + in_memory, m, n, ld_in, inc_in); + typename MatrixViewType::type + out_view = make_matrix_view( + out_memory, m, n, ld_out, inc_out); // if alpha=1 no need to multiply if (alpha == 1) { auto copy_op = make_op(out_view, in_view); @@ -143,31 +149,58 @@ template typename std::enable_if::type _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, - in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out, const typename sb_handle_t::event_t& _dependencies) { + in_t in_memory, index_t ld_in, index_t inc_in, index_t stride_in, + out_t out_memory, index_t ld_out, index_t inc_out, + index_t stride_out, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies) { + // if alpha=1 no need to multiply if (inc_in == 1 && inc_out == 1) { - return _matcopy_impl(sb_handle, m, n, alpha, - in_memory, ld_in, inc_in, - out_memory, ld_out, inc_out, - _dependencies); + return _matcopy_impl( + sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, + inc_out, _dependencies); } else if (inc_in == 1) { - return _matcopy_impl(sb_handle, m, n, alpha, - in_memory, ld_in, inc_in, - out_memory, ld_out, inc_out, - _dependencies); + return _matcopy_impl( + sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, + inc_out, _dependencies); } else if (inc_out == 1) { - return _matcopy_impl(sb_handle, m, n, alpha, - in_memory, ld_in, inc_in, - out_memory, ld_out, inc_out, - _dependencies); + return _matcopy_impl( + sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, + inc_out, _dependencies); } else { - return _matcopy_impl(sb_handle, m, n, alpha, - in_memory, ld_in, inc_in, - out_memory, ld_out, inc_out, - _dependencies); + return _matcopy_impl( + sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out, + inc_out, _dependencies); } } +/** + * @brief Implementation of matrix copy batch operators for non transpose cases. + */ +template +typename sb_handle_t::event_t _matcopy_batch_impl( + sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t in_stride, out_t out_memory, + index_t ld_out, index_t out_stride, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies) { + typename MatrixViewType::type in_view = + make_matrix_view(in_memory, m, n, ld_in); + auto out_view = make_matrix_view(out_memory, m, n, ld_out); + const element_t beta = 0; + const index_t ld_b = 0; + const index_t stride_b = 0; + auto copy_batch_tree = make_matcopy_batch( + out_view, in_view, in_view, alpha, beta, m, n, ld_out, ld_in, ld_b, + out_stride, in_stride, stride_b, batch_size); + constexpr index_t local_size = TileSize * TilePerWG; + const index_t tile_per_matrix = + (((m - 1) / TileSize) + 1) * (((n - 1) / TileSize) + 1); + const index_t wg_size = (tile_per_matrix - 1) / TilePerWG + 1; + const index_t global_size = (wg_size)*local_size * batch_size; + return sb_handle.execute(copy_batch_tree, local_size, global_size, + _dependencies); +} + /*! * @brief Wrapper around Transpose-Add. Creates the views, then makes and * launches Transpose Add kernel @@ -280,7 +313,8 @@ template typename sb_handle_t::event_t launch_type_based_reduction( sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out, - index_t rows, index_t cols, const typename SB_Handle::event_t& dependencies) { + index_t rows, index_t cols, + const typename SB_Handle::event_t& dependencies) { #ifdef POWER_VR constexpr int ClSize = 32; constexpr int WgSize = 64; @@ -327,21 +361,21 @@ typename sb_handle_t::event_t launch_type_based_reduction( /* 1st step */ auto reduction = blas::make_reduction(matrix_buffer_in, temp_); - reduction_event = - concatenate_vectors(reduction_event, sb_handle.execute(reduction, dependencies)); + reduction_event = concatenate_vectors( + reduction_event, sb_handle.execute(reduction, dependencies)); /* 2nd step */ auto reduction_step_2 = blas::make_reduction::type, params_t>(temp_, matrix_buffer_out); - reduction_event = concatenate_vectors(reduction_event, - sb_handle.execute(reduction_step_2, reduction_event)); + reduction_event = concatenate_vectors( + reduction_event, sb_handle.execute(reduction_step_2, reduction_event)); } else { /* 1-step reduction */ auto reduction = blas::make_reduction( matrix_buffer_in, matrix_buffer_out); - reduction_event = - concatenate_vectors(reduction_event, sb_handle.execute(reduction, dependencies)); + reduction_event = concatenate_vectors( + reduction_event, sb_handle.execute(reduction, dependencies)); } return reduction_event; @@ -349,39 +383,68 @@ typename sb_handle_t::event_t launch_type_based_reduction( template -typename sb_handle_t::event_t _matcopy(sb_handle_t& sb_handle, char trans, - index_t m, index_t n, element_t alpha, - in_t in_memory, index_t ld_in, - index_t inc_in, out_t out_memory, - index_t ld_out, index_t inc_out, - const typename sb_handle_t::event_t& _dependencies) { +typename sb_handle_t::event_t _matcopy( + sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory, + index_t ld_out, index_t inc_out, + const typename sb_handle_t::event_t& _dependencies) { // bail out early if the leading dimensions are not correct if (ld_in < (inc_in * (m - 1) + 1) || (ld_out - 1) < (trans == 't' ? inc_out * (n - 1) : inc_out * (m - 1))) { - typename sb_handle_t::event_t ret; - return ret; + throw std::invalid_argument("invalid ld_in and/or ld_out, inc_out, inc_in"); + } + + const index_t stride = 1; + const index_t batch_size = 1; + + if (trans == 't') { + return _matcopy_impl( + sb_handle, m, n, alpha, in_memory, ld_in, inc_in, stride, out_memory, + ld_out, inc_out, stride, index_t(1), _dependencies); + } else { + return _matcopy_impl( + sb_handle, m, n, alpha, in_memory, ld_in, inc_in, stride, out_memory, + ld_out, inc_out, stride, batch_size, _dependencies); } +} + +template +typename sb_handle_t::event_t _matcopy_batch( + sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha, + in_t in_memory, index_t ld_in, index_t stride_in, out_t out_memory, + index_t ld_out, index_t stride_out, index_t batch_size, + const typename sb_handle_t::event_t& _dependencies) { + // bail out early if the leading dimensions / strides are not correct + if (ld_in < m || (ld_out < (trans == 't' ? n : m))) { + throw std::invalid_argument("invalid ld_in and/or ld_out"); + } + if ((stride_in < ld_in * n) || + (stride_out < (ld_out * (trans == 't' ? m : n)))) { + throw std::invalid_argument("invalid stride_in and/or stride_out"); + } + + const index_t increment = 1; if (trans == 't') { - return _matcopy_impl(sb_handle, m, n, alpha, in_memory, - ld_in, inc_in, out_memory, ld_out, - inc_out, _dependencies); + return _matcopy_impl( + sb_handle, m, n, alpha, in_memory, ld_in, increment, stride_in, + out_memory, ld_out, increment, stride_out, batch_size, _dependencies); } else { - return _matcopy_impl(sb_handle, m, n, alpha, in_memory, - ld_in, inc_in, out_memory, ld_out, - inc_out, _dependencies); + return blas::matcopy_batch::backend::_matcopy_batch( + sb_handle, m, n, alpha, in_memory, ld_in, stride_in, out_memory, ld_out, + stride_out, batch_size, _dependencies); } } template -typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a, - char trans_b, index_t m, index_t n, - element_t alpha, container_0_t a, - index_t lda, element_t beta, - container_1_t b, index_t ldb, - container_2_t c, index_t ldc, - const typename sb_handle_t::event_t& _dependencies) { + typename container_0_t, typename container_1_t, + typename container_2_t> +typename sb_handle_t::event_t _omatadd( + sb_handle_t& sb_handle, char trans_a, char trans_b, index_t m, index_t n, + element_t alpha, container_0_t a, index_t lda, element_t beta, + container_1_t b, index_t ldb, container_2_t c, index_t ldc, + const typename sb_handle_t::event_t& _dependencies) { if (trans_a == 't') { if (trans_b == 't') { return _omatadd_impl(sb_handle, m, n, alpha, a, lda, beta, b, @@ -405,41 +468,39 @@ typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a, template -typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m, - index_t n, in_t A, index_t ld_a, - out_t B, index_t ld_b, - const typename sb_handle_t::event_t& _dependencies) { +typename sb_handle_t::event_t _transpose( + sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, out_t B, + index_t ld_b, const typename sb_handle_t::event_t& _dependencies) { // bail out early if the leading dimensions are not correct if (ld_a < m || ld_b < n) { typename sb_handle_t::event_t ret; return ret; } + const element_t alpha = 1; const index_t inc = 1; - const element_t alpha = element_t(1); + const index_t stride = 1; + const index_t batch_size = 1; - return _matcopy_impl(sb_handle, m, n, alpha, A, ld_a, inc, B, - ld_b, inc, _dependencies); + return _matcopy_impl(sb_handle, m, n, (float)1.0, A, ld_a, + inc, stride, B, ld_b, inc, stride, + batch_size, _dependencies); } template -typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle, - input_t buffer_in, index_t ld, - output_t buffer_out, index_t rows, - index_t cols, - reduction_dim_t reduction_dim, - const typename sb_handle_t::event_t& dependencies) { +typename sb_handle_t::event_t _reduction( + sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out, + index_t rows, index_t cols, reduction_dim_t reduction_dim, + const typename sb_handle_t::event_t& dependencies) { if (reduction_dim == reduction_dim_t::inner) { return launch_type_based_reduction(sb_handle, buffer_in, ld, - buffer_out, rows, cols, - dependencies); + element_t>( + sb_handle, buffer_in, ld, buffer_out, rows, cols, dependencies); } else { // reduction_dim_t::outer return launch_type_based_reduction(sb_handle, buffer_in, ld, - buffer_out, rows, cols, - dependencies); + element_t>( + sb_handle, buffer_in, ld, buffer_out, rows, cols, dependencies); } } diff --git a/src/operations/extension/matcopy_batch.hpp b/src/operations/extension/matcopy_batch.hpp new file mode 100644 index 000000000..2d13d332f --- /dev/null +++ b/src/operations/extension/matcopy_batch.hpp @@ -0,0 +1,171 @@ +/*************************************************************************** + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL-BLAS: BLAS implementation using SYCL + * + * @filename matcopy_batch.hpp + * + **************************************************************************/ + +#ifndef PORTBLAS_EXTENSION_MATCOPY_BATCH_HPP +#define PORTBLAS_EXTENSION_MATCOPY_BATCH_HPP + +#include "operations/extension/matcopy_batch.h" + +namespace blas { + +template +Matcopy_batch::Matcopy_batch( + lhs_t lhs, rhs_t rhs_1, rhs_t rhs_2, typename lhs_t::value_t alpha, + typename lhs_t::value_t beta, typename rhs_t::index_t m, + typename rhs_t::index_t n, typename rhs_t::index_t lhs_ld, + typename rhs_t::index_t rhs_ld, typename rhs_t::index_t rhs_2_ld, + typename rhs_t::index_t lhs_stride, typename rhs_t::index_t rhs_stride, + typename rhs_t::index_t rhs_2_stride, typename rhs_t::index_t batch_size) + : lhs_(lhs), + rhs_1_(rhs_1), + rhs_2_(rhs_2), + alpha_(alpha), + beta_(beta), + m_(m), + n_(n), + lhs_ld_(lhs_ld), + rhs_1_ld_(rhs_ld), + rhs_2_ld_(rhs_2_ld), + lhs_stride_(lhs_stride), + rhs_1_stride_(rhs_stride), + rhs_2_stride_(rhs_2_stride), + batch_size_(batch_size) {} + +template +typename lhs_t::value_t Matcopy_batch::eval( + index_t i) {} + +template +typename lhs_t::value_t Matcopy_batch::eval( + cl::sycl::nd_item<1> ndItem) { + const index_t m{m_}; + const index_t n{n_}; + + const index_t required_tile = + (((m - 1) / TileSize) + 1) * (((n - 1) / TileSize) + 1); + + const index_t tile_for_matrix = ((required_tile - 1) / TilePerWG) + 1; + + const index_t wg_batch_id = + (ndItem.get_group(0)) / ((required_tile - 1) / TilePerWG + 1); + + const index_t l_lhs_stride = lhs_stride_; + const index_t l_rhs_stride = rhs_1_stride_; + + const index_t number_of_block_per_row = ((m_ - 1) / TileSize) + 1; + + const index_t wg_id = ndItem.get_local_id(0) / TileSize + + ((ndItem.get_group(0) % tile_for_matrix) * TilePerWG); + + /* row tile id per work group */ + const index_t tile_id_row = wg_id % number_of_block_per_row; + /* column tile id per work group */ + const index_t tile_id_col = wg_id / number_of_block_per_row; + /* the start position of the tile-row per work group */ + const index_t wg_row = tile_id_row * TileSize; + /* the start position of the tile-column per work group */ + const index_t wg_col = tile_id_col * TileSize; + + const index_t item_id = ndItem.get_local_id(0) % TileSize; + + auto orig_lhs = lhs_.get_pointer() + (wg_batch_id * l_lhs_stride); + auto orig_rhs = rhs_1_.get_pointer() + (wg_batch_id * l_rhs_stride); + + orig_lhs = orig_lhs + wg_row + wg_col * lhs_ld_ + item_id; + orig_rhs = orig_rhs + wg_row + wg_col * rhs_1_ld_ + item_id; + + value_t reg_rhs[TileSize]; + const index_t alpha = alpha_; + + const bool is_internal_block = + (m - wg_row >= TileSize) && (n - wg_col >= TileSize); + + // check for short&large + const bool valid_index = + (item_id > m || (item_id >= (m - wg_row))) ? false : true; + if (!valid_index) return 0; + + if (is_internal_block) { + auto A = orig_rhs; + auto B = orig_lhs; + +#pragma unroll + for (int i = 0; i < TileSize; ++i) { + reg_rhs[i] = A[i * rhs_1_ld_]; + } +#pragma unroll + for (int i = 0; i < TileSize; ++i) { + B[i * lhs_ld_] = alpha * reg_rhs[i]; + } + + } else { + const auto limit_m = m - wg_row; + const auto limit_n = n - wg_col; + auto A = orig_rhs; + auto B = orig_lhs; + + for (int i = 0; i < TileSize; ++i) { + if (i >= limit_n) break; + reg_rhs[i] = A[i * rhs_1_ld_]; + } + for (int i = 0; i < TileSize; ++i) { + if (i >= limit_n) break; + B[i * lhs_ld_] = alpha * reg_rhs[i]; + } + } + + return 0; +} + +template +PORTBLAS_INLINE void Matcopy_batch::bind( + cl::sycl::handler& h) { + lhs_.bind(h); + rhs_1_.bind(h); + rhs_2_.bind(h); +} + +template +PORTBLAS_INLINE void +Matcopy_batch::adjust_access_displacement() { + lhs_.adjust_access_displacement(); + rhs_1_.adjust_access_displacement(); + rhs_2_.adjust_access_displacement(); +} + +template +PORTBLAS_INLINE typename rhs_t::index_t +Matcopy_batch::get_size() const { + return m_ * n_ * batch_size_; +} + +template +PORTBLAS_INLINE bool +Matcopy_batch::valid_thread( + cl::sycl::nd_item<1> ndItem) const { + return true; +} +} // namespace blas + +#endif // PORTBLAS_EXTENSION_MATCOPY_BATCH_HPP diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp index 0d3adba54..57be3db27 100644 --- a/src/operations/extension/transpose.hpp +++ b/src/operations/extension/transpose.hpp @@ -54,8 +54,9 @@ template ::get_size() const { - // Smallest TileSize square-multiple containing input/output matrices - return (M_pad_ * N_pad_); + // Smallest TileSize square-multiple containing input/output matrices times + // batch_size + return (size_pad_ * batch_size_); } template void ext_omatcopy(char trans, const index_t m, const index_t n, - const scalar_t alpha, std::vector& A, - const index_t lda, std::vector& B, index_t ldb) { + const scalar_t alpha, const scalar_t* A, const index_t lda, + scalar_t* B, index_t ldb) { if (trans != 't') { for (index_t j = 0; j < n; j++) { for (index_t i = 0; i < m; i++) { @@ -70,9 +70,9 @@ void ext_omatcopy(char trans, const index_t m, const index_t n, */ template void ext_omatcopy2(const char& t, const index_t& m, const index_t& n, - const scalar_t& alpha, std::vector& in_matrix, + const scalar_t& alpha, const scalar_t* in_matrix, const index_t& ld_in, const index_t& inc_in, - std::vector& out_matrix, const index_t& ld_out, + scalar_t* out_matrix, const index_t& ld_out, const index_t inc_out) { if (t == 't') { for (int i = 0; i < m; ++i) { diff --git a/test/unittest/extension/omatcopy2_test.cpp b/test/unittest/extension/omatcopy2_test.cpp index 8b1058f3f..bae7ba0e4 100644 --- a/test/unittest/extension/omatcopy2_test.cpp +++ b/test/unittest/extension/omatcopy2_test.cpp @@ -66,8 +66,8 @@ void run_test(const combination_t combi) { // TODO: There isn't a reference implementation from any library. So we // compare the results with a basic host implementation. Working on a // better comparison. - reference_blas::ext_omatcopy2(trans, m, n, alpha, A_ref, ld_in, inc_in, B_ref, - ld_out, inc_out); + reference_blas::ext_omatcopy2(trans, m, n, alpha, A_ref.data(), ld_in, inc_in, + B_ref.data(), ld_out, inc_out); auto matrix_in = helper::allocate(m_a_size, q); auto matrix_out = helper::allocate(m_b_size, q); diff --git a/test/unittest/extension/omatcopy_batched_test.cpp b/test/unittest/extension/omatcopy_batched_test.cpp new file mode 100644 index 000000000..94fb91d53 --- /dev/null +++ b/test/unittest/extension/omatcopy_batched_test.cpp @@ -0,0 +1,156 @@ +/*************************************************************************** + * + * @license + * Copyright (C) Codeplay Software Limited + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * For your convenience, a copy of the License has been included in this + * repository. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SYCL-BLAS: BLAS implementation using SYCL + * + * @filename omatcopy_batched_test.cpp + * + **************************************************************************/ + +#include "blas_test.hpp" +#include "extension_reference.hpp" + +template +using combination_t = std::tuple; + +template +void run_test(const combination_t combi) { + std::string alloc; + char trans; + index_t m, n, ld_in_m, ld_out_m, stride_in_m, stride_out_m, batch_size; + scalar_t alpha; + + std::tie(alloc, trans, m, n, alpha, ld_in_m, ld_out_m, stride_in_m, + stride_out_m, batch_size) = combi; + + // Compute leading dimensions using second_dim-ld multipliers + index_t ld_in = ld_in_m * m; + index_t ld_out = ld_out_m * (trans == 't' ? n : m); + + index_t size_a = ld_in * n; + index_t size_b = ld_out * (trans == 't' ? m : n); + + // Compute Strides using size-stride multipliers + index_t stride_in = stride_in_m * size_a; + index_t stride_out = stride_out_m * size_b; + + auto q = make_queue(); + blas::SB_Handle sb_handle(q); + + std::vector A(stride_in * batch_size); + std::vector B(stride_out * batch_size, 0); + + fill_random(A); + + std::vector A_ref = A; + std::vector B_ref = B; + + // Reference implementation + for (auto b = 0; b < batch_size; b++) { + reference_blas::ext_omatcopy(trans, m, n, alpha, + A_ref.data() + b * stride_in, ld_in, + B_ref.data() + b * stride_out, ld_out); + } + + auto matrix_in = + helper::allocate(stride_in * batch_size, q); + auto matrix_out = + helper::allocate(stride_out * batch_size, q); + + auto copy_in = helper::copy_to_device(q, A.data(), matrix_in, + stride_in * batch_size); + auto copy_out = helper::copy_to_device(q, B.data(), matrix_out, + stride_out * batch_size); + + auto operator_event = blas::_omatcopy_batch( + sb_handle, trans, m, n, alpha, matrix_in, ld_in, stride_in, matrix_out, + ld_out, stride_out, batch_size, {copy_in, copy_out}); + sb_handle.wait(operator_event); + + auto event = blas::helper::copy_to_host( + sb_handle.get_queue(), matrix_out, B.data(), stride_out * batch_size); + sb_handle.wait(event); + + // Validate the result + const bool isAlmostEqual = utils::compare_vectors(B, B_ref); + ASSERT_TRUE(isAlmostEqual); +} + +template +void run_test(const combination_t combi) { + std::string alloc; + char trans; + index_t m, n, ld_in_m, ld_out_m, stride_in_m, stride_out_m, batch_size; + scalar_t alpha; + + std::tie(alloc, trans, m, n, alpha, ld_in_m, ld_out_m, stride_in_m, + stride_out_m, batch_size) = combi; + + if (alloc == "usm") { +#ifdef SB_ENABLE_USM + run_test(combi); +#else + GTEST_SKIP(); +#endif + } else { + run_test(combi); + } +} + +#ifdef STRESS_TESTING +template +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), + ::testing::Values('n', 't'), // trans + ::testing::Values(1024, 4050, 16380), // m + ::testing::Values(1024, 4050, 16380), // n + ::testing::Values(0, 1.05, -20.01), // alpha + ::testing::Values(3, 5), // ld_in_m + ::testing::Values(3, 5), // ld_out_m + ::testing::Values(5, 10), // stride_in_m + ::testing::Values(5, 10), // stride_out_m + ::testing::Values(10, 21)); // batch_size +#else +template +const auto combi = + ::testing::Combine(::testing::Values("usm", "buf"), + ::testing::Values('n', 't'), // trans + ::testing::Values(64, 129, 255), // m + ::testing::Values(64, 129, 255), // n + ::testing::Values(0, 2), // alpha + ::testing::Values(1, 2, 3), // ld_in_m + ::testing::Values(1, 2, 3), // ld_out_m + ::testing::Values(1, 3), // stride_in_m + ::testing::Values(1, 3), // stride_out_m + ::testing::Values(1, 2, 5)); // batch_size +#endif + +template +static std::string generate_name( + const ::testing::TestParamInfo>& info) { + std::string alloc; + char trans; + index_t m, n, ld_in_m, ld_out_m, stride_in_m, stride_out_m, batch_size; + T alpha; + BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, ld_in_m, ld_out_m, + stride_in_m, stride_out_m, batch_size); +} + +BLAS_REGISTER_TEST_ALL(OmatCopyBatched, combination_t, combi, generate_name); diff --git a/test/unittest/extension/omatcopy_test.cpp b/test/unittest/extension/omatcopy_test.cpp index 4f0eb1e35..a7a1e49a8 100644 --- a/test/unittest/extension/omatcopy_test.cpp +++ b/test/unittest/extension/omatcopy_test.cpp @@ -58,7 +58,8 @@ void run_test(const combination_t combi) { std::vector B_ref = B; // Reference implementation - reference_blas::ext_omatcopy(trans, m, n, alpha, A_ref, ld_in, B_ref, ld_out); + reference_blas::ext_omatcopy(trans, m, n, alpha, A_ref.data(), ld_in, + B_ref.data(), ld_out); auto matrix_in = helper::allocate(size_a, q); auto matrix_out = helper::allocate(size_b, q); @@ -115,7 +116,7 @@ const auto combi = ::testing::Values(1024, 4050, 16380), // n ::testing::Values(0, 1.05, 2.01), // alpha ::testing::Values(1, 3), // ld_in_m - ::testing::Values(1, 3)); // ld_in_n + ::testing::Values(1, 3)); // ld_out_m #else template const auto combi = @@ -125,7 +126,7 @@ const auto combi = ::testing::Values(64, 129, 255), // n ::testing::Values(0, 1, 2), // alpha ::testing::Values(1, 2, 3), // ld_in_m - ::testing::Values(1, 2, 3)); // ld_in_n + ::testing::Values(1, 2, 3)); // ld_out_m #endif template diff --git a/test/unittest/extension/transpose_test.cpp b/test/unittest/extension/transpose_test.cpp index ebbc54858..d3c6a0163 100644 --- a/test/unittest/extension/transpose_test.cpp +++ b/test/unittest/extension/transpose_test.cpp @@ -87,7 +87,8 @@ void run_test(const combination_t& combi) { helper::deallocate(matrix_out, q); } else { - // Inplace Transpose: TODO + // Inplace Transpose currently disabled (TODO) + GTEST_SKIP(); } }