From 796efc2822faa653905663d55f603f3a79db8002 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Fri, 16 Jun 2023 12:32:26 +0100 Subject: [PATCH] Added CL & WG size as transposeAdd params & switched to per backend config --- include/interface/transpose_launcher.h | 3 +- include/operations/extension/transpose.h | 49 ++++++++---- src/interface/extension/backend/amd_gpu.hpp | 21 +++++ .../extension/backend/default_cpu.hpp | 14 ++++ src/interface/extension/backend/intel_gpu.hpp | 21 +++++ .../extension/backend/nvidia_gpu.hpp | 21 +++++ .../extension/transpose_launcher.cpp.in | 9 +-- src/interface/extension_interface.hpp | 45 +---------- src/interface/transpose_launcher.hpp | 7 +- src/operations/extension/transpose.hpp | 76 ++++++++++--------- 10 files changed, 165 insertions(+), 101 deletions(-) diff --git a/include/interface/transpose_launcher.h b/include/interface/transpose_launcher.h index 8acc75a9a..06f6f82df 100644 --- a/include/interface/transpose_launcher.h +++ b/include/interface/transpose_launcher.h @@ -49,7 +49,8 @@ struct Transpose_Launcher { * @brief Wrapper around TransposeAdd. Creates the views, then makes and * launches Transpose Add. */ -template +template struct TransposeAdd_Launcher { template +template class TransposeAdd { public: using index_t = typename in1_t::index_t; @@ -160,21 +165,32 @@ class TransposeAdd { in1_t A_; in2_t B_; out_t C_; - - index_t lda_; - index_t ldb_; - index_t ldc_; - index_t N_; index_t M_; value_t alpha_; value_t beta_; + // Leading dimensions + index_t lda_; + index_t ldb_; + index_t ldc_; // Minimum number of tiles used to cover output matrix rows & columns index_t tile_count_m_; index_t tile_count_n_; + // Inner WG Tiles + static constexpr const index_t inner_tile_size_ = wg_size / Tile_size; + static constexpr const index_t inner_tile_count_ = + Tile_size / inner_tile_size_; // Minimum number of Tile-mutliple rows & columns to cover the output matrix index_t M_pad_; index_t N_pad_; + // The number of elements per cache line size depends on the element type + static constexpr index_t get_num_cache_line_elems() { + return cl_size / sizeof(element_t); + } + // The number of Tile-sides per cache line + static constexpr index_t get_num_tiles_per_cache_line() { + return get_num_cache_line_elems() / Tile_size; + } TransposeAdd(in1_t &A, in2_t &B, out_t &C, value_t &alpha, value_t &beta) : A_(A), @@ -209,14 +225,15 @@ class TransposeAdd { /*! * @brief Generator/factory for Transpose-Add trees. */ -template -TransposeAdd +template +TransposeAdd make_transpose_add(in1_t &A, in2_t &B, out_t &C, element_t &alpha, element_t &beta) { - return TransposeAdd(A, B, C, alpha, beta); + return TransposeAdd(A, B, C, alpha, beta); } } // namespace blas diff --git a/src/interface/extension/backend/amd_gpu.hpp b/src/interface/extension/backend/amd_gpu.hpp index 5bf0315ae..35dfaf818 100644 --- a/src/interface/extension/backend/amd_gpu.hpp +++ b/src/interface/extension/backend/amd_gpu.hpp @@ -53,6 +53,27 @@ typename sb_handle_t::event_t _transpose_outplace( } } +template +typename sb_handle_t::event_t _transpose_add( + sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_0_t a_, index_t _ld_a, index_t _a_rows, index_t _a_cols, + element_t _beta, container_1_t b_, index_t _ld_b, index_t _b_rows, + index_t _b_cols, container_2_t c_, index_t _ld_c) { + if (_M * _N > (1 << 18)) { + return TransposeAdd_Launcher:: + template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a, + _a_rows, _a_cols, _beta, b_, _ld_b, + _b_rows, _b_cols, c_, _ld_c); + } else { + return TransposeAdd_Launcher:: + template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a, + _a_rows, _a_cols, _beta, b_, _ld_b, + _b_rows, _b_cols, c_, _ld_c); + } +} + } // namespace backend } // namespace extension } // namespace blas diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp index 8e1f7bcbe..c8c59c4e6 100644 --- a/src/interface/extension/backend/default_cpu.hpp +++ b/src/interface/extension/backend/default_cpu.hpp @@ -44,6 +44,20 @@ typename sb_handle_t::event_t _transpose_outplace( _inc_out); } +template +typename sb_handle_t::event_t _transpose_add( + sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_0_t a_, index_t _ld_a, index_t _a_rows, index_t _a_cols, + element_t _beta, container_1_t b_, index_t _ld_b, index_t _b_rows, + index_t _b_cols, container_2_t c_, index_t _ld_c) { + return TransposeAdd_Launcher:: + template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a, + _a_rows, _a_cols, _beta, b_, _ld_b, + _b_rows, _b_cols, c_, _ld_c); +} + } // namespace backend } // namespace extension } // namespace blas diff --git a/src/interface/extension/backend/intel_gpu.hpp b/src/interface/extension/backend/intel_gpu.hpp index a0e4bd643..57e77af52 100644 --- a/src/interface/extension/backend/intel_gpu.hpp +++ b/src/interface/extension/backend/intel_gpu.hpp @@ -54,6 +54,27 @@ typename sb_handle_t::event_t _transpose_outplace( } } +template +typename sb_handle_t::event_t _transpose_add( + sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_0_t a_, index_t _ld_a, index_t _a_rows, index_t _a_cols, + element_t _beta, container_1_t b_, index_t _ld_b, index_t _b_rows, + index_t _b_cols, container_2_t c_, index_t _ld_c) { + if (_M * _N > (1 << 18)) { + return TransposeAdd_Launcher:: + template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a, + _a_rows, _a_cols, _beta, b_, _ld_b, + _b_rows, _b_cols, c_, _ld_c); + } else { + return TransposeAdd_Launcher:: + template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a, + _a_rows, _a_cols, _beta, b_, _ld_b, + _b_rows, _b_cols, c_, _ld_c); + } +} + } // namespace backend } // namespace extension } // namespace blas diff --git a/src/interface/extension/backend/nvidia_gpu.hpp b/src/interface/extension/backend/nvidia_gpu.hpp index accfe8708..3e9648dca 100644 --- a/src/interface/extension/backend/nvidia_gpu.hpp +++ b/src/interface/extension/backend/nvidia_gpu.hpp @@ -55,6 +55,27 @@ typename sb_handle_t::event_t _transpose_outplace( } } +template +typename sb_handle_t::event_t _transpose_add( + sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, + container_0_t a_, index_t _ld_a, index_t _a_rows, index_t _a_cols, + element_t _beta, container_1_t b_, index_t _ld_b, index_t _b_rows, + index_t _b_cols, container_2_t c_, index_t _ld_c) { + if (_M * _N > (1 << 18)) { + return TransposeAdd_Launcher:: + template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a, + _a_rows, _a_cols, _beta, b_, _ld_b, + _b_rows, _b_cols, c_, _ld_c); + } else { + return TransposeAdd_Launcher:: + template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a, + _a_rows, _a_cols, _beta, b_, _ld_b, + _b_rows, _b_cols, c_, _ld_c); + } +} + } // namespace backend } // namespace extension } // namespace blas diff --git a/src/interface/extension/transpose_launcher.cpp.in b/src/interface/extension/transpose_launcher.cpp.in index 89dc4967d..aed2b4c0b 100644 --- a/src/interface/extension/transpose_launcher.cpp.in +++ b/src/interface/extension/transpose_launcher.cpp.in @@ -33,8 +33,8 @@ namespace blas { namespace extension { template class Transpose_Launcher<${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>; -template class TransposeAdd_Launcher; -template class TransposeAdd_Launcher; +template class TransposeAdd_Launcher; +template class TransposeAdd_Launcher; template typename SB_Handle::event_t Transpose_Launcher< ${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>:: @@ -52,7 +52,7 @@ template typename SB_Handle::event_t Transpose_Launcher< ${INDEX_TYPE} _ld_out, ${INDEX_TYPE} _inc_out); template typename SB_Handle::event_t TransposeAdd_Launcher< - true, ${TILE_SIZE}, ${LOCAL_MEM}>:: + true, ${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>:: _select_transpose_add(SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, @@ -69,7 +69,7 @@ template typename SB_Handle::event_t TransposeAdd_Launcher< ${INDEX_TYPE} _ldc); template typename SB_Handle::event_t TransposeAdd_Launcher< - false, ${TILE_SIZE}, ${LOCAL_MEM}>:: + false, ${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>:: _select_transpose_add(SB_Handle& sb_handle, ${INDEX_TYPE} _M, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha, @@ -86,6 +86,5 @@ template typename SB_Handle::event_t TransposeAdd_Launcher< ${INDEX_TYPE} _ldc); -} // namespace internal } // namespace extension } // namespace blas diff --git a/src/interface/extension_interface.hpp b/src/interface/extension_interface.hpp index e8a2bb9ac..d6af6fcd3 100644 --- a/src/interface/extension_interface.hpp +++ b/src/interface/extension_interface.hpp @@ -133,48 +133,11 @@ _omatadd_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, constexpr const bool both_trans = trans_a && trans_b; - bool use_local_memory = sb_handle.has_local_memory(); - - if (use_local_memory) { - // Using local Memory - if (m > 1024 && n > 1024) { - ret = TransposeAdd_Launcher< - both_trans, 32, true>::template _select_transpose_add(sb_handle, m, n, - alpha, a, lda, - a_rows, a_cols, - beta, b, ldb, - b_rows, b_cols, - c, ldc); - } else if (m > 64 && n > 64) { - ret = TransposeAdd_Launcher< - both_trans, 16, true>::template _select_transpose_add(sb_handle, m, n, - alpha, a, lda, - a_rows, a_cols, - beta, b, ldb, - b_rows, b_cols, - c, ldc); - } else { - ret = TransposeAdd_Launcher< - both_trans, 8, true>::template _select_transpose_add(sb_handle, m, n, - alpha, a, lda, - a_rows, a_cols, - beta, b, ldb, - b_rows, b_cols, - c, ldc); - } - } else { - // With no local Memory - ret = TransposeAdd_Launcher< - both_trans, 16, false>::template _select_transpose_add(sb_handle, m, n, - alpha, a, lda, - a_rows, a_cols, - beta, b, ldb, - b_rows, b_cols, - c, ldc); - } - - return ret; + return blas::extension::backend::_transpose_add( + sb_handle, m, n, alpha, a, lda, a_rows, a_cols, beta, b, ldb, b_rows, + b_cols, c, ldc); } + template typename std::enable_if:: * @brief Wrapper around Transpose-Add. Creates the views, then makes and * launches Transpose Add kernel */ -template +template template typename sb_handle_t::event_t -TransposeAdd_Launcher:: +TransposeAdd_Launcher:: _select_transpose_add(sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha, container_0_t a_, index_t _lda, index_t _nrows_a, index_t _ncols_a, element_t _beta, @@ -103,7 +104,7 @@ TransposeAdd_Launcher:: // Transpose Add expression Tree auto trans_scale_tree = - make_transpose_add( + make_transpose_add( A_view, B_view, C_view, _alpha, _beta); if constexpr (local_memory) { diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp index 8cb288beb..b769f2b5f 100644 --- a/src/operations/extension/transpose.hpp +++ b/src/operations/extension/transpose.hpp @@ -217,49 +217,54 @@ Transpose -SYCL_BLAS_INLINE bool -TransposeAdd::valid_thread(cl::sycl::nd_item<1> item) const { +template +SYCL_BLAS_INLINE bool TransposeAdd< + both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t, in2_t, out_t, + element_t>::valid_thread(cl::sycl::nd_item<1> item) const { // Valid threads are established by ::eval() return true; } -template +template SYCL_BLAS_INLINE void -TransposeAdd::bind(cl::sycl::handler &cgh) { +TransposeAdd::bind(cl::sycl::handler &cgh) { A_.bind(cgh); B_.bind(cgh); C_.bind(cgh); } -template +template SYCL_BLAS_INLINE typename in1_t::index_t -TransposeAdd::get_size() const { +TransposeAdd::get_size() const { // Smallest TileSize square-multiple containing input/output matrices return (M_pad_ * N_pad_); } -template +template SYCL_BLAS_INLINE void -TransposeAdd::adjust_access_displacement() { +TransposeAdd::adjust_access_displacement() { A_.adjust_access_displacement(); B_.adjust_access_displacement(); C_.adjust_access_displacement(); } -template +template SYCL_BLAS_INLINE void -TransposeAdd::eval(cl::sycl::nd_item<1> id) { +TransposeAdd::eval(cl::sycl::nd_item<1> id) { auto idx = id.get_global_linear_id(); if (idx < get_size()) { @@ -318,15 +323,15 @@ TransposeAdd -SYCL_BLAS_INLINE void -TransposeAdd::get_indices(cl::sycl::nd_item<1> id, index_t &in_a_idx, - index_t &in_b_idx, index_t &in_local_idx, - index_t &out_idx, index_t &out_local_idx, - bool &valid_index_in, - bool &valid_index_out) { +template +SYCL_BLAS_INLINE void TransposeAdd< + both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t, in2_t, out_t, + element_t>::get_indices(cl::sycl::nd_item<1> id, index_t &in_a_idx, + index_t &in_b_idx, index_t &in_local_idx, + index_t &out_idx, index_t &out_local_idx, + bool &valid_index_in, bool &valid_index_out) { index_t M = both_trans ? N_ : M_; index_t N = both_trans ? M_ : N_; index_t m_tiles = both_trans ? tile_count_n_ : tile_count_m_; @@ -362,13 +367,14 @@ TransposeAdd +template template SYCL_BLAS_INLINE void -TransposeAdd::eval(local_memory_t local_mem, - cl::sycl::nd_item<1> id) { +TransposeAdd::eval(local_memory_t local_mem, + cl::sycl::nd_item<1> id) { auto idx = id.get_global_linear_id(); if (idx < get_size()) {