Skip to content

Commit

Permalink
Added CL & WG size as transposeAdd params & switched to per backend c…
Browse files Browse the repository at this point in the history
…onfig
  • Loading branch information
OuadiElfarouki committed Jun 16, 2023
1 parent c3d0389 commit 796efc2
Show file tree
Hide file tree
Showing 10 changed files with 165 additions and 101 deletions.
3 changes: 2 additions & 1 deletion include/interface/transpose_launcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ struct Transpose_Launcher {
* @brief Wrapper around TransposeAdd. Creates the views, then makes and
* launches Transpose Add.
*/
template <bool both_trans, int Tile_size, bool local_memory>
template <bool both_trans, int Tile_size, int wg_size, int cl_size,
bool local_memory>
struct TransposeAdd_Launcher {
template <typename sb_handle_t, typename container_0_t,
typename container_1_t, typename container_2_t, typename element_t,
Expand Down
49 changes: 33 additions & 16 deletions include/operations/extension/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,39 +142,55 @@ make_transpose(in_t &A, index_t inc_a, out_t &At, index_t inc_a_t,
* input matrices are transposed, with and without the use of local memory,
* while remaining customizable Tiling-size wise.
*
* @tparam in_place Whether the transpose is in or out of place
* @tparam both_trans Whether both A & B matrices are transposed (or just the
* first one)
* @tparam Tile_size Tiling size used explicitly in the local memory kernel, and
* used to compute work-group size in the non-local memory case.
* @tparam wg_size work group size
* @tparam cl_size cache line size
* @tparam local_memory Whether to use local memory
* @tparam in_t The input matrix type
* @tparam out_t The output matrix type
* @tparam in1_t The input matrix A type
* @tparam in2_t The input matrix B type
* @tparam out_t The output matrix C type
* @tparam element_t The scaling factor type
*
*/
template <bool both_trans, int Tile_size, bool local_memory, typename in1_t,
typename in2_t, typename out_t, typename element_t>
template <bool both_trans, int Tile_size, int wg_size, int cl_size,
bool local_memory, typename in1_t, typename in2_t, typename out_t,
typename element_t>
class TransposeAdd {
public:
using index_t = typename in1_t::index_t;
using value_t = element_t;
in1_t A_;
in2_t B_;
out_t C_;

index_t lda_;
index_t ldb_;
index_t ldc_;

index_t N_;
index_t M_;
value_t alpha_;
value_t beta_;
// Leading dimensions
index_t lda_;
index_t ldb_;
index_t ldc_;
// Minimum number of tiles used to cover output matrix rows & columns
index_t tile_count_m_;
index_t tile_count_n_;
// Inner WG Tiles
static constexpr const index_t inner_tile_size_ = wg_size / Tile_size;
static constexpr const index_t inner_tile_count_ =
Tile_size / inner_tile_size_;
// Minimum number of Tile-mutliple rows & columns to cover the output matrix
index_t M_pad_;
index_t N_pad_;
// The number of elements per cache line size depends on the element type
static constexpr index_t get_num_cache_line_elems() {
return cl_size / sizeof(element_t);
}
// The number of Tile-sides per cache line
static constexpr index_t get_num_tiles_per_cache_line() {
return get_num_cache_line_elems() / Tile_size;
}

TransposeAdd(in1_t &A, in2_t &B, out_t &C, value_t &alpha, value_t &beta)
: A_(A),
Expand Down Expand Up @@ -209,14 +225,15 @@ class TransposeAdd {
/*!
* @brief Generator/factory for Transpose-Add trees.
*/
template <bool both_trans, int Tile_size, bool local_memory, typename in1_t,
typename in2_t, typename out_t, typename element_t>
TransposeAdd<both_trans, Tile_size, local_memory, in1_t, in2_t, out_t,
element_t>
template <bool both_trans, int Tile_size, int wg_size, int cl_size,
bool local_memory, typename in1_t, typename in2_t, typename out_t,
typename element_t>
TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory, in1_t,
in2_t, out_t, element_t>
make_transpose_add(in1_t &A, in2_t &B, out_t &C, element_t &alpha,
element_t &beta) {
return TransposeAdd<both_trans, Tile_size, local_memory, in1_t, in2_t, out_t,
element_t>(A, B, C, alpha, beta);
return TransposeAdd<both_trans, Tile_size, wg_size, cl_size, local_memory,
in1_t, in2_t, out_t, element_t>(A, B, C, alpha, beta);
}

} // namespace blas
Expand Down
21 changes: 21 additions & 0 deletions src/interface/extension/backend/amd_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,27 @@ typename sb_handle_t::event_t _transpose_outplace(
}
}

template <bool both_trans, typename sb_handle_t, typename container_0_t,
typename container_1_t, typename container_2_t, typename element_t,
typename index_t>
typename sb_handle_t::event_t _transpose_add(
sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
container_0_t a_, index_t _ld_a, index_t _a_rows, index_t _a_cols,
element_t _beta, container_1_t b_, index_t _ld_b, index_t _b_rows,
index_t _b_cols, container_2_t c_, index_t _ld_c) {
if (_M * _N > (1 << 18)) {
return TransposeAdd_Launcher<both_trans, 16, 256, 64, true>::
template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a,
_a_rows, _a_cols, _beta, b_, _ld_b,
_b_rows, _b_cols, c_, _ld_c);
} else {
return TransposeAdd_Launcher<both_trans, 16, 64, 64, true>::
template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a,
_a_rows, _a_cols, _beta, b_, _ld_b,
_b_rows, _b_cols, c_, _ld_c);
}
}

} // namespace backend
} // namespace extension
} // namespace blas
Expand Down
14 changes: 14 additions & 0 deletions src/interface/extension/backend/default_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@ typename sb_handle_t::event_t _transpose_outplace(
_inc_out);
}

template <bool both_trans, typename sb_handle_t, typename container_0_t,
typename container_1_t, typename container_2_t, typename element_t,
typename index_t>
typename sb_handle_t::event_t _transpose_add(
sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
container_0_t a_, index_t _ld_a, index_t _a_rows, index_t _a_cols,
element_t _beta, container_1_t b_, index_t _ld_b, index_t _b_rows,
index_t _b_cols, container_2_t c_, index_t _ld_c) {
return TransposeAdd_Launcher<both_trans, 16, 64, 64, false>::
template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a,
_a_rows, _a_cols, _beta, b_, _ld_b,
_b_rows, _b_cols, c_, _ld_c);
}

} // namespace backend
} // namespace extension
} // namespace blas
Expand Down
21 changes: 21 additions & 0 deletions src/interface/extension/backend/intel_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,27 @@ typename sb_handle_t::event_t _transpose_outplace(
}
}

template <bool both_trans, typename sb_handle_t, typename container_0_t,
typename container_1_t, typename container_2_t, typename element_t,
typename index_t>
typename sb_handle_t::event_t _transpose_add(
sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
container_0_t a_, index_t _ld_a, index_t _a_rows, index_t _a_cols,
element_t _beta, container_1_t b_, index_t _ld_b, index_t _b_rows,
index_t _b_cols, container_2_t c_, index_t _ld_c) {
if (_M * _N > (1 << 18)) {
return TransposeAdd_Launcher<both_trans, 32, 256, 128, true>::
template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a,
_a_rows, _a_cols, _beta, b_, _ld_b,
_b_rows, _b_cols, c_, _ld_c);
} else {
return TransposeAdd_Launcher<both_trans, 16, 64, 64, true>::
template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a,
_a_rows, _a_cols, _beta, b_, _ld_b,
_b_rows, _b_cols, c_, _ld_c);
}
}

} // namespace backend
} // namespace extension
} // namespace blas
Expand Down
21 changes: 21 additions & 0 deletions src/interface/extension/backend/nvidia_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,27 @@ typename sb_handle_t::event_t _transpose_outplace(
}
}

template <bool both_trans, typename sb_handle_t, typename container_0_t,
typename container_1_t, typename container_2_t, typename element_t,
typename index_t>
typename sb_handle_t::event_t _transpose_add(
sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
container_0_t a_, index_t _ld_a, index_t _a_rows, index_t _a_cols,
element_t _beta, container_1_t b_, index_t _ld_b, index_t _b_rows,
index_t _b_cols, container_2_t c_, index_t _ld_c) {
if (_M * _N > (1 << 18)) {
return TransposeAdd_Launcher<both_trans, 32, 512, 128, true>::
template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a,
_a_rows, _a_cols, _beta, b_, _ld_b,
_b_rows, _b_cols, c_, _ld_c);
} else {
return TransposeAdd_Launcher<both_trans, 32, 128, 128, true>::
template _select_transpose_add(sb_handle, _M, _N, _alpha, a_, _ld_a,
_a_rows, _a_cols, _beta, b_, _ld_b,
_b_rows, _b_cols, c_, _ld_c);
}
}

} // namespace backend
} // namespace extension
} // namespace blas
Expand Down
9 changes: 4 additions & 5 deletions src/interface/extension/transpose_launcher.cpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
namespace blas {
namespace extension {
template class Transpose_Launcher<${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>;
template class TransposeAdd_Launcher<true, ${TILE_SIZE}, ${LOCAL_MEM}>;
template class TransposeAdd_Launcher<false, ${TILE_SIZE}, ${LOCAL_MEM}>;
template class TransposeAdd_Launcher<true, ${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>;
template class TransposeAdd_Launcher<false, ${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>;

template typename SB_Handle::event_t Transpose_Launcher<
${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>::
Expand All @@ -52,7 +52,7 @@ template typename SB_Handle::event_t Transpose_Launcher<
${INDEX_TYPE} _ld_out, ${INDEX_TYPE} _inc_out);

template typename SB_Handle::event_t TransposeAdd_Launcher<
true, ${TILE_SIZE}, ${LOCAL_MEM}>::
true, ${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>::
_select_transpose_add(SB_Handle& sb_handle,
${INDEX_TYPE} _M,
${INDEX_TYPE} _N, ${DATA_TYPE} _alpha,
Expand All @@ -69,7 +69,7 @@ template typename SB_Handle::event_t TransposeAdd_Launcher<
${INDEX_TYPE} _ldc);

template typename SB_Handle::event_t TransposeAdd_Launcher<
false, ${TILE_SIZE}, ${LOCAL_MEM}>::
false, ${TILE_SIZE}, ${WG_SIZE}, ${CL_SIZE}, ${LOCAL_MEM}>::
_select_transpose_add(SB_Handle& sb_handle,
${INDEX_TYPE} _M,
${INDEX_TYPE} _N, ${DATA_TYPE} _alpha,
Expand All @@ -86,6 +86,5 @@ template typename SB_Handle::event_t TransposeAdd_Launcher<
${INDEX_TYPE} _ldc);


} // namespace internal
} // namespace extension
} // namespace blas
45 changes: 4 additions & 41 deletions src/interface/extension_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,48 +133,11 @@ _omatadd_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,

constexpr const bool both_trans = trans_a && trans_b;

bool use_local_memory = sb_handle.has_local_memory();

if (use_local_memory) {
// Using local Memory
if (m > 1024 && n > 1024) {
ret = TransposeAdd_Launcher<
both_trans, 32, true>::template _select_transpose_add(sb_handle, m, n,
alpha, a, lda,
a_rows, a_cols,
beta, b, ldb,
b_rows, b_cols,
c, ldc);
} else if (m > 64 && n > 64) {
ret = TransposeAdd_Launcher<
both_trans, 16, true>::template _select_transpose_add(sb_handle, m, n,
alpha, a, lda,
a_rows, a_cols,
beta, b, ldb,
b_rows, b_cols,
c, ldc);
} else {
ret = TransposeAdd_Launcher<
both_trans, 8, true>::template _select_transpose_add(sb_handle, m, n,
alpha, a, lda,
a_rows, a_cols,
beta, b, ldb,
b_rows, b_cols,
c, ldc);
}
} else {
// With no local Memory
ret = TransposeAdd_Launcher<
both_trans, 16, false>::template _select_transpose_add(sb_handle, m, n,
alpha, a, lda,
a_rows, a_cols,
beta, b, ldb,
b_rows, b_cols,
c, ldc);
}

return ret;
return blas::extension::backend::_transpose_add<both_trans>(
sb_handle, m, n, alpha, a, lda, a_rows, a_cols, beta, b, ldb, b_rows,
b_cols, c, ldc);
}

template <bool trans_a, bool trans_b, typename sb_handle_t, typename element_t,
typename index_t, typename container_t>
typename std::enable_if<!trans_a && !trans_b,
Expand Down
7 changes: 4 additions & 3 deletions src/interface/transpose_launcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,12 @@ Transpose_Launcher<Tile_size, wg_size, cl_size, local_memory>::
* @brief Wrapper around Transpose-Add. Creates the views, then makes and
* launches Transpose Add kernel
*/
template <bool both_trans, int Tile_size, bool local_memory>
template <bool both_trans, int Tile_size, int wg_size, int cl_size,
bool local_memory>
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename container_2_t, typename element_t, typename index_t>
typename sb_handle_t::event_t
TransposeAdd_Launcher<both_trans, Tile_size, local_memory>::
TransposeAdd_Launcher<both_trans, Tile_size, wg_size, cl_size, local_memory>::
_select_transpose_add(sb_handle_t& sb_handle, index_t _M, index_t _N,
element_t _alpha, container_0_t a_, index_t _lda,
index_t _nrows_a, index_t _ncols_a, element_t _beta,
Expand All @@ -103,7 +104,7 @@ TransposeAdd_Launcher<both_trans, Tile_size, local_memory>::

// Transpose Add expression Tree
auto trans_scale_tree =
make_transpose_add<both_trans, Tile_size, local_memory>(
make_transpose_add<both_trans, Tile_size, wg_size, cl_size, local_memory>(
A_view, B_view, C_view, _alpha, _beta);

if constexpr (local_memory) {
Expand Down
Loading

0 comments on commit 796efc2

Please sign in to comment.