From 82d709277cd19d58ef7179107311bb2488773155 Mon Sep 17 00:00:00 2001 From: Muhammad Tanvir Date: Mon, 16 Sep 2024 16:19:57 +0100 Subject: [PATCH] Renamed files to xe_* * Removed l2 workspace alignment --- examples/sycl/pvc/pvc_gemm_streamk.cpp | 6 +++--- .../cutlass/gemm/kernel/gemm_universal.hpp | 2 +- .../cutlass/gemm/kernel/tile_scheduler.hpp | 4 ++-- ...ooperative.hpp => xe_gemm_cooperative.hpp} | 0 ...sistent_tile_scheduler_params_streamk.hpp} | 19 +++++-------------- ...eamk.hpp => xe_tile_scheduler_streamk.hpp} | 10 +++++----- 6 files changed, 16 insertions(+), 25 deletions(-) rename include/cutlass/gemm/kernel/{intel_pvc_gemm_cooperative.hpp => xe_gemm_cooperative.hpp} (100%) rename include/cutlass/gemm/kernel/{intel_pvc_persistent_tile_scheduler_params_streamk.hpp => xe_persistent_tile_scheduler_params_streamk.hpp} (97%) rename include/cutlass/gemm/kernel/{intel_pvc_tile_scheduler_streamk.hpp => xe_tile_scheduler_streamk.hpp} (98%) diff --git a/examples/sycl/pvc/pvc_gemm_streamk.cpp b/examples/sycl/pvc/pvc_gemm_streamk.cpp index a28c6b18d..5d5aa347d 100644 --- a/examples/sycl/pvc/pvc_gemm_streamk.cpp +++ b/examples/sycl/pvc/pvc_gemm_streamk.cpp @@ -47,7 +47,7 @@ #include "cutlass/util/reference/device/tensor_compare.h" #include "common.h" -#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp" +#include "cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp" using namespace cute; /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -232,8 +232,8 @@ struct ExampleRunner { {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, hw_info, {options.splits, - options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::SplitK : - cutlass::gemm::kernel::detail::PersistentTileSchedulerIntelPVCStreamKParams::DecompositionMode::StreamK} + options.splitk ? cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::SplitK : + cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode::StreamK} }; Gemm gemm_op; diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp index 0d063db13..eced02115 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.hpp +++ b/include/cutlass/gemm/kernel/gemm_universal.hpp @@ -65,6 +65,6 @@ struct IsCutlass3ArrayKernel { - using Scheduler = PersistentTileSchedulerIntelPVCStreamK; + using Scheduler = PersistentTileSchedulerXeStreamK; }; #endif diff --git a/include/cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp b/include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp similarity index 100% rename from include/cutlass/gemm/kernel/intel_pvc_gemm_cooperative.hpp rename to include/cutlass/gemm/kernel/xe_gemm_cooperative.hpp diff --git a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp similarity index 97% rename from include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp rename to include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp index 85fcfdbef..443e2cf10 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp +++ b/include/cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp @@ -48,8 +48,8 @@ namespace kernel { namespace detail { //////////////////////////////////////////////////////////////////////////////// -// Parameters for Intel PVC persistent stream-K scheduler -struct PersistentTileSchedulerIntelPVCStreamKParams { +// Parameters for Xe persistent stream-K scheduler +struct PersistentTileSchedulerXeStreamKParams { // Strategies for computing reductions between work-groups computing portions of a given output tile enum class ReductionMode { @@ -88,7 +88,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { FastDivmodU64 divmod_blk_major_{}; // Divide up the number of stream-K tiles amongst G groups of stream-K units. - // Currently defaults to 1 since we don't create groups for PVC. + // Currently defaults to 1 since we don't create groups for Xe. FastDivmodU64 divmod_sk_groups_{}; // Number of stream-K units in each group @@ -464,7 +464,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { static size_t get_barrier_workspace_size(uint64_t num_tiles, uint32_t barrier_bits) { size_t workspace_bits = num_tiles * static_cast(barrier_bits); - return round_up_to_l2_alignment(bits_to_bytes(workspace_bits)); + return bits_to_bytes(workspace_bits); } // Calculates the size of the workspace needed for holding partial outputs from splits @@ -473,7 +473,7 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) { size_t output_tile_size = tile_shape.m() * tile_shape.n(); size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs; - return round_up_to_l2_alignment(bits_to_bytes(workspace_bits)); + return bits_to_bytes(workspace_bits); } static void @@ -695,15 +695,6 @@ struct PersistentTileSchedulerIntelPVCStreamKParams { sk_units_ = 0; divmod_sk_units_per_group_ = FastDivmodU64(blocks_m * blocks_n * blocks_l); } - - private: - // Round up number of bytes to the nearest multiple of L2 cache line alignment - CUTLASS_HOST_DEVICE - static size_t - round_up_to_l2_alignment(size_t bytes) { - constexpr size_t L2CacheLineSizeBytes = 128u; - return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes; - } }; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp similarity index 98% rename from include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp rename to include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp index 69ad69a37..e1423e2bb 100644 --- a/include/cutlass/gemm/kernel/intel_pvc_tile_scheduler_streamk.hpp +++ b/include/cutlass/gemm/kernel/xe_tile_scheduler_streamk.hpp @@ -38,7 +38,7 @@ #include "cutlass/kernel_hardware_info.hpp" #include "cute/layout.hpp" #include "cute/tensor.hpp" -#include "cutlass/gemm/kernel/intel_pvc_persistent_tile_scheduler_params_streamk.hpp" +#include "cutlass/gemm/kernel/xe_persistent_tile_scheduler_params_streamk.hpp" namespace cutlass::gemm::kernel::detail { @@ -46,7 +46,7 @@ namespace cutlass::gemm::kernel::detail { template < class TileShape > -class PersistentTileSchedulerIntelPVCStreamK { +class PersistentTileSchedulerXeStreamK { // // Data members // @@ -59,7 +59,7 @@ class PersistentTileSchedulerIntelPVCStreamK { // Use a dummy barrier manager to simply get the type used to store the barrier using BarrierType = typename NamedBarrierManager<1>::T; - using Params = PersistentTileSchedulerIntelPVCStreamKParams; + using Params = PersistentTileSchedulerXeStreamKParams; using ReductionMode = Params::ReductionMode; using DecompositionMode = Params::DecompositionMode; @@ -180,10 +180,10 @@ class PersistentTileSchedulerIntelPVCStreamK { } CUTLASS_HOST_DEVICE - PersistentTileSchedulerIntelPVCStreamK() { }; + PersistentTileSchedulerXeStreamK() { }; CUTLASS_HOST_DEVICE - PersistentTileSchedulerIntelPVCStreamK(Params const& params_) : scheduler_params(params_) { + PersistentTileSchedulerXeStreamK(Params const& params_) : scheduler_params(params_) { current_work_linear_idx_ = uint64_t(BlockIdxX()); }