codeplaysoftware · aacostadiaz · Oct 7, 2024 · rolandschulz · Oct 16, 2024 · rolandschulz
diff --git a/examples/35_gemm_softmax/gemm_softmax.cu b/examples/35_gemm_softmax/gemm_softmax.cu
@@ -52,7 +52,11 @@
 #include "cutlass/util/reference/host/tensor_compare.h"
 #include "cutlass/util/reference/host/tensor_norm.h"
 #include "cutlass/util/reference/host/tensor_copy.h"
+#if defined(CUTLASS_ENABLE_SYCL)
+#include "cutlass/util/reference/device/sycl_tensor_fill.h"
+#else
 #include "cutlass/util/reference/device/tensor_fill.h"
+#endif
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/host/error_metrics.h"
 #include "cutlass/util/tensor_view_io.h"
@@ -61,6 +65,8 @@
 #include "cutlass/epilogue/thread/linear_combination.h"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+#include <helper.h>
+
 #include "gemm_with_softmax.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -159,6 +165,8 @@ struct Options {
   /// Returns true if the environment and Toolkit support this
   bool supported(bool verbose = true) const {
 
+#if !defined(CUTLASS_ENABLE_SYCL)
+
     // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
     // in CUDA 11.0.
     //
@@ -187,7 +195,7 @@ struct Options {
       }
       return false;
     }
-
+#endif
     return true;
   }
 };
@@ -333,12 +341,16 @@ struct Testbed {
       return disposition;
     }
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    syclcompat::wait();
+#else
     cudaError_t result = cudaDeviceSynchronize();
     if (result != cudaSuccess) {
       std::cerr << "Device synchronize failed with error "
         << cudaGetErrorString(result) << std::endl;
       return disposition;
     }
+#endif
 
     //
     // Verify
@@ -513,6 +525,10 @@ struct Testbed {
         ElementCompute(0)
       );
 
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#endif
+
       // Copy reference results to host memory for verification
       std::vector<ElementD> matrix_D_Ref(layout_C.capacity(extent_C));
       cutlass::device_memory::copy_to_host(matrix_D_Ref.data(), block_Ref.get(), matrix_D_Ref.size());
@@ -597,25 +613,10 @@ struct Testbed {
     //
 
     cutlass::Status status = cutlass::Status::kSuccess;
-    cudaError_t result;
-    cudaEvent_t events[2];
+    GpuTimer timer;
     int const kIterations = options.iterations;
 
-    for (cudaEvent_t &evt : events) {
-      result = cudaEventCreate(&evt);
-      if (result != cudaSuccess) {
-        std::cerr << "cudaEventCreate failed with error " << cudaGetErrorString(result) << std::endl;
-        return false;
-      }
-    }
-
-    result = cudaEventRecord(events[0]);
-
-    if (result != cudaSuccess) {
-      std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
-      return false;
-    }
-
+    timer.start();
     for (int iter = 0; iter < kIterations; ++iter) {
 
       status = execute_device_kernel();
@@ -625,36 +626,9 @@ struct Testbed {
         return false;
       }
     }
+    timer.stop();
 
-    result = cudaEventRecord(events[1]);
-
-    if (result != cudaSuccess) {
-      std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
-      return false;
-    }
-
-    result = cudaDeviceSynchronize();
-
-    if (result != cudaSuccess) {
-      std::cerr << "cudaDeviceSynchronize() failed with error " << cudaGetErrorString(result) << std::endl;
-      return false;
-    }
-
-    float elapsed_ms = 0;
-    result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
-
-    if (result != cudaSuccess) {
-      std::cerr << "cudaEventElapsedTime() failed with error " << cudaGetErrorString(result) << std::endl;
-      return false;
-    }
-
-    for (cudaEvent_t &evt : events) {
-      result = cudaEventDestroy(evt);
-      if (result != cudaSuccess) {
-        std::cerr << "cudaEventDestroy() failed with error " << cudaGetErrorString(result) << std::endl;
-        return false;
-      }
-    }
+    float elapsed_ms = timer.elapsed_millis();
 
     int64_t flops = int64_t(options.problem_size.m()) * options.problem_size.n() * options.problem_size.k() * 2;
     int64_t bytes = (sizeof(ElementD) * 2 + sizeof(ElementSoftmax)) * options.problem_size.m() * options.problem_size.n();

diff --git a/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h b/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
@@ -428,7 +428,7 @@ struct GemmWithEpilogueVisitor {
     };
 
     // Compute position within threadblock
-    int thread_idx = threadIdx.x;
+    int thread_idx = ThreadIdxX();
 
     // Construct iterators to A and B operands
     typename Mma::IteratorA iterator_A(
@@ -447,9 +447,9 @@ struct GemmWithEpilogueVisitor {
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int warp_idx = shfl_sync(0xffffffff, ThreadIdxX() / 32, 0);
 
-    int lane_idx = threadIdx.x % 32;
+    int lane_idx = ThreadIdxX() % 32;
 
     //
     // Main loop
@@ -505,7 +505,7 @@ struct GemmWithEpilogueVisitor {
       params.ptr_Max,
       params.ptr_Sum,
       threadblock_offset,
-      blockIdx.y *params.problem_size.m() );
+      BlockIdxY() *params.problem_size.m() );
 
     if (params.mode == GemmUniversalMode::kGemm) {
       // Indicate which position in a serial reduction the output operator is currently updating

diff --git a/examples/35_gemm_softmax/gemm_with_softmax.h b/examples/35_gemm_softmax/gemm_with_softmax.h
@@ -201,12 +201,12 @@ class ApplySoftmax {
 
     using AccessTypeD = AlignedArray<ElementD, kAlignment>;
 
-    int block_batch = blockIdx.z;
-    int block_m = blockIdx.x * ApplyShape::kRow;
+    int block_batch = BlockIdxZ();
+    int block_m = BlockIdxX() * ApplyShape::kRow;
     int block_n = 0;
 
-    int thread_m = threadIdx.y;
-    int thread_n = threadIdx.x * kAlignment;
+    int thread_m = ThreadIdxY();
+    int thread_n = ThreadIdxX() * kAlignment;
 
     int idx_m = block_m + thread_m;
     int idx_n = block_n + thread_n;
@@ -580,6 +580,17 @@ class GemmSoftmax {
 
     cudaError_t result;
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_block = syclcompat::dim3(gemm_block.x, gemm_block.y, gemm_block.z);
+    const auto sycl_grid = syclcompat::dim3(gemm_grid.x, gemm_grid.y, gemm_grid.z);
+
+    using namespace syclcompat::experimental;
+
+    auto gemm_event = launch<cutlass::Kernel<GemmKernel>>(launch_policy{
+      sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(gemm_smem_size)}},
+      params_.gemm);
+    EventManager::getInstance().addEvent(gemm_event);
+#else
     if (gemm_smem_size >= (48 << 10)) {
       result = cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>,
                                     cudaFuncAttributeMaxDynamicSharedMemorySize,
@@ -591,6 +602,7 @@ class GemmSoftmax {
     }
 
     cutlass::Kernel<GemmKernel><<<gemm_grid, gemm_block, gemm_smem_size, stream>>>(params_.gemm);
+#endif
 
     result = cudaGetLastError();
 
@@ -613,9 +625,21 @@ class GemmSoftmax {
     dim3 final_reduction_grid(block_per_row, 1, params_.softmax.args.batch_count);
     dim3 final_reduction_block(thread_per_block);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_final_reduction_block = syclcompat::dim3(final_reduction_block.x, final_reduction_block.y, final_reduction_block.z);
+    const auto sycl_final_reduction_grid = syclcompat::dim3(final_reduction_grid.x, final_reduction_grid.y, final_reduction_grid.z);
+
+    using namespace syclcompat::experimental;
+
+    auto final_reduction_event = launch<Kernel<ApplyFinalReductionKernel>>(launch_policy{
+      sycl_final_reduction_grid, sycl_final_reduction_block, local_mem_size{sizeof(typename ApplyFinalReductionKernel::SharedStorage)}},
+      params_.reduction);
+    EventManager::getInstance().addEvent(final_reduction_event);
+#else
     Kernel<ApplyFinalReductionKernel><<<
       final_reduction_grid, final_reduction_block, sizeof(typename ApplyFinalReductionKernel::SharedStorage), stream
     >>>(params_.reduction);
+#endif
 
     result = cudaGetLastError();
 
@@ -637,9 +661,21 @@ class GemmSoftmax {
       (params_.softmax.args.extent.column() + threadblock_columns - 1) / threadblock_columns,
       params_.softmax.args.batch_count);
 
+#if defined(CUTLASS_ENABLE_SYCL)
+    const auto sycl_apply_block = syclcompat::dim3(apply_block.x, apply_block.y, apply_block.z);
+    const auto sycl_apply_grid = syclcompat::dim3(apply_grid.x, apply_grid.y, apply_grid.z);
+
+    using namespace syclcompat::experimental;
+
+    auto apply_event = launch<Kernel<SoftmaxApplyKernel>>(launch_policy{
+      sycl_apply_grid, sycl_apply_block, local_mem_size{sizeof(typename SoftmaxApplyKernel::SharedStorage)}},
+      params_.softmax);
+    EventManager::getInstance().addEvent(apply_event);
+#else
     Kernel<SoftmaxApplyKernel><<<
       apply_grid, apply_block, sizeof(typename SoftmaxApplyKernel::SharedStorage), stream
     >>>(params_.softmax);
+#endif
 
     result = cudaGetLastError();
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -155,6 +155,7 @@ if (NOT CUTLASS_ENABLE_SYCL)
 else()
   foreach(EXAMPLE
     14_ampere_tf32_tensorop_gemm
+    35_gemm_softmax
     cute
     sycl
     )

diff --git a/include/cute/util/debug.hpp b/include/cute/util/debug.hpp
@@ -36,7 +36,7 @@
  */
 
 #if defined(CUTLASS_ENABLE_SYCL)
-#include <sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <syclcompat.hpp>
 #else
 #include <cuda_runtime_api.h>

diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h
@@ -65,6 +65,8 @@ struct global_load;
      (__CUDACC_VER_MAJOR__ > 11)) &&                                  \
     defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
   #define CUTLASS_ENABLE_L2_PREFETCH 1
+#elif defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 750)
+  #define CUTLASS_ENABLE_L2_PREFETCH 1
 #else
   #define CUTLASS_ENABLE_L2_PREFETCH 0
 #endif

diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h
@@ -41,7 +41,7 @@
 #include "cutlass/arch/memory_sm75.h"
 #include "cutlass/arch/cache_operation.h"
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) || (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 800))
   #define CUDA_CP_ASYNC_ACTIVATED 1
 #else
   #define CUDA_CP_ASYNC_ACTIVATED 0
@@ -194,7 +194,7 @@ struct cp_async_nan<16, CacheOperation::Always> {
   cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
     #if CUDA_CP_ASYNC_ACTIVATED
 
-      static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
+      static CUTLASS_CONSTANT uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
                                                  OOB_NAN_F16x2, OOB_NAN_F16x2};
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
@@ -236,9 +236,9 @@ struct cp_async_diag <Element_, false> {
     #if CUDA_CP_ASYNC_ACTIVATED
 
       /// Values for the diagonal elements of the triangular input matrix
-      static __constant__ uint2 DIAG_DATA_DOUBLE_ONE = {0x3ff00000, 0x00000000};
-      static __constant__ uint1 DIAG_DATA_FLOAT_ONE = {0x3f800000};
-      static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
+      static CUTLASS_CONSTANT uint2 DIAG_DATA_DOUBLE_ONE = {0x3ff00000, 0x00000000};
+      static CUTLASS_CONSTANT uint1 DIAG_DATA_FLOAT_ONE = {0x3f800000};
+      static CUTLASS_CONSTANT uint1 DIAG_DATA_ZERO = {0x00000000};
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
 
@@ -283,7 +283,7 @@ struct cp_async_diag <Element_, true> {
     #if CUDA_CP_ASYNC_ACTIVATED
 
       /// Values for the diagonal elements of the triangular input matrix
-      static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
+      static CUTLASS_CONSTANT uint1 DIAG_DATA_ZERO = {0x00000000};
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
 
@@ -397,7 +397,7 @@ struct cp_async_nan<16, CacheOperation::Global> {
   cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
     #if CUDA_CP_ASYNC_ACTIVATED
 
-      static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
+      static CUTLASS_CONSTANT uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
                                                  OOB_NAN_F16x2, OOB_NAN_F16x2};
 
       unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);

diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h
@@ -47,11 +47,13 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) || \
+    defined(CUTLASS_ENABLE_SYCL)
 
 #define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) || \
+    (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 800))
 #define CUTLASS_ARCH_MMA_SM80_ENABLED
 
 #if (__CUDA_ARCH__ <= 900)

diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
@@ -69,6 +69,13 @@
 #define CUTLASS_GLOBAL __global__ static
 #define CUTLASS_SHARED __shared__
 #endif
+
+#if defined(__CUDA_ARCH__)
+#define CUTLASS_CONSTANT __constant__
+#else
+#define CUTLASS_CONSTANT constexpr
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template<typename T>

diff --git a/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h b/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
@@ -381,11 +381,11 @@ class EpilogueVisitorSoftmax {
     // Compute accumulate sum only in the last step
     accum_sum_ = warp_reduce_sum_(accum_sum_);
 
-    bool is_first_thread_in_tile = ((threadIdx.x % kThreadsPerRow) == 0);
+    bool is_first_thread_in_tile = ((ThreadIdxX() % kThreadsPerRow) == 0);
     bool row_guard = thread_offset_.row() < extent_.row();
     bool is_write_thread = row_guard && is_first_thread_in_tile;
 
-    int block_batch = blockIdx.z;
+    int block_batch = BlockIdxZ();
 
     ElementNorm *curr_ptr_max = ptr_Max_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Max;
     ElementSum *curr_ptr_sum = ptr_Sum_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Sum;
@@ -434,7 +434,7 @@ class EpilogueVisitorSoftmax {
     int half_thread_in_row = (kThreadsPerRow >> 1);
     CUTLASS_PRAGMA_UNROLL
     for (int i = half_thread_in_row; i > 0; i >>= 1) {
-      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, sum_, i);
+      ElementSoftmaxCompute tmp = shfl_xor_sync(0xFFFFFFFF, sum_, i);
       sum_ += tmp;
     }
     return sum_;
@@ -445,7 +445,7 @@ class EpilogueVisitorSoftmax {
     int half_thread_in_row = (kThreadsPerRow >> 1);
     CUTLASS_PRAGMA_UNROLL
     for (int i = half_thread_in_row; i > 0; i >>= 1) {
-      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, max_, i);
+      ElementSoftmaxCompute tmp = shfl_xor_sync(0xFFFFFFFF, max_, i);
       max_ = fast_max(max_, tmp);
     }
     return max_;