Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for SYCL on example 35 #142

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 21 additions & 47 deletions examples/35_gemm_softmax/gemm_softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#if defined(CUTLASS_ENABLE_SYCL)
#include "cutlass/util/reference/device/sycl_tensor_fill.h"
#else
#include "cutlass/util/reference/device/tensor_fill.h"
#endif
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/error_metrics.h"
#include "cutlass/util/tensor_view_io.h"
Expand All @@ -61,6 +65,8 @@
#include "cutlass/epilogue/thread/linear_combination.h"
/////////////////////////////////////////////////////////////////////////////////////////////////

#include <helper.h>

#include "gemm_with_softmax.h"

/////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -159,6 +165,8 @@ struct Options {
/// Returns true if the environment and Toolkit support this
bool supported(bool verbose = true) const {

#if !defined(CUTLASS_ENABLE_SYCL)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only temporary needed until we rework how __CUDACC_VER_MAJOR__ works for syclcompat, right? If so a code comment would be good so that we can find those easily in the future.


// Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
// in CUDA 11.0.
//
Expand Down Expand Up @@ -187,7 +195,7 @@ struct Options {
}
return false;
}

#endif
return true;
}
};
Expand Down Expand Up @@ -333,12 +341,16 @@ struct Testbed {
return disposition;
}

#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::wait();
#else
cudaError_t result = cudaDeviceSynchronize();
if (result != cudaSuccess) {
std::cerr << "Device synchronize failed with error "
<< cudaGetErrorString(result) << std::endl;
return disposition;
}
#endif

//
// Verify
Expand Down Expand Up @@ -513,6 +525,10 @@ struct Testbed {
ElementCompute(0)
);

#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::wait();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is that extra wait needed? Should probably be a code comment.

#endif

// Copy reference results to host memory for verification
std::vector<ElementD> matrix_D_Ref(layout_C.capacity(extent_C));
cutlass::device_memory::copy_to_host(matrix_D_Ref.data(), block_Ref.get(), matrix_D_Ref.size());
Expand Down Expand Up @@ -597,25 +613,10 @@ struct Testbed {
//

cutlass::Status status = cutlass::Status::kSuccess;
cudaError_t result;
cudaEvent_t events[2];
GpuTimer timer;
int const kIterations = options.iterations;

for (cudaEvent_t &evt : events) {
result = cudaEventCreate(&evt);
if (result != cudaSuccess) {
std::cerr << "cudaEventCreate failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}
}

result = cudaEventRecord(events[0]);

if (result != cudaSuccess) {
std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}

timer.start();
for (int iter = 0; iter < kIterations; ++iter) {

status = execute_device_kernel();
Expand All @@ -625,36 +626,9 @@ struct Testbed {
return false;
}
}
timer.stop();

result = cudaEventRecord(events[1]);

if (result != cudaSuccess) {
std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}

result = cudaDeviceSynchronize();

if (result != cudaSuccess) {
std::cerr << "cudaDeviceSynchronize() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}

float elapsed_ms = 0;
result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);

if (result != cudaSuccess) {
std::cerr << "cudaEventElapsedTime() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}

for (cudaEvent_t &evt : events) {
result = cudaEventDestroy(evt);
if (result != cudaSuccess) {
std::cerr << "cudaEventDestroy() failed with error " << cudaGetErrorString(result) << std::endl;
return false;
}
}
float elapsed_ms = timer.elapsed_millis();

int64_t flops = int64_t(options.problem_size.m()) * options.problem_size.n() * options.problem_size.k() * 2;
int64_t bytes = (sizeof(ElementD) * 2 + sizeof(ElementSoftmax)) * options.problem_size.m() * options.problem_size.n();
Expand Down
8 changes: 4 additions & 4 deletions examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ struct GemmWithEpilogueVisitor {
};

// Compute position within threadblock
int thread_idx = threadIdx.x;
int thread_idx = ThreadIdxX();

// Construct iterators to A and B operands
typename Mma::IteratorA iterator_A(
Expand All @@ -447,9 +447,9 @@ struct GemmWithEpilogueVisitor {

// Broadcast the warp_id computed by lane 0 to ensure dependent code
// is compiled as warp-uniform.
int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
int warp_idx = shfl_sync(0xffffffff, ThreadIdxX() / 32, 0);

int lane_idx = threadIdx.x % 32;
int lane_idx = ThreadIdxX() % 32;

//
// Main loop
Expand Down Expand Up @@ -505,7 +505,7 @@ struct GemmWithEpilogueVisitor {
params.ptr_Max,
params.ptr_Sum,
threadblock_offset,
blockIdx.y *params.problem_size.m() );
BlockIdxY() *params.problem_size.m() );

if (params.mode == GemmUniversalMode::kGemm) {
// Indicate which position in a serial reduction the output operator is currently updating
Expand Down
44 changes: 40 additions & 4 deletions examples/35_gemm_softmax/gemm_with_softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,12 +201,12 @@ class ApplySoftmax {

using AccessTypeD = AlignedArray<ElementD, kAlignment>;

int block_batch = blockIdx.z;
int block_m = blockIdx.x * ApplyShape::kRow;
int block_batch = BlockIdxZ();
int block_m = BlockIdxX() * ApplyShape::kRow;
int block_n = 0;

int thread_m = threadIdx.y;
int thread_n = threadIdx.x * kAlignment;
int thread_m = ThreadIdxY();
int thread_n = ThreadIdxX() * kAlignment;

int idx_m = block_m + thread_m;
int idx_n = block_n + thread_n;
Expand Down Expand Up @@ -580,6 +580,17 @@ class GemmSoftmax {

cudaError_t result;

#if defined(CUTLASS_ENABLE_SYCL)
const auto sycl_block = syclcompat::dim3(gemm_block.x, gemm_block.y, gemm_block.z);
const auto sycl_grid = syclcompat::dim3(gemm_grid.x, gemm_grid.y, gemm_grid.z);

using namespace syclcompat::experimental;

auto gemm_event = launch<cutlass::Kernel<GemmKernel>>(launch_policy{
sycl_grid, sycl_block, local_mem_size{static_cast<std::size_t>(gemm_smem_size)}},
params_.gemm);
EventManager::getInstance().addEvent(gemm_event);
Copy link
Collaborator

@rolandschulz rolandschulz Oct 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does the event need to be recorded? (a general question for each launch)

#else
if (gemm_smem_size >= (48 << 10)) {
result = cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
Expand All @@ -591,6 +602,7 @@ class GemmSoftmax {
}

cutlass::Kernel<GemmKernel><<<gemm_grid, gemm_block, gemm_smem_size, stream>>>(params_.gemm);
#endif

result = cudaGetLastError();

Expand All @@ -613,9 +625,21 @@ class GemmSoftmax {
dim3 final_reduction_grid(block_per_row, 1, params_.softmax.args.batch_count);
dim3 final_reduction_block(thread_per_block);

#if defined(CUTLASS_ENABLE_SYCL)
const auto sycl_final_reduction_block = syclcompat::dim3(final_reduction_block.x, final_reduction_block.y, final_reduction_block.z);
const auto sycl_final_reduction_grid = syclcompat::dim3(final_reduction_grid.x, final_reduction_grid.y, final_reduction_grid.z);

using namespace syclcompat::experimental;

auto final_reduction_event = launch<Kernel<ApplyFinalReductionKernel>>(launch_policy{
sycl_final_reduction_grid, sycl_final_reduction_block, local_mem_size{sizeof(typename ApplyFinalReductionKernel::SharedStorage)}},
params_.reduction);
EventManager::getInstance().addEvent(final_reduction_event);
#else
Kernel<ApplyFinalReductionKernel><<<
final_reduction_grid, final_reduction_block, sizeof(typename ApplyFinalReductionKernel::SharedStorage), stream
>>>(params_.reduction);
#endif

result = cudaGetLastError();

Expand All @@ -637,9 +661,21 @@ class GemmSoftmax {
(params_.softmax.args.extent.column() + threadblock_columns - 1) / threadblock_columns,
params_.softmax.args.batch_count);

#if defined(CUTLASS_ENABLE_SYCL)
const auto sycl_apply_block = syclcompat::dim3(apply_block.x, apply_block.y, apply_block.z);
const auto sycl_apply_grid = syclcompat::dim3(apply_grid.x, apply_grid.y, apply_grid.z);

using namespace syclcompat::experimental;

auto apply_event = launch<Kernel<SoftmaxApplyKernel>>(launch_policy{
sycl_apply_grid, sycl_apply_block, local_mem_size{sizeof(typename SoftmaxApplyKernel::SharedStorage)}},
params_.softmax);
EventManager::getInstance().addEvent(apply_event);
#else
Kernel<SoftmaxApplyKernel><<<
apply_grid, apply_block, sizeof(typename SoftmaxApplyKernel::SharedStorage), stream
>>>(params_.softmax);
#endif

result = cudaGetLastError();

Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ if (NOT CUTLASS_ENABLE_SYCL)
else()
foreach(EXAMPLE
14_ampere_tf32_tensorop_gemm
35_gemm_softmax
cute
sycl
)
Expand Down
2 changes: 1 addition & 1 deletion include/cute/util/debug.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
*/

#if defined(CUTLASS_ENABLE_SYCL)
#include <sycl.hpp>
#include <sycl/sycl.hpp>
#include <syclcompat.hpp>
#else
#include <cuda_runtime_api.h>
Expand Down
2 changes: 2 additions & 0 deletions include/cutlass/arch/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ struct global_load;
(__CUDACC_VER_MAJOR__ > 11)) && \
defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
#define CUTLASS_ENABLE_L2_PREFETCH 1
#elif defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 750)
#define CUTLASS_ENABLE_L2_PREFETCH 1
#else
#define CUTLASS_ENABLE_L2_PREFETCH 0
#endif
Expand Down
14 changes: 7 additions & 7 deletions include/cutlass/arch/memory_sm80.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
#include "cutlass/arch/memory_sm75.h"
#include "cutlass/arch/cache_operation.h"

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) || (defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 800))
#define CUDA_CP_ASYNC_ACTIVATED 1
#else
#define CUDA_CP_ASYNC_ACTIVATED 0
Expand Down Expand Up @@ -194,7 +194,7 @@ struct cp_async_nan<16, CacheOperation::Always> {
cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
#if CUDA_CP_ASYNC_ACTIVATED

static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
static CUTLASS_CONSTANT uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
OOB_NAN_F16x2, OOB_NAN_F16x2};

unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
Expand Down Expand Up @@ -236,9 +236,9 @@ struct cp_async_diag <Element_, false> {
#if CUDA_CP_ASYNC_ACTIVATED

/// Values for the diagonal elements of the triangular input matrix
static __constant__ uint2 DIAG_DATA_DOUBLE_ONE = {0x3ff00000, 0x00000000};
static __constant__ uint1 DIAG_DATA_FLOAT_ONE = {0x3f800000};
static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
static CUTLASS_CONSTANT uint2 DIAG_DATA_DOUBLE_ONE = {0x3ff00000, 0x00000000};
static CUTLASS_CONSTANT uint1 DIAG_DATA_FLOAT_ONE = {0x3f800000};
static CUTLASS_CONSTANT uint1 DIAG_DATA_ZERO = {0x00000000};

unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);

Expand Down Expand Up @@ -283,7 +283,7 @@ struct cp_async_diag <Element_, true> {
#if CUDA_CP_ASYNC_ACTIVATED

/// Values for the diagonal elements of the triangular input matrix
static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
static CUTLASS_CONSTANT uint1 DIAG_DATA_ZERO = {0x00000000};

unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);

Expand Down Expand Up @@ -397,7 +397,7 @@ struct cp_async_nan<16, CacheOperation::Global> {
cp_async_nan(void *smem_ptr, void const *global_ptr, bool pred_guard) {
#if CUDA_CP_ASYNC_ACTIVATED

static __constant__ uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
static CUTLASS_CONSTANT uint4 OOB_NAN_F16x8 = {OOB_NAN_F16x2, OOB_NAN_F16x2,
OOB_NAN_F16x2, OOB_NAN_F16x2};

unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
Expand Down
6 changes: 4 additions & 2 deletions include/cutlass/arch/mma_sm80.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,13 @@

////////////////////////////////////////////////////////////////////////////////

#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) || \
defined(CUTLASS_ENABLE_SYCL)

#define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1

#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) || \
(defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 800))
#define CUTLASS_ARCH_MMA_SM80_ENABLED

#if (__CUDA_ARCH__ <= 900)
Expand Down
7 changes: 7 additions & 0 deletions include/cutlass/detail/helper_macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@
#define CUTLASS_GLOBAL __global__ static
#define CUTLASS_SHARED __shared__
#endif

#if defined(__CUDA_ARCH__)
#define CUTLASS_CONSTANT __constant__
#else
#define CUTLASS_CONSTANT constexpr
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,11 @@ class EpilogueVisitorSoftmax {
// Compute accumulate sum only in the last step
accum_sum_ = warp_reduce_sum_(accum_sum_);

bool is_first_thread_in_tile = ((threadIdx.x % kThreadsPerRow) == 0);
bool is_first_thread_in_tile = ((ThreadIdxX() % kThreadsPerRow) == 0);
bool row_guard = thread_offset_.row() < extent_.row();
bool is_write_thread = row_guard && is_first_thread_in_tile;

int block_batch = blockIdx.z;
int block_batch = BlockIdxZ();

ElementNorm *curr_ptr_max = ptr_Max_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Max;
ElementSum *curr_ptr_sum = ptr_Sum_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Sum;
Expand Down Expand Up @@ -434,7 +434,7 @@ class EpilogueVisitorSoftmax {
int half_thread_in_row = (kThreadsPerRow >> 1);
CUTLASS_PRAGMA_UNROLL
for (int i = half_thread_in_row; i > 0; i >>= 1) {
ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, sum_, i);
ElementSoftmaxCompute tmp = shfl_xor_sync(0xFFFFFFFF, sum_, i);
sum_ += tmp;
}
return sum_;
Expand All @@ -445,7 +445,7 @@ class EpilogueVisitorSoftmax {
int half_thread_in_row = (kThreadsPerRow >> 1);
CUTLASS_PRAGMA_UNROLL
for (int i = half_thread_in_row; i > 0; i >>= 1) {
ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, max_, i);
ElementSoftmaxCompute tmp = shfl_xor_sync(0xFFFFFFFF, max_, i);
max_ = fast_max(max_, tmp);
}
return max_;
Expand Down
Loading
Loading