Skip to content

Commit

Permalink
Add sycl equivalent to cuda events for profiling (#69)
Browse files Browse the repository at this point in the history
  • Loading branch information
aacostadiaz authored Jul 16, 2024
1 parent 8f3bd67 commit 00c366c
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 25 deletions.
14 changes: 14 additions & 0 deletions benchmarks/common/benchmark_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
*
**************************************************************************************************/

#if defined(CUTLASS_ENABLE_SYCL)
#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
#endif

#include "cutlass/gemm/device/gemm.h"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/gemm/device/gemm_universal.h"
Expand Down Expand Up @@ -208,6 +212,16 @@ struct BenchmarkRunner {
auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
auto [M, N, K, L] = problem_shape_MNKL;

#if defined(CUTLASS_SYCLCOMPAT_PROFILING_ENABLED)
sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);
#endif

stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include <cstdlib>
#include <cstdio>

Expand Down Expand Up @@ -121,6 +123,13 @@ run(Gemm_Op gemm_op)

void test_gemm(int m, int n, int k)
{
sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

std::cout << "M = " << m << std::endl;
std::cout << "N = " << n << std::endl;
Expand Down
16 changes: 14 additions & 2 deletions examples/cute/tutorial/sgemm_1_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/print_error.hpp"

Expand Down Expand Up @@ -295,12 +297,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
auto dimBlock = syclcompat::dim3(size(tC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));

syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(tA), TB, decltype(dB), decltype(sB), decltype(tB), TC, decltype(dC),
decltype(sC), decltype(tC), Alpha, Beta>>(dimGrid, dimBlock, prob_shape,
cta_tiler, A, dA, sA, tA, B, dB, sB, tB,
C, dC, sC, tC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

// Setup params for a TN GEMM
Expand Down Expand Up @@ -341,12 +344,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
auto dimBlock = syclcompat::dim3(size(tC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));

syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(tA), TB, decltype(dB), decltype(sB), decltype(tB), TC, decltype(dC),
decltype(sC), decltype(tC), Alpha, Beta>>(dimGrid, dimBlock, prob_shape,
cta_tiler, A, dA, sA, tA, B, dB, sB, tB,
C, dC, sC, tC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

template <class TA, class TB, class TC, class Alpha, class Beta>
Expand Down Expand Up @@ -376,6 +380,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
16 changes: 14 additions & 2 deletions examples/cute/tutorial/sgemm_2_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/print_error.hpp"

Expand Down Expand Up @@ -287,12 +289,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
sC, mmaC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

// Setup params for a TN GEMM
Expand Down Expand Up @@ -361,12 +364,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
sC, mmaC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

template <class TA, class TB, class TC, class Alpha, class Beta>
Expand Down Expand Up @@ -397,6 +401,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
16 changes: 14 additions & 2 deletions examples/cute/tutorial/sgemm_sm70_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/print_error.hpp"

Expand Down Expand Up @@ -283,12 +285,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
sC, mmaC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

// Setup params for a TN GEMM
Expand Down Expand Up @@ -349,12 +352,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
sC, mmaC, alpha, beta);
EventManager::getInstance().addEvent(event);
}

template <class TA, class TB, class TC, class Alpha, class Beta>
Expand Down Expand Up @@ -385,6 +389,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
16 changes: 14 additions & 2 deletions examples/cute/tutorial/sgemm_sm80_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/print_error.hpp"

Expand Down Expand Up @@ -362,12 +364,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC,
alpha, beta);
EventManager::getInstance().addEvent(event);
}

// Setup params for a NT GEMM
Expand Down Expand Up @@ -433,12 +436,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B

auto dimBlock = syclcompat::dim3(size(mmaC));
auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
syclcompat::launch<
auto event = syclcompat::launch<
gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC,
alpha, beta);
EventManager::getInstance().addEvent(event);
}

template <class TA, class TB, class TC, class Alpha, class Beta>
Expand Down Expand Up @@ -468,6 +472,14 @@ int main(int argc, char** argv) {
char transB = 'T';
if (argc >= 6) sscanf(argv[5], "%c", &transB);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

using TA = float;
using TB = float;
using TC = float;
Expand Down
10 changes: 10 additions & 0 deletions examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
*
**************************************************************************************************/

#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED

#include "cutlass/gemm/device/gemm.h"
#include "cutlass/epilogue/collective/default_epilogue.hpp"
#include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp"
Expand Down Expand Up @@ -260,6 +262,14 @@ struct ExampleRunner {

initialize(problem_size);

sycl::property_list prop = {
sycl::property::queue::in_order(),
sycl::property::queue::enable_profiling()
};

auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
syclcompat::set_default_queue(q);

typename Gemm::GemmKernel::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
problem_size,
Expand Down
9 changes: 7 additions & 2 deletions include/cutlass/gemm/device/gemm_universal_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@
// 3.x
#include "cutlass/gemm/kernel/gemm_universal.hpp"

#if defined(CUTLASS_ENABLE_SYCL)
#include "cutlass/util/sycl_event_manager.hpp"
#endif

////////////////////////////////////////////////////////////////////////////////

namespace cutlass::gemm::device {
Expand Down Expand Up @@ -407,10 +411,11 @@ class GemmUniversalAdapter<
const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);

#if defined (SYCL_INTEL_TARGET)
syclcompat::experimental::launch<device_kernel<GemmKernel>, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params);
auto event = syclcompat::experimental::launch<device_kernel<GemmKernel>, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params);
#else
syclcompat::launch<device_kernel<GemmKernel>>(sycl_grid, sycl_block, smem_size, params);
auto event = syclcompat::launch<device_kernel<GemmKernel>>(sycl_grid, sycl_block, smem_size, params);
#endif
EventManager::getInstance().addEvent(event);
#else
device_kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params);
#endif
Expand Down
33 changes: 18 additions & 15 deletions tools/util/include/cutlass/util/GPU_Clock.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,42 +32,49 @@
#pragma once

#if defined(CUTLASS_ENABLE_SYCL)
#include <syclcompat.hpp>
#include <chrono>
#include "cutlass/util/sycl_event_manager.hpp"
#else
#include <cuda_runtime.h>
#endif

struct GPU_Clock
{
#if !defined(CUTLASS_ENABLE_SYCL)
GPU_Clock() {
#if defined(CUTLASS_ENABLE_SYCL)
start_ = SyclEvent{};
stop_ = SyclEvent{};
#else
cudaEventCreate(&start_);
cudaEventCreate(&stop_);
cudaEventRecord(start_);
#endif
}

~GPU_Clock() {
#if defined(CUTLASS_ENABLE_SYCL)
syclEventDestroy(start_);
syclEventDestroy(stop_);
#else
cudaEventDestroy(start_);
cudaEventDestroy(stop_);
}
#endif
}

void start() {
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::get_default_queue().wait();
start_ = std::chrono::high_resolution_clock::now();
syclEventRecord(start_);
#else
cudaEventRecord(start_);
#endif
}

float milliseconds() {
#if defined(CUTLASS_ENABLE_SYCL)
syclcompat::get_default_queue().wait();
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<float, std::milli> time = stop - start_;
return time.count();
syclEventRecord(stop_);
syclEventSynchronize(start_, stop_);
float time;
syclEventElapsedTime(&time, start_, stop_);
return time;
#else
cudaEventRecord(stop_);
cudaEventSynchronize(stop_);
Expand All @@ -83,11 +90,7 @@ struct GPU_Clock

private:
#if defined(CUTLASS_ENABLE_SYCL)
typedef std::chrono::nanoseconds duration;
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::time_point<high_resolution_clock, duration> time_point;

time_point start_ = std::chrono::high_resolution_clock::now();
SyclEvent start_, stop_;
#else
cudaEvent_t start_, stop_;
#endif
Expand Down
Loading

0 comments on commit 00c366c

Please sign in to comment.