From 00c366c5647dc1fe3737501ff1c2db10fc172390 Mon Sep 17 00:00:00 2001
From: Alejandro Acosta <alejandro.acosta@codeplay.com>
Date: Tue, 16 Jul 2024 12:08:21 +0100
Subject: [PATCH] Add sycl equivalent to cuda events for profiling (#69)

---
 benchmarks/common/benchmark_runner.hpp        |  14 ++
 .../ampere_tf32_tensorop_gemm_cute.cpp        |   9 ++
 examples/cute/tutorial/sgemm_1_sycl.cpp       |  16 ++-
 examples/cute/tutorial/sgemm_2_sycl.cpp       |  16 ++-
 examples/cute/tutorial/sgemm_sm70_sycl.cpp    |  16 ++-
 examples/cute/tutorial/sgemm_sm80_sycl.cpp    |  16 ++-
 .../sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp    |  10 ++
 .../gemm/device/gemm_universal_adapter.h      |   9 +-
 tools/util/include/cutlass/util/GPU_Clock.hpp |  33 +++--
 .../cutlass/util/sycl_event_manager.hpp       | 135 ++++++++++++++++++
 10 files changed, 249 insertions(+), 25 deletions(-)
 create mode 100644 tools/util/include/cutlass/util/sycl_event_manager.hpp
diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp
index 5eb2ade3eb..decedbf75a 100644
--- a/benchmarks/common/benchmark_runner.hpp
+++ b/benchmarks/common/benchmark_runner.hpp
@@ -29,6 +29,10 @@
  *
  **************************************************************************************************/
 
+#if defined(CUTLASS_ENABLE_SYCL)
+#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
+#endif
+
 #include "cutlass/gemm/device/gemm.h"
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
 #include "cutlass/gemm/device/gemm_universal.h"
@@ -208,6 +212,16 @@ struct BenchmarkRunner {
       auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
       auto [M, N, K, L] = problem_shape_MNKL;
 
+#if defined(CUTLASS_SYCLCOMPAT_PROFILING_ENABLED)
+      sycl::property_list prop = {
+              sycl::property::queue::in_order(),
+              sycl::property::queue::enable_profiling()
+      };
+
+      auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
+      syclcompat::set_default_queue(q);
+#endif
+
       stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
       stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
       stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp
index 77bae13ca1..8858a3c2e3 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
+
 #include <cstdlib>
 #include <cstdio>
 
@@ -121,6 +123,13 @@ run(Gemm_Op gemm_op)
 
 void test_gemm(int m, int n, int k)
 {
+  sycl::property_list prop = {
+          sycl::property::queue::in_order(),
+          sycl::property::queue::enable_profiling()
+  };
+
+  auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
+  syclcompat::set_default_queue(q);
 
   std::cout << "M = " << m << std::endl;
   std::cout << "N = " << n << std::endl;
diff --git a/examples/cute/tutorial/sgemm_1_sycl.cpp b/examples/cute/tutorial/sgemm_1_sycl.cpp
index 764b1f105e..2a588af0ef 100644
--- a/examples/cute/tutorial/sgemm_1_sycl.cpp
+++ b/examples/cute/tutorial/sgemm_1_sycl.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
+
 #include "cutlass/util/GPU_Clock.hpp"
 #include "cutlass/util/print_error.hpp"
 
@@ -295,12 +297,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
   auto dimBlock = syclcompat::dim3(size(tC));
   auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
 
-  syclcompat::launch<
+  auto event = syclcompat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
                   decltype(tA), TB, decltype(dB), decltype(sB), decltype(tB), TC, decltype(dC),
                   decltype(sC), decltype(tC), Alpha, Beta>>(dimGrid, dimBlock, prob_shape,
                                                             cta_tiler, A, dA, sA, tA, B, dB, sB, tB,
                                                             C, dC, sC, tC, alpha, beta);
+  EventManager::getInstance().addEvent(event);
 }
 
 // Setup params for a TN GEMM
@@ -341,12 +344,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
   auto dimBlock = syclcompat::dim3(size(tC));
   auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
 
-  syclcompat::launch<
+  auto event = syclcompat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
                   decltype(tA), TB, decltype(dB), decltype(sB), decltype(tB), TC, decltype(dC),
                   decltype(sC), decltype(tC), Alpha, Beta>>(dimGrid, dimBlock, prob_shape,
                                                             cta_tiler, A, dA, sA, tA, B, dB, sB, tB,
                                                             C, dC, sC, tC, alpha, beta);
+  EventManager::getInstance().addEvent(event);
 }
 
 template <class TA, class TB, class TC, class Alpha, class Beta>
@@ -376,6 +380,14 @@ int main(int argc, char** argv) {
   char transB = 'T';
   if (argc >= 6) sscanf(argv[5], "%c", &transB);
 
+  sycl::property_list prop = {
+          sycl::property::queue::in_order(),
+          sycl::property::queue::enable_profiling()
+  };
+
+  auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
+  syclcompat::set_default_queue(q);
+
   using TA = float;
   using TB = float;
   using TC = float;
diff --git a/examples/cute/tutorial/sgemm_2_sycl.cpp b/examples/cute/tutorial/sgemm_2_sycl.cpp
index bb6d1fe214..831bfe0881 100644
--- a/examples/cute/tutorial/sgemm_2_sycl.cpp
+++ b/examples/cute/tutorial/sgemm_2_sycl.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
+
 #include "cutlass/util/GPU_Clock.hpp"
 #include "cutlass/util/print_error.hpp"
 
@@ -287,12 +289,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
 
   auto dimBlock = syclcompat::dim3(size(mmaC));
   auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  syclcompat::launch<
+  auto event = syclcompat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
                   decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
                   decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
       dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
       sC, mmaC, alpha, beta);
+  EventManager::getInstance().addEvent(event);
 }
 
 // Setup params for a TN GEMM
@@ -361,12 +364,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
 
   auto dimBlock = syclcompat::dim3(size(mmaC));
   auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  syclcompat::launch<
+  auto event = syclcompat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
                   decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
                   decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
       dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
       sC, mmaC, alpha, beta);
+  EventManager::getInstance().addEvent(event);
 }
 
 template <class TA, class TB, class TC, class Alpha, class Beta>
@@ -397,6 +401,14 @@ int main(int argc, char** argv) {
   char transB = 'T';
   if (argc >= 6) sscanf(argv[5], "%c", &transB);
 
+  sycl::property_list prop = {
+          sycl::property::queue::in_order(),
+          sycl::property::queue::enable_profiling()
+  };
+
+  auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
+  syclcompat::set_default_queue(q);
+
   using TA = float;
   using TB = float;
   using TC = float;
diff --git a/examples/cute/tutorial/sgemm_sm70_sycl.cpp b/examples/cute/tutorial/sgemm_sm70_sycl.cpp
index 3ad03e3c9c..28e3b2948b 100644
--- a/examples/cute/tutorial/sgemm_sm70_sycl.cpp
+++ b/examples/cute/tutorial/sgemm_sm70_sycl.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
+
 #include "cutlass/util/GPU_Clock.hpp"
 #include "cutlass/util/print_error.hpp"
 
@@ -283,12 +285,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
 
   auto dimBlock = syclcompat::dim3(size(mmaC));
   auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  syclcompat::launch<
+  auto event = syclcompat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
                   decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
                   decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
       dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
       sC, mmaC, alpha, beta);
+  EventManager::getInstance().addEvent(event);
 }
 
 // Setup params for a TN GEMM
@@ -349,12 +352,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
 
   auto dimBlock = syclcompat::dim3(size(mmaC));
   auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  syclcompat::launch<
+  auto event = syclcompat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
                   decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
                   decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
       dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC,
       sC, mmaC, alpha, beta);
+  EventManager::getInstance().addEvent(event);
 }
 
 template <class TA, class TB, class TC, class Alpha, class Beta>
@@ -385,6 +389,14 @@ int main(int argc, char** argv) {
   char transB = 'T';
   if (argc >= 6) sscanf(argv[5], "%c", &transB);
 
+  sycl::property_list prop = {
+          sycl::property::queue::in_order(),
+          sycl::property::queue::enable_profiling()
+  };
+
+  auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
+  syclcompat::set_default_queue(q);
+
   using TA = float;
   using TB = float;
   using TC = float;
diff --git a/examples/cute/tutorial/sgemm_sm80_sycl.cpp b/examples/cute/tutorial/sgemm_sm80_sycl.cpp
index 1bd288f1c8..d59cf3ee57 100644
--- a/examples/cute/tutorial/sgemm_sm80_sycl.cpp
+++ b/examples/cute/tutorial/sgemm_sm80_sycl.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
+
 #include "cutlass/util/GPU_Clock.hpp"
 #include "cutlass/util/print_error.hpp"
 
@@ -362,12 +364,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
 
   auto dimBlock = syclcompat::dim3(size(mmaC));
   auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  syclcompat::launch<
+  auto event = syclcompat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
                   decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
                   decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
       dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC,
       alpha, beta);
+  EventManager::getInstance().addEvent(event);
 }
 
 // Setup params for a NT GEMM
@@ -433,12 +436,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B
 
   auto dimBlock = syclcompat::dim3(size(mmaC));
   auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN)));
-  syclcompat::launch<
+  auto event = syclcompat::launch<
       gemm_device<decltype(prob_shape), decltype(cta_tiler), TA, decltype(dA), decltype(sA),
                   decltype(copyA), TB, decltype(dB), decltype(sB), decltype(copyB), TC,
                   decltype(dC), decltype(sC), decltype(mmaC), Alpha, Beta>>(
       dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC,
       alpha, beta);
+  EventManager::getInstance().addEvent(event);
 }
 
 template <class TA, class TB, class TC, class Alpha, class Beta>
@@ -468,6 +472,14 @@ int main(int argc, char** argv) {
   char transB = 'T';
   if (argc >= 6) sscanf(argv[5], "%c", &transB);
 
+  sycl::property_list prop = {
+          sycl::property::queue::in_order(),
+          sycl::property::queue::enable_profiling()
+  };
+
+  auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
+  syclcompat::set_default_queue(q);
+
   using TA = float;
   using TB = float;
   using TC = float;
diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
index 204214da11..f19c7b2165 100644
--- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
+++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
@@ -29,6 +29,8 @@
  *
  **************************************************************************************************/
 
+#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED
+
 #include "cutlass/gemm/device/gemm.h"
 #include "cutlass/epilogue/collective/default_epilogue.hpp"
 #include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp"
@@ -260,6 +262,14 @@ struct ExampleRunner {
 
     initialize(problem_size);
 
+    sycl::property_list prop = {
+      sycl::property::queue::in_order(),
+      sycl::property::queue::enable_profiling()
+    };
+
+    auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop);
+    syclcompat::set_default_queue(q);
+
     typename Gemm::GemmKernel::Arguments arguments{
       cutlass::gemm::GemmUniversalMode::kGemm,
       problem_size,
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 628afc9af8..147edd4557 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -58,6 +58,10 @@
 // 3.x
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 
+#if defined(CUTLASS_ENABLE_SYCL)
+#include "cutlass/util/sycl_event_manager.hpp"
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass::gemm::device {
@@ -407,10 +411,11 @@ class GemmUniversalAdapter<
         const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z);
 
 #if defined (SYCL_INTEL_TARGET)
-        syclcompat::experimental::launch<device_kernel<GemmKernel>, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params);
+        auto event = syclcompat::experimental::launch<device_kernel<GemmKernel>, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params);
 #else
-        syclcompat::launch<device_kernel<GemmKernel>>(sycl_grid, sycl_block, smem_size, params);
+        auto event = syclcompat::launch<device_kernel<GemmKernel>>(sycl_grid, sycl_block, smem_size, params);
 #endif
+        EventManager::getInstance().addEvent(event);
 #else
         device_kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params);
 #endif
diff --git a/tools/util/include/cutlass/util/GPU_Clock.hpp b/tools/util/include/cutlass/util/GPU_Clock.hpp
index bd017a777a..052c491640 100644
--- a/tools/util/include/cutlass/util/GPU_Clock.hpp
+++ b/tools/util/include/cutlass/util/GPU_Clock.hpp
@@ -32,31 +32,37 @@
 #pragma once
 
 #if defined(CUTLASS_ENABLE_SYCL)
-#include <syclcompat.hpp>
-#include <chrono>
+#include "cutlass/util/sycl_event_manager.hpp"
 #else
 #include <cuda_runtime.h>
 #endif
 
 struct GPU_Clock
 {
-#if !defined(CUTLASS_ENABLE_SYCL)
   GPU_Clock() {
+#if defined(CUTLASS_ENABLE_SYCL)
+    start_ = SyclEvent{};
+    stop_ = SyclEvent{};
+#else
     cudaEventCreate(&start_);
     cudaEventCreate(&stop_);
     cudaEventRecord(start_);
+#endif
   }
 
   ~GPU_Clock() {
+#if defined(CUTLASS_ENABLE_SYCL)
+    syclEventDestroy(start_);
+    syclEventDestroy(stop_);
+#else
     cudaEventDestroy(start_);
     cudaEventDestroy(stop_);
-  }
 #endif
+  }
 
   void start() {
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::get_default_queue().wait();
-    start_ = std::chrono::high_resolution_clock::now();
+    syclEventRecord(start_);
 #else
     cudaEventRecord(start_);
 #endif
@@ -64,10 +70,11 @@ struct GPU_Clock
 
   float milliseconds() {
 #if defined(CUTLASS_ENABLE_SYCL)
-    syclcompat::get_default_queue().wait();
-    auto stop = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<float, std::milli> time = stop - start_;
-    return time.count();
+    syclEventRecord(stop_);
+    syclEventSynchronize(start_, stop_);
+    float time;
+    syclEventElapsedTime(&time, start_, stop_);
+    return time;
 #else
     cudaEventRecord(stop_);
     cudaEventSynchronize(stop_);
@@ -83,11 +90,7 @@ struct GPU_Clock
 
  private:
 #if defined(CUTLASS_ENABLE_SYCL)
-    typedef std::chrono::nanoseconds				                  duration;
-    typedef std::chrono::high_resolution_clock		                  high_resolution_clock;
-    typedef std::chrono::time_point<high_resolution_clock, duration>  time_point;
-
-    time_point start_ = std::chrono::high_resolution_clock::now();
+    SyclEvent start_, stop_;
 #else
     cudaEvent_t start_, stop_;
 #endif
diff --git a/tools/util/include/cutlass/util/sycl_event_manager.hpp b/tools/util/include/cutlass/util/sycl_event_manager.hpp
new file mode 100644
index 0000000000..4b2032adfc
--- /dev/null
+++ b/tools/util/include/cutlass/util/sycl_event_manager.hpp
@@ -0,0 +1,135 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <vector>
+#include <sycl/sycl.hpp>
+
+class SyclEvent {
+private:
+    int index;
+
+public:
+    SyclEvent() : index(-1) {
+    };
+
+    int getIndex() const {
+      return index;
+    }
+
+    SyclEvent& operator=(int const& value) {
+      index = value;
+      return *this;
+    };
+};
+
+class EventManager {
+public:
+    static EventManager& getInstance()
+    {
+      static EventManager instance;
+      return instance;
+    }
+private:
+    EventManager() {}
+    std::vector<sycl::event> events{};
+    int recorders = 0;
+
+public:
+    EventManager(EventManager const&) = delete;
+    void operator=(EventManager const&) = delete;
+
+    void startRecording(SyclEvent &event) {
+      if (event.getIndex() != -1) {
+        throw std::runtime_error("Event is already being recorded.");
+      }
+      recorders++;
+      event = static_cast<int>(events.size());
+    }
+
+    void addEvent(const sycl::event &event) {
+      events.push_back(event);
+    }
+
+    void eventDestroy() {
+      recorders--;
+      if (!recorders) {
+        events.clear();
+      }
+    }
+
+    float getEventElapsedTimeMs(SyclEvent const& begin, SyclEvent const& end) {
+      if (begin.getIndex() < 0 || begin.getIndex() > end.getIndex() || end.getIndex() > events.size()) {
+        throw std::runtime_error("Index out of bounds");
+      }
+
+      auto time_event = 0.0;
+      for (int i = begin.getIndex(); i < end.getIndex(); ++i) {
+        auto start_time = events[i].template get_profiling_info<
+                sycl::info::event_profiling::command_start>();
+
+        auto end_time = events[i].template get_profiling_info<
+                sycl::info::event_profiling::command_end>();
+
+        time_event += static_cast<float>(end_time - start_time);
+      }
+      return time_event * 1e-6;
+    }
+
+    void wait(SyclEvent const& begin, SyclEvent const& end) {
+      if (begin.getIndex() < 0 || begin.getIndex() > end.getIndex() || end.getIndex() > events.size()) {
+        throw std::runtime_error("Index out of bounds");
+      }
+
+      for (int i = begin.getIndex(); i < end.getIndex(); ++i) {
+        events[i].wait();
+      }
+    }
+
+};
+
+inline void syclEventDestroy(SyclEvent const& event) {
+  EventManager::getInstance().eventDestroy();
+}
+
+inline void syclEventRecord(SyclEvent &event) {
+  EventManager::getInstance().startRecording(event);
+}
+
+inline void syclEventSynchronize(SyclEvent const& begin, SyclEvent const& end) {
+  EventManager::getInstance().wait(begin, end);
+}
+
+inline void syclEventElapsedTime(float* time, SyclEvent const& begin, SyclEvent const& end) {
+#if defined(CUTLASS_SYCLCOMPAT_PROFILING_ENABLED)
+  *time = EventManager::getInstance().getEventElapsedTimeMs(begin, end);
+#endif
+}