From 00c366c5647dc1fe3737501ff1c2db10fc172390 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 16 Jul 2024 12:08:21 +0100 Subject: [PATCH] Add sycl equivalent to cuda events for profiling (#69) --- benchmarks/common/benchmark_runner.hpp | 14 ++ .../ampere_tf32_tensorop_gemm_cute.cpp | 9 ++ examples/cute/tutorial/sgemm_1_sycl.cpp | 16 ++- examples/cute/tutorial/sgemm_2_sycl.cpp | 16 ++- examples/cute/tutorial/sgemm_sm70_sycl.cpp | 16 ++- examples/cute/tutorial/sgemm_sm80_sycl.cpp | 16 ++- .../sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp | 10 ++ .../gemm/device/gemm_universal_adapter.h | 9 +- tools/util/include/cutlass/util/GPU_Clock.hpp | 33 +++-- .../cutlass/util/sycl_event_manager.hpp | 135 ++++++++++++++++++ 10 files changed, 249 insertions(+), 25 deletions(-) create mode 100644 tools/util/include/cutlass/util/sycl_event_manager.hpp diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp index 5eb2ade3eb..decedbf75a 100644 --- a/benchmarks/common/benchmark_runner.hpp +++ b/benchmarks/common/benchmark_runner.hpp @@ -29,6 +29,10 @@ * **************************************************************************************************/ +#if defined(CUTLASS_ENABLE_SYCL) +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED +#endif + #include "cutlass/gemm/device/gemm.h" #include "cutlass/epilogue/collective/default_epilogue.hpp" #include "cutlass/gemm/device/gemm_universal.h" @@ -208,6 +212,16 @@ struct BenchmarkRunner { auto problem_shape_MNKL = cute::append<4>(problem_size, 1); auto [M, N, K, L] = problem_shape_MNKL; +#if defined(CUTLASS_SYCLCOMPAT_PROFILING_ENABLED) + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); +#endif + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp index 77bae13ca1..8858a3c2e3 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include #include @@ -121,6 +123,13 @@ run(Gemm_Op gemm_op) void test_gemm(int m, int n, int k) { + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); std::cout << "M = " << m << std::endl; std::cout << "N = " << n << std::endl; diff --git a/examples/cute/tutorial/sgemm_1_sycl.cpp b/examples/cute/tutorial/sgemm_1_sycl.cpp index 764b1f105e..2a588af0ef 100644 --- a/examples/cute/tutorial/sgemm_1_sycl.cpp +++ b/examples/cute/tutorial/sgemm_1_sycl.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/print_error.hpp" @@ -295,12 +297,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(tC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>(dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, tA, B, dB, sB, tB, C, dC, sC, tC, alpha, beta); + EventManager::getInstance().addEvent(event); } // Setup params for a TN GEMM @@ -341,12 +344,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(tC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>(dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, tA, B, dB, sB, tB, C, dC, sC, tC, alpha, beta); + EventManager::getInstance().addEvent(event); } template @@ -376,6 +380,14 @@ int main(int argc, char** argv) { char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + using TA = float; using TB = float; using TC = float; diff --git a/examples/cute/tutorial/sgemm_2_sycl.cpp b/examples/cute/tutorial/sgemm_2_sycl.cpp index bb6d1fe214..831bfe0881 100644 --- a/examples/cute/tutorial/sgemm_2_sycl.cpp +++ b/examples/cute/tutorial/sgemm_2_sycl.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/print_error.hpp" @@ -287,12 +289,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } // Setup params for a TN GEMM @@ -361,12 +364,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } template @@ -397,6 +401,14 @@ int main(int argc, char** argv) { char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + using TA = float; using TB = float; using TC = float; diff --git a/examples/cute/tutorial/sgemm_sm70_sycl.cpp b/examples/cute/tutorial/sgemm_sm70_sycl.cpp index 3ad03e3c9c..28e3b2948b 100644 --- a/examples/cute/tutorial/sgemm_sm70_sycl.cpp +++ b/examples/cute/tutorial/sgemm_sm70_sycl.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/print_error.hpp" @@ -283,12 +285,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } // Setup params for a TN GEMM @@ -349,12 +352,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } template @@ -385,6 +389,14 @@ int main(int argc, char** argv) { char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + using TA = float; using TB = float; using TC = float; diff --git a/examples/cute/tutorial/sgemm_sm80_sycl.cpp b/examples/cute/tutorial/sgemm_sm80_sycl.cpp index 1bd288f1c8..d59cf3ee57 100644 --- a/examples/cute/tutorial/sgemm_sm80_sycl.cpp +++ b/examples/cute/tutorial/sgemm_sm80_sycl.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/util/GPU_Clock.hpp" #include "cutlass/util/print_error.hpp" @@ -362,12 +364,13 @@ void gemm_nt(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } // Setup params for a NT GEMM @@ -433,12 +436,13 @@ void gemm_tn(int m, int n, int k, Alpha alpha, TA const* A, int ldA, TB const* B auto dimBlock = syclcompat::dim3(size(mmaC)); auto dimGrid = syclcompat::dim3(size(ceil_div(M, bM)), size(ceil_div(N, bN))); - syclcompat::launch< + auto event = syclcompat::launch< gemm_device>( dimGrid, dimBlock, prob_shape, cta_tiler, A, dA, sA, copyA, B, dB, sB, copyB, C, dC, sC, mmaC, alpha, beta); + EventManager::getInstance().addEvent(event); } template @@ -468,6 +472,14 @@ int main(int argc, char** argv) { char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + using TA = float; using TB = float; using TC = float; diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp index 204214da11..f19c7b2165 100644 --- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp +++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp @@ -29,6 +29,8 @@ * **************************************************************************************************/ +#define CUTLASS_SYCLCOMPAT_PROFILING_ENABLED + #include "cutlass/gemm/device/gemm.h" #include "cutlass/epilogue/collective/default_epilogue.hpp" #include "cutlass/epilogue/collective/intel_pvc_epilogue.hpp" @@ -260,6 +262,14 @@ struct ExampleRunner { initialize(problem_size); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + typename Gemm::GemmKernel::Arguments arguments{ cutlass::gemm::GemmUniversalMode::kGemm, problem_size, diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h index 628afc9af8..147edd4557 100644 --- a/include/cutlass/gemm/device/gemm_universal_adapter.h +++ b/include/cutlass/gemm/device/gemm_universal_adapter.h @@ -58,6 +58,10 @@ // 3.x #include "cutlass/gemm/kernel/gemm_universal.hpp" +#if defined(CUTLASS_ENABLE_SYCL) +#include "cutlass/util/sycl_event_manager.hpp" +#endif + //////////////////////////////////////////////////////////////////////////////// namespace cutlass::gemm::device { @@ -407,10 +411,11 @@ class GemmUniversalAdapter< const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); #if defined (SYCL_INTEL_TARGET) - syclcompat::experimental::launch, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params); + auto event = syclcompat::experimental::launch, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params); #else - syclcompat::launch>(sycl_grid, sycl_block, smem_size, params); + auto event = syclcompat::launch>(sycl_grid, sycl_block, smem_size, params); #endif + EventManager::getInstance().addEvent(event); #else device_kernel<<>>(params); #endif diff --git a/tools/util/include/cutlass/util/GPU_Clock.hpp b/tools/util/include/cutlass/util/GPU_Clock.hpp index bd017a777a..052c491640 100644 --- a/tools/util/include/cutlass/util/GPU_Clock.hpp +++ b/tools/util/include/cutlass/util/GPU_Clock.hpp @@ -32,31 +32,37 @@ #pragma once #if defined(CUTLASS_ENABLE_SYCL) -#include -#include +#include "cutlass/util/sycl_event_manager.hpp" #else #include #endif struct GPU_Clock { -#if !defined(CUTLASS_ENABLE_SYCL) GPU_Clock() { +#if defined(CUTLASS_ENABLE_SYCL) + start_ = SyclEvent{}; + stop_ = SyclEvent{}; +#else cudaEventCreate(&start_); cudaEventCreate(&stop_); cudaEventRecord(start_); +#endif } ~GPU_Clock() { +#if defined(CUTLASS_ENABLE_SYCL) + syclEventDestroy(start_); + syclEventDestroy(stop_); +#else cudaEventDestroy(start_); cudaEventDestroy(stop_); - } #endif + } void start() { #if defined(CUTLASS_ENABLE_SYCL) - syclcompat::get_default_queue().wait(); - start_ = std::chrono::high_resolution_clock::now(); + syclEventRecord(start_); #else cudaEventRecord(start_); #endif @@ -64,10 +70,11 @@ struct GPU_Clock float milliseconds() { #if defined(CUTLASS_ENABLE_SYCL) - syclcompat::get_default_queue().wait(); - auto stop = std::chrono::high_resolution_clock::now(); - std::chrono::duration time = stop - start_; - return time.count(); + syclEventRecord(stop_); + syclEventSynchronize(start_, stop_); + float time; + syclEventElapsedTime(&time, start_, stop_); + return time; #else cudaEventRecord(stop_); cudaEventSynchronize(stop_); @@ -83,11 +90,7 @@ struct GPU_Clock private: #if defined(CUTLASS_ENABLE_SYCL) - typedef std::chrono::nanoseconds duration; - typedef std::chrono::high_resolution_clock high_resolution_clock; - typedef std::chrono::time_point time_point; - - time_point start_ = std::chrono::high_resolution_clock::now(); + SyclEvent start_, stop_; #else cudaEvent_t start_, stop_; #endif diff --git a/tools/util/include/cutlass/util/sycl_event_manager.hpp b/tools/util/include/cutlass/util/sycl_event_manager.hpp new file mode 100644 index 0000000000..4b2032adfc --- /dev/null +++ b/tools/util/include/cutlass/util/sycl_event_manager.hpp @@ -0,0 +1,135 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +class SyclEvent { +private: + int index; + +public: + SyclEvent() : index(-1) { + }; + + int getIndex() const { + return index; + } + + SyclEvent& operator=(int const& value) { + index = value; + return *this; + }; +}; + +class EventManager { +public: + static EventManager& getInstance() + { + static EventManager instance; + return instance; + } +private: + EventManager() {} + std::vector events{}; + int recorders = 0; + +public: + EventManager(EventManager const&) = delete; + void operator=(EventManager const&) = delete; + + void startRecording(SyclEvent &event) { + if (event.getIndex() != -1) { + throw std::runtime_error("Event is already being recorded."); + } + recorders++; + event = static_cast(events.size()); + } + + void addEvent(const sycl::event &event) { + events.push_back(event); + } + + void eventDestroy() { + recorders--; + if (!recorders) { + events.clear(); + } + } + + float getEventElapsedTimeMs(SyclEvent const& begin, SyclEvent const& end) { + if (begin.getIndex() < 0 || begin.getIndex() > end.getIndex() || end.getIndex() > events.size()) { + throw std::runtime_error("Index out of bounds"); + } + + auto time_event = 0.0; + for (int i = begin.getIndex(); i < end.getIndex(); ++i) { + auto start_time = events[i].template get_profiling_info< + sycl::info::event_profiling::command_start>(); + + auto end_time = events[i].template get_profiling_info< + sycl::info::event_profiling::command_end>(); + + time_event += static_cast(end_time - start_time); + } + return time_event * 1e-6; + } + + void wait(SyclEvent const& begin, SyclEvent const& end) { + if (begin.getIndex() < 0 || begin.getIndex() > end.getIndex() || end.getIndex() > events.size()) { + throw std::runtime_error("Index out of bounds"); + } + + for (int i = begin.getIndex(); i < end.getIndex(); ++i) { + events[i].wait(); + } + } + +}; + +inline void syclEventDestroy(SyclEvent const& event) { + EventManager::getInstance().eventDestroy(); +} + +inline void syclEventRecord(SyclEvent &event) { + EventManager::getInstance().startRecording(event); +} + +inline void syclEventSynchronize(SyclEvent const& begin, SyclEvent const& end) { + EventManager::getInstance().wait(begin, end); +} + +inline void syclEventElapsedTime(float* time, SyclEvent const& begin, SyclEvent const& end) { +#if defined(CUTLASS_SYCLCOMPAT_PROFILING_ENABLED) + *time = EventManager::getInstance().getEventElapsedTimeMs(begin, end); +#endif +}