diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp index 96d259d5eb..211f3ebbd6 100644 --- a/include/cutlass/detail/helper_macros.hpp +++ b/include/cutlass/detail/helper_macros.hpp @@ -62,7 +62,7 @@ #if defined(CUTLASS_ENABLE_SYCL) #define CUTLASS_HOST -#define CUTLASS_GLOBAL +#define CUTLASS_GLOBAL __attribute__((always_inline)) inline #define CUTLASS_SHARED #else #define CUTLASS_HOST __host__ diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h index cca398ea8f..0fb6f6e5cd 100644 --- a/include/cutlass/device_kernel.h +++ b/include/cutlass/device_kernel.h @@ -108,11 +108,10 @@ void Kernel2(typename Operator::Params params) { /// Generic CUTLASS kernel template. template +CUTLASS_GLOBAL #if defined(CUTLASS_ENABLE_SYCL) -__attribute__((always_inline)) inline void -device_kernel(typename Operator::Params const& params, sycl::local_ptr smem) { +void device_kernel(typename Operator::Params const& params, sycl::local_ptr smem) { #else -CUTLASS_GLOBAL #ifdef __CUDACC__ // Enclosing this in __CUDACC__ suppresses MSVC warnings. __launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)