diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
index 96d259d5eb..211f3ebbd6 100644
--- a/include/cutlass/detail/helper_macros.hpp
+++ b/include/cutlass/detail/helper_macros.hpp
@@ -62,7 +62,7 @@
 
 #if defined(CUTLASS_ENABLE_SYCL)
 #define CUTLASS_HOST
-#define CUTLASS_GLOBAL
+#define CUTLASS_GLOBAL __attribute__((always_inline)) inline
 #define CUTLASS_SHARED
 #else
 #define CUTLASS_HOST __host__
diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h
index cca398ea8f..0fb6f6e5cd 100644
--- a/include/cutlass/device_kernel.h
+++ b/include/cutlass/device_kernel.h
@@ -108,11 +108,10 @@ void Kernel2(typename Operator::Params params) {
 
 /// Generic CUTLASS kernel template.
 template <typename Operator>
+CUTLASS_GLOBAL
 #if defined(CUTLASS_ENABLE_SYCL)
-__attribute__((always_inline)) inline void 
-device_kernel(typename Operator::Params const& params, sycl::local_ptr<char> smem) {
+void device_kernel(typename Operator::Params const& params, sycl::local_ptr<char> smem) {
 #else
-CUTLASS_GLOBAL
 #ifdef __CUDACC__
 // Enclosing this in __CUDACC__ suppresses MSVC warnings.
 __launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)