diff --git a/cmake/options.cmake b/cmake/options.cmake
index cd1c8be3c56..0520f931b7e 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -146,6 +146,7 @@ set(DNNL_ENABLE_PRIMITIVE_GPU_ISA "ALL" CACHE STRING
     at build time. Regardless of value chosen, reference OpenCL-based
     implementations will always be available. Valid values:
     - ALL (the default). Includes all ISA to be enabled.
+    - NONE. Includes no ISAs, just the generic kernels.
     - <ISA_NAME>;<ISA_NAME>;... Includes only selected ISA to be enabled.
       Possible values are: GEN9, GEN11, XELP, XEHP, XEHPG, XEHPC, XE2.")
 
@@ -281,13 +282,13 @@ endif()
 
 set(DNNL_GPU_VENDOR "NONE" CACHE STRING
     "When DNNL_GPU_RUNTIME is not NONE DNNL_GPU_VENDOR specifies target GPU
-    vendor for GPU engines. Can be INTEL (default), NVIDIA or AMD.")
+    vendor for GPU engines. Can be INTEL (default), GENERIC, NVIDIA or AMD.")
 
 if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE" AND DNNL_GPU_VENDOR STREQUAL "NONE")
     set(DNNL_GPU_VENDOR "INTEL")
 endif()
 
-if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(NONE|INTEL|NVIDIA|AMD)$")
+if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(NONE|GENERIC|INTEL|NVIDIA|AMD)$")
     message(FATAL_ERROR "Unsupported GPU vendor: ${DNNL_GPU_VENDOR}")
 endif()
 
@@ -327,6 +328,12 @@ else()
     set(DNNL_WITH_SYCL false)
 endif()
 
+if(DNNL_GPU_RUNTIME STREQUAL "OCL") # ... OR DNNL_CPU_RUNTIME STREQUAL "OCL")
+    set(DNNL_WITH_OCL true)
+else()
+    set(DNNL_WITH_OCL false)
+endif()
+
 # =============
 # Miscellaneous
 # =============
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d286751d2ee..5c50e64d20b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -125,6 +125,10 @@ endif()
 
 add_subdirectory(common)
 
+if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME STREQUAL "OCL")
+    add_subdirectory(xpu)
+endif()
+
 if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")
     add_subdirectory(cpu)
 endif()
@@ -133,10 +137,6 @@ if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE")
     add_subdirectory(gpu)
 endif()
 
-if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME STREQUAL "OCL")
-    add_subdirectory(xpu)
-endif()
-
 if(DNNL_WITH_SYCL)
     # Enable linking SYCL kernels.
     if(DNNL_SYCL_CUDA)
diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt
index f49a9bd2d63..17dd337a46c 100644
--- a/src/gpu/CMakeLists.txt
+++ b/src/gpu/CMakeLists.txt
@@ -26,12 +26,14 @@ set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
 
 add_subdirectory(generic)
 
-if(DNNL_GPU_VENDOR STREQUAL "INTEL")
-    add_definitions_with_host_compiler(-DNGEN_CPP11)
-    add_definitions_with_host_compiler(-DNGEN_SAFE)
-    add_definitions_with_host_compiler(-DNGEN_NEO_INTERFACE)
-    add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES)
-    add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT)
+if(DNNL_WITH_OCL)
+    if(DNNL_GPU_VENDOR STREQUAL "INTEL")
+	add_definitions_with_host_compiler(-DNGEN_CPP11)
+	add_definitions_with_host_compiler(-DNGEN_SAFE)
+	add_definitions_with_host_compiler(-DNGEN_NEO_INTERFACE)
+	add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES)
+	add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT)
+    endif()
     add_subdirectory(intel)
 endif()
 
diff --git a/src/gpu/gpu_batch_normalization_list.cpp b/src/gpu/gpu_batch_normalization_list.cpp
index e37b4edd65a..6cc312bbd2f 100644
--- a/src/gpu/gpu_batch_normalization_list.cpp
+++ b/src/gpu/gpu_batch_normalization_list.cpp
@@ -16,13 +16,14 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp"
-#include "gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp"
 #include "gpu/intel/ocl/bnorm/ref_batch_normalization.hpp"
 #include "gpu/intel/ocl/bnorm/reusable_bnorm.hpp"
 #include "gpu/intel/ocl/bnorm/simple_bnorm.hpp"
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+#include "gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp"
+#include "gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp"
+
 #ifdef DNNL_DEV_MODE
 #include "gpu/intel/ocl/bnorm/nhwc_reusable.hpp"
 #endif
@@ -52,9 +53,9 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::nhwc_reusable_batch_normalization_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::nhwc_batch_normalization_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_batch_normalization_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::simple_batch_normalization_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_batch_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_batch_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_batch_normalization_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_batch_normalization_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_batch_normalization_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_batch_normalization_fwd_t)
@@ -64,9 +65,9 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::nhwc_reusable_batch_normalization_bwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::nhwc_batch_normalization_bwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_batch_normalization_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::simple_batch_normalization_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_batch_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_batch_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_batch_normalization_bwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_batch_normalization_bwd_t)
         GPU_INSTANCE_AMD(amd::miopen_batch_normalization_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_batch_normalization_bwd_t)
diff --git a/src/gpu/gpu_binary_list.cpp b/src/gpu/gpu_binary_list.cpp
index 965b9fc51fe..1ce42013c40 100644
--- a/src/gpu/gpu_binary_list.cpp
+++ b/src/gpu/gpu_binary_list.cpp
@@ -15,11 +15,11 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/multi_po_reorder_binary.hpp"
+#include "gpu/intel/ocl/simple_binary.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gen9_binary.hpp"
-#include "gpu/intel/ocl/multi_po_reorder_binary.hpp"
-#include "gpu/intel/ocl/simple_binary.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -39,9 +39,9 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_BINARY_P({
-        GPU_INSTANCE_INTEL(intel::ocl::multi_po_reorder_binary)
+        GPU_INSTANCE_GENERIC(intel::ocl::multi_po_reorder_binary)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_binary_t)
-        GPU_INSTANCE_INTEL(intel::ocl::simple_binary_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_binary_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_binary_t)
         GPU_INSTANCE_AMD(amd::miopen_binary_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_binary_t)
diff --git a/src/gpu/gpu_concat_list.cpp b/src/gpu/gpu_concat_list.cpp
index 24105ee7b7f..110229743a8 100644
--- a/src/gpu/gpu_concat_list.cpp
+++ b/src/gpu/gpu_concat_list.cpp
@@ -17,12 +17,12 @@
 #include "gpu/gpu_impl_list.hpp"
 
 #include "gpu/generic/ref_concat.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/ocl/gen9_concat.hpp"
 #include "gpu/intel/ocl/multi_concat.hpp"
 #include "gpu/intel/ocl/reusable_simple_concat.hpp"
 #include "gpu/intel/ocl/simple_concat.hpp"
+
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+#include "gpu/intel/ocl/gen9_concat.hpp"
 #endif
 
 namespace dnnl {
@@ -33,10 +33,10 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_CONCAT_P({
-        GPU_CONCAT_INSTANCE_INTEL(intel::ocl::reusable_simple_concat_t)
-        GPU_CONCAT_INSTANCE_INTEL(intel::ocl::simple_concat_t)
+        GPU_CONCAT_INSTANCE_GENERIC(intel::ocl::reusable_simple_concat_t)
+        GPU_CONCAT_INSTANCE_GENERIC(intel::ocl::simple_concat_t)
         GPU_CONCAT_INSTANCE_INTEL(intel::ocl::gen9_concat_t)
-        GPU_CONCAT_INSTANCE_INTEL(intel::ocl::multi_concat_t)
+        GPU_CONCAT_INSTANCE_GENERIC(intel::ocl::multi_concat_t)
         GPU_CONCAT_INSTANCE_GENERIC(generic::ref_concat_t)
         nullptr,
 });
diff --git a/src/gpu/gpu_convolution_list.cpp b/src/gpu/gpu_convolution_list.cpp
index 88d5969683f..77d3d082701 100644
--- a/src/gpu/gpu_convolution_list.cpp
+++ b/src/gpu/gpu_convolution_list.cpp
@@ -15,12 +15,12 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/ref_convolution.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/binary_format.hpp"
 #include "gpu/intel/jit/conv/gen_convolution.hpp"
 #include "gpu/intel/ocl/gen9_wino_convolution.hpp"
-#include "gpu/intel/ocl/ref_convolution.hpp"
 
 #ifdef DNNL_DEV_MODE
 #include "gpu/intel/jit/v2/conv/gen_convolution.hpp"
@@ -50,7 +50,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_fwd_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_wino_convolution_fwd_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_fwd_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_convolution_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_fwd_t)
         nullptr,
@@ -58,7 +58,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     {{backward_data}, REG_BWD_D_PK({
         GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_data_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_data_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_bwd_data_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_convolution_bwd_data_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_data_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_bwd_data_t)
         nullptr,
@@ -66,7 +66,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     {{backward_weights}, REG_BWD_PK({
         GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_weights_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_weights_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_bwd_weights_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_convolution_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_weights_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_bwd_weights_t)
         nullptr,
diff --git a/src/gpu/gpu_deconvolution_list.cpp b/src/gpu/gpu_deconvolution_list.cpp
index 5e8f07a8c5d..54352acec0e 100644
--- a/src/gpu/gpu_deconvolution_list.cpp
+++ b/src/gpu/gpu_deconvolution_list.cpp
@@ -15,10 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/convolution_deconvolution.hpp"
-#endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/nvidia/cudnn_deconvolution.hpp"
@@ -39,14 +36,14 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_DECONV_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::convolution_deconvolution_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_deconvolution_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_bwd_data_t)
-        GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_bwd_weights_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::convolution_deconvolution_bwd_data_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::convolution_deconvolution_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_data_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_weights_t)
         GPU_INSTANCE_AMD(amd::miopen_deconvolution_bwd_data_t)
diff --git a/src/gpu/gpu_eltwise_list.cpp b/src/gpu/gpu_eltwise_list.cpp
index 9250e1c43a6..30cfc62af0f 100644
--- a/src/gpu/gpu_eltwise_list.cpp
+++ b/src/gpu/gpu_eltwise_list.cpp
@@ -15,10 +15,10 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/ref_eltwise.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gen9_eltwise.hpp"
-#include "gpu/intel/ocl/ref_eltwise.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -42,7 +42,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_ELTWISE_P({
     {{forward}, {
         GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_eltwise_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_eltwise_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_eltwise_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_sycl_eltwise_fwd_t)
@@ -50,7 +50,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     }},
     {{backward}, REG_BWD_PK({
         GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_eltwise_bwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_eltwise_bwd_t)
         GPU_INSTANCE_AMD(amd::miopen_eltwise_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_sycl_eltwise_bwd_t)
diff --git a/src/gpu/gpu_gemm_list.cpp b/src/gpu/gpu_gemm_list.cpp
index 08358253269..2a3ea0a368f 100644
--- a/src/gpu/gpu_gemm_list.cpp
+++ b/src/gpu/gpu_gemm_list.cpp
@@ -17,14 +17,14 @@
 #include "common/compiler_workarounds.hpp"
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/gemm/gemm_with_post_ops.hpp"
+#include "gpu/intel/ocl/gemm/ref_gemm.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/binary_format.hpp"
 
 #include "gpu/intel/jit/gemm/gen_gemm.hpp"
 #include "gpu/intel/jit/gemm/xe_hp_systolic_gemm.hpp"
-#include "gpu/intel/ocl/gemm/gemm_with_post_ops.hpp"
-#include "gpu/intel/ocl/gemm/ref_gemm.hpp"
 
 #ifdef DNNL_DEV_MODE
 #include "gpu/intel/ocl/gemm/conv_gemm.hpp"
@@ -44,7 +44,7 @@ constexpr impl_list_item_t impl_list[] = {
         GPU_INSTANCE_INTEL(intel::jit::xe_hp_systolic_gemm_t)
         GPU_INSTANCE_INTEL(intel::ocl::gemm_with_post_ops_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_gemm_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_gemm_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_gemm_t)
         nullptr,
 };
 // clang-format on
diff --git a/src/gpu/gpu_group_normalization_list.cpp b/src/gpu/gpu_group_normalization_list.cpp
index 65e3c4c6e2e..1b0bdafb136 100644
--- a/src/gpu/gpu_group_normalization_list.cpp
+++ b/src/gpu/gpu_group_normalization_list.cpp
@@ -15,10 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_group_normalization.hpp"
-#endif
 
 namespace dnnl {
 namespace impl {
@@ -31,12 +28,12 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
 impl_list_map REG_GNORM_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::ref_group_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_group_normalization_fwd_t)
         nullptr,
         }
     },
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::ref_group_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_group_normalization_bwd_t)
         nullptr,
         })
     },
diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp
index 992ac436428..b7180cb30a9 100644
--- a/src/gpu/gpu_impl_list.hpp
+++ b/src/gpu/gpu_impl_list.hpp
@@ -150,10 +150,9 @@ namespace gpu {
 
 // Instance macros that are enabled only when REF is disabled
 #ifdef DNNL_DISABLE_GPU_REF_KERNELS
-#define GPU_INSTANCE_INTEL_REF(...)
+#define GPU_INSTANCE_REF(...)
 #else
-#define GPU_INSTANCE_INTEL_REF(...) \
-    DNNL_GPU_INTEL_ONLY(GPU_INSTANCE(__VA_ARGS__))
+#define GPU_INSTANCE_REF(...) GPU_INSTANCE(__VA_ARGS__)
 #endif
 
 #define DECLARE_IMPL_LIST(kind) \
diff --git a/src/gpu/gpu_inner_product_list.cpp b/src/gpu/gpu_inner_product_list.cpp
index b13f990a9a5..f32d521c56d 100644
--- a/src/gpu/gpu_inner_product_list.cpp
+++ b/src/gpu/gpu_inner_product_list.cpp
@@ -15,12 +15,12 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/convolution_inner_product.hpp"
+#include "gpu/intel/ocl/ref_inner_product.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/ocl/convolution_inner_product.hpp"
 #include "gpu/intel/ocl/gemm_inner_product.hpp"
 #include "gpu/intel/ocl/gemm_post_ops_inner_product.hpp"
-#include "gpu/intel/ocl/ref_inner_product.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -45,7 +45,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     {{forward}, {
         GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::convolution_inner_product_fwd_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_inner_product_fwd_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_inner_product_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_fwd_t)
@@ -54,8 +54,8 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     {{backward}, REG_BWD_PK({
         GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_bwd_data_t)
         GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_bwd_weights_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_inner_product_bwd_data_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_inner_product_bwd_weights_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_inner_product_bwd_data_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_inner_product_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_bwd_data_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_bwd_data_t)
diff --git a/src/gpu/gpu_layer_normalization_list.cpp b/src/gpu/gpu_layer_normalization_list.cpp
index 70fa92bd7e8..3e427eab763 100644
--- a/src/gpu/gpu_layer_normalization_list.cpp
+++ b/src/gpu/gpu_layer_normalization_list.cpp
@@ -16,12 +16,10 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_layer_normalization.hpp"
 #include "gpu/intel/ocl/reusable_lnorm.hpp"
 #include "gpu/intel/ocl/reusable_vectorized_lnorm.hpp"
 #include "gpu/intel/ocl/vectorized_lnorm.hpp"
-#endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/generic/sycl/ref_layer_normalizations.hpp"
@@ -38,17 +36,17 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_LNORM_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_vectorized_layer_normalization_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::vectorized_lnorm_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_layer_normalization_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_layer_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_vectorized_layer_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::vectorized_lnorm_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_layer_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_layer_normalization_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_layer_normalization_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::vectorized_lnorm_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_layer_normalization_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_layer_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::vectorized_lnorm_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_layer_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_layer_normalization_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_layer_normalization_bwd_t)
         nullptr,
     })},
diff --git a/src/gpu/gpu_lrn_list.cpp b/src/gpu/gpu_lrn_list.cpp
index 40a6dd688c7..8d339037238 100644
--- a/src/gpu/gpu_lrn_list.cpp
+++ b/src/gpu/gpu_lrn_list.cpp
@@ -16,9 +16,7 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_lrn.hpp"
-#endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/generic/sycl/ref_lrn.hpp"
@@ -40,14 +38,14 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_LRN_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::ref_lrn_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_lrn_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_lrn_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_lrn_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_sycl_lrn_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::ref_lrn_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_lrn_bwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_lrn_bwd_t)
         GPU_INSTANCE_AMD(amd::miopen_lrn_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_sycl_lrn_bwd_t)
diff --git a/src/gpu/gpu_matmul_list.cpp b/src/gpu/gpu_matmul_list.cpp
index 940ce994d49..e586489b4af 100644
--- a/src/gpu/gpu_matmul_list.cpp
+++ b/src/gpu/gpu_matmul_list.cpp
@@ -15,10 +15,10 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/ref_matmul.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gemm_matmul.hpp"
-#include "gpu/intel/ocl/ref_matmul.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -38,7 +38,7 @@ namespace {
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({
         GPU_INSTANCE_INTEL(intel::ocl::gemm_matmul_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_matmul_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_matmul_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_matmul_t)
         GPU_INSTANCE_AMD(amd::miopen_matmul_t)
         nullptr,
diff --git a/src/gpu/gpu_pooling_list.cpp b/src/gpu/gpu_pooling_list.cpp
index 12da2459cc6..afcbb4d26db 100644
--- a/src/gpu/gpu_pooling_list.cpp
+++ b/src/gpu/gpu_pooling_list.cpp
@@ -17,12 +17,12 @@
 #include "common/compiler_workarounds.hpp"
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/ref_pooling.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/pooling/gen_pooling.hpp"
 #include "gpu/intel/ocl/gen9_global_pooling.hpp"
 #include "gpu/intel/ocl/gen9_pooling.hpp"
-#include "gpu/intel/ocl/ref_pooling.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -48,7 +48,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_INTEL(intel::jit::gen_pooling_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_global_pooling_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_pooling_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_pooling_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_pooling_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_pooling_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_pooling_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_pooling_fwd_t)
@@ -57,7 +57,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     {{backward}, REG_BWD_PK({
         GPU_INSTANCE_INTEL(intel::ocl::gen9_global_pooling_bwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_pooling_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_pooling_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_pooling_bwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_pooling_bwd_t)
         GPU_INSTANCE_AMD(amd::miopen_pooling_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_pooling_bwd_t)
diff --git a/src/gpu/gpu_prelu_list.cpp b/src/gpu/gpu_prelu_list.cpp
index a649197a6d2..6961aece191 100644
--- a/src/gpu/gpu_prelu_list.cpp
+++ b/src/gpu/gpu_prelu_list.cpp
@@ -18,9 +18,7 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_prelu.hpp"
-#endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/generic/sycl/ref_prelu.hpp"
@@ -37,12 +35,12 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_PRELU_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::ref_prelu_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_prelu_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_prelu_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::ref_prelu_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_prelu_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_prelu_bwd_t)
         nullptr,
     })},
diff --git a/src/gpu/gpu_reduction_list.cpp b/src/gpu/gpu_reduction_list.cpp
index b29c238e04a..85f8a956fb3 100644
--- a/src/gpu/gpu_reduction_list.cpp
+++ b/src/gpu/gpu_reduction_list.cpp
@@ -16,12 +16,13 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/reduction/atomic_reduction.hpp"
 #include "gpu/intel/ocl/reduction/combined_reduction.hpp"
 #include "gpu/intel/ocl/reduction/ref_reduction.hpp"
 #include "gpu/intel/ocl/reduction/reusable_ref_reduction.hpp"
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+
 #ifdef DNNL_DEV_MODE
 #include "gpu/intel/jit/jit_reduction.hpp"
 #endif
@@ -45,10 +46,10 @@ namespace {
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({
         GPU_INSTANCE_INTEL_DEVMODE(intel::jit::jit_reduction_t)
-        GPU_INSTANCE_INTEL(intel::ocl::atomic_reduction_t)
-        GPU_INSTANCE_INTEL(intel::ocl::combined_reduction_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::atomic_reduction_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::combined_reduction_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_reduction_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_ref_reduction_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t)
         GPU_INSTANCE_AMD(amd::miopen_reduction_t)
         nullptr,
diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp
index be52bad91f5..8dff445259d 100644
--- a/src/gpu/gpu_reorder_list.cpp
+++ b/src/gpu/gpu_reorder_list.cpp
@@ -18,13 +18,13 @@
 
 #include "gpu/generic/cross_engine_reorder.hpp"
 #include "gpu/generic/direct_copy.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/jit/reorder/gen_reorder.hpp"
 #include "gpu/intel/ocl/custom_reorder.hpp"
 #include "gpu/intel/ocl/generic_reorder.hpp"
 #include "gpu/intel/ocl/ref_reorder.hpp"
 #include "gpu/intel/ocl/rnn/rnn_reorders.hpp"
+
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+#include "gpu/intel/jit/reorder/gen_reorder.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -46,12 +46,12 @@ using namespace dnnl::impl::data_type;
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_REORDER_P({
-        GPU_REORDER_INSTANCE_INTEL(intel::ocl::rnn_weights_reorder_t::pd_t)
+        GPU_REORDER_INSTANCE_GENERIC(intel::ocl::rnn_weights_reorder_t::pd_t)
         GPU_REORDER_INSTANCE_GENERIC(generic::direct_copy_t::pd_t)
         GPU_REORDER_INSTANCE_INTEL(intel::jit::gen_reorder_t::pd_t)
-        GPU_REORDER_INSTANCE_INTEL(intel::ocl::custom_reorder_t::pd_t) // for specific tensor shapes
-        GPU_REORDER_INSTANCE_INTEL(intel::ocl::generic_reorder_t::pd_t)// fast and quite generic
-        GPU_REORDER_INSTANCE_INTEL(intel::ocl::ref_reorder_t::pd_t)    // slow but fits every use case
+        GPU_REORDER_INSTANCE_GENERIC(intel::ocl::custom_reorder_t::pd_t) // for specific tensor shapes
+        GPU_REORDER_INSTANCE_GENERIC(intel::ocl::generic_reorder_t::pd_t)// fast and quite generic
+        GPU_REORDER_INSTANCE_GENERIC(intel::ocl::ref_reorder_t::pd_t)    // slow but fits every use case
         GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_t::pd_t)
         GPU_REORDER_INSTANCE_AMD(amd::miopen_reorder_t::pd_t)
         GPU_REORDER_INSTANCE_GENERIC_SYCL(generic::sycl::ref_reorder_t::pd_t)
diff --git a/src/gpu/gpu_resampling_list.cpp b/src/gpu/gpu_resampling_list.cpp
index e9a623c0438..d33696c4ce8 100644
--- a/src/gpu/gpu_resampling_list.cpp
+++ b/src/gpu/gpu_resampling_list.cpp
@@ -16,10 +16,8 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_resampling.hpp"
 #include "gpu/intel/ocl/vectorized_resampling.hpp"
-#endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/generic/sycl/ref_resampling.hpp"
@@ -37,14 +35,14 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_RESAMPLING_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_resampling_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_resampling_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_resampling_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::vectorized_resampling_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::vectorized_resampling_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_resampling_bwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_resampling_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_resampling_bwd_t)
         nullptr,
diff --git a/src/gpu/gpu_rnn_list.cpp b/src/gpu/gpu_rnn_list.cpp
index 7e9526abf61..0bf74d7a245 100644
--- a/src/gpu/gpu_rnn_list.cpp
+++ b/src/gpu/gpu_rnn_list.cpp
@@ -16,9 +16,7 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/rnn/rnn_grid.hpp"
-#endif
 
 namespace dnnl {
 namespace impl {
@@ -31,11 +29,11 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_RNN_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::simple_rnn_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_rnn_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::simple_rnn_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_rnn_bwd_t)
         nullptr,
     })},
 });
diff --git a/src/gpu/gpu_sdpa_list.cpp b/src/gpu/gpu_sdpa_list.cpp
index 1c9230f95c1..a101c190c8e 100644
--- a/src/gpu/gpu_sdpa_list.cpp
+++ b/src/gpu/gpu_sdpa_list.cpp
@@ -30,7 +30,7 @@ namespace {
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_SDPA_P({
         GPU_INSTANCE_INTEL(intel::ocl::micro_sdpa_t)
-        GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::ref_sdpa_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_sdpa_t)
         nullptr,
 });
 // clang-format on
diff --git a/src/gpu/gpu_shuffle_list.cpp b/src/gpu/gpu_shuffle_list.cpp
index fe3ef9c2e9c..4f867b987ab 100644
--- a/src/gpu/gpu_shuffle_list.cpp
+++ b/src/gpu/gpu_shuffle_list.cpp
@@ -16,10 +16,8 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_shuffle.hpp"
 #include "gpu/intel/ocl/shuffle_by_reorder.hpp"
-#endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/generic/sycl/ref_shuffle.hpp"
@@ -33,8 +31,8 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_SHUFFLE_P({
-        GPU_INSTANCE_INTEL(intel::ocl::shuffle_by_reorder_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_shuffle_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::shuffle_by_reorder_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_shuffle_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_shuffle_t)
         nullptr,
 });
diff --git a/src/gpu/gpu_softmax_list.cpp b/src/gpu/gpu_softmax_list.cpp
index fbbd595b941..b28d57c1268 100644
--- a/src/gpu/gpu_softmax_list.cpp
+++ b/src/gpu/gpu_softmax_list.cpp
@@ -15,11 +15,11 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/reusable_softmax.hpp"
+#include "gpu/intel/ocl/simple_softmax.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gen9_softmax.hpp"
-#include "gpu/intel/ocl/reusable_softmax.hpp"
-#include "gpu/intel/ocl/simple_softmax.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -43,8 +43,8 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_SOFTMAX_P({
     {{forward}, {
         GPU_INSTANCE_INTEL(intel::ocl::gen9_softmax_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::simple_softmax_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_softmax_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_softmax_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_softmax_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_softmax_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_softmax_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_sycl_softmax_fwd_t)
@@ -52,7 +52,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     }},
     {{backward}, REG_BWD_PK({
         GPU_INSTANCE_INTEL(intel::ocl::gen9_softmax_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::simple_softmax_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_softmax_bwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_softmax_bwd_t)
         GPU_INSTANCE_AMD(amd::miopen_softmax_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_sycl_softmax_bwd_t)
diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp
index acf21625e7c..af05ca44afe 100644
--- a/src/gpu/gpu_sum_list.cpp
+++ b/src/gpu/gpu_sum_list.cpp
@@ -20,13 +20,13 @@
 #include "gpu/gpu_sum_pd.hpp"
 
 #include "gpu/generic/ref_sum.hpp"
+#include "gpu/intel/ocl/many_inputs_sum.hpp"
+#include "gpu/intel/ocl/multi_po_reorder_sum.hpp"
+#include "gpu/intel/ocl/simple_sum.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/gen9_simple_sum.hpp"
 #include "gpu/intel/ocl/gen9_sum.hpp"
-#include "gpu/intel/ocl/many_inputs_sum.hpp"
-#include "gpu/intel/ocl/multi_po_reorder_sum.hpp"
-#include "gpu/intel/ocl/simple_sum.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -44,10 +44,10 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_SUM_P({
-        GPU_SUM_INSTANCE_INTEL(intel::ocl::multi_po_reorder_sum)
+        GPU_SUM_INSTANCE_GENERIC(intel::ocl::multi_po_reorder_sum)
         GPU_SUM_INSTANCE_INTEL(intel::ocl::gen9_sum_t)
-        GPU_SUM_INSTANCE_INTEL(intel::ocl::many_inputs_sum_t)
-        GPU_SUM_INSTANCE_INTEL(intel::ocl::simple_sum_t<data_type::f32>)
+        GPU_SUM_INSTANCE_GENERIC(intel::ocl::many_inputs_sum_t)
+        GPU_SUM_INSTANCE_GENERIC(intel::ocl::simple_sum_t<data_type::f32>)
         GPU_SUM_INSTANCE_NVIDIA(nvidia::cudnn_ref_sum_t)
         GPU_SUM_INSTANCE_GENERIC(generic::ref_sum_t)
         nullptr,
diff --git a/src/gpu/gpu_zero_pad_list.cpp b/src/gpu/gpu_zero_pad_list.cpp
index 4c17827ce72..501d92bfd91 100644
--- a/src/gpu/gpu_zero_pad_list.cpp
+++ b/src/gpu/gpu_zero_pad_list.cpp
@@ -15,10 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/simple_zero_pad.hpp"
-#endif
 
 namespace dnnl {
 namespace impl {
@@ -28,7 +25,7 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = {
-        GPU_INSTANCE_INTEL(intel::ocl::simple_zero_pad_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_zero_pad_t)
         nullptr,
 };
 // clang-format on
diff --git a/src/gpu/intel/CMakeLists.txt b/src/gpu/intel/CMakeLists.txt
index 994bfb9af39..b3d4b538db9 100644
--- a/src/gpu/intel/CMakeLists.txt
+++ b/src/gpu/intel/CMakeLists.txt
@@ -19,16 +19,20 @@ file(GLOB SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
     )
 
-add_definitions_with_host_compiler(-DNGEN_CPP11)
-add_definitions_with_host_compiler(-DNGEN_SAFE)
-add_definitions_with_host_compiler(-DNGEN_NEO_INTERFACE)
-add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES)
-add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT)
-
 add_subdirectory(compute)
-add_subdirectory(microkernels)
-add_subdirectory(jit)
-add_subdirectory(ocl)
+if(DNNL_GPU_VENDOR STREQUAL "INTEL")
+    add_definitions_with_host_compiler(-DNGEN_CPP11)
+    add_definitions_with_host_compiler(-DNGEN_SAFE)
+    add_definitions_with_host_compiler(-DNGEN_NEO_INTERFACE)
+    add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES)
+    add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT)
+    add_subdirectory(microkernels)
+    add_subdirectory(jit)
+endif()
+
+if(DNNL_WITH_OCL)
+    add_subdirectory(ocl)
+endif()
 
 if(DNNL_WITH_SYCL)
     add_subdirectory(sycl)
diff --git a/src/gpu/intel/compute/device_info.cpp b/src/gpu/intel/compute/device_info.cpp
index 237dded47b3..52296eef40f 100644
--- a/src/gpu/intel/compute/device_info.cpp
+++ b/src/gpu/intel/compute/device_info.cpp
@@ -20,7 +20,9 @@
 
 #include "common/type_helpers.hpp"
 #include "gpu/intel/compute/device_info.hpp"
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
+#endif
 #include "gpu/intel/utils.hpp"
 
 #ifdef DNNL_WITH_SYCL
@@ -129,8 +131,12 @@ int device_info_t::max_subgroup_size(gpu_arch_t gpu_arch) {
 }
 
 int device_info_t::grf_size(gpu_arch_t gpu_arch) {
+    int size = 0;
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
     ngen::HW hw = jit::convert_dnnl_arch_to_ngen(gpu_arch);
-    return ngen::GRF::bytes(hw);
+    size = ngen::GRF::bytes(hw);
+#endif
+    return size;
 }
 
 int device_info_t::min_subgroup_size() const {
diff --git a/src/gpu/intel/ocl/CMakeLists.txt b/src/gpu/intel/ocl/CMakeLists.txt
index 237eb8ba01e..b4cfda5cd9b 100644
--- a/src/gpu/intel/ocl/CMakeLists.txt
+++ b/src/gpu/intel/ocl/CMakeLists.txt
@@ -26,6 +26,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/gen_gpu_kernel_list.cmake")
 file(GLOB_RECURSE CL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cl)
 file(GLOB_RECURSE CL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
 
+if(NOT DNNL_GPU_VENDOR STREQUAL "INTEL")
+	list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/micro_sdpa.cpp")
+	list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/ocl_gpu_hw_info.cpp")
+endif()
+
 set(kernel_list_templ "${PROJECT_SOURCE_DIR}/src/gpu/intel/ocl/ocl_kernel_list.cpp.in")
 set(kernel_list_src "${PROJECT_BINARY_DIR}/src/gpu/intel/ocl/ocl_kernel_list.cpp")
 
diff --git a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp b/src/gpu/intel/ocl/ocl_gpu_device_info.cpp
index f79047823d7..2241faaa0fd 100644
--- a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp
+++ b/src/gpu/intel/ocl/ocl_gpu_device_info.cpp
@@ -42,9 +42,11 @@ status_t ocl_gpu_device_info_t::init_arch(impl::engine_t *engine) {
             = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
     OCL_CHECK(err);
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
     init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_,
             gpu_product_family_, stepping_id_, native_extensions_,
             mayiuse_systolic_, mayiuse_ngen_kernels_);
+#endif
 
     err = clReleaseContext(context);
     OCL_CHECK(err);
diff --git a/src/xpu/CMakeLists.txt b/src/xpu/CMakeLists.txt
index 8cf11d5c8d2..f4ab64cd3fe 100644
--- a/src/xpu/CMakeLists.txt
+++ b/src/xpu/CMakeLists.txt
@@ -25,7 +25,7 @@ if(DNNL_WITH_SYCL)
     add_subdirectory(sycl)
 endif()
 
-if(DNNL_GPU_VENDOR STREQUAL "INTEL")
+if(DNNL_WITH_OCL)
     add_subdirectory(ocl)
 endif()
 
diff --git a/src/xpu/ocl/usm_utils.cpp b/src/xpu/ocl/usm_utils.cpp
index 06b267a0604..13baac86ee3 100644
--- a/src/xpu/ocl/usm_utils.cpp
+++ b/src/xpu/ocl/usm_utils.cpp
@@ -15,8 +15,6 @@
 *******************************************************************************/
 
 #include "xpu/ocl/usm_utils.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/usm_utils.hpp"
 
 #define HANDLE_USM_CALL_V(e, ...) \
@@ -27,10 +25,6 @@
     assert(e->kind() == engine_kind::gpu); \
     return gpu::intel::ocl::usm::__VA_ARGS__
 
-#else
-#error "Unsupported vendor"
-#endif
-
 namespace dnnl {
 namespace impl {
 namespace xpu {