oneapi-src · nwnk · Jul 24, 2024 · Jul 30, 2024 · Jul 24, 2024 · Jul 29, 2024
@@ -146,6 +146,7 @@ set(DNNL_ENABLE_PRIMITIVE_GPU_ISA "ALL" CACHE STRING
     at build time. Regardless of value chosen, reference OpenCL-based
     implementations will always be available. Valid values:
     - ALL (the default). Includes all ISA to be enabled.
+    - NONE. Includes no ISAs, just the generic kernels.
     - <ISA_NAME>;<ISA_NAME>;... Includes only selected ISA to be enabled.
       Possible values are: GEN9, GEN11, XELP, XEHP, XEHPG, XEHPC, XE2.")
 
@@ -281,13 +282,13 @@ endif()
 
 set(DNNL_GPU_VENDOR "NONE" CACHE STRING
     "When DNNL_GPU_RUNTIME is not NONE DNNL_GPU_VENDOR specifies target GPU
-    vendor for GPU engines. Can be INTEL (default), NVIDIA or AMD.")
+    vendor for GPU engines. Can be INTEL (default), GENERIC, NVIDIA or AMD.")
 
 if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE" AND DNNL_GPU_VENDOR STREQUAL "NONE")
     set(DNNL_GPU_VENDOR "INTEL")
 endif()
 
-if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(NONE|INTEL|NVIDIA|AMD)$")
+if(NOT "${DNNL_GPU_VENDOR}" MATCHES "^(NONE|GENERIC|INTEL|NVIDIA|AMD)$")
     message(FATAL_ERROR "Unsupported GPU vendor: ${DNNL_GPU_VENDOR}")
 endif()
 
@@ -327,6 +328,12 @@ else()
     set(DNNL_WITH_SYCL false)
 endif()
 
+if(DNNL_GPU_RUNTIME STREQUAL "OCL") # ... OR DNNL_CPU_RUNTIME STREQUAL "OCL")
+    set(DNNL_WITH_OCL true)
+else()
+    set(DNNL_WITH_OCL false)
+endif()
+
 # =============
 # Miscellaneous
 # =============

@@ -125,6 +125,10 @@ endif()
 
 add_subdirectory(common)
 
+if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME STREQUAL "OCL")
+    add_subdirectory(xpu)
+endif()
+
 if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")
     add_subdirectory(cpu)
 endif()
@@ -133,10 +137,6 @@ if(NOT DNNL_GPU_RUNTIME STREQUAL "NONE")
     add_subdirectory(gpu)
 endif()
 
-if(DNNL_WITH_SYCL OR DNNL_GPU_RUNTIME STREQUAL "OCL")
-    add_subdirectory(xpu)
-endif()
-
 if(DNNL_WITH_SYCL)
     # Enable linking SYCL kernels.
     if(DNNL_SYCL_CUDA)

@@ -26,12 +26,14 @@ set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
 
 add_subdirectory(generic)
 
-if(DNNL_GPU_VENDOR STREQUAL "INTEL")
-    add_definitions_with_host_compiler(-DNGEN_CPP11)
-    add_definitions_with_host_compiler(-DNGEN_SAFE)
-    add_definitions_with_host_compiler(-DNGEN_NEO_INTERFACE)
-    add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES)
-    add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT)
+if(DNNL_WITH_OCL)
+    if(DNNL_GPU_VENDOR STREQUAL "INTEL")
+	add_definitions_with_host_compiler(-DNGEN_CPP11)
+	add_definitions_with_host_compiler(-DNGEN_SAFE)
+	add_definitions_with_host_compiler(-DNGEN_NEO_INTERFACE)
+	add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES)
+	add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT)
+    endif()
     add_subdirectory(intel)
 endif()
 

@@ -16,13 +16,14 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp"
-#include "gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp"
 #include "gpu/intel/ocl/bnorm/ref_batch_normalization.hpp"
 #include "gpu/intel/ocl/bnorm/reusable_bnorm.hpp"
 #include "gpu/intel/ocl/bnorm/simple_bnorm.hpp"
 
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+#include "gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp"
+#include "gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp"
+
 #ifdef DNNL_DEV_MODE
 #include "gpu/intel/ocl/bnorm/nhwc_reusable.hpp"
 #endif
@@ -52,9 +53,9 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::nhwc_reusable_batch_normalization_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::nhwc_batch_normalization_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_batch_normalization_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::simple_batch_normalization_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_batch_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_batch_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_batch_normalization_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_batch_normalization_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_batch_normalization_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_batch_normalization_fwd_t)
@@ -64,9 +65,9 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_INTEL_DEVMODE(intel::ocl::nhwc_reusable_batch_normalization_bwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::nhwc_batch_normalization_bwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_batch_normalization_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::simple_batch_normalization_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::reusable_batch_normalization_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_batch_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_batch_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::reusable_batch_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_batch_normalization_bwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_batch_normalization_bwd_t)
         GPU_INSTANCE_AMD(amd::miopen_batch_normalization_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_batch_normalization_bwd_t)

@@ -15,11 +15,11 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/multi_po_reorder_binary.hpp"
+#include "gpu/intel/ocl/simple_binary.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gen9_binary.hpp"
-#include "gpu/intel/ocl/multi_po_reorder_binary.hpp"
-#include "gpu/intel/ocl/simple_binary.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -39,9 +39,9 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_BINARY_P({
-        GPU_INSTANCE_INTEL(intel::ocl::multi_po_reorder_binary)
+        GPU_INSTANCE_GENERIC(intel::ocl::multi_po_reorder_binary)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_binary_t)
-        GPU_INSTANCE_INTEL(intel::ocl::simple_binary_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::simple_binary_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_binary_t)
         GPU_INSTANCE_AMD(amd::miopen_binary_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_binary_t)

@@ -17,12 +17,12 @@
 #include "gpu/gpu_impl_list.hpp"
 
 #include "gpu/generic/ref_concat.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/ocl/gen9_concat.hpp"
 #include "gpu/intel/ocl/multi_concat.hpp"
 #include "gpu/intel/ocl/reusable_simple_concat.hpp"
 #include "gpu/intel/ocl/simple_concat.hpp"
+
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+#include "gpu/intel/ocl/gen9_concat.hpp"
 #endif
 
 namespace dnnl {
@@ -33,10 +33,10 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_CONCAT_P({
-        GPU_CONCAT_INSTANCE_INTEL(intel::ocl::reusable_simple_concat_t)
-        GPU_CONCAT_INSTANCE_INTEL(intel::ocl::simple_concat_t)
+        GPU_CONCAT_INSTANCE_GENERIC(intel::ocl::reusable_simple_concat_t)
+        GPU_CONCAT_INSTANCE_GENERIC(intel::ocl::simple_concat_t)
         GPU_CONCAT_INSTANCE_INTEL(intel::ocl::gen9_concat_t)
-        GPU_CONCAT_INSTANCE_INTEL(intel::ocl::multi_concat_t)
+        GPU_CONCAT_INSTANCE_GENERIC(intel::ocl::multi_concat_t)
         GPU_CONCAT_INSTANCE_GENERIC(generic::ref_concat_t)
         nullptr,
 });

@@ -15,12 +15,12 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/ref_convolution.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/binary_format.hpp"
 #include "gpu/intel/jit/conv/gen_convolution.hpp"
 #include "gpu/intel/ocl/gen9_wino_convolution.hpp"
-#include "gpu/intel/ocl/ref_convolution.hpp"
 
 #ifdef DNNL_DEV_MODE
 #include "gpu/intel/jit/v2/conv/gen_convolution.hpp"
@@ -50,23 +50,23 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_fwd_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_wino_convolution_fwd_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_fwd_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_convolution_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_fwd_t)
         nullptr,
     }},
     {{backward_data}, REG_BWD_D_PK({
         GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_data_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_data_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_bwd_data_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_convolution_bwd_data_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_data_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_bwd_data_t)
         nullptr,
     })},
     {{backward_weights}, REG_BWD_PK({
         GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_weights_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_weights_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_bwd_weights_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_convolution_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_weights_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_bwd_weights_t)
         nullptr,

@@ -15,10 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/convolution_deconvolution.hpp"
-#endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/nvidia/cudnn_deconvolution.hpp"
@@ -39,14 +36,14 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_DECONV_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::convolution_deconvolution_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_deconvolution_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_bwd_data_t)
-        GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_bwd_weights_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::convolution_deconvolution_bwd_data_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::convolution_deconvolution_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_data_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_weights_t)
         GPU_INSTANCE_AMD(amd::miopen_deconvolution_bwd_data_t)

@@ -15,10 +15,10 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/ref_eltwise.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gen9_eltwise.hpp"
-#include "gpu/intel/ocl/ref_eltwise.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -42,15 +42,15 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_ELTWISE_P({
     {{forward}, {
         GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_fwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_eltwise_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_eltwise_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_eltwise_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_sycl_eltwise_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
         GPU_INSTANCE_INTEL(intel::ocl::gen9_eltwise_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_eltwise_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_eltwise_bwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_eltwise_bwd_t)
         GPU_INSTANCE_AMD(amd::miopen_eltwise_bwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_sycl_eltwise_bwd_t)

@@ -17,14 +17,14 @@
 #include "common/compiler_workarounds.hpp"
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/gemm/gemm_with_post_ops.hpp"
+#include "gpu/intel/ocl/gemm/ref_gemm.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/binary_format.hpp"
 
 #include "gpu/intel/jit/gemm/gen_gemm.hpp"
 #include "gpu/intel/jit/gemm/xe_hp_systolic_gemm.hpp"
-#include "gpu/intel/ocl/gemm/gemm_with_post_ops.hpp"
-#include "gpu/intel/ocl/gemm/ref_gemm.hpp"
 
 #ifdef DNNL_DEV_MODE
 #include "gpu/intel/ocl/gemm/conv_gemm.hpp"
@@ -44,7 +44,7 @@ constexpr impl_list_item_t impl_list[] = {
         GPU_INSTANCE_INTEL(intel::jit::xe_hp_systolic_gemm_t)
         GPU_INSTANCE_INTEL(intel::ocl::gemm_with_post_ops_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_gemm_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_gemm_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_gemm_t)
         nullptr,
 };
 // clang-format on

@@ -15,10 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
-
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/ref_group_normalization.hpp"
-#endif
 
 namespace dnnl {
 namespace impl {
@@ -31,12 +28,12 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
 impl_list_map REG_GNORM_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::ref_group_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_group_normalization_fwd_t)
         nullptr,
         }
     },
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::ref_group_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC(intel::ocl::ref_group_normalization_bwd_t)
         nullptr,
         })
     },

@@ -150,10 +150,9 @@ namespace gpu {
 
 // Instance macros that are enabled only when REF is disabled
 #ifdef DNNL_DISABLE_GPU_REF_KERNELS
-#define GPU_INSTANCE_INTEL_REF(...)
+#define GPU_INSTANCE_REF(...)
 #else
-#define GPU_INSTANCE_INTEL_REF(...) \
-    DNNL_GPU_INTEL_ONLY(GPU_INSTANCE(__VA_ARGS__))
+#define GPU_INSTANCE_REF(...) GPU_INSTANCE(__VA_ARGS__)
 #endif
 
 #define DECLARE_IMPL_LIST(kind) \

@@ -15,12 +15,12 @@
 *******************************************************************************/
 
 #include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/ocl/convolution_inner_product.hpp"
+#include "gpu/intel/ocl/ref_inner_product.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/ocl/convolution_inner_product.hpp"
 #include "gpu/intel/ocl/gemm_inner_product.hpp"
 #include "gpu/intel/ocl/gemm_post_ops_inner_product.hpp"
-#include "gpu/intel/ocl/ref_inner_product.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
@@ -45,7 +45,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     {{forward}, {
         GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::convolution_inner_product_fwd_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_inner_product_fwd_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_inner_product_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_fwd_t)
@@ -54,8 +54,8 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
     {{backward}, REG_BWD_PK({
         GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_bwd_data_t)
         GPU_INSTANCE_INTEL(intel::ocl::gemm_inner_product_bwd_weights_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_inner_product_bwd_data_t)
-        GPU_INSTANCE_INTEL_REF(intel::ocl::ref_inner_product_bwd_weights_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_inner_product_bwd_data_t)
+        GPU_INSTANCE_REF(intel::ocl::ref_inner_product_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_bwd_data_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_bwd_data_t)