From 7e08e76214d65434f0eadb26913da5858030a414 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 5 Jun 2024 12:39:55 -0700 Subject: [PATCH 01/10] Resolve register spills in single-wg radix_sort Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index b46fb50c831..0054078b73c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -806,8 +806,10 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); - else if (__n <= 16384 && __wg_size * 8 <= __max_wg_size) - __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 32, __radix_bits, __is_ascending>{}( + // SIMD16 subgroups are needed to avoid register spillage. In practice, SIMD16 is only available on iGPU + // hardware which can be detected by a SPIR-V target check. + else if ((__n <= 16384 && __wg_size * 8 <= __max_wg_size) && oneapi::dpl::__internal::__is_spirv_target_v) + __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 16, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); else { From 4dd65206f3e3859aaaafdb7ad197cb62bcb9776c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 11 Jun 2024 06:39:52 -0700 Subject: [PATCH 02/10] Switch to a runtime check for sub-group size in problematic case Signed-off-by: Matthew Michel --- .../hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 0054078b73c..00e43749e98 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -778,6 +778,8 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements constexpr auto __wg_size = 64; + auto __subgroup_sizes = __exec.queue().get_device().template get_info(); + bool __dev_has_sg16 = std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), static_cast(16)) != __subgroup_sizes.end(); //TODO: with _RadixSortKernel also the following a couple of compile time constants is used for unique kernel name using _RadixSortKernel = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; @@ -806,10 +808,10 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); - // SIMD16 subgroups are needed to avoid register spillage. In practice, SIMD16 is only available on iGPU - // hardware which can be detected by a SPIR-V target check. - else if ((__n <= 16384 && __wg_size * 8 <= __max_wg_size) && oneapi::dpl::__internal::__is_spirv_target_v) - __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 16, 16, __radix_bits, __is_ascending>{}( + // Size 16 subgroups are needed to avoid register spillage on most hardware. Skip this branch if not supported + // to avoid runtime exceptions. + else if (__n <= 16384 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) + __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 32, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); else { From b0fe6c3cec8db94f37b4d177d1e3da6b175a8968 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 11 Jun 2024 06:43:06 -0700 Subject: [PATCH 03/10] Add missing include Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index fbf80582d43..54c61bfb2c3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -17,6 +17,7 @@ #define _ONEDPL_parallel_backend_sycl_radix_sort_one_wg_H #include "sycl_traits.h" //SYCL traits specialization for some oneDPL types. +#include //The file is an internal file and the code of that file is included by a major file into the following namespaces: //namespace oneapi From dc9ec48f9bfb503c8019263c810f62a1bfa65873 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 11 Jun 2024 06:44:28 -0700 Subject: [PATCH 04/10] Correct include location Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 1 + .../pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 00e43749e98..be1c7a9b818 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index 54c61bfb2c3..fbf80582d43 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -17,7 +17,6 @@ #define _ONEDPL_parallel_backend_sycl_radix_sort_one_wg_H #include "sycl_traits.h" //SYCL traits specialization for some oneDPL types. -#include //The file is an internal file and the code of that file is included by a major file into the following namespaces: //namespace oneapi From d10fa173850431483efa80097e5334a4295c780f Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 11 Jun 2024 08:49:55 -0500 Subject: [PATCH 05/10] Clang format Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index be1c7a9b818..b19f0003103 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -780,7 +780,8 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements constexpr auto __wg_size = 64; auto __subgroup_sizes = __exec.queue().get_device().template get_info(); - bool __dev_has_sg16 = std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), static_cast(16)) != __subgroup_sizes.end(); + bool __dev_has_sg16 = std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), static_cast(16)) != + __subgroup_sizes.end(); //TODO: with _RadixSortKernel also the following a couple of compile time constants is used for unique kernel name using _RadixSortKernel = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; From bf9dc3258889557267bf5dbd865321109749ea8a Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 17 Jun 2024 12:34:24 -0500 Subject: [PATCH 06/10] Add clarifying comment on sub-group size check Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index b19f0003103..f067e858406 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -810,8 +810,9 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); - // Size 16 subgroups are needed to avoid register spillage on most hardware. Skip this branch if not supported - // to avoid runtime exceptions. + // In __subgroup_radix_sort, we request a sub-group size via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED + // based upon the iters per item. For the below case, register spills that result in runtime exceptions have + // been observed on accelerators that do not support the requested sub-group size of 16. else if (__n <= 16384 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 32, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); From 34e633a2e0108afb490f7b66c2a4604758591c0c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 17 Jun 2024 14:52:43 -0700 Subject: [PATCH 07/10] Add second check to avoid register spills Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index f067e858406..8e27b8d47e8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -807,7 +807,7 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP else if (__n <= 4096 && __wg_size * 4 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 4, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); - else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size) + else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); // In __subgroup_radix_sort, we request a sub-group size via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED From c72cb6e8819e2d734ccb93063ec1c21e954c84c5 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 17 Jun 2024 14:58:02 -0700 Subject: [PATCH 08/10] Move comment to before the first subgroup size check Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 8e27b8d47e8..32597de4673 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -807,12 +807,12 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP else if (__n <= 4096 && __wg_size * 4 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 4, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); - else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) - __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); // In __subgroup_radix_sort, we request a sub-group size via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED // based upon the iters per item. For the below case, register spills that result in runtime exceptions have // been observed on accelerators that do not support the requested sub-group size of 16. + else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) + __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( + __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); else if (__n <= 16384 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 32, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); From cee1a50caf67d5a299f09dc76aa3785a8d117603 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 18 Jun 2024 08:59:42 -0500 Subject: [PATCH 09/10] Make variables const where appropriate Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 32597de4673..03568c60230 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -779,9 +779,9 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements constexpr auto __wg_size = 64; - auto __subgroup_sizes = __exec.queue().get_device().template get_info(); - bool __dev_has_sg16 = std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), static_cast(16)) != - __subgroup_sizes.end(); + const auto __subgroup_sizes = __exec.queue().get_device().template get_info(); + const bool __dev_has_sg16 = std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), + static_cast(16)) != __subgroup_sizes.end(); //TODO: with _RadixSortKernel also the following a couple of compile time constants is used for unique kernel name using _RadixSortKernel = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; From a489b24e2d13fa481135e61e3d0bc764522159a6 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 18 Jun 2024 09:39:57 -0500 Subject: [PATCH 10/10] Clarify comment on sub-group sizes and register spills Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 03568c60230..b625d3ffd18 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -808,8 +808,10 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 4, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); // In __subgroup_radix_sort, we request a sub-group size via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED - // based upon the iters per item. For the below case, register spills that result in runtime exceptions have - // been observed on accelerators that do not support the requested sub-group size of 16. + // based upon the iters per item. For the below cases, register spills that result in runtime exceptions have + // been observed on accelerators that do not support the requested sub-group size of 16. For the above cases + // that request but may not receive a sub-group size of 16, inputs are small enough to avoid register + // spills on assessed hardware. else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj);