From b35f42ac6d71c416379a34abf941f42f77d15752 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:36:58 +0200 Subject: [PATCH 01/40] Implementation of __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 0e03341ec02..bb5d08a9080 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1134,6 +1134,60 @@ struct __early_exit_find_or // parallel_find_or - sync pattern //------------------------------------------------------------------------ +template +struct __parallel_find_or_nd_range_tuner +{ + // Calculate the number of work groups. + template + std::size_t + operator()(_ExecutionPolicy&& /*__exec*/, const std::size_t /*__rng_n*/, std::size_t __n_groups, + std::size_t /*__wgroup_size*/) const + { + return __n_groups; + } +}; + +using __parallel_find_or_nd_range_tuner_none = __parallel_find_or_nd_range_tuner; + +// No tuning for FPGA_EMU because we are not going to tune here the performance for FPGA emulation. +#if !_ONEDPL_FPGA_EMU +template <> +struct __parallel_find_or_nd_range_tuner +{ + static constexpr std::size_t __base_rng_n = 4096; + + // Calculate the number of work groups. + template + std::size_t + operator()(_ExecutionPolicy&& /*__exec*/, const std::size_t __rng_n, std::size_t __n_groups, + std::size_t __wgroup_size) const + { + assert(__rng_n > 0); + + if (__n_groups > 1 && __rng_n >= __base_rng_n) + { + // Empirically found formula for typical devices. + const auto __rng_x = __rng_n / __base_rng_n; + const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); + + auto __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + + // We halve the number of work-groups until the number of iterations per work-item + // is greater than or equal to the desired number of iterations per work-item. + while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1) + { + __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); + __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + } + } + + return __n_groups; + } +}; +#endif // !_ONEDPL_FPGA_EMU + // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. template From 387dedd4c8084340becd37d6c1b6774c95439179 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:37:28 +0200 Subject: [PATCH 02/40] Using _GroupsTuner in __parallel_find_or Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index bb5d08a9080..8b479ce5233 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1310,12 +1310,12 @@ __parallel_find_or_impl_multiple_wgs(oneapi::dpl::__internal::__device_backend_t } // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. -template +template ::std::conditional_t< ::std::is_same_v<_BrickTag, __parallel_or_tag>, bool, oneapi::dpl::__internal::__difference_t::type>> __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, - _BrickTag __brick_tag, _Ranges&&... __rngs) + _BrickTag __brick_tag, const _GroupsTuner& __n_groups_tuner, _Ranges&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _FindOrKernelOneWG = @@ -1344,6 +1344,9 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli __n_groups = 1; } + // Tune __n_groups count + __n_groups = __n_groups_tuner(__exec, __rng_n, __n_groups, __wgroup_size); + _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); using _AtomicType = typename _BrickTag::_AtomicType; From f70dca7543f5ef98c42b327fab25c154130ad162 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 15:00:01 +0200 Subject: [PATCH 03/40] include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - __pattern_adjacent_find : __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 3bdc187ce7f..bceb4341074 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -563,6 +563,7 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _I bool result = __par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, __par_backend_hetero::__parallel_or_tag{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); // inverted conditional because of From 245d9e5b2f96a6e7587f18dc1a4715359a917eaa Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:40:35 +0200 Subject: [PATCH 04/40] include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - __pattern_any_of : __parallel_find_or_nd_range_tuner_none --- include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index bceb4341074..899f37ae3a5 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -652,7 +652,9 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, __buf.all_view()); + _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner_none{}, + __buf.all_view()); } //------------------------------------------------------------------------ From ecda02b936282606fdb9de2006f31e9afab8c57c Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 15:01:35 +0200 Subject: [PATCH 05/40] include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - __pattern_equal : __parallel_find_or_nd_range_tuner_none Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 899f37ae3a5..0bffef63473 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -681,6 +681,7 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 return !__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, __par_backend_hetero::__parallel_or_tag{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner_none{}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); } From deef5313caeb258ea986178a24536aec7a9e516a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:44:12 +0200 Subject: [PATCH 06/40] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - __parallel_or : __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 8b479ce5233..723d2fdd1a9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1404,7 +1404,9 @@ __parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Exec return oneapi::dpl::__par_backend_hetero::__parallel_find_or( __backend_tag, __par_backend_hetero::make_wrapped_policy<__or_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), __f, - __parallel_or_tag{}, __buf.all_view(), __s_buf.all_view()); + __parallel_or_tag{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner{}, + __buf.all_view(), __s_buf.all_view()); } // Special overload for single sequence cases. @@ -1421,7 +1423,9 @@ __parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Exec return oneapi::dpl::__par_backend_hetero::__parallel_find_or( __backend_tag, __par_backend_hetero::make_wrapped_policy<__or_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), __f, - __parallel_or_tag{}, __buf.all_view()); + __parallel_or_tag{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner{}, + __buf.all_view()); } //------------------------------------------------------------------------ From 37b7d08221e0423899a060b40334e88afb3535bb Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:45:20 +0200 Subject: [PATCH 07/40] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - __parallel_find : __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 723d2fdd1a9..e3d80133b94 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1453,7 +1453,9 @@ __parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Ex __backend_tag, __par_backend_hetero::make_wrapped_policy<__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - __f, _TagType{}, __buf.all_view(), __s_buf.all_view()); + __f, _TagType{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner{}, + __buf.all_view(), __s_buf.all_view()); } // Special overload for single sequence cases. @@ -1473,7 +1475,9 @@ __parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Ex __backend_tag, __par_backend_hetero::make_wrapped_policy<__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - __f, _TagType{}, __buf.all_view()); + __f, _TagType{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner{}, + __buf.all_view()); } //------------------------------------------------------------------------ From b675f23fd4a0f43b82be76c41fab4bb8f751e970 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 15:00:29 +0200 Subject: [PATCH 08/40] include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pattern_adjacent_find : __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index dd4dd25c7e3..9e7a861aa9c 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -299,6 +299,7 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R auto result = oneapi::dpl::__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, _TagType{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, oneapi::dpl::__ranges::zip_view(__rng1, __rng2)); // inverted conditional because of From 4a1282c3cee6f6623d2ee1fde4f66b19ec06e86a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:43:08 +0200 Subject: [PATCH 09/40] include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pattern_any_of : __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 9e7a861aa9c..346798e3f96 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -199,7 +199,9 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _BackendTag{}, __par_backend_hetero::make_wrapped_policy( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, ::std::forward<_Range>(__rng)); + _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner_none{}, + ::std::forward<_Range>(__rng)); } //------------------------------------------------------------------------ From c2ab99e97d7ea1868b7a3d6c884441bb519d0881 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 15:02:04 +0200 Subject: [PATCH 10/40] include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pattern_equal : __parallel_find_or_nd_range_tuner_none Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 346798e3f96..d92bc9974ff 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -104,6 +104,7 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& return !oneapi::dpl::__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner_none{}, oneapi::dpl::__ranges::zip_view(::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2))); } From 00e2a7b342f0d59f99c809567e6accacd4d2190c Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:42:09 +0200 Subject: [PATCH 11/40] include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pattern_find_if : __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index d92bc9974ff..2f768216264 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -127,7 +127,9 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range>(__rng)); + _Predicate{__pred}, _TagType{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, + ::std::forward<_Range>(__rng)); } //------------------------------------------------------------------------ From 6ee3176eedadb1163b7aa4ed2b9245e57512f93f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:42:51 +0200 Subject: [PATCH 12/40] include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pattern_find_first_of : __parallel_find_or_nd_range_tuner --- include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 2f768216264..91aa27e5528 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -183,7 +183,9 @@ __pattern_find_first_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + _Predicate{__pred}, _TagType{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, + ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ From 9222e4d38d49c5f626d1a4382bde8272737e5fdd Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:42:29 +0200 Subject: [PATCH 13/40] include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pattern_find_end : __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 91aa27e5528..88d976ff417 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -159,7 +159,9 @@ __pattern_find_end(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + _Predicate{__pred}, _TagType{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, + ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ From c29ac42800ed1128e3decc3d64581f4edfb45261 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 6 Aug 2024 13:43:39 +0200 Subject: [PATCH 14/40] include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pattern_search : __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 88d976ff417..a3e4196b7da 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -246,7 +246,9 @@ __pattern_search(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ra _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy< oneapi::dpl::__par_backend_hetero::__find_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + _Predicate{__pred}, _TagType{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, + ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ From 8a16fad145b06e9b423c1f3cd08aadb0b0d12581 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 11:24:44 +0200 Subject: [PATCH 15/40] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h - fix error in __print_device_debug_info : policy has been passed by value Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index de7e8fa1796..4606709bd45 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -298,7 +298,7 @@ class __kernel_compiler template inline void // Passing policy by value should be enough for debugging -__print_device_debug_info(_Policy __policy, size_t __wg_size = 0, size_t __max_cu = 0) +__print_device_debug_info(const _Policy& __policy, size_t __wg_size = 0, size_t __max_cu = 0) { ::std::cout << "Device info" << ::std::endl; ::std::cout << " > device name: " << oneapi::dpl::__internal::__device_info(__policy) << ::std::endl; @@ -309,7 +309,8 @@ __print_device_debug_info(_Policy __policy, size_t __wg_size = 0, size_t __max_c } #else template -inline void __print_device_debug_info(_Policy, size_t = 0, size_t = 0) +inline void +__print_device_debug_info(const _Policy& __policy, size_t = 0, size_t = 0) { } #endif From 7d0f39ed0ab6c8e476ac5a7df579dd2b8ac7b362 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 11:29:23 +0200 Subject: [PATCH 16/40] Fix review comment Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 4 +- .../hetero/algorithm_ranges_impl_hetero.h | 4 +- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 99 ++++++++++--------- 3 files changed, 59 insertions(+), 48 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 0bffef63473..06293309d9a 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -653,7 +653,7 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner_none{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner_common{}, __buf.all_view()); } @@ -681,7 +681,7 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 return !__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, __par_backend_hetero::__parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner_none{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner_common{}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index a3e4196b7da..399cc888ea9 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -104,7 +104,7 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& return !oneapi::dpl::__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner_none{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner_common{}, oneapi::dpl::__ranges::zip_view(::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2))); } @@ -207,7 +207,7 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& __par_backend_hetero::make_wrapped_policy( ::std::forward<_ExecutionPolicy>(__exec)), _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner_none{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner_common{}, ::std::forward<_Range>(__rng)); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index e3d80133b94..52b86cbf41a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1137,17 +1137,34 @@ struct __early_exit_find_or template struct __parallel_find_or_nd_range_tuner { - // Calculate the number of work groups. + // Tune the amount of work-groups and work-group size template - std::size_t - operator()(_ExecutionPolicy&& /*__exec*/, const std::size_t /*__rng_n*/, std::size_t __n_groups, - std::size_t /*__wgroup_size*/) const + std::tuple + operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const { - return __n_groups; + // TODO: find a way to generalize getting of reliable work-group size + // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. + std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); + + const auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size); + __n_groups = ::std::min(__n_groups, decltype(__n_groups)(__max_cu)); + + // Pass all small data into single WG implementation + constexpr std::size_t __max_iters_per_work_item = 32; + if (__rng_n <= __wgroup_size * __max_iters_per_work_item) + { + __n_groups = 1; + } + + _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); + + return {__n_groups, __wgroup_size}; } }; -using __parallel_find_or_nd_range_tuner_none = __parallel_find_or_nd_range_tuner; +using __parallel_find_or_nd_range_tuner_common = __parallel_find_or_nd_range_tuner; // No tuning for FPGA_EMU because we are not going to tune here the performance for FPGA emulation. #if !_ONEDPL_FPGA_EMU @@ -1156,34 +1173,44 @@ struct __parallel_find_or_nd_range_tuner - std::size_t - operator()(_ExecutionPolicy&& /*__exec*/, const std::size_t __rng_n, std::size_t __n_groups, - std::size_t __wgroup_size) const + std::tuple + operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const { - assert(__rng_n > 0); + // Call common tuning function to get the work-group size + auto __nd_range_params = __parallel_find_or_nd_range_tuner_common{}(__exec, __rng_n); - if (__n_groups > 1 && __rng_n >= __base_rng_n) + if (__rng_n >= __base_rng_n) { - // Empirically found formula for typical devices. - const auto __rng_x = __rng_n / __base_rng_n; - const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); + auto __n_groups = std::get<0>(__nd_range_params); + if (__n_groups > 1) + { + auto __wgroup_size = std::get<1>(__nd_range_params); - auto __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + // Empirically found formula for typical devices. + const auto __rng_x = __rng_n / __base_rng_n; + const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); - // We halve the number of work-groups until the number of iterations per work-item - // is greater than or equal to the desired number of iterations per work-item. - while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1) - { - __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); - __iters_per_work_item = + auto __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + + // We halve the number of work-groups until the number of iterations per work-item + // is greater than or equal to the desired number of iterations per work-item. + while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1) + { + __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); + __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + } + + _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size); + + __nd_range_params = {__n_groups, __wgroup_size}; } } - return __n_groups; + return __nd_range_params; } }; #endif // !_ONEDPL_FPGA_EMU @@ -1328,26 +1355,10 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli auto __rng_n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); assert(__rng_n > 0); - // TODO: find a way to generalize getting of reliable work-group size - // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. - // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); - - const auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - auto __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size); - __n_groups = ::std::min(__n_groups, decltype(__n_groups)(__max_cu)); - - // Pass all small data into single WG implementation - constexpr std::size_t __max_iters_per_work_item = 32; - if (__rng_n <= __wgroup_size * __max_iters_per_work_item) - { - __n_groups = 1; - } - - // Tune __n_groups count - __n_groups = __n_groups_tuner(__exec, __rng_n, __n_groups, __wgroup_size); - - _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); + // Evaluate the amount of work-groups and work-group size + const auto __nd_range_params = __n_groups_tuner(__exec, __rng_n); + const auto __n_groups = std::get<0>(__nd_range_params); + const auto __wgroup_size = std::get<1>(__nd_range_params); using _AtomicType = typename _BrickTag::_AtomicType; const _AtomicType __init_value = _BrickTag::__init_value(__rng_n); From 7f8453af1c37297799ffc1d363de3c1dafb6fff5 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 11:32:59 +0200 Subject: [PATCH 17/40] Restore usage of the common tuner in all algorithms Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h | 4 ++-- .../oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 4 ++-- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 06293309d9a..f708afb3c48 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -653,7 +653,7 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner_common{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, __buf.all_view()); } @@ -681,7 +681,7 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 return !__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, __par_backend_hetero::__parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner_common{}, + __par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 399cc888ea9..c54bb25790e 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -104,7 +104,7 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& return !oneapi::dpl::__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner_common{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, oneapi::dpl::__ranges::zip_view(::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2))); } @@ -207,7 +207,7 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& __par_backend_hetero::make_wrapped_policy( ::std::forward<_ExecutionPolicy>(__exec)), _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner_common{}, + oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, ::std::forward<_Range>(__rng)); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 52b86cbf41a..fef6b73ec27 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1164,8 +1164,6 @@ struct __parallel_find_or_nd_range_tuner } }; -using __parallel_find_or_nd_range_tuner_common = __parallel_find_or_nd_range_tuner; - // No tuning for FPGA_EMU because we are not going to tune here the performance for FPGA emulation. #if !_ONEDPL_FPGA_EMU template <> @@ -1178,6 +1176,9 @@ struct __parallel_find_or_nd_range_tuner operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const { + // Define common tuner type + using __parallel_find_or_nd_range_tuner_common = __parallel_find_or_nd_range_tuner; + // Call common tuning function to get the work-group size auto __nd_range_params = __parallel_find_or_nd_range_tuner_common{}(__exec, __rng_n); From dcc066ad768691f1fe8a31c57dddcad43543c615 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 12:16:25 +0200 Subject: [PATCH 18/40] Tune from 65'536 size and more Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index fef6b73ec27..7793be486e1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1169,6 +1169,7 @@ struct __parallel_find_or_nd_range_tuner template <> struct __parallel_find_or_nd_range_tuner { + static constexpr std::size_t __min_tune_rng_n = 65'536; static constexpr std::size_t __base_rng_n = 4096; // Tune the amount of work-groups and work-group size @@ -1182,7 +1183,7 @@ struct __parallel_find_or_nd_range_tuner= __base_rng_n) + if (__rng_n >= __min_tune_rng_n) { auto __n_groups = std::get<0>(__nd_range_params); if (__n_groups > 1) From 5128952a307853afee38f2988faa4205a22d14e5 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 12:20:52 +0200 Subject: [PATCH 19/40] Remove tuner from params Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 6 +---- .../hetero/algorithm_ranges_impl_hetero.h | 22 ++++-------------- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 23 +++++++------------ 3 files changed, 14 insertions(+), 37 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index f708afb3c48..3bdc187ce7f 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -563,7 +563,6 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _I bool result = __par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, __par_backend_hetero::__parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); // inverted conditional because of @@ -652,9 +651,7 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, - __buf.all_view()); + _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, __buf.all_view()); } //------------------------------------------------------------------------ @@ -681,7 +678,6 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 return !__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, __par_backend_hetero::__parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index c54bb25790e..dd4dd25c7e3 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -104,7 +104,6 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& return !oneapi::dpl::__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, oneapi::dpl::__ranges::zip_view(::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2))); } @@ -127,9 +126,7 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, - ::std::forward<_Range>(__rng)); + _Predicate{__pred}, _TagType{}, ::std::forward<_Range>(__rng)); } //------------------------------------------------------------------------ @@ -159,9 +156,7 @@ __pattern_find_end(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, - ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ @@ -185,9 +180,7 @@ __pattern_find_first_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, - ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ @@ -206,9 +199,7 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _BackendTag{}, __par_backend_hetero::make_wrapped_policy( ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, - ::std::forward<_Range>(__rng)); + _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, ::std::forward<_Range>(__rng)); } //------------------------------------------------------------------------ @@ -246,9 +237,7 @@ __pattern_search(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ra _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy< oneapi::dpl::__par_backend_hetero::__find_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, - ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ @@ -310,7 +299,6 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R auto result = oneapi::dpl::__par_backend_hetero::__parallel_find_or( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, _TagType{}, - oneapi::dpl::__par_backend_hetero::__parallel_find_or_nd_range_tuner<_BackendTag>{}, oneapi::dpl::__ranges::zip_view(__rng1, __rng2)); // inverted conditional because of diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 7793be486e1..ab967c697c2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1339,12 +1339,12 @@ __parallel_find_or_impl_multiple_wgs(oneapi::dpl::__internal::__device_backend_t } // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. -template +template ::std::conditional_t< ::std::is_same_v<_BrickTag, __parallel_or_tag>, bool, oneapi::dpl::__internal::__difference_t::type>> __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, - _BrickTag __brick_tag, const _GroupsTuner& __n_groups_tuner, _Ranges&&... __rngs) + _BrickTag __brick_tag, _Ranges&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _FindOrKernelOneWG = @@ -1358,7 +1358,8 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli assert(__rng_n > 0); // Evaluate the amount of work-groups and work-group size - const auto __nd_range_params = __n_groups_tuner(__exec, __rng_n); + const auto __nd_range_params = + __parallel_find_or_nd_range_tuner{}(__exec, __rng_n); const auto __n_groups = std::get<0>(__nd_range_params); const auto __wgroup_size = std::get<1>(__nd_range_params); @@ -1417,9 +1418,7 @@ __parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Exec return oneapi::dpl::__par_backend_hetero::__parallel_find_or( __backend_tag, __par_backend_hetero::make_wrapped_policy<__or_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), __f, - __parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner{}, - __buf.all_view(), __s_buf.all_view()); + __parallel_or_tag{}, __buf.all_view(), __s_buf.all_view()); } // Special overload for single sequence cases. @@ -1436,9 +1435,7 @@ __parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Exec return oneapi::dpl::__par_backend_hetero::__parallel_find_or( __backend_tag, __par_backend_hetero::make_wrapped_policy<__or_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), __f, - __parallel_or_tag{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner{}, - __buf.all_view()); + __parallel_or_tag{}, __buf.all_view()); } //------------------------------------------------------------------------ @@ -1466,9 +1463,7 @@ __parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Ex __backend_tag, __par_backend_hetero::make_wrapped_policy<__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - __f, _TagType{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner{}, - __buf.all_view(), __s_buf.all_view()); + __f, _TagType{}, __buf.all_view(), __s_buf.all_view()); } // Special overload for single sequence cases. @@ -1488,9 +1483,7 @@ __parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Ex __backend_tag, __par_backend_hetero::make_wrapped_policy<__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), - __f, _TagType{}, - __par_backend_hetero::__parallel_find_or_nd_range_tuner{}, - __buf.all_view()); + __f, _TagType{}, __buf.all_view()); } //------------------------------------------------------------------------ From 369b979c09001d3ab260058948fa3607271bed77 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 12:28:57 +0200 Subject: [PATCH 20/40] Move _PRINT_INFO_IN_DEBUG_MODE into __parallel_find_or from __parallel_find_or_nd_range_tuner Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ab967c697c2..9b7f50de7fc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1158,8 +1158,6 @@ struct __parallel_find_or_nd_range_tuner __n_groups = 1; } - _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); - return {__n_groups, __wgroup_size}; } }; @@ -1206,8 +1204,6 @@ struct __parallel_find_or_nd_range_tuner(__nd_range_params); const auto __wgroup_size = std::get<1>(__nd_range_params); + _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size); + using _AtomicType = typename _BrickTag::_AtomicType; const _AtomicType __init_value = _BrickTag::__init_value(__rng_n); const auto __pred = oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick>{__f}; From 2d20b7505797b1a6335efacba5becb902faa37d2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 13:39:34 +0200 Subject: [PATCH 21/40] Remove __min_tune_rng_n Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 38 +++++++++---------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 9b7f50de7fc..8ff64a9683f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1167,7 +1167,6 @@ struct __parallel_find_or_nd_range_tuner template <> struct __parallel_find_or_nd_range_tuner { - static constexpr std::size_t __min_tune_rng_n = 65'536; static constexpr std::size_t __base_rng_n = 4096; // Tune the amount of work-groups and work-group size @@ -1181,31 +1180,28 @@ struct __parallel_find_or_nd_range_tuner= __min_tune_rng_n) + auto __n_groups = std::get<0>(__nd_range_params); + if (__n_groups > 1) { - auto __n_groups = std::get<0>(__nd_range_params); - if (__n_groups > 1) - { - auto __wgroup_size = std::get<1>(__nd_range_params); + auto __wgroup_size = std::get<1>(__nd_range_params); - // Empirically found formula for typical devices. - const auto __rng_x = __rng_n / __base_rng_n; - const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); + // Empirically found formula for typical devices. + const auto __rng_x = __rng_n / __base_rng_n; + const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); - auto __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + auto __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); - // We halve the number of work-groups until the number of iterations per work-item - // is greater than or equal to the desired number of iterations per work-item. - while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1) - { - __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); - __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); - } - - __nd_range_params = {__n_groups, __wgroup_size}; + // We halve the number of work-groups until the number of iterations per work-item + // is greater than or equal to the desired number of iterations per work-item. + while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1) + { + __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); + __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); } + + __nd_range_params = {__n_groups, __wgroup_size}; } return __nd_range_params; From b11f794466fe90a4e0e1109a75af11a887a8dd64 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 13:45:25 +0200 Subject: [PATCH 22/40] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 8ff64a9683f..ae9b7c93e28 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1197,8 +1197,7 @@ struct __parallel_find_or_nd_range_tuner 1) { __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); - __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); } __nd_range_params = {__n_groups, __wgroup_size}; From e8f59e8e72f393b125f2494fe3bfcf2375751606 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 8 Aug 2024 18:10:36 +0200 Subject: [PATCH 23/40] Fix performance degradation for 64K of source data: tune only when __iters_per_work_item > 1 Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ae9b7c93e28..f5f796948ab 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1185,19 +1185,24 @@ struct __parallel_find_or_nd_range_tuner(__nd_range_params); - // Empirically found formula for typical devices. - const auto __rng_x = __rng_n / __base_rng_n; - const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); - auto __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); - - // We halve the number of work-groups until the number of iterations per work-item - // is greater than or equal to the desired number of iterations per work-item. - while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1) + + // If our work capacity is not enough to process all data in one iteration, will tune the number of work-groups + if (__iters_per_work_item > 1) { - __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); - __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + // Empirically found formula for typical devices. + const auto __rng_x = __rng_n / __base_rng_n; + const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); + + + // We halve the number of work-groups until the number of iterations per work-item + // is greater than or equal to the desired number of iterations per work-item. + while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1) + { + __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); + __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + } } __nd_range_params = {__n_groups, __wgroup_size}; From 9dd761c4f879a9be1080a7c127fbc9aa822eb3dd Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 14:51:13 +0200 Subject: [PATCH 24/40] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index f5f796948ab..4913717452e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1187,7 +1187,7 @@ struct __parallel_find_or_nd_range_tuner 1) { @@ -1195,13 +1195,13 @@ struct __parallel_find_or_nd_range_tuner 1) { __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); - __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); } } From 67e59d47efe23185d4c9e4f597b7ecff8faa1992 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 14:54:09 +0200 Subject: [PATCH 25/40] Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h Co-authored-by: Alexey Kukanov --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 4913717452e..54bf20956d7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1354,10 +1354,8 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli assert(__rng_n > 0); // Evaluate the amount of work-groups and work-group size - const auto __nd_range_params = + const auto [__n_groups, __wgroup_size] = __parallel_find_or_nd_range_tuner{}(__exec, __rng_n); - const auto __n_groups = std::get<0>(__nd_range_params); - const auto __wgroup_size = std::get<1>(__nd_range_params); _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size); From f2332af2171e4157aca5b7ebde78b8ded4c33a37 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 14:59:11 +0200 Subject: [PATCH 26/40] Using structure binding in __parallel_find_or_nd_range_tuner::operator() Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 54bf20956d7..ddc5d1c22f8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1178,13 +1178,10 @@ struct __parallel_find_or_nd_range_tuner; // Call common tuning function to get the work-group size - auto __nd_range_params = __parallel_find_or_nd_range_tuner_common{}(__exec, __rng_n); + auto [__n_groups, __wgroup_size] = __parallel_find_or_nd_range_tuner_common{}(__exec, __rng_n); - auto __n_groups = std::get<0>(__nd_range_params); if (__n_groups > 1) { - auto __wgroup_size = std::get<1>(__nd_range_params); - auto __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); @@ -1204,11 +1201,9 @@ struct __parallel_find_or_nd_range_tuner Date: Fri, 9 Aug 2024 15:00:18 +0200 Subject: [PATCH 27/40] Fix review comment: remove __parallel_find_or_nd_range_tuner_common Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ddc5d1c22f8..812419c7463 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1174,11 +1174,8 @@ struct __parallel_find_or_nd_range_tuner operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const { - // Define common tuner type - using __parallel_find_or_nd_range_tuner_common = __parallel_find_or_nd_range_tuner; - // Call common tuning function to get the work-group size - auto [__n_groups, __wgroup_size] = __parallel_find_or_nd_range_tuner_common{}(__exec, __rng_n); + auto [__n_groups, __wgroup_size] = __parallel_find_or_nd_range_tuner{}(__exec, __rng_n); if (__n_groups > 1) { From 001a75fea6d52674a92296a4df3c6725a798f0e4 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 15:05:42 +0200 Subject: [PATCH 28/40] Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h Co-authored-by: Alexey Kukanov --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 812419c7463..9e047863b82 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1145,17 +1145,15 @@ struct __parallel_find_or_nd_range_tuner // TODO: find a way to generalize getting of reliable work-group size // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); - - const auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); - std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size); - __n_groups = ::std::min(__n_groups, decltype(__n_groups)(__max_cu)); - - // Pass all small data into single WG implementation - constexpr std::size_t __max_iters_per_work_item = 32; - if (__rng_n <= __wgroup_size * __max_iters_per_work_item) + namespace __util = oneapi::dpl::__internal; + const std::size_t __wgroup_size = __util::__max_work_group_size(__exec, (std::size_t)4096); + std::size_t __n_groups = 1; + // If no more than 32 data elements per work item, a single work group will be used + if (__rng_n > __wgroup_size * 32) { - __n_groups = 1; + // Compute the number of groups and limit by the number of compute units + __n_groups = std::min(__util::__dpl_ceiling_div(__rng_n, __wgroup_size), + __util::__max_compute_units(__exec))); } return {__n_groups, __wgroup_size}; From 58bb63286e7a8c104d9775d6cb2947a40023165c Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 15:10:52 +0200 Subject: [PATCH 29/40] Remove alias and extra closing bracket Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 9e047863b82..78f077a7c6f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1145,15 +1145,14 @@ struct __parallel_find_or_nd_range_tuner // TODO: find a way to generalize getting of reliable work-group size // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - namespace __util = oneapi::dpl::__internal; - const std::size_t __wgroup_size = __util::__max_work_group_size(__exec, (std::size_t)4096); + const std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096); std::size_t __n_groups = 1; // If no more than 32 data elements per work item, a single work group will be used if (__rng_n > __wgroup_size * 32) { // Compute the number of groups and limit by the number of compute units - __n_groups = std::min(__util::__dpl_ceiling_div(__rng_n, __wgroup_size), - __util::__max_compute_units(__exec))); + __n_groups = std::min(oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size), + oneapi::dpl::__internal::__max_compute_units(__exec)); } return {__n_groups, __wgroup_size}; From d3ba89bc665f97ce6ee484a04718da8b6ddd853a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 15:11:58 +0200 Subject: [PATCH 30/40] Fix comment Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 78f077a7c6f..ca847f717ce 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1182,7 +1182,7 @@ struct __parallel_find_or_nd_range_tuner 1) { - // Empirically found formula for typical devices. + // Empirically found formula for GPU devices. const auto __rng_x = __rng_n / __base_rng_n; const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); From dbb77e43987e8cd7620e052ecebb55b2f053927d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 15:24:44 +0200 Subject: [PATCH 31/40] Fix review comment: remove __base_rng_n Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ca847f717ce..28c29acc583 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1164,8 +1164,6 @@ struct __parallel_find_or_nd_range_tuner template <> struct __parallel_find_or_nd_range_tuner { - static constexpr std::size_t __base_rng_n = 4096; - // Tune the amount of work-groups and work-group size template std::tuple @@ -1183,7 +1181,7 @@ struct __parallel_find_or_nd_range_tuner 1) { // Empirically found formula for GPU devices. - const auto __rng_x = __rng_n / __base_rng_n; + const auto __rng_x = __rng_n / 4096; const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); // We halve the number of work-groups until the number of iterations per work-item From 193983b258a253a21eea315eb494768bfd31a049 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 17:07:33 +0200 Subject: [PATCH 32/40] __k = std::pow(2, std::ceil(std::log2(__k)));Fix review comment: remove loop in __parallel_find_or_nd_range_tuner::operator() --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 28c29acc583..3fbdc9debf5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1180,17 +1180,21 @@ struct __parallel_find_or_nd_range_tuner 1) { + auto __current_iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + // Empirically found formula for GPU devices. const auto __rng_x = __rng_n / 4096; - const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); + const auto __desired_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); - // We halve the number of work-groups until the number of iterations per work-item - // is greater than or equal to the desired number of iterations per work-item. - while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1) + if (__current_iters_per_work_item < __desired_iters_per_work_item) { - __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2); - __iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + auto __k = __desired_iters_per_work_item / __current_iters_per_work_item; + __k = std::pow(2, std::ceil(std::log2(__k))); + __n_groups = (std::size_t)std::ceil(__n_groups / __k); + + assert(oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size) <= + __desired_iters_per_work_item); } } } From b2af0e8dc1dfdd6376f19e6be561bfb9f7527b0b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 18:04:49 +0200 Subject: [PATCH 33/40] Fix review comments: __current_iters_per_work_item is not required Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 3fbdc9debf5..bc106b4315e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1180,16 +1180,13 @@ struct __parallel_find_or_nd_range_tuner 1) { - auto __current_iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); - // Empirically found formula for GPU devices. const auto __rng_x = __rng_n / 4096; const auto __desired_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); - if (__current_iters_per_work_item < __desired_iters_per_work_item) + if (__iters_per_work_item < __desired_iters_per_work_item) { - auto __k = __desired_iters_per_work_item / __current_iters_per_work_item; + auto __k = __desired_iters_per_work_item / __iters_per_work_item; __k = std::pow(2, std::ceil(std::log2(__k))); __n_groups = (std::size_t)std::ceil(__n_groups / __k); From 4922f4644a54accdfdef359cc27bcd7ec9ba370f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 9 Aug 2024 18:10:24 +0200 Subject: [PATCH 34/40] Fix review comments: __current_iters_per_work_item is not required Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index bc106b4315e..d985cbbfda1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1186,9 +1186,9 @@ struct __parallel_find_or_nd_range_tuner Date: Fri, 9 Aug 2024 18:54:41 +0200 Subject: [PATCH 35/40] Fix review comment Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d985cbbfda1..2798aa010bf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1186,12 +1186,11 @@ struct __parallel_find_or_nd_range_tuner Date: Fri, 9 Aug 2024 19:07:42 +0200 Subject: [PATCH 36/40] Fix review comment Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2798aa010bf..9f0c04f5654 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1182,12 +1182,14 @@ struct __parallel_find_or_nd_range_tuner Date: Mon, 12 Aug 2024 09:38:18 +0200 Subject: [PATCH 37/40] Fix review comment: // TODO : need to re-evaluate this formula. Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 9f0c04f5654..d5218e70885 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1181,6 +1181,7 @@ struct __parallel_find_or_nd_range_tuner 1) { // Empirically found formula for GPU devices. + // TODO : need to re-evaluate this formula. const auto __rng_x = __rng_n / 4096; const float __desired_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); From e48be0083e5b469fa5922fed4dabec32c98b0633 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 12 Aug 2024 09:59:18 +0200 Subject: [PATCH 38/40] Fix review comment: use float type instead auto Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d5218e70885..ac2b0808374 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1182,8 +1182,8 @@ struct __parallel_find_or_nd_range_tuner Date: Mon, 12 Aug 2024 10:03:30 +0200 Subject: [PATCH 39/40] Add const to std::size_t __k Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ac2b0808374..d3c6e61184b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1190,7 +1190,7 @@ struct __parallel_find_or_nd_range_tuner Date: Mon, 12 Aug 2024 14:02:16 +0200 Subject: [PATCH 40/40] Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h Co-authored-by: Alexey Kukanov --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d3c6e61184b..64fb643377c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1187,13 +1187,13 @@ struct __parallel_find_or_nd_range_tuner