From c1da480968510da79e1cc0f4510bafef8872ffd6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 12:24:37 +0100 Subject: [PATCH 1/6] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - remove usage of __kernel_name_generator as not required Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 216 ++++++++++-------- 1 file changed, 116 insertions(+), 100 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 96d63e33aee..1366417c967 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1837,100 +1837,46 @@ struct __parallel_find_or_nd_range_tuner -__FoundStateType -__parallel_find_or_impl_one_wg(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - _BrickTag __brick_tag, const std::size_t __rng_n, const std::size_t __wgroup_size, - const __FoundStateType __init_value, _Predicate __pred, _Ranges&&... __rngs) -{ - using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, __FoundStateType>; - __result_and_scratch_storage_t __result_storage{__exec, 1, 0}; - - // Calculate the number of elements to be processed by each work-item. - const auto __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size); - - // main parallel_for - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - auto __result_acc = - __result_storage.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); - - __cgh.parallel_for( - sycl::nd_range(sycl::range(__wgroup_size), sycl::range(__wgroup_size)), - [=](sycl::nd_item __item_id) { - auto __local_idx = __item_id.get_local_id(0); - - // 1. Set initial value to local found state - __FoundStateType __found_local = __init_value; - - // 2. Find any element that satisfies pred - // - after this call __found_local may still have initial value: - // 1) if no element satisfies pred; - // 2) early exit from sub-group occurred: in this case the state of __found_local will updated in the next group operation (3) - __pred(__item_id, __rng_n, __iters_per_work_item, __wgroup_size, __found_local, __brick_tag, __rngs...); - - // 3. Reduce over group: find __dpl_sycl::__minimum (for the __parallel_find_forward_tag), - // find __dpl_sycl::__maximum (for the __parallel_find_backward_tag) - // or update state with __dpl_sycl::__any_of_group (for the __parallel_or_tag) - // inside all our group items - if constexpr (__or_tag_check) - __found_local = __dpl_sycl::__any_of_group(__item_id.get_group(), __found_local); - else - __found_local = __dpl_sycl::__reduce_over_group(__item_id.get_group(), __found_local, - typename _BrickTag::_LocalResultsReduceOp{}); - - // Set local found state value value to global state to have correct result - if (__local_idx == 0) - { - __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__result_acc)[0] = __found_local; - } - }); - }); - - // Wait and return result - return __result_storage.__wait_and_get_value(__event); -} +template +struct __parallel_find_or_impl_one_wg; // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. -template -_AtomicType -__parallel_find_or_impl_multiple_wgs(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - _BrickTag __brick_tag, const std::size_t __rng_n, const std::size_t __n_groups, - const std::size_t __wgroup_size, const _AtomicType __init_value, _Predicate __pred, - _Ranges&&... __rngs) -{ - auto __result = __init_value; - - // Calculate the number of elements to be processed by each work-item. - const auto __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); - - // scope is to copy data back to __result after destruction of temporary sycl:buffer +template +struct __parallel_find_or_impl_one_wg<__internal::__optional_kernel_name, __or_tag_check> +{ + template + __FoundStateType + operator()(_ExecutionPolicy&& __exec, _BrickTag __brick_tag, const std::size_t __rng_n, + const std::size_t __wgroup_size, const __FoundStateType __init_value, _Predicate __pred, + _Ranges&&... __rngs) { - sycl::buffer<_AtomicType, 1> __result_sycl_buf(&__result, 1); // temporary storage for global atomic + using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, __FoundStateType>; + __result_and_scratch_storage_t __result_storage{__exec, 1, 0}; + + // Calculate the number of elements to be processed by each work-item. + const auto __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size); // main parallel_for - __exec.queue().submit([&](sycl::handler& __cgh) { + auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - auto __result_sycl_buf_acc = __result_sycl_buf.template get_access(__cgh); + auto __result_acc = + __result_storage.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); - __cgh.parallel_for( - sycl::nd_range(sycl::range(__n_groups * __wgroup_size), - sycl::range(__wgroup_size)), + __cgh.parallel_for( + sycl::nd_range(sycl::range(__wgroup_size), sycl::range(__wgroup_size)), [=](sycl::nd_item __item_id) { auto __local_idx = __item_id.get_local_id(0); // 1. Set initial value to local found state - _AtomicType __found_local = __init_value; + __FoundStateType __found_local = __init_value; // 2. Find any element that satisfies pred // - after this call __found_local may still have initial value: // 1) if no element satisfies pred; // 2) early exit from sub-group occurred: in this case the state of __found_local will updated in the next group operation (3) - __pred(__item_id, __rng_n, __iters_per_work_item, __n_groups * __wgroup_size, __found_local, - __brick_tag, __rngs...); + __pred(__item_id, __rng_n, __iters_per_work_item, __wgroup_size, __found_local, __brick_tag, + __rngs...); // 3. Reduce over group: find __dpl_sycl::__minimum (for the __parallel_find_forward_tag), // find __dpl_sycl::__maximum (for the __parallel_find_backward_tag) @@ -1942,22 +1888,92 @@ __parallel_find_or_impl_multiple_wgs(oneapi::dpl::__internal::__device_backend_t __found_local = __dpl_sycl::__reduce_over_group(__item_id.get_group(), __found_local, typename _BrickTag::_LocalResultsReduceOp{}); - // Set local found state value value to global atomic - if (__local_idx == 0 && __found_local != __init_value) + // Set local found state value value to global state to have correct result + if (__local_idx == 0) { - __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( - *__dpl_sycl::__get_accessor_ptr(__result_sycl_buf_acc)); - - // Update global (for all groups) atomic state with the found index - _BrickTag::__save_state_to_atomic(__found, __found_local); + __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__result_acc)[0] = + __found_local; } }); }); - //The end of the scope - a point of synchronization (on temporary sycl buffer destruction) + + // Wait and return result + return __result_storage.__wait_and_get_value(__event); } +}; - return __result; -} +template +struct __parallel_find_or_impl_multiple_wgs; + +// Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. +template +struct __parallel_find_or_impl_multiple_wgs<__internal::__optional_kernel_name, __or_tag_check> +{ + template + _AtomicType + operator()(_ExecutionPolicy&& __exec, _BrickTag __brick_tag, + const std::size_t __rng_n, const std::size_t __n_groups, const std::size_t __wgroup_size, + const _AtomicType __init_value, _Predicate __pred, _Ranges&&... __rngs) + { + auto __result = __init_value; + + // Calculate the number of elements to be processed by each work-item. + const auto __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); + + // scope is to copy data back to __result after destruction of temporary sycl:buffer + { + sycl::buffer<_AtomicType, 1> __result_sycl_buf(&__result, 1); // temporary storage for global atomic + + // main parallel_for + __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + auto __result_sycl_buf_acc = __result_sycl_buf.template get_access(__cgh); + + __cgh.parallel_for( + sycl::nd_range(sycl::range(__n_groups * __wgroup_size), + sycl::range(__wgroup_size)), + [=](sycl::nd_item __item_id) { + auto __local_idx = __item_id.get_local_id(0); + + // 1. Set initial value to local found state + _AtomicType __found_local = __init_value; + + // 2. Find any element that satisfies pred + // - after this call __found_local may still have initial value: + // 1) if no element satisfies pred; + // 2) early exit from sub-group occurred: in this case the state of __found_local will updated in the next group operation (3) + __pred(__item_id, __rng_n, __iters_per_work_item, __n_groups * __wgroup_size, __found_local, + __brick_tag, __rngs...); + + // 3. Reduce over group: find __dpl_sycl::__minimum (for the __parallel_find_forward_tag), + // find __dpl_sycl::__maximum (for the __parallel_find_backward_tag) + // or update state with __dpl_sycl::__any_of_group (for the __parallel_or_tag) + // inside all our group items + if constexpr (__or_tag_check) + __found_local = __dpl_sycl::__any_of_group(__item_id.get_group(), __found_local); + else + __found_local = __dpl_sycl::__reduce_over_group( + __item_id.get_group(), __found_local, typename _BrickTag::_LocalResultsReduceOp{}); + + // Set local found state value value to global atomic + if (__local_idx == 0 && __found_local != __init_value) + { + __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( + *__dpl_sycl::__get_accessor_ptr(__result_sycl_buf_acc)); + + // Update global (for all groups) atomic state with the found index + _BrickTag::__save_state_to_atomic(__found, __found_local); + } + }); + }); + //The end of the scope - a point of synchronization (on temporary sycl buffer destruction) + } + + return __result; + } +}; // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. template @@ -1968,12 +1984,6 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli _BrickTag __brick_tag, _Ranges&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _FindOrKernelOneWG = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_or_kernel_one_wg, _CustomName, - _Brick, _BrickTag, _Ranges...>; - using _FindOrKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_or_kernel, _CustomName, _Brick, - _BrickTag, _Ranges...>; auto __rng_n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); assert(__rng_n > 0); @@ -1996,20 +2006,26 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // We shouldn't have any restrictions for _AtomicType type here // because we have a single work-group and we don't need to use atomics for inter-work-group communication. + using _KernelName = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__find_or_kernel_one_wg<_CustomName>>; + // Single WG implementation - __result = __parallel_find_or_impl_one_wg<_FindOrKernelOneWG, __or_tag_check>( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, - __rng_n, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); + __result = __parallel_find_or_impl_one_wg<_KernelName, __or_tag_check>()( + std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __wgroup_size, __init_value, __pred, + std::forward<_Ranges>(__rngs)...); } else { assert("This device does not support 64-bit atomics" && (sizeof(_AtomicType) < 8 || __exec.queue().get_device().has(sycl::aspect::atomic64))); + using _KernelName = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__find_or_kernel<_CustomName>>; + // Multiple WG implementation - __result = __parallel_find_or_impl_multiple_wgs<_FindOrKernel, __or_tag_check>( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, - __rng_n, __n_groups, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); + __result = __parallel_find_or_impl_multiple_wgs<_KernelName, __or_tag_check>()( + std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __n_groups, __wgroup_size, __init_value, + __pred, std::forward<_Ranges>(__rngs)...); } if constexpr (__or_tag_check) From d2bcf2a1ac645d6d7bc9ddbda87b0d553670e428 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 23 Dec 2024 13:31:11 +0100 Subject: [PATCH 2/6] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 72c8230c732..7c29e06d318 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1844,8 +1844,8 @@ struct __parallel_find_or_impl_one_wg; template struct __parallel_find_or_impl_one_wg<__internal::__optional_kernel_name, __or_tag_check> { - template + template __FoundStateType operator()(_ExecutionPolicy&& __exec, _BrickTag __brick_tag, const std::size_t __rng_n, const std::size_t __wgroup_size, const __FoundStateType __init_value, _Predicate __pred, @@ -1912,9 +1912,9 @@ struct __parallel_find_or_impl_multiple_wgs<__internal::__optional_kernel_name _AtomicType - operator()(_ExecutionPolicy&& __exec, _BrickTag __brick_tag, - const std::size_t __rng_n, const std::size_t __n_groups, const std::size_t __wgroup_size, - const _AtomicType __init_value, _Predicate __pred, _Ranges&&... __rngs) + operator()(_ExecutionPolicy&& __exec, _BrickTag __brick_tag, const std::size_t __rng_n, + const std::size_t __n_groups, const std::size_t __wgroup_size, const _AtomicType __init_value, + _Predicate __pred, _Ranges&&... __rngs) { auto __result = __init_value; From f3b9f48a49b8e67c0c8ad08649327a701ccac461 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 13 Jan 2025 10:06:41 +0100 Subject: [PATCH 3/6] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - fix review comment: restore __device_backend_tag in __parallel_find_or_impl_one_wg::operator() and __parallel_find_or_impl_multiple_wgs::operator() --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c427cd8f9d3..3d17d665363 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1786,9 +1786,9 @@ struct __parallel_find_or_impl_one_wg<__internal::__optional_kernel_name __FoundStateType - operator()(_ExecutionPolicy&& __exec, _BrickTag __brick_tag, const std::size_t __rng_n, - const std::size_t __wgroup_size, const __FoundStateType __init_value, _Predicate __pred, - _Ranges&&... __rngs) + operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _BrickTag __brick_tag, + const std::size_t __rng_n, const std::size_t __wgroup_size, const __FoundStateType __init_value, + _Predicate __pred, _Ranges&&... __rngs) { using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, __FoundStateType>; __result_and_scratch_storage_t __result_storage{__exec, 1, 0}; @@ -1851,9 +1851,9 @@ struct __parallel_find_or_impl_multiple_wgs<__internal::__optional_kernel_name _AtomicType - operator()(_ExecutionPolicy&& __exec, _BrickTag __brick_tag, const std::size_t __rng_n, - const std::size_t __n_groups, const std::size_t __wgroup_size, const _AtomicType __init_value, - _Predicate __pred, _Ranges&&... __rngs) + operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _BrickTag __brick_tag, + const std::size_t __rng_n, const std::size_t __n_groups, const std::size_t __wgroup_size, + const _AtomicType __init_value, _Predicate __pred, _Ranges&&... __rngs) { auto __result = __init_value; @@ -1950,8 +1950,8 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // Single WG implementation __result = __parallel_find_or_impl_one_wg<_KernelName, __or_tag_check>()( - std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __wgroup_size, __init_value, __pred, - std::forward<_Ranges>(__rngs)...); + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, + __rng_n, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); } else { @@ -1963,8 +1963,8 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // Multiple WG implementation __result = __parallel_find_or_impl_multiple_wgs<_KernelName, __or_tag_check>()( - std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __n_groups, __wgroup_size, __init_value, - __pred, std::forward<_Ranges>(__rngs)...); + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, + __rng_n, __n_groups, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); } if constexpr (__or_tag_check) From 6aef56c4196e404118d5edf5230abdff8e25fbf3 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 15 Jan 2025 10:46:27 +0100 Subject: [PATCH 4/6] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - move KernelName template parameter to last position --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 3d17d665363..cc3e9af8d40 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1776,12 +1776,12 @@ struct __parallel_find_or_nd_range_tuner +template struct __parallel_find_or_impl_one_wg; // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. -template -struct __parallel_find_or_impl_one_wg<__internal::__optional_kernel_name, __or_tag_check> +template +struct __parallel_find_or_impl_one_wg<__or_tag_check, __internal::__optional_kernel_name> { template @@ -1841,12 +1841,12 @@ struct __parallel_find_or_impl_one_wg<__internal::__optional_kernel_name +template struct __parallel_find_or_impl_multiple_wgs; // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. -template -struct __parallel_find_or_impl_multiple_wgs<__internal::__optional_kernel_name, __or_tag_check> +template +struct __parallel_find_or_impl_multiple_wgs<__or_tag_check, __internal::__optional_kernel_name> { template @@ -1949,7 +1949,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__find_or_kernel_one_wg<_CustomName>>; // Single WG implementation - __result = __parallel_find_or_impl_one_wg<_KernelName, __or_tag_check>()( + __result = __parallel_find_or_impl_one_wg<__or_tag_check, _KernelName>()( oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); } @@ -1962,7 +1962,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__find_or_kernel<_CustomName>>; // Multiple WG implementation - __result = __parallel_find_or_impl_multiple_wgs<_KernelName, __or_tag_check>()( + __result = __parallel_find_or_impl_multiple_wgs<__or_tag_check, _KernelName>()( oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __n_groups, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); } From 927b85dd4e04ea57d1241fee41e2a3661eb4e98d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 15 Jan 2025 10:49:09 +0100 Subject: [PATCH 5/6] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - fix review comment: rename _KernelName to __find_or_kernel_name --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index cc3e9af8d40..73a30aec844 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1945,11 +1945,11 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // We shouldn't have any restrictions for _AtomicType type here // because we have a single work-group and we don't need to use atomics for inter-work-group communication. - using _KernelName = + using __find_or_kernel_name = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__find_or_kernel_one_wg<_CustomName>>; // Single WG implementation - __result = __parallel_find_or_impl_one_wg<__or_tag_check, _KernelName>()( + __result = __parallel_find_or_impl_one_wg<__or_tag_check, __find_or_kernel_name>()( oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); } @@ -1958,11 +1958,11 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli assert("This device does not support 64-bit atomics" && (sizeof(_AtomicType) < 8 || __exec.queue().get_device().has(sycl::aspect::atomic64))); - using _KernelName = + using __find_or_kernel_name = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__find_or_kernel<_CustomName>>; // Multiple WG implementation - __result = __parallel_find_or_impl_multiple_wgs<__or_tag_check, _KernelName>()( + __result = __parallel_find_or_impl_multiple_wgs<__or_tag_check, __find_or_kernel_name>()( oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __n_groups, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); } From aec257ebcfab181966258f0302064c19af41678c Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 15 Jan 2025 10:53:45 +0100 Subject: [PATCH 6/6] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - fix review comment: rename _KernelName to __find_or_one_wg_kernel_name / __find_or_kernel_name --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 73a30aec844..c7b72625315 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1945,11 +1945,11 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // We shouldn't have any restrictions for _AtomicType type here // because we have a single work-group and we don't need to use atomics for inter-work-group communication. - using __find_or_kernel_name = + using __find_or_one_wg_kernel_name = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__find_or_kernel_one_wg<_CustomName>>; // Single WG implementation - __result = __parallel_find_or_impl_one_wg<__or_tag_check, __find_or_kernel_name>()( + __result = __parallel_find_or_impl_one_wg<__or_tag_check, __find_or_one_wg_kernel_name>()( oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, __rng_n, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); }