From 0ad44565715767961447099d22896f15ce7c3088 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 7 Jun 2024 10:54:54 +0200 Subject: [PATCH 01/28] include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - new predicates __find_if_unary_transform_op, __find_if_binary_reduce_op Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 766b599361f..5c47188cfe3 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -646,6 +646,37 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator // any_of //------------------------------------------------------------------------ +template +struct __find_if_unary_transform_op +{ + _UnaryTransformOp __transform_op; + + template + _Typle + operator()(const Arg& arg) const + { + return {__transform_op(std::get<0>(arg)), std::get<1>(arg)}; + } +}; + +template +struct __find_if_binary_reduce_op +{ + _Typle + operator()(const _Typle& op1, const _Typle& op2) const + { + if (std::get<0>(op1) && std::get<0>(op2)) + { + if constexpr (_IsFirst{}) + return {true, std::min(std::get<1>(op1), std::get<1>(op2))}; + else + return {true, std::max(std::get<1>(op1), std::get<1>(op2))}; + } + + return std::get<0>(op1) ? op1 : op2; + } +}; + template bool __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, From b95e518993ac946016e8483925a3219e529df870 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 7 Jun 2024 10:55:47 +0200 Subject: [PATCH 02/28] include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - implementation of __pattern_any_of on __parallel_transform_reduce Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 41 +++++++++++++++---- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 5c47188cfe3..54a9c14c52b 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -682,19 +682,42 @@ bool __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _Pred __pred) { - if (__first == __last) + using _difference_type = typename ::std::iterator_traits<_Iterator>::difference_type; + + const _difference_type __n = __last - __first; + if (__n == 0) return false; - using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_ExecutionPolicy, _Pred>; + using _result_type = oneapi::dpl::__internal::tuple; + const auto __init = _result_type{false, __n}; - auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); - auto __buf = __keep(__first, __last); + // __counting_iterator_t - iterate position (index) in source data + using __counting_iterator_t = oneapi::dpl::counting_iterator<_difference_type>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, - __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, __buf.all_view()); + using _zipped_data_type = typename std::iterator_traits::value_type; + + __find_if_binary_reduce_op<_zipped_data_type, /*_IsFirst*/ std::true_type> __reduce_op; + __find_if_unary_transform_op<_zipped_data_type, _Pred> __transform_op{__pred}; + + using _Functor = unseq_backend::walk_n<_ExecutionPolicy, decltype(__transform_op)>; + using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_result_type>; + + auto __keep_src_data = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); + auto __buf_src_data = __keep_src_data(__first, __last); + + const __counting_iterator_t __counting_it_first{0}, __counting_it_last{__n}; + auto __keep_counting_it = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); + auto __buf_counting_it = __keep_counting_it(__counting_it_first, __counting_it_last); + + auto res = + oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, std::true_type /*is_commutative*/>( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + __reduce_op, _Functor{__transform_op}, + unseq_backend::__init_value<_RepackedTp>{__init}, // initial value + oneapi::dpl::__ranges::make_zip_view(__buf_src_data.all_view(), __buf_counting_it.all_view())) + .get(); + + return std::get<0>(res); } //------------------------------------------------------------------------ From f2a0344ff9c67973f550b86839ae2be5c14c3ed1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 7 Jun 2024 10:56:05 +0200 Subject: [PATCH 03/28] include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - implementation of __pattern_find_if on __parallel_transform_reduce Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 40 +++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 54a9c14c52b..4f693e68494 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -769,16 +769,42 @@ _Iterator __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _Pred __pred) { - if (__first == __last) + using _difference_type = typename ::std::iterator_traits<_Iterator>::difference_type; + + const _difference_type __n = __last - __first; + if (__n == 0) return __last; - using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_ExecutionPolicy, _Pred>; + using _result_type = oneapi::dpl::__internal::tuple; + const auto __init = _result_type{false, __n}; - return __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), - __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), _Predicate{__pred}, - ::std::true_type{}); + // __counting_iterator_t - iterate position (index) in source data + using __counting_iterator_t = oneapi::dpl::counting_iterator<_difference_type>; + + using _zipped_data_type = typename std::iterator_traits::value_type; + + __find_if_binary_reduce_op<_zipped_data_type, /*_IsFirst*/ std::true_type> __reduce_op; + __find_if_unary_transform_op<_zipped_data_type, _Pred> __transform_op{__pred}; + + using _Functor = unseq_backend::walk_n<_ExecutionPolicy, decltype(__transform_op)>; + using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_result_type>; + + auto __keep_src_data = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); + auto __buf_src_data = __keep_src_data(__first, __last); + + const __counting_iterator_t __counting_it_first{0}, __counting_it_last{__n}; + auto __keep_counting_it = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); + auto __buf_counting_it = __keep_counting_it(__counting_it_first, __counting_it_last); + + auto res = + oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, std::true_type /*is_commutative*/>( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + __reduce_op, _Functor{__transform_op}, + unseq_backend::__init_value<_RepackedTp>{__init}, // initial value + oneapi::dpl::__ranges::make_zip_view(__buf_src_data.all_view(), __buf_counting_it.all_view())) + .get(); + + return std::get<0>(res) ? __first + std::get<1>(res) : __last; } //------------------------------------------------------------------------ From d1bd11803cdc8e2101abedfc2077afc826e25996 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 10 Jun 2024 12:35:59 +0200 Subject: [PATCH 04/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - remove as unused anymoreinclude/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - performance optimization of __parallel_find_or + __device_backend_tag for the usage with __parallel_or_tag --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 220 ++++++++++++++---- 1 file changed, 170 insertions(+), 50 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 3ab717db63f..ba49c403c80 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -997,25 +997,27 @@ struct __parallel_find_backward_tag // Tag for __parallel_find_or for or-semantic struct __parallel_or_tag { - class __atomic_compare + using _AtomicType = unsigned int; + + static constexpr _AtomicType __found_state = 1; + static constexpr _AtomicType __not_found_state = 0; + + struct __compare_state { - public: - template bool - operator()(const _LocalAtomic& __found_local, const _GlobalAtomic& __found) const + operator()(const _AtomicType __found_local, const _AtomicType __found) const { - return __found_local == 1 && __found == 0; + return __found_local == __found_state && __found == __not_found_state; } }; - using _AtomicType = int32_t; - using _Compare = __atomic_compare; + using _Compare = __compare_state; // The template parameter is intended to unify __init_value in tags. template constexpr static _AtomicType __init_value(_DiffType) { - return 0; + return __not_found_state; } }; @@ -1028,55 +1030,91 @@ struct __early_exit_find_or { _Pred __pred; + // operator() overload for __parallel_or_tag + template + void + operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Compare __comp, + _FoundLocalState& __found_local, __parallel_or_tag, _Ranges&&... __rngs) const + { + const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); + + std::size_t __shift = 16; + const std::size_t __local_idx = __item_id.get_local_id(0); + const std::size_t __group_idx = __item_id.get_group(0); + + // each work_item processes N_ELEMENTS with step SHIFT + const std::size_t __leader = (__local_idx / __shift) * __shift; + const std::size_t __init_index = + __group_idx * __wg_size * __n_iter + __leader * __n_iter + __local_idx % __shift; + + // if our "line" is out of work group size, reduce the line to the number of the rest elements + if (__wg_size - __leader < __shift) + __shift = __wg_size - __leader; + for (_IterSize __i = 0; __i < __n_iter; ++__i) + { + // Point #B1 - not required to have _ShiftedIdxType + + _IterSize __current_iter = __i; + // Point #B2 - not required + + // Point #B3 - rewritten + const auto __shifted_idx = __init_index + __current_iter * __shift; + // Point #B4 - rewritten + if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) + { + __found_local = __parallel_or_tag::__found_state; + + // Doesn't make sense to continue if we found the element + break; + } + } + } + + // operator() overload for __parallel_find_forward_tag and for __parallel_find_backward_tag template void operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Compare __comp, _LocalAtomic& __found_local, _BrickTag, _Ranges&&... __rngs) const { - using __par_backend_hetero::__parallel_or_tag; - using _OrTagType = ::std::is_same<_BrickTag, __par_backend_hetero::__parallel_or_tag>; - using _BackwardTagType = ::std::is_same; + using _BackwardTagType = std::is_same; - auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); + const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); - ::std::size_t __shift = 16; - ::std::size_t __local_idx = __item_id.get_local_id(0); - ::std::size_t __group_idx = __item_id.get_group(0); + std::size_t __shift = 16; + const std::size_t __local_idx = __item_id.get_local_id(0); + const std::size_t __group_idx = __item_id.get_group(0); // each work_item processes N_ELEMENTS with step SHIFT - ::std::size_t __leader = (__local_idx / __shift) * __shift; - ::std::size_t __init_index = __group_idx * __wg_size * __n_iter + __leader * __n_iter + __local_idx % __shift; + const std::size_t __leader = (__local_idx / __shift) * __shift; + const std::size_t __init_index = + __group_idx * __wg_size * __n_iter + __leader * __n_iter + __local_idx % __shift; // if our "line" is out of work group size, reduce the line to the number of the rest elements if (__wg_size - __leader < __shift) __shift = __wg_size - __leader; for (_IterSize __i = 0; __i < __n_iter; ++__i) { + // Point #B1 //in case of find-semantic __shifted_idx must be the same type as the atomic for a correct comparison - using _ShiftedIdxType = ::std::conditional_t<_OrTagType::value, decltype(__init_index + __i * __shift), - decltype(__found_local.load())>; + using _ShiftedIdxType = decltype(__found_local.load()); _IterSize __current_iter = __i; + // Point #B2 if constexpr (_BackwardTagType::value) __current_iter = __n_iter - 1 - __i; - _ShiftedIdxType __shifted_idx = __init_index + __current_iter * __shift; + // Point #B3 + const _ShiftedIdxType __shifted_idx = __init_index + __current_iter * __shift; + // Point #B4 // TODO:[Performance] the issue with atomic load (in comparison with __shifted_idx for early exit) // should be investigated later, with other HW if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) { - if constexpr (_OrTagType::value) + for (auto __old = __found_local.load(); __comp(__shifted_idx, __old); __old = __found_local.load()) { - __found_local.store(1); - break; - } - else - { - for (auto __old = __found_local.load(); __comp(__shifted_idx, __old); __old = __found_local.load()) - { - __found_local.compare_exchange_strong(__old, __shifted_idx); - } + __found_local.compare_exchange_strong(__old, __shifted_idx); } } } @@ -1087,11 +1125,95 @@ struct __early_exit_find_or // parallel_find_or - sync pattern //------------------------------------------------------------------------ -// Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. -template -::std::conditional_t< - ::std::is_same_v<_BrickTag, __parallel_or_tag>, bool, - oneapi::dpl::__internal::__difference_t::type>> +// Specialization for __parallel_or_tag +template +bool +__parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, + __parallel_or_tag /*__brick_tag*/, _Ranges&&... __rngs) +{ + using _BrickTag = __parallel_or_tag; + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + using _AtomicType = typename _BrickTag::_AtomicType; + using _FindOrKernel = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_or_kernel, _CustomName, _Brick, + _BrickTag, _Ranges...>; + + auto __rng_n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); + assert(__rng_n > 0); + + // TODO: find a way to generalize getting of reliable work-group size + auto __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec); +#if _ONEDPL_COMPILE_KERNEL + auto __kernel = __internal::__kernel_compiler<_FindOrKernel>::__compile(__exec); + __wgroup_size = ::std::min(__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel)); +#endif + auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + + auto __n_groups = (__rng_n - 1) / __wgroup_size + 1; + // TODO: try to change __n_groups with another formula for more perfect load balancing + __n_groups = ::std::min(__n_groups, decltype(__n_groups)(__max_cu)); + + auto __n_iter = (__rng_n - 1) / (__n_groups * __wgroup_size) + 1; + + _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); + + const _AtomicType __init_value = _BrickTag::__init_value(__rng_n); + _AtomicType __result = __init_value; + + const auto __pred = oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick>{__f}; + + // scope is to copy data back to __result after destruction of temporary sycl:buffer + { + auto __temp = sycl::buffer<_AtomicType, 1>(&__result, 1); // temporary storage for global atomic + + // main parallel_for + __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + auto __temp_acc = __temp.template get_access(__cgh); + +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_FindOrKernel>( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __kernel, +#endif + sycl::nd_range(sycl::range(__n_groups * __wgroup_size), + sycl::range(__wgroup_size)), + [=](sycl::nd_item __item_id) { + + __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( + *__dpl_sycl::__get_accessor_ptr(__temp_acc)); + // Point #A1 - not required + + // Point #A2 - rewritten + _AtomicType __found_local = __init_value; + // Point #A2.1 - not required + + // Point #A3 - rewritten + constexpr auto __comp = typename _BrickTag::_Compare{}; + __pred(__item_id, __n_iter, __wgroup_size, __comp, __found_local, __parallel_or_tag{}, __rngs...); + // Point #A3.1 - not required + + // Point #A4 - rewritten + // Set found state result to global atomic + if (__found_local != __init_value) + { + __found.fetch_or(__found_local); + } + }); + }); + //The end of the scope - a point of synchronization (on temporary sycl buffer destruction) + } + + return __result != __init_value; +} + +// Specialization for __parallel_find_forward_tag, __parallel_find_backward_tag +template +oneapi::dpl::__internal::__difference_t::type> __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag, _Ranges&&... __rngs) { @@ -1101,7 +1223,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_or_kernel, _CustomName, _Brick, _BrickTag, _Ranges...>; - constexpr bool __or_tag_check = ::std::is_same_v<_BrickTag, __parallel_or_tag>; + static_assert(!std::is_same_v<_BrickTag, __parallel_or_tag>); auto __rng_n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); assert(__rng_n > 0); @@ -1124,7 +1246,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli _AtomicType __init_value = _BrickTag::__init_value(__rng_n); auto __result = __init_value; - auto __pred = oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick>{__f}; + const auto __pred = oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick>{__f}; // scope is to copy data back to __result after destruction of temporary sycl:buffer { @@ -1147,35 +1269,36 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli sycl::nd_range(sycl::range(__n_groups * __wgroup_size), sycl::range(__wgroup_size)), [=](sycl::nd_item __item_id) { - auto __local_idx = __item_id.get_local_id(0); + const auto __local_idx = __item_id.get_local_id(0); __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( *__dpl_sycl::__get_accessor_ptr(__temp_acc)); + // Point #A1 __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::local_space> __found_local( *__dpl_sycl::__get_accessor_ptr(__temp_local)); + // Point #A2 // 1. Set initial value to local atomic if (__local_idx == 0) __found_local.store(__init_value); + // Point #A2.1 __dpl_sycl::__group_barrier(__item_id); + // Point #A3 // 2. Find any element that satisfies pred and set local atomic value to global atomic constexpr auto __comp = typename _BrickTag::_Compare{}; __pred(__item_id, __n_iter, __wgroup_size, __comp, __found_local, __brick_tag, __rngs...); + // Point #A3.1 __dpl_sycl::__group_barrier(__item_id); + // Point #A4 // Set local atomic value to global atomic if (__local_idx == 0 && __comp(__found_local.load(), __found.load())) { - if constexpr (__or_tag_check) - __found.store(1); - else + for (auto __old = __found.load(); __comp(__found_local.load(), __old); + __old = __found.load()) { - for (auto __old = __found.load(); __comp(__found_local.load(), __old); - __old = __found.load()) - { - __found.compare_exchange_strong(__old, __found_local.load()); - } + __found.compare_exchange_strong(__old, __found_local.load()); } } }); @@ -1183,10 +1306,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli //The end of the scope - a point of synchronization (on temporary sycl buffer destruction) } - if constexpr (__or_tag_check) - return __result; - else - return __result != __init_value ? __result : __rng_n; + return __result != __init_value ? __result : __rng_n; } //------------------------------------------------------------------------ From e958b109f792a71c8968aa1442747522ad8cf15a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 7 Jun 2024 16:34:52 +0200 Subject: [PATCH 05/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - remove extra call of __comp(__found_local.load(), __found.load()) Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ba49c403c80..d94fcaef954 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1293,7 +1293,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // Point #A4 // Set local atomic value to global atomic - if (__local_idx == 0 && __comp(__found_local.load(), __found.load())) + if (__local_idx == 0) { for (auto __old = __found.load(); __comp(__found_local.load(), __old); __old = __found.load()) From 77642abbfdeaaeef9d7cfcbafcdeacd46d87bdaa Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 7 Jun 2024 17:17:24 +0200 Subject: [PATCH 06/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - remove extra auto keyword Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d94fcaef954..0e4cbf35b8c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1165,7 +1165,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // scope is to copy data back to __result after destruction of temporary sycl:buffer { - auto __temp = sycl::buffer<_AtomicType, 1>(&__result, 1); // temporary storage for global atomic + sycl::buffer<_AtomicType, 1> __temp(&__result, 1); // temporary storage for global atomic // main parallel_for __exec.queue().submit([&](sycl::handler& __cgh) { @@ -1250,7 +1250,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // scope is to copy data back to __result after destruction of temporary sycl:buffer { - auto __temp = sycl::buffer<_AtomicType, 1>(&__result, 1); // temporary storage for global atomic + sycl::buffer<_AtomicType, 1> __temp(&__result, 1); // temporary storage for global atomic // main parallel_for __exec.queue().submit([&](sycl::handler& __cgh) { From 86c9ac0e6d74e73f9cff07d10be0f46f17ad2288 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 10 Jun 2024 14:10:06 +0200 Subject: [PATCH 07/28] Apply GitHUB clang fromat Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 24 +++++++++++-------- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 7 ++---- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 4f693e68494..ef37e27b376 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -694,7 +694,8 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator // __counting_iterator_t - iterate position (index) in source data using __counting_iterator_t = oneapi::dpl::counting_iterator<_difference_type>; - using _zipped_data_type = typename std::iterator_traits::value_type; + using _zipped_data_type = typename std::iterator_traits::value_type; __find_if_binary_reduce_op<_zipped_data_type, /*_IsFirst*/ std::true_type> __reduce_op; __find_if_unary_transform_op<_zipped_data_type, _Pred> __transform_op{__pred}; @@ -702,17 +703,18 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator using _Functor = unseq_backend::walk_n<_ExecutionPolicy, decltype(__transform_op)>; using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_result_type>; - auto __keep_src_data = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); + auto __keep_src_data = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf_src_data = __keep_src_data(__first, __last); const __counting_iterator_t __counting_it_first{0}, __counting_it_last{__n}; - auto __keep_counting_it = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); + auto __keep_counting_it = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); auto __buf_counting_it = __keep_counting_it(__counting_it_first, __counting_it_last); auto res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, std::true_type /*is_commutative*/>( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), - __reduce_op, _Functor{__transform_op}, + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __reduce_op, _Functor{__transform_op}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value oneapi::dpl::__ranges::make_zip_view(__buf_src_data.all_view(), __buf_counting_it.all_view())) .get(); @@ -781,7 +783,8 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato // __counting_iterator_t - iterate position (index) in source data using __counting_iterator_t = oneapi::dpl::counting_iterator<_difference_type>; - using _zipped_data_type = typename std::iterator_traits::value_type; + using _zipped_data_type = typename std::iterator_traits::value_type; __find_if_binary_reduce_op<_zipped_data_type, /*_IsFirst*/ std::true_type> __reduce_op; __find_if_unary_transform_op<_zipped_data_type, _Pred> __transform_op{__pred}; @@ -789,17 +792,18 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato using _Functor = unseq_backend::walk_n<_ExecutionPolicy, decltype(__transform_op)>; using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_result_type>; - auto __keep_src_data = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); + auto __keep_src_data = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf_src_data = __keep_src_data(__first, __last); const __counting_iterator_t __counting_it_first{0}, __counting_it_last{__n}; - auto __keep_counting_it = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); + auto __keep_counting_it = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); auto __buf_counting_it = __keep_counting_it(__counting_it_first, __counting_it_last); auto res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, std::true_type /*is_commutative*/>( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), - __reduce_op, _Functor{__transform_op}, + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __reduce_op, _Functor{__transform_op}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value oneapi::dpl::__ranges::make_zip_view(__buf_src_data.all_view(), __buf_counting_it.all_view())) .get(); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 0e4cbf35b8c..af959b633e4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1182,7 +1182,6 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli sycl::nd_range(sycl::range(__n_groups * __wgroup_size), sycl::range(__wgroup_size)), [=](sycl::nd_item __item_id) { - __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( *__dpl_sycl::__get_accessor_ptr(__temp_acc)); // Point #A1 - not required @@ -1211,8 +1210,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli } // Specialization for __parallel_find_forward_tag, __parallel_find_backward_tag -template +template oneapi::dpl::__internal::__difference_t::type> __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag, _Ranges&&... __rngs) @@ -1295,8 +1293,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // Set local atomic value to global atomic if (__local_idx == 0) { - for (auto __old = __found.load(); __comp(__found_local.load(), __old); - __old = __found.load()) + for (auto __old = __found.load(); __comp(__found_local.load(), __old); __old = __found.load()) { __found.compare_exchange_strong(__old, __found_local.load()); } From e96f6ca430fa6477dbe5783edb833dea7ac28887 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 10 Jun 2024 15:26:29 +0200 Subject: [PATCH 08/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - fix self review comment: let's use __brick_tag instead of __parallel_or_tag{} Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index af959b633e4..35f52a86566 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1129,7 +1129,7 @@ struct __early_exit_find_or template bool __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, - __parallel_or_tag /*__brick_tag*/, _Ranges&&... __rngs) + __parallel_or_tag __brick_tag, _Ranges&&... __rngs) { using _BrickTag = __parallel_or_tag; @@ -1192,7 +1192,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // Point #A3 - rewritten constexpr auto __comp = typename _BrickTag::_Compare{}; - __pred(__item_id, __n_iter, __wgroup_size, __comp, __found_local, __parallel_or_tag{}, __rngs...); + __pred(__item_id, __n_iter, __wgroup_size, __comp, __found_local, __brick_tag, __rngs...); // Point #A3.1 - not required // Point #A4 - rewritten From a707cb1553f6c69ad439e1ab89aae28dc0cac099 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 10 Jun 2024 17:42:50 +0200 Subject: [PATCH 09/28] include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - fix review comment Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index ef37e27b376..96acf42a99e 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -646,24 +646,24 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator // any_of //------------------------------------------------------------------------ -template +template struct __find_if_unary_transform_op { _UnaryTransformOp __transform_op; template - _Typle + _Tuple operator()(const Arg& arg) const { return {__transform_op(std::get<0>(arg)), std::get<1>(arg)}; } }; -template +template struct __find_if_binary_reduce_op { - _Typle - operator()(const _Typle& op1, const _Typle& op2) const + _Tuple + operator()(const _Tuple& op1, const _Tuple& op2) const { if (std::get<0>(op1) && std::get<0>(op2)) { From b078d7f59976fd894ef64bd092921f2600c3e0b6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 10 Jun 2024 17:51:21 +0200 Subject: [PATCH 10/28] include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h - fix error in comments: __typle_type -> __tuple_type Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 2a6145bf182..dce24b51706 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -609,12 +609,12 @@ struct __copy_by_mask // operator to ::std::tuple, but for some reason this doesn't work(conversion from // ::std::tuple to ::std::tuple fails). What does work is the explicit cast below: // for internal::tuple we define a field that provides a corresponding ::std::tuple - // with matching types. We get this type(see __typle_type definition above) and use it + // with matching types. We get this type(see __tuple_type definition above) and use it // for static cast to explicitly convert internal::tuple -> ::std::tuple. // Now we have the following assignment ::std::tuple = ::std::tuple which works as expected. // NOTE: we only need this explicit conversion when we have internal::tuple and // ::std::tuple as operands, in all the other cases this is not necessary and no conversion - // is performed(i.e. __typle_type is the same type as its operand). + // is performed(i.e. __tuple_type is the same type as its operand). __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]); } } From d04610b983a5607afbdccc0a785854f085e46496 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 10 Jun 2024 18:07:24 +0200 Subject: [PATCH 11/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - fix review comment: remove local variable _IterSize __current_iter Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 35f52a86566..55ba500b532 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1055,11 +1055,10 @@ struct __early_exit_find_or { // Point #B1 - not required to have _ShiftedIdxType - _IterSize __current_iter = __i; // Point #B2 - not required // Point #B3 - rewritten - const auto __shifted_idx = __init_index + __current_iter * __shift; + const auto __shifted_idx = __init_index + __i * __shift; // Point #B4 - rewritten if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) { From d9e7b4d062f8b2fb201009172d925b99f7ec508e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 11 Jun 2024 10:37:53 +0200 Subject: [PATCH 12/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - remove extra auto keyword Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 55ba500b532..130edf8744c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1160,7 +1160,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli const _AtomicType __init_value = _BrickTag::__init_value(__rng_n); _AtomicType __result = __init_value; - const auto __pred = oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick>{__f}; + const oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick> __pred{__f}; // scope is to copy data back to __result after destruction of temporary sycl:buffer { @@ -1243,7 +1243,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli _AtomicType __init_value = _BrickTag::__init_value(__rng_n); auto __result = __init_value; - const auto __pred = oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick>{__f}; + const oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick> __pred{__f}; // scope is to copy data back to __result after destruction of temporary sycl:buffer { From 91f3de3365ac4ae0527109bd4336bb61a89eb836 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 11 Jun 2024 17:22:31 +0200 Subject: [PATCH 13/28] Revert "include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - fix review comment" This reverts commit b2e73dfb85a5c5c5cfeeb65e018c0fea8267ad1f. --- include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 96acf42a99e..ef37e27b376 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -646,24 +646,24 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator // any_of //------------------------------------------------------------------------ -template +template struct __find_if_unary_transform_op { _UnaryTransformOp __transform_op; template - _Tuple + _Typle operator()(const Arg& arg) const { return {__transform_op(std::get<0>(arg)), std::get<1>(arg)}; } }; -template +template struct __find_if_binary_reduce_op { - _Tuple - operator()(const _Tuple& op1, const _Tuple& op2) const + _Typle + operator()(const _Typle& op1, const _Typle& op2) const { if (std::get<0>(op1) && std::get<0>(op2)) { From d0dbb936c10d3a5068498964cc2d8291b8708a63 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 11 Jun 2024 17:23:04 +0200 Subject: [PATCH 14/28] Revert "Apply GitHUB clang fromat" This reverts commit 8c6e80f3347ceead41d4ee033b8e01fb4eb63a75. --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index ef37e27b376..4f693e68494 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -694,8 +694,7 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator // __counting_iterator_t - iterate position (index) in source data using __counting_iterator_t = oneapi::dpl::counting_iterator<_difference_type>; - using _zipped_data_type = typename std::iterator_traits::value_type; + using _zipped_data_type = typename std::iterator_traits::value_type; __find_if_binary_reduce_op<_zipped_data_type, /*_IsFirst*/ std::true_type> __reduce_op; __find_if_unary_transform_op<_zipped_data_type, _Pred> __transform_op{__pred}; @@ -703,18 +702,17 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator using _Functor = unseq_backend::walk_n<_ExecutionPolicy, decltype(__transform_op)>; using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_result_type>; - auto __keep_src_data = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); + auto __keep_src_data = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf_src_data = __keep_src_data(__first, __last); const __counting_iterator_t __counting_it_first{0}, __counting_it_last{__n}; - auto __keep_counting_it = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); + auto __keep_counting_it = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); auto __buf_counting_it = __keep_counting_it(__counting_it_first, __counting_it_last); auto res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, std::true_type /*is_commutative*/>( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __reduce_op, _Functor{__transform_op}, + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + __reduce_op, _Functor{__transform_op}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value oneapi::dpl::__ranges::make_zip_view(__buf_src_data.all_view(), __buf_counting_it.all_view())) .get(); @@ -783,8 +781,7 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato // __counting_iterator_t - iterate position (index) in source data using __counting_iterator_t = oneapi::dpl::counting_iterator<_difference_type>; - using _zipped_data_type = typename std::iterator_traits::value_type; + using _zipped_data_type = typename std::iterator_traits::value_type; __find_if_binary_reduce_op<_zipped_data_type, /*_IsFirst*/ std::true_type> __reduce_op; __find_if_unary_transform_op<_zipped_data_type, _Pred> __transform_op{__pred}; @@ -792,18 +789,17 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato using _Functor = unseq_backend::walk_n<_ExecutionPolicy, decltype(__transform_op)>; using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_result_type>; - auto __keep_src_data = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); + auto __keep_src_data = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf_src_data = __keep_src_data(__first, __last); const __counting_iterator_t __counting_it_first{0}, __counting_it_last{__n}; - auto __keep_counting_it = - oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); + auto __keep_counting_it = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); auto __buf_counting_it = __keep_counting_it(__counting_it_first, __counting_it_last); auto res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, std::true_type /*is_commutative*/>( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __reduce_op, _Functor{__transform_op}, + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + __reduce_op, _Functor{__transform_op}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value oneapi::dpl::__ranges::make_zip_view(__buf_src_data.all_view(), __buf_counting_it.all_view())) .get(); From dec2a04f2d1e6bab3d6b6df6285a7ef4e39da0fc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 11 Jun 2024 17:23:16 +0200 Subject: [PATCH 15/28] Revert "include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - implementation of __pattern_find_if on __parallel_transform_reduce" This reverts commit f97df31ed2f401c83eb2c0e1a42cdb8b3c3dde4a. --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 40 ++++--------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 4f693e68494..54a9c14c52b 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -769,42 +769,16 @@ _Iterator __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _Pred __pred) { - using _difference_type = typename ::std::iterator_traits<_Iterator>::difference_type; - - const _difference_type __n = __last - __first; - if (__n == 0) + if (__first == __last) return __last; - using _result_type = oneapi::dpl::__internal::tuple; - const auto __init = _result_type{false, __n}; - - // __counting_iterator_t - iterate position (index) in source data - using __counting_iterator_t = oneapi::dpl::counting_iterator<_difference_type>; + using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_ExecutionPolicy, _Pred>; - using _zipped_data_type = typename std::iterator_traits::value_type; - - __find_if_binary_reduce_op<_zipped_data_type, /*_IsFirst*/ std::true_type> __reduce_op; - __find_if_unary_transform_op<_zipped_data_type, _Pred> __transform_op{__pred}; - - using _Functor = unseq_backend::walk_n<_ExecutionPolicy, decltype(__transform_op)>; - using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_result_type>; - - auto __keep_src_data = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); - auto __buf_src_data = __keep_src_data(__first, __last); - - const __counting_iterator_t __counting_it_first{0}, __counting_it_last{__n}; - auto __keep_counting_it = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); - auto __buf_counting_it = __keep_counting_it(__counting_it_first, __counting_it_last); - - auto res = - oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, std::true_type /*is_commutative*/>( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), - __reduce_op, _Functor{__transform_op}, - unseq_backend::__init_value<_RepackedTp>{__init}, // initial value - oneapi::dpl::__ranges::make_zip_view(__buf_src_data.all_view(), __buf_counting_it.all_view())) - .get(); - - return std::get<0>(res) ? __first + std::get<1>(res) : __last; + return __par_backend_hetero::__parallel_find( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), + __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), _Predicate{__pred}, + ::std::true_type{}); } //------------------------------------------------------------------------ From c50af52d2101f29771b262b7654f1c0acac7ea92 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 11 Jun 2024 17:23:21 +0200 Subject: [PATCH 16/28] Revert "include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - implementation of __pattern_any_of on __parallel_transform_reduce" This reverts commit d6598660a6f9ce5908bf935a8cb060cd5225ced4. --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 41 ++++--------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 54a9c14c52b..5c47188cfe3 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -682,42 +682,19 @@ bool __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _Pred __pred) { - using _difference_type = typename ::std::iterator_traits<_Iterator>::difference_type; - - const _difference_type __n = __last - __first; - if (__n == 0) + if (__first == __last) return false; - using _result_type = oneapi::dpl::__internal::tuple; - const auto __init = _result_type{false, __n}; - - // __counting_iterator_t - iterate position (index) in source data - using __counting_iterator_t = oneapi::dpl::counting_iterator<_difference_type>; - - using _zipped_data_type = typename std::iterator_traits::value_type; - - __find_if_binary_reduce_op<_zipped_data_type, /*_IsFirst*/ std::true_type> __reduce_op; - __find_if_unary_transform_op<_zipped_data_type, _Pred> __transform_op{__pred}; - - using _Functor = unseq_backend::walk_n<_ExecutionPolicy, decltype(__transform_op)>; - using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_result_type>; - - auto __keep_src_data = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); - auto __buf_src_data = __keep_src_data(__first, __last); - - const __counting_iterator_t __counting_it_first{0}, __counting_it_last{__n}; - auto __keep_counting_it = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, __counting_iterator_t>(); - auto __buf_counting_it = __keep_counting_it(__counting_it_first, __counting_it_last); + using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_ExecutionPolicy, _Pred>; - auto res = - oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, std::true_type /*is_commutative*/>( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), - __reduce_op, _Functor{__transform_op}, - unseq_backend::__init_value<_RepackedTp>{__init}, // initial value - oneapi::dpl::__ranges::make_zip_view(__buf_src_data.all_view(), __buf_counting_it.all_view())) - .get(); + auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); + auto __buf = __keep(__first, __last); - return std::get<0>(res); + return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + _BackendTag{}, + __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>( + ::std::forward<_ExecutionPolicy>(__exec)), + _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, __buf.all_view()); } //------------------------------------------------------------------------ From d8511057d5a59ef49c31f8e5649102feceb027e2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 11 Jun 2024 17:23:26 +0200 Subject: [PATCH 17/28] Revert "include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - new predicates __find_if_unary_transform_op, __find_if_binary_reduce_op" This reverts commit 2d8971435519778bda1969fef1f98ddfc5edc56d. --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 5c47188cfe3..766b599361f 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -646,37 +646,6 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator // any_of //------------------------------------------------------------------------ -template -struct __find_if_unary_transform_op -{ - _UnaryTransformOp __transform_op; - - template - _Typle - operator()(const Arg& arg) const - { - return {__transform_op(std::get<0>(arg)), std::get<1>(arg)}; - } -}; - -template -struct __find_if_binary_reduce_op -{ - _Typle - operator()(const _Typle& op1, const _Typle& op2) const - { - if (std::get<0>(op1) && std::get<0>(op2)) - { - if constexpr (_IsFirst{}) - return {true, std::min(std::get<1>(op1), std::get<1>(op2))}; - else - return {true, std::max(std::get<1>(op1), std::get<1>(op2))}; - } - - return std::get<0>(op1) ? op1 : op2; - } -}; - template bool __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, From c07477875e02d84271a3ab255501449fdf1d2447 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 11 Jun 2024 17:23:55 +0200 Subject: [PATCH 18/28] Revert "include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h - fix error in comments: __typle_type -> __tuple_type" This reverts commit 67eedc13a8e491cf553fdd90dffac4a790ad65e5. --- include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index dce24b51706..2a6145bf182 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -609,12 +609,12 @@ struct __copy_by_mask // operator to ::std::tuple, but for some reason this doesn't work(conversion from // ::std::tuple to ::std::tuple fails). What does work is the explicit cast below: // for internal::tuple we define a field that provides a corresponding ::std::tuple - // with matching types. We get this type(see __tuple_type definition above) and use it + // with matching types. We get this type(see __typle_type definition above) and use it // for static cast to explicitly convert internal::tuple -> ::std::tuple. // Now we have the following assignment ::std::tuple = ::std::tuple which works as expected. // NOTE: we only need this explicit conversion when we have internal::tuple and // ::std::tuple as operands, in all the other cases this is not necessary and no conversion - // is performed(i.e. __tuple_type is the same type as its operand). + // is performed(i.e. __typle_type is the same type as its operand). __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]); } } From b92fce341a342ae0e60b274b58cac46ace607a59 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 13 Jun 2024 11:05:29 +0200 Subject: [PATCH 19/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - fix review comment: the __parallel_or_tag overload takes __comp as a parameter but discards it without use. Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 130edf8744c..2765189d4ea 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1002,16 +1002,7 @@ struct __parallel_or_tag static constexpr _AtomicType __found_state = 1; static constexpr _AtomicType __not_found_state = 0; - struct __compare_state - { - bool - operator()(const _AtomicType __found_local, const _AtomicType __found) const - { - return __found_local == __found_state && __found == __not_found_state; - } - }; - - using _Compare = __compare_state; + struct _Compare{}; // The template parameter is intended to unify __init_value in tags. template @@ -1034,7 +1025,7 @@ struct __early_exit_find_or template void - operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Compare __comp, + operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Compare, _FoundLocalState& __found_local, __parallel_or_tag, _Ranges&&... __rngs) const { const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); From 7ea5ba64381ebb5072676888624ab7a774fcac19 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 13 Jun 2024 14:35:33 +0200 Subject: [PATCH 20/28] Fix review comment: rename __parallel_find_or to __parallel_find_first and __parallel_find_any Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 17 +- .../hetero/algorithm_ranges_impl_hetero.h | 72 ++++--- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 183 +++++++++--------- .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 14 +- 4 files changed, 158 insertions(+), 128 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 766b599361f..15d5232cee8 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -572,9 +572,9 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _I // TODO: in case of conflicting names // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() - bool result = __par_backend_hetero::__parallel_find_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, __par_backend_hetero::__parallel_or_tag{}, + bool result = __par_backend_hetero::__parallel_find_any( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); // inverted conditional because of @@ -659,11 +659,11 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + return oneapi::dpl::__par_backend_hetero::__parallel_find_any( _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, __buf.all_view()); + std::forward<_ExecutionPolicy>(__exec)), + _Predicate{__pred}, __buf.all_view()); } //------------------------------------------------------------------------ @@ -687,9 +687,8 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 // TODO: in case of conflicting names // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() - return !__par_backend_hetero::__parallel_find_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, - __par_backend_hetero::__parallel_or_tag{}, + return !__par_backend_hetero::__parallel_find_any( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 33587374b3b..fcae7b50ff6 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -105,9 +105,8 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& // TODO: in case of conflicting names // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() - return !oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, - oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, + return !oneapi::dpl::__par_backend_hetero::__parallel_find_any( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, oneapi::dpl::__ranges::zip_view(::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2))); } @@ -126,10 +125,10 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_ExecutionPolicy, _Pred>; using _TagType = oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + return oneapi::dpl::__par_backend_hetero::__parallel_find_first( _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), + std::forward<_ExecutionPolicy>(__exec)), _Predicate{__pred}, _TagType{}, ::std::forward<_Range>(__rng)); } @@ -156,10 +155,10 @@ __pattern_find_end(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ using _Predicate = unseq_backend::multiple_match_pred<_ExecutionPolicy, _Pred>; using _TagType = __par_backend_hetero::__parallel_find_backward_tag<_Range1>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + return oneapi::dpl::__par_backend_hetero::__parallel_find_first( _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), + std::forward<_ExecutionPolicy>(__exec)), _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); } @@ -180,11 +179,11 @@ __pattern_find_first_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R using _TagType = oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range1>; //TODO: To check whether it makes sense to iterate over the second sequence in case of __rng1.size() < __rng2.size() - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + return oneapi::dpl::__par_backend_hetero::__parallel_find_first( _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + std::forward<_ExecutionPolicy>(__exec)), + _Predicate{__pred}, _TagType{}, std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ @@ -199,11 +198,11 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& return false; using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_ExecutionPolicy, _Pred>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + return oneapi::dpl::__par_backend_hetero::__parallel_find_any( _BackendTag{}, __par_backend_hetero::make_wrapped_policy( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, ::std::forward<_Range>(__rng)); + std::forward<_ExecutionPolicy>(__exec)), + _Predicate{__pred}, std::forward<_Range>(__rng)); } //------------------------------------------------------------------------ @@ -237,11 +236,11 @@ __pattern_search(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ra using _Predicate = unseq_backend::multiple_match_pred<_ExecutionPolicy, _Pred>; using _TagType = oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range1>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + return oneapi::dpl::__par_backend_hetero::__parallel_find_first( _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy< - oneapi::dpl::__par_backend_hetero::__find_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + oneapi::dpl::__par_backend_hetero::__find_policy_wrapper>(std::forward<_ExecutionPolicy>(__exec)), + _Predicate{__pred}, _TagType{}, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ @@ -292,22 +291,37 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_ExecutionPolicy, adjacent_find_fn<_BinaryPredicate>>; - using _TagType = ::std::conditional_t<__is__or_semantic(), oneapi::dpl::__par_backend_hetero::__parallel_or_tag, - oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range>>; - auto __rng1 = __rng | oneapi::dpl::experimental::ranges::views::take(__rng.size() - 1); auto __rng2 = __rng | oneapi::dpl::experimental::ranges::views::drop(1); - // TODO: in case of conflicting names - // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() - auto result = oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, _TagType{}, - oneapi::dpl::__ranges::zip_view(__rng1, __rng2)); - - // inverted conditional because of - // reorder_predicate in glue_algorithm_impl.h - return return_value(result, __rng.size(), __is__or_semantic); + if constexpr (__is__or_semantic()) + { + // TODO: in case of conflicting names + // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() + auto result = oneapi::dpl::__par_backend_hetero::__parallel_find_any( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, + oneapi::dpl::__ranges::zip_view(__rng1, __rng2)); + + // inverted conditional because of + // reorder_predicate in glue_algorithm_impl.h + return return_value(result, __rng.size(), __is__or_semantic); + } + else + { + using _TagType = oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range>; + + // TODO: in case of conflicting names + // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() + auto result = oneapi::dpl::__par_backend_hetero::__parallel_find_first( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, _TagType{}, + oneapi::dpl::__ranges::zip_view(__rng1, __rng2)); + + // inverted conditional because of + // reorder_predicate in glue_algorithm_impl.h + return return_value(result, __rng.size(), __is__or_semantic); + } } template diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2765189d4ea..ef0c911b146 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -194,7 +194,10 @@ template class __scan_group_kernel; template -class __find_or_kernel; +class __find_any_kernel; + +template +class __find_first_kernel; template class __scan_propagate_kernel; @@ -954,7 +957,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, // find_or tags //------------------------------------------------------------------------ -// Tag for __parallel_find_or to find the first element that satisfies predicate +// Tag for __parallel_find_first to find the first element that satisfies predicate template struct __parallel_find_forward_tag { @@ -975,7 +978,7 @@ struct __parallel_find_forward_tag } }; -// Tag for __parallel_find_or to find the last element that satisfies predicate +// Tag for __parallel_find_first to find the last element that satisfies predicate template struct __parallel_find_backward_tag { @@ -994,7 +997,7 @@ struct __parallel_find_backward_tag } }; -// Tag for __parallel_find_or for or-semantic +// Tag for __parallel_find_any for or-semantic struct __parallel_or_tag { using _AtomicType = unsigned int; @@ -1002,8 +1005,6 @@ struct __parallel_or_tag static constexpr _AtomicType __found_state = 1; static constexpr _AtomicType __not_found_state = 0; - struct _Compare{}; - // The template parameter is intended to unify __init_value in tags. template constexpr static _AtomicType __init_value(_DiffType) @@ -1017,16 +1018,14 @@ struct __parallel_or_tag //------------------------------------------------------------------------ template -struct __early_exit_find_or +struct __early_exit_find_any { _Pred __pred; - // operator() overload for __parallel_or_tag - template + template void - operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Compare, - _FoundLocalState& __found_local, __parallel_or_tag, _Ranges&&... __rngs) const + operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, + _FoundLocalState& __found_local, _Ranges&&... __rngs) const { const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); @@ -1060,74 +1059,22 @@ struct __early_exit_find_or } } } - - // operator() overload for __parallel_find_forward_tag and for __parallel_find_backward_tag - template - void - operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Compare __comp, - _LocalAtomic& __found_local, _BrickTag, _Ranges&&... __rngs) const - { - using _BackwardTagType = std::is_same; - - const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); - - std::size_t __shift = 16; - const std::size_t __local_idx = __item_id.get_local_id(0); - const std::size_t __group_idx = __item_id.get_group(0); - - // each work_item processes N_ELEMENTS with step SHIFT - const std::size_t __leader = (__local_idx / __shift) * __shift; - const std::size_t __init_index = - __group_idx * __wg_size * __n_iter + __leader * __n_iter + __local_idx % __shift; - - // if our "line" is out of work group size, reduce the line to the number of the rest elements - if (__wg_size - __leader < __shift) - __shift = __wg_size - __leader; - for (_IterSize __i = 0; __i < __n_iter; ++__i) - { - // Point #B1 - //in case of find-semantic __shifted_idx must be the same type as the atomic for a correct comparison - using _ShiftedIdxType = decltype(__found_local.load()); - - _IterSize __current_iter = __i; - // Point #B2 - if constexpr (_BackwardTagType::value) - __current_iter = __n_iter - 1 - __i; - - // Point #B3 - const _ShiftedIdxType __shifted_idx = __init_index + __current_iter * __shift; - // Point #B4 - // TODO:[Performance] the issue with atomic load (in comparison with __shifted_idx for early exit) - // should be investigated later, with other HW - if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) - { - for (auto __old = __found_local.load(); __comp(__shifted_idx, __old); __old = __found_local.load()) - { - __found_local.compare_exchange_strong(__old, __shifted_idx); - } - } - } - } }; //------------------------------------------------------------------------ // parallel_find_or - sync pattern //------------------------------------------------------------------------ -// Specialization for __parallel_or_tag template bool -__parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, - __parallel_or_tag __brick_tag, _Ranges&&... __rngs) +__parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, + _Ranges&&... __rngs) { - using _BrickTag = __parallel_or_tag; - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _AtomicType = typename _BrickTag::_AtomicType; + using _AtomicType = typename __parallel_or_tag::_AtomicType; using _FindOrKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_or_kernel, _CustomName, _Brick, - _BrickTag, _Ranges...>; + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_any_kernel, _CustomName, _Brick, + _Ranges...>; auto __rng_n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); assert(__rng_n > 0); @@ -1148,10 +1095,10 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); - const _AtomicType __init_value = _BrickTag::__init_value(__rng_n); + const _AtomicType __init_value = __parallel_or_tag::__init_value(__rng_n); _AtomicType __result = __init_value; - const oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick> __pred{__f}; + const oneapi::dpl::__par_backend_hetero::__early_exit_find_any<_ExecutionPolicy, _Brick> __pred{__f}; // scope is to copy data back to __result after destruction of temporary sycl:buffer { @@ -1181,8 +1128,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // Point #A2.1 - not required // Point #A3 - rewritten - constexpr auto __comp = typename _BrickTag::_Compare{}; - __pred(__item_id, __n_iter, __wgroup_size, __comp, __found_local, __brick_tag, __rngs...); + __pred(__item_id, __n_iter, __wgroup_size, __found_local, __rngs...); // Point #A3.1 - not required // Point #A4 - rewritten @@ -1199,16 +1145,77 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli return __result != __init_value; } -// Specialization for __parallel_find_forward_tag, __parallel_find_backward_tag +//------------------------------------------------------------------------ +// early_exit (find_entry) +//------------------------------------------------------------------------ + +template +struct __early_exit_find_first +{ + _Pred __pred; + + template + void + operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Compare __comp, + _LocalAtomic& __found_local, _BrickTag, _Ranges&&... __rngs) const + { + using _BackwardTagType = std::is_same; + + const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); + + std::size_t __shift = 16; + const std::size_t __local_idx = __item_id.get_local_id(0); + const std::size_t __group_idx = __item_id.get_group(0); + + // each work_item processes N_ELEMENTS with step SHIFT + const std::size_t __leader = (__local_idx / __shift) * __shift; + const std::size_t __init_index = + __group_idx * __wg_size * __n_iter + __leader * __n_iter + __local_idx % __shift; + + // if our "line" is out of work group size, reduce the line to the number of the rest elements + if (__wg_size - __leader < __shift) + __shift = __wg_size - __leader; + for (_IterSize __i = 0; __i < __n_iter; ++__i) + { + // Point #B1 + //in case of find-semantic __shifted_idx must be the same type as the atomic for a correct comparison + using _ShiftedIdxType = decltype(__found_local.load()); + + _IterSize __current_iter = __i; + // Point #B2 + if constexpr (_BackwardTagType::value) + __current_iter = __n_iter - 1 - __i; + + // Point #B3 + const _ShiftedIdxType __shifted_idx = __init_index + __current_iter * __shift; + // Point #B4 + // TODO:[Performance] the issue with atomic load (in comparison with __shifted_idx for early exit) + // should be investigated later, with other HW + if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) + { + for (auto __old = __found_local.load(); __comp(__shifted_idx, __old); __old = __found_local.load()) + { + __found_local.compare_exchange_strong(__old, __shifted_idx); + } + } + } + } +}; + +//------------------------------------------------------------------------ +// parallel_find_entry - sync pattern +//------------------------------------------------------------------------ + template oneapi::dpl::__internal::__difference_t::type> -__parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, - _BrickTag __brick_tag, _Ranges&&... __rngs) +__parallel_find_first(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, + _BrickTag __brick_tag, _Ranges&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _AtomicType = typename _BrickTag::_AtomicType; using _FindOrKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_or_kernel, _CustomName, _Brick, + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_first_kernel, _CustomName, _Brick, _BrickTag, _Ranges...>; static_assert(!std::is_same_v<_BrickTag, __parallel_or_tag>); @@ -1234,7 +1241,7 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli _AtomicType __init_value = _BrickTag::__init_value(__rng_n); auto __result = __init_value; - const oneapi::dpl::__par_backend_hetero::__early_exit_find_or<_ExecutionPolicy, _Brick> __pred{__f}; + const oneapi::dpl::__par_backend_hetero::__early_exit_find_first<_ExecutionPolicy, _Brick> __pred{__f}; // scope is to copy data back to __result after destruction of temporary sycl:buffer { @@ -1315,10 +1322,10 @@ __parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Exec auto __s_keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>(); auto __s_buf = __s_keep(__s_first, __s_last); - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + return oneapi::dpl::__par_backend_hetero::__parallel_find_any( __backend_tag, __par_backend_hetero::make_wrapped_policy<__or_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), __f, - __parallel_or_tag{}, __buf.all_view(), __s_buf.all_view()); + __buf.all_view(), __s_buf.all_view()); } // Special overload for single sequence cases. @@ -1332,10 +1339,10 @@ __parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Exec auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( + return oneapi::dpl::__par_backend_hetero::__parallel_find_any( __backend_tag, __par_backend_hetero::make_wrapped_policy<__or_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), __f, - __parallel_or_tag{}, __buf.all_view()); + __buf.all_view()); } //------------------------------------------------------------------------ @@ -1357,9 +1364,9 @@ __parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Ex auto __s_keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>(); auto __s_buf = __s_keep(__s_first, __s_last); - using _TagType = ::std::conditional_t<_IsFirst::value, __parallel_find_forward_tag, - __parallel_find_backward_tag>; - return __first + oneapi::dpl::__par_backend_hetero::__parallel_find_or( + using _TagType = std::conditional_t<_IsFirst::value, __parallel_find_forward_tag, + __parallel_find_backward_tag>; + return __first + oneapi::dpl::__par_backend_hetero::__parallel_find_first( __backend_tag, __par_backend_hetero::make_wrapped_policy<__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), @@ -1377,9 +1384,9 @@ __parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Ex auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - using _TagType = ::std::conditional_t<_IsFirst::value, __parallel_find_forward_tag, - __parallel_find_backward_tag>; - return __first + oneapi::dpl::__par_backend_hetero::__parallel_find_or( + using _TagType = std::conditional_t<_IsFirst::value, __parallel_find_forward_tag, + __parallel_find_backward_tag>; + return __first + oneapi::dpl::__par_backend_hetero::__parallel_find_first( __backend_tag, __par_backend_hetero::make_wrapped_policy<__find_policy_wrapper>( ::std::forward<_ExecutionPolicy>(__exec)), diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 2144c454864..5dd22ee9d4e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -234,12 +234,22 @@ namespace oneapi::dpl::__par_backend_hetero { template -struct __early_exit_find_or; +struct __early_exit_find_first; + +template +struct __early_exit_find_any; } // namespace oneapi::dpl::__par_backend_hetero template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_or, +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_first, + _ExecutionPolicy, _Pred)> + : oneapi::dpl::__internal::__are_all_device_copyable<_Pred> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_any, _ExecutionPolicy, _Pred)> : oneapi::dpl::__internal::__are_all_device_copyable<_Pred> { From b9c26a82c3539036296b19787d9db882d2491132 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 13 Jun 2024 14:47:40 +0200 Subject: [PATCH 21/28] Fix review comment: rename __parallel_find_or to __parallel_find_first and __parallel_find_any Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ef0c911b146..2dc64a1b324 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1072,7 +1072,7 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _AtomicType = typename __parallel_or_tag::_AtomicType; - using _FindOrKernel = + using _FindAnyKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_any_kernel, _CustomName, _Brick, _Ranges...>; @@ -1082,7 +1082,7 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol // TODO: find a way to generalize getting of reliable work-group size auto __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec); #if _ONEDPL_COMPILE_KERNEL - auto __kernel = __internal::__kernel_compiler<_FindOrKernel>::__compile(__exec); + auto __kernel = __internal::__kernel_compiler<_FindAnyKernel>::__compile(__exec); __wgroup_size = ::std::min(__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel)); #endif auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); @@ -1112,7 +1112,7 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif - __cgh.parallel_for<_FindOrKernel>( + __cgh.parallel_for<_FindAnyKernel>( #if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT __kernel, #endif @@ -1214,7 +1214,7 @@ __parallel_find_first(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _AtomicType = typename _BrickTag::_AtomicType; - using _FindOrKernel = + using _FindFirstKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__find_first_kernel, _CustomName, _Brick, _BrickTag, _Ranges...>; @@ -1225,7 +1225,7 @@ __parallel_find_first(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP // TODO: find a way to generalize getting of reliable work-group size auto __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec); #if _ONEDPL_COMPILE_KERNEL - auto __kernel = __internal::__kernel_compiler<_FindOrKernel>::__compile(__exec); + auto __kernel = __internal::__kernel_compiler<_FindFirstKernel>::__compile(__exec); __wgroup_size = ::std::min(__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel)); #endif auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); @@ -1257,7 +1257,7 @@ __parallel_find_first(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif - __cgh.parallel_for<_FindOrKernel>( + __cgh.parallel_for<_FindFirstKernel>( #if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT __kernel, #endif From 225d5f656aeeedbfd834d02cd0472fa4b2b6d942 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 13 Jun 2024 15:14:14 +0200 Subject: [PATCH 22/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - remove extra comments "Point #..." Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 24 ++----------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2dc64a1b324..07bbb7dca69 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1043,13 +1043,8 @@ struct __early_exit_find_any __shift = __wg_size - __leader; for (_IterSize __i = 0; __i < __n_iter; ++__i) { - // Point #B1 - not required to have _ShiftedIdxType - - // Point #B2 - not required - - // Point #B3 - rewritten const auto __shifted_idx = __init_index + __i * __shift; - // Point #B4 - rewritten + if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) { __found_local = __parallel_or_tag::__found_state; @@ -1121,17 +1116,11 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol [=](sycl::nd_item __item_id) { __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( *__dpl_sycl::__get_accessor_ptr(__temp_acc)); - // Point #A1 - not required - // Point #A2 - rewritten _AtomicType __found_local = __init_value; - // Point #A2.1 - not required - // Point #A3 - rewritten __pred(__item_id, __n_iter, __wgroup_size, __found_local, __rngs...); - // Point #A3.1 - not required - // Point #A4 - rewritten // Set found state result to global atomic if (__found_local != __init_value) { @@ -1178,18 +1167,15 @@ struct __early_exit_find_first __shift = __wg_size - __leader; for (_IterSize __i = 0; __i < __n_iter; ++__i) { - // Point #B1 //in case of find-semantic __shifted_idx must be the same type as the atomic for a correct comparison using _ShiftedIdxType = decltype(__found_local.load()); _IterSize __current_iter = __i; - // Point #B2 if constexpr (_BackwardTagType::value) __current_iter = __n_iter - 1 - __i; - // Point #B3 const _ShiftedIdxType __shifted_idx = __init_index + __current_iter * __shift; - // Point #B4 + // TODO:[Performance] the issue with atomic load (in comparison with __shifted_idx for early exit) // should be investigated later, with other HW if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) @@ -1268,25 +1254,19 @@ __parallel_find_first(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( *__dpl_sycl::__get_accessor_ptr(__temp_acc)); - // Point #A1 __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::local_space> __found_local( *__dpl_sycl::__get_accessor_ptr(__temp_local)); - // Point #A2 // 1. Set initial value to local atomic if (__local_idx == 0) __found_local.store(__init_value); - // Point #A2.1 __dpl_sycl::__group_barrier(__item_id); - // Point #A3 // 2. Find any element that satisfies pred and set local atomic value to global atomic constexpr auto __comp = typename _BrickTag::_Compare{}; __pred(__item_id, __n_iter, __wgroup_size, __comp, __found_local, __brick_tag, __rngs...); - // Point #A3.1 __dpl_sycl::__group_barrier(__item_id); - // Point #A4 // Set local atomic value to global atomic if (__local_idx == 0) { From d20a4cd89c57f12be981fb39050040b81e426c0c Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 13 Jun 2024 15:16:09 +0200 Subject: [PATCH 23/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - restore store call instead of fetch_or Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 07bbb7dca69..ebfacb297f7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1124,7 +1124,7 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol // Set found state result to global atomic if (__found_local != __init_value) { - __found.fetch_or(__found_local); + __found.store(__found_local); } }); }); From 9a69cc16ac34a2cc0be6223c604772f70aa70a06 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 13 Jun 2024 15:59:55 +0200 Subject: [PATCH 24/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - fix review comment: do not use type name "_AtomicType" for local state variable Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 39 +++++++------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ebfacb297f7..74b31973fcf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1000,17 +1000,12 @@ struct __parallel_find_backward_tag // Tag for __parallel_find_any for or-semantic struct __parallel_or_tag { - using _AtomicType = unsigned int; + using _LocalStatusType = unsigned int; - static constexpr _AtomicType __found_state = 1; - static constexpr _AtomicType __not_found_state = 0; + static constexpr _LocalStatusType __found_state = 1; + static constexpr _LocalStatusType __not_found_state = 0; - // The template parameter is intended to unify __init_value in tags. - template - constexpr static _AtomicType __init_value(_DiffType) - { - return __not_found_state; - } + using _AtomicType = _LocalStatusType; }; //------------------------------------------------------------------------ @@ -1022,10 +1017,9 @@ struct __early_exit_find_any { _Pred __pred; - template - void - operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, - _FoundLocalState& __found_local, _Ranges&&... __rngs) const + template + __parallel_or_tag::_LocalStatusType + operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Ranges&&... __rngs) const { const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); @@ -1047,12 +1041,11 @@ struct __early_exit_find_any if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) { - __found_local = __parallel_or_tag::__found_state; - - // Doesn't make sense to continue if we found the element - break; + return __parallel_or_tag::__found_state; } } + + return __parallel_or_tag::__not_found_state; } }; @@ -1090,8 +1083,7 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); - const _AtomicType __init_value = __parallel_or_tag::__init_value(__rng_n); - _AtomicType __result = __init_value; + _AtomicType __result = __parallel_or_tag::__not_found_state; const oneapi::dpl::__par_backend_hetero::__early_exit_find_any<_ExecutionPolicy, _Brick> __pred{__f}; @@ -1117,12 +1109,11 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( *__dpl_sycl::__get_accessor_ptr(__temp_acc)); - _AtomicType __found_local = __init_value; - - __pred(__item_id, __n_iter, __wgroup_size, __found_local, __rngs...); + const __parallel_or_tag::_LocalStatusType __found_local = + __pred(__item_id, __n_iter, __wgroup_size, __rngs...); // Set found state result to global atomic - if (__found_local != __init_value) + if (__found_local == __parallel_or_tag::__found_state) { __found.store(__found_local); } @@ -1131,7 +1122,7 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol //The end of the scope - a point of synchronization (on temporary sycl buffer destruction) } - return __result != __init_value; + return __result == __parallel_or_tag::__found_state; } //------------------------------------------------------------------------ From 30fb5a6db9b5368a41b49388b0be571a0f5efa08 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 14 Jun 2024 11:24:18 +0200 Subject: [PATCH 25/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - simplify __parallel_or_tag Signed-off-by: Sergey Kopienko (cherry picked from commit f9948b378f7465d9cc42a3bd1dad6fda29ca2788) --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 74b31973fcf..a8b60849745 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1000,12 +1000,7 @@ struct __parallel_find_backward_tag // Tag for __parallel_find_any for or-semantic struct __parallel_or_tag { - using _LocalStatusType = unsigned int; - - static constexpr _LocalStatusType __found_state = 1; - static constexpr _LocalStatusType __not_found_state = 0; - - using _AtomicType = _LocalStatusType; + using _AtomicType = unsigned int; }; //------------------------------------------------------------------------ @@ -1018,7 +1013,7 @@ struct __early_exit_find_any _Pred __pred; template - __parallel_or_tag::_LocalStatusType + bool operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Ranges&&... __rngs) const { const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); @@ -1041,11 +1036,11 @@ struct __early_exit_find_any if (__shifted_idx < __n && __pred(__shifted_idx, __rngs...)) { - return __parallel_or_tag::__found_state; + return true; } } - return __parallel_or_tag::__not_found_state; + return false; } }; @@ -1083,7 +1078,7 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); - _AtomicType __result = __parallel_or_tag::__not_found_state; + _AtomicType __result = 0; const oneapi::dpl::__par_backend_hetero::__early_exit_find_any<_ExecutionPolicy, _Brick> __pred{__f}; @@ -1109,20 +1104,17 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( *__dpl_sycl::__get_accessor_ptr(__temp_acc)); - const __parallel_or_tag::_LocalStatusType __found_local = - __pred(__item_id, __n_iter, __wgroup_size, __rngs...); - // Set found state result to global atomic - if (__found_local == __parallel_or_tag::__found_state) + if (__pred(__item_id, __n_iter, __wgroup_size, __rngs...)) { - __found.store(__found_local); + __found.store(1); } }); }); //The end of the scope - a point of synchronization (on temporary sycl buffer destruction) } - return __result == __parallel_or_tag::__found_state; + return __result != 0; } //------------------------------------------------------------------------ From aa9400798a87905d5a7065ae55d39612b0077f22 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 14 Jun 2024 11:25:22 +0200 Subject: [PATCH 26/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - rename some local variables inside __parallel_find_any Signed-off-by: Sergey Kopienko (cherry picked from commit 315b09168392529e1dd646daecaf6d00f1ccb62e) --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index a8b60849745..e8097548507 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1084,12 +1084,12 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol // scope is to copy data back to __result after destruction of temporary sycl:buffer { - sycl::buffer<_AtomicType, 1> __temp(&__result, 1); // temporary storage for global atomic + sycl::buffer<_AtomicType, 1> __result_buf(&__result, 1); // temporary storage for global atomic // main parallel_for __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - auto __temp_acc = __temp.template get_access(__cgh); + auto __result_buf_acc = __result_buf.template get_access(__cgh); #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__kernel.get_kernel_bundle()); @@ -1102,7 +1102,7 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol sycl::range(__wgroup_size)), [=](sycl::nd_item __item_id) { __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( - *__dpl_sycl::__get_accessor_ptr(__temp_acc)); + *__dpl_sycl::__get_accessor_ptr(__result_buf_acc)); // Set found state result to global atomic if (__pred(__item_id, __n_iter, __wgroup_size, __rngs...)) From 63eb820834469f9c2786d3b66ea7d3e318fe2789 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 14 Jun 2024 11:26:15 +0200 Subject: [PATCH 27/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - rewrite __parallel_find_any on parallel_for_work_group + parallel_for_work_item Signed-off-by: Sergey Kopienko (cherry picked from commit a7c749ff93dab8cef1b128670c08cf1434f8119c) --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index e8097548507..9bf46be386e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1012,15 +1012,14 @@ struct __early_exit_find_any { _Pred __pred; - template + template bool - operator()(const _NDItemId __item_id, const _IterSize __n_iter, const _WgSize __wg_size, _Ranges&&... __rngs) const + operator()(const _GroupID __group_idx, const _ItemID __local_idx, const _IterSize __n_iter, const _WgSize __wg_size, + _Ranges&&... __rngs) const { const auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); std::size_t __shift = 16; - const std::size_t __local_idx = __item_id.get_local_id(0); - const std::size_t __group_idx = __item_id.get_group(0); // each work_item processes N_ELEMENTS with step SHIFT const std::size_t __leader = (__local_idx / __shift) * __shift; @@ -1091,22 +1090,29 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); auto __result_buf_acc = __result_buf.template get_access(__cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_FindAnyKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __kernel, -#endif - sycl::nd_range(sycl::range(__n_groups * __wgroup_size), - sycl::range(__wgroup_size)), - [=](sycl::nd_item __item_id) { - __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( - *__dpl_sycl::__get_accessor_ptr(__result_buf_acc)); + __cgh.parallel_for_work_group<_FindAnyKernel>( + sycl::range(__n_groups), // Number of work groups + sycl::range(__wgroup_size), // The size of each work group + [=](sycl::group __group) { + + bool __found_in_any_item_inside_group = false; + + const std::size_t __group_idx = __group.get_group_id(0); + + // process all work-items in our group + __group.parallel_for_work_item([&](sycl::h_item __item) { - // Set found state result to global atomic - if (__pred(__item_id, __n_iter, __wgroup_size, __rngs...)) + const std::size_t __local_idx = __item.get_local_id(0); + + if (__pred(__group_idx, __local_idx, __n_iter, __wgroup_size, __rngs...)) + __found_in_any_item_inside_group = true; + }); + + if (__found_in_any_item_inside_group) { + __dpl_sycl::__atomic_ref<_AtomicType, sycl::access::address_space::global_space> __found( + *__dpl_sycl::__get_accessor_ptr(__result_buf_acc)); + __found.store(1); } }); From d2581b21fb3c0840348a17df3b4f588bfdfb559e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 14 Jun 2024 12:50:05 +0200 Subject: [PATCH 28/28] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - run pred only when __found_in_any_item_inside_group is false Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 9bf46be386e..33e7c141d9f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1104,8 +1104,11 @@ __parallel_find_any(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPol const std::size_t __local_idx = __item.get_local_id(0); - if (__pred(__group_idx, __local_idx, __n_iter, __wgroup_size, __rngs...)) + if (!__found_in_any_item_inside_group && + __pred(__group_idx, __local_idx, __n_iter, __wgroup_size, __rngs...)) + { __found_in_any_item_inside_group = true; + } }); if (__found_in_any_item_inside_group)