From 872ffb15e5ec24b20be25b6958ea1c2959c3fdd9 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Wed, 9 Oct 2024 18:56:47 +0200 Subject: [PATCH 01/10] [oneDPL][ranges] + support sized output range for serial __pattern_copy_if_ranges --- .../oneapi/dpl/pstl/algorithm_ranges_impl.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h index 55d29a56be8..6f792f172bb 100644 --- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h @@ -435,7 +435,25 @@ auto __pattern_copy_if_ranges(__serial_tag, _ExecutionPolicy&& __exec, _InRange&& __in_r, _OutRange&& __out_r, _Pred __pred, _Proj __proj) { +#if 1 + using __return_type = std::ranges::copy_if_result, + std::ranges::borrowed_iterator_t<_OutRange>>; + + auto __it_in = std::ranges::begin(__in_r); + auto __it_out = std::ranges::begin(__out_r); + for(; __it_in != std::ranges::end(__in_r) && __it_out != std::ranges::end(__out_r); ++__it_in) + { + if (std::invoke(__pred, std::invoke(__proj, *__it_in))) + { + *__it_out = *__it_in; + ++__it_out; + } + } + + return __return_type{__it_in, __it_out}; +#else return std::ranges::copy_if(std::forward<_InRange>(__in_r), std::ranges::begin(__out_r), __pred, __proj); +#endif } //--------------------------------------------------------------------------------------------------------------------- From 946846853a580006fda05608b347b94ea332b965 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Tue, 15 Oct 2024 12:09:25 +0200 Subject: [PATCH 02/10] [oneDPL][ranges] + __simd_copy_if_sized_out --- include/oneapi/dpl/pstl/unseq_backend_simd.h | 46 ++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index 7e454c80268..7b760ccc67b 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -302,6 +302,52 @@ __simd_copy_if(_InputIterator __first, _DifferenceType __n, _OutputIterator __re return __result + __cnt; } +//const _Size __block_size = __lane_size / sizeof(_Tp); +template +std::pair<_DiffTypeIn, _OutputIt> +__simd_copy_if(_InputIt __first, _DiffTypeIn __n, _OutputIt __result, _DiffTypeOut __m, _UnaryPredicate __pred) noexcept +{ + _DiffTypeIn __i = 0; + _DiffTypeOut __cnt = 0; + if(__m >= __n) + { + _ONEDPL_PRAGMA_SIMD + for (__i = 0; __i < __n; ++__i) + { + _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) + if (__pred(__first[__i])) + { + __result[__cnt] = __first[__i]; + ++__cnt; + } + } + } + else // __m < __n + { + _ONEDPL_PRAGMA_SIMD + for (__i = 0; __i < __m; ++__i) + { + _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) + if (__pred(__first[__i])) + { + __result[__cnt] = __first[__i]; + ++__cnt; + } + } + + //process the remaining (__n - __m) elements + for (__i = __m; __i < __n && __cnt < __m; ++__i) + { + if (__pred(__first[__i])) + { + __result[__cnt] = __first[__i]; + ++__cnt; + } + } + } + return {__i, __cnt}; +} + template _DifferenceType __simd_calc_mask_2(_InputIterator __first, _DifferenceType __n, bool* __mask, _BinaryPredicate __pred) noexcept From b374660c21265fa361248aaf6fe63c7082ae4fd7 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Mon, 14 Oct 2024 12:59:51 +0200 Subject: [PATCH 03/10] [oneDPL][ranges] + __simd_count for sized output; + _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION --- include/oneapi/dpl/pstl/onedpl_config.h | 2 ++ include/oneapi/dpl/pstl/unseq_backend_simd.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h index 05b91087078..668f596bb78 100644 --- a/include/oneapi/dpl/pstl/onedpl_config.h +++ b/include/oneapi/dpl/pstl/onedpl_config.h @@ -162,8 +162,10 @@ # define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _PSTL_PRAGMA_SIMD_EARLYEXIT #elif _ONEDPL_EARLYEXIT_PRESENT # define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _ONEDPL_PRAGMA(omp simd early_exit) +# define _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(PRM) _ONEDPL_PRAGMA(omp simd early_exit reduction(PRM)) #else # define _ONEDPL_PRAGMA_SIMD_EARLYEXIT +# define _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(PRM) #endif #define _ONEDPL_MONOTONIC_PRESENT (__INTEL_COMPILER >= 1800) diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index 7b760ccc67b..909b48c419b 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -248,6 +248,23 @@ __simd_count(_Index __index, _DifferenceType __n, _Pred __pred) noexcept return __count; } +template +_DifferenceType +__simd_count(_Index __index, _DifferenceType __n, _Bound __m, _Pred __pred) noexcept +{ + _DifferenceType __count = 0; + _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(+ : __count) + for (_DifferenceType __i = 0; __i < __n; ++__i) + { + if(__count >= __m) + break; + if (__pred(*(__index + __i))) + ++__count; + } + + return __count; +} + template _OutputIterator __simd_unique_copy(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, From f4bb7fed089573438795843ebe2c41bb23b2e2cb Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Wed, 16 Oct 2024 12:20:46 +0200 Subject: [PATCH 04/10] [oneDPL][ranges] + __brick_calc_mask_1 with output size bound check --- include/oneapi/dpl/pstl/algorithm_impl.h | 31 +++++++++++++++++ include/oneapi/dpl/pstl/unseq_backend_simd.h | 35 ++++++++++---------- 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index ae9094f721a..d08ae53a521 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1233,6 +1233,28 @@ __brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, bool* __r return ::std::make_pair(__count_true, __size - __count_true); } +template +::std::pair<_DifferenceType, _DifferenceType> +__brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, _Bound __m, bool* __restrict __mask, _UnaryPredicate __pred, + /*vector=*/::std::false_type) noexcept +{ + auto __count_true = _DifferenceType(0); + auto __size = __last - __first; + + static_assert(__is_random_access_iterator_v<_ForwardIterator>, + "Pattern-brick error. Should be a random access iterator."); + + for (; __first != __last && __count_true < __m; ++__first, ++__mask) + { + *__mask = __pred(*__first); + if (*__mask) + { + ++__count_true; + } + } + return ::std::make_pair(__count_true, __size - __count_true); +} + template ::std::pair<_DifferenceType, _DifferenceType> __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, bool* __mask, _UnaryPredicate __pred, @@ -1242,6 +1264,15 @@ __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, return ::std::make_pair(__result, (__last - __first) - __result); } +template +::std::pair<_DifferenceType, _DifferenceType> +__brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, _Bound __m, bool* __mask, _UnaryPredicate __pred, + /*vector=*/::std::true_type) noexcept +{ + auto __result = __unseq_backend::__simd_calc_mask_1(__first, __last - __first, __m, __mask, __pred); + return ::std::make_pair(__result, (__last - __first) - __result); +} + template void __brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, bool* __mask, diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index 909b48c419b..9650ed8b85d 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -248,23 +248,6 @@ __simd_count(_Index __index, _DifferenceType __n, _Pred __pred) noexcept return __count; } -template -_DifferenceType -__simd_count(_Index __index, _DifferenceType __n, _Bound __m, _Pred __pred) noexcept -{ - _DifferenceType __count = 0; - _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(+ : __count) - for (_DifferenceType __i = 0; __i < __n; ++__i) - { - if(__count >= __m) - break; - if (__pred(*(__index + __i))) - ++__count; - } - - return __count; -} - template _OutputIterator __simd_unique_copy(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, @@ -395,6 +378,24 @@ __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, bool* __mask, _U return __count; } +template +_DifferenceType +__simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, _Bound __m, bool* __mask, _UnaryPredicate __pred) noexcept +{ + _DifferenceType __count = 0; + + _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(+ : __count) + for (_DifferenceType __i = 0; __i < __n; ++__i) + { + if(__count >= __m) + break; + + __mask[__i] = __pred(__first[__i]); + __count += __mask[__i]; + } + return __count; +} + template void __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, bool* __mask, From a1ce833c03d636d52fb57b1a436e1152cb0e2e27 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Mon, 18 Nov 2024 17:36:31 +0100 Subject: [PATCH 05/10] [oneDPL][ranges][copy_if] + support sized output range (TBB backend) --- include/oneapi/dpl/pstl/algorithm_impl.h | 126 +++++++++++++++++- .../oneapi/dpl/pstl/algorithm_ranges_impl.h | 6 +- .../oneapi/dpl/pstl/parallel_backend_tbb.h | 47 ++++--- include/oneapi/dpl/pstl/unseq_backend_simd.h | 60 ++++++++- 4 files changed, 210 insertions(+), 29 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h index d08ae53a521..8bd2a1a3644 100644 --- a/include/oneapi/dpl/pstl/algorithm_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_impl.h @@ -1197,6 +1197,25 @@ __brick_copy_if(_ForwardIterator __first, _ForwardIterator __last, _OutputIterat return ::std::copy_if(__first, __last, __result, __pred); } +template +std::pair<_ForwardIterator, _OutputIterator> +__brick_copy_if(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, + typename std::iterator_traits<_OutputIterator>::difference_type __m, _UnaryPredicate __pred, + /*vector=*/::std::false_type) noexcept +{ + for(; __first != __last && __m > 0; ++__first) + { + const auto& __v = *__first; + if(__pred(__v)) + { + *__result = __v; + ++__result; + --__m; + } + } + return {__first, __result}; +} + template _RandomAccessIterator2 __brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, @@ -1206,7 +1225,20 @@ __brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _ #if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) return __unseq_backend::__simd_copy_if(__first, __last - __first, __result, __pred); #else - return ::std::copy_if(__first, __last, __result, __pred); + return __brick_copy_if(__first, __last, __result, __pred, std::false_type{}); +#endif +} + +template +std::pair<_RandomAccessIterator1, _RandomAccessIterator2> +__brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, + typename std::iterator_traits<_RandomAccessIterator2>::difference_type __m, _UnaryPredicate __pred, + /*vector=*/::std::true_type) noexcept +{ +#if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) + return __unseq_backend::__simd_copy_if(__first, __last - __first, __result, __m, __pred); +#else + return __brick_copy_if(__first, __last, __result, __m, __pred, std::false_type{}); #endif } @@ -1234,7 +1266,7 @@ __brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, bool* __r } template -::std::pair<_DifferenceType, _DifferenceType> +std::pair<_DifferenceType, _ForwardIterator> __brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, _Bound __m, bool* __restrict __mask, _UnaryPredicate __pred, /*vector=*/::std::false_type) noexcept { @@ -1252,7 +1284,7 @@ __brick_calc_mask_1(_ForwardIterator __first, _ForwardIterator __last, _Bound __ ++__count_true; } } - return ::std::make_pair(__count_true, __size - __count_true); + return {__count_true, __first}; } template @@ -1265,12 +1297,12 @@ __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, } template -::std::pair<_DifferenceType, _DifferenceType> +std::pair<_DifferenceType, _RandomAccessIterator> __brick_calc_mask_1(_RandomAccessIterator __first, _RandomAccessIterator __last, _Bound __m, bool* __mask, _UnaryPredicate __pred, /*vector=*/::std::true_type) noexcept { auto __result = __unseq_backend::__simd_calc_mask_1(__first, __last - __first, __m, __mask, __pred); - return ::std::make_pair(__result, (__last - __first) - __result); + return {__result.first, __first + __result.second}; } template @@ -1288,6 +1320,23 @@ __brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputI } } +template +std::pair<_ForwardIterator, _OutputIterator> +__brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, _Bound __m, bool* __mask, + _Assigner __assigner, /*vector=*/::std::false_type) noexcept +{ + for (; __first != __last && __m > 0; ++__first, ++__mask) + { + if (*__mask) + { + __assigner(__first, __result); + ++__result; + --__m; + } + } + return {__first, __result}; +} + template void __brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, @@ -1300,6 +1349,18 @@ __brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __la #endif } +template +auto +__brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, + _Bound __m, bool* __restrict __mask, _Assigner __assigner, /*vector=*/::std::true_type) noexcept +{ +#if (_PSTL_MONOTONIC_PRESENT || _ONEDPL_MONOTONIC_PRESENT) + return __unseq_backend::__simd_copy_by_mask(__first, __last - __first, __result, __m, __mask, __assigner); +#else + return __internal::__brick_copy_by_mask(__first, __last, __result, __m, __mask, __assigner, ::std::false_type()); +#endif +} + template void __brick_partition_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator1 __out_true, @@ -1343,6 +1404,16 @@ __pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _ForwardIt return __internal::__brick_copy_if(__first, __last, __result, __pred, typename _Tag::__is_vector{}); } +template +std::pair<_ForwardIterator, _OutputIterator> +__pattern_copy_if(_Tag, _ExecutionPolicy&&, _ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, + _UnaryPredicate __pred, typename std::iterator_traits<_OutputIterator>::difference_type __n_out) noexcept +{ + static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>); + + return __internal::__brick_copy_if(__first, __last, __result, __n_out, __pred, typename _Tag::__is_vector{}); +} + template _RandomAccessIterator2 @@ -1353,6 +1424,7 @@ __pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA typedef typename ::std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; const _DifferenceType __n = __last - __first; + if (_DifferenceType(1) < __n) { __par_backend::__buffer<_ExecutionPolicy, bool> __mask_buf(__exec, __n); @@ -1380,6 +1452,50 @@ __pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomA return __internal::__brick_copy_if(__first, __last, __result, __pred, _IsVector{}); } +template +std::pair<_RandomAccessIterator1, _RandomAccessIterator2> +__pattern_copy_if(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, + _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, _UnaryPredicate __pred, + typename std::iterator_traits<_RandomAccessIterator2>::difference_type __n_out) +{ + using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag; + + typedef typename ::std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; + const _DifferenceType __n = __last - __first; + + if(__n_out < 0) + __n_out = __n; + + if (_DifferenceType(1) < __n) + { + __par_backend::__buffer<_ExecutionPolicy, bool> __mask_buf(__exec, __n); + return __internal::__except_handler([&__exec, __n, __first, __result, __pred, &__mask_buf, __n_out]() { + bool* __mask = __mask_buf.get(); + _DifferenceType __res_in{}, __res_out{}; + __par_backend::__parallel_strict_scan( + __backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), + [=](_DifferenceType __i, _DifferenceType __len) { // Reduce + return __internal::__brick_calc_mask_1<_DifferenceType>(__first + __i, __first + (__i + __len), + __n_out, __mask + __i, __pred, _IsVector{}) + .first; + }, + ::std::plus<_DifferenceType>(), // Combine + [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial, _DifferenceType __len_out) { // Scan + auto res = __internal::__brick_copy_by_mask( + __first + __i, __first + (__i + __len), __result + __initial, __len_out, __mask + __i, + [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, _IsVector{}); + return std::make_pair(res.first - (__first + __i), res.second - (__result + __initial)); + }, + [&__res_in, &__res_out](auto __total_in, auto __total_out) { __res_in = __total_in; __res_out = __total_out; }, + __n_out); + return std::make_pair(__first + __res_in, __result + __res_out); + }); + } + // trivial sequence - use serial algorithm + return __internal::__brick_copy_if(__first, __last, __result, __n_out, __pred, _IsVector{}); +} + //------------------------------------------------------------------------ // count //------------------------------------------------------------------------ diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h index 6f792f172bb..c7cb7fa3a41 100644 --- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h @@ -420,14 +420,14 @@ __pattern_copy_if_ranges(_Tag __tag, _ExecutionPolicy&& __exec, _InRange&& __in_ auto __pred_1 = [__pred, __proj](auto&& __val) { return std::invoke(__pred, std::invoke(__proj, std::forward(__val)));}; - auto __res_idx = oneapi::dpl::__internal::__pattern_copy_if(__tag, std::forward<_ExecutionPolicy>(__exec), + auto __res = oneapi::dpl::__internal::__pattern_copy_if(__tag, std::forward<_ExecutionPolicy>(__exec), std::ranges::begin(__in_r), std::ranges::begin(__in_r) + std::ranges::size(__in_r), - std::ranges::begin(__out_r), __pred_1) - std::ranges::begin(__out_r); + std::ranges::begin(__out_r), __pred_1, std::ranges::size(__out_r)); using __return_type = std::ranges::copy_if_result, std::ranges::borrowed_iterator_t<_OutRange>>; - return __return_type{std::ranges::begin(__in_r) + std::ranges::size(__in_r), std::ranges::begin(__out_r) + __res_idx}; + return __return_type{__res.first, __res.second}; } template diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 59821e98156..748c4ec614b 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -316,25 +316,34 @@ __upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize } } -template -void +template +std::pair<_Index, _Index> __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, - _Sp __scan) + _Sp __scan, _OutBound __n_out) { + std::pair<_Index, _Index> __res{}; if (__m == 1) - __scan(__i * __tilesize, __lastsize, __initial); + { + if(__initial < __n_out) + __res = __scan(__i * __tilesize, __lastsize, __initial, __n_out - __initial); + } else { const _Index __k = __split(__m); + auto __start = __combine(__initial, __r[__k - 1]); + + std::pair<_Index, _Index> __res_1{}, __res_2{}; tbb::parallel_invoke( - [=] { __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); }, + [=, &__res_1] { __res_1 = __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan, __n_out); }, // Assumes that __combine never throws. //TODO: Consider adding a requirement for user functors to be constant. - [=, &__combine] { - __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, - __combine(__initial, __r[__k - 1]), __combine, __scan); + [=, &__combine, &__res_2] { + __res_2 = __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, + __start, __combine, __scan, __n_out); }); + __res = std::make_pair(__res_1.first + __res_2.first, __res_1.second + __res_2.second); } + return __res; } // Adapted from Intel(R) Cilk(TM) version from cilkpub. @@ -354,8 +363,10 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi template void __parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial, - _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex) + _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, int __n_out = -1) { + if(__n_out < 0) + __n_out = __n; tbb::this_task_arena::isolate([=, &__combine]() { if (__n > 1) { @@ -376,18 +387,22 @@ __parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPol _Tp __t = __r[__k - 1]; while ((__k &= __k - 1)) __t = __combine(__r[__k - 1], __t); - __apex(__combine(__initial, __t)); - __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial, - __combine, __scan); + + auto __res = __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial, + __combine, __scan, __n_out); + __apex(__res.first, __res.second); return; } // Fewer than 2 elements in sequence, or out of memory. Handle has single block. _Tp __sum = __initial; - if (__n) + if (__n && __n_out > 0) __sum = __combine(__sum, __reduce(_Index(0), __n)); - __apex(__sum); - if (__n) - __scan(_Index(0), __n, __initial); + //__apex(__sum); + if (__n && __n_out > 0) + { + auto __res = __scan(_Index(0), __n, __initial, __n_out); + __apex(__res.first, __res.second); + } }); } diff --git a/include/oneapi/dpl/pstl/unseq_backend_simd.h b/include/oneapi/dpl/pstl/unseq_backend_simd.h index 9650ed8b85d..c5b1db856a8 100644 --- a/include/oneapi/dpl/pstl/unseq_backend_simd.h +++ b/include/oneapi/dpl/pstl/unseq_backend_simd.h @@ -304,7 +304,7 @@ __simd_copy_if(_InputIterator __first, _DifferenceType __n, _OutputIterator __re //const _Size __block_size = __lane_size / sizeof(_Tp); template -std::pair<_DiffTypeIn, _OutputIt> +std::pair<_InputIt, _OutputIt> __simd_copy_if(_InputIt __first, _DiffTypeIn __n, _OutputIt __result, _DiffTypeOut __m, _UnaryPredicate __pred) noexcept { _DiffTypeIn __i = 0; @@ -345,7 +345,7 @@ __simd_copy_if(_InputIt __first, _DiffTypeIn __n, _OutputIt __result, _DiffTypeO } } } - return {__i, __cnt}; + return {__first + __i, __result + __cnt}; } template @@ -379,13 +379,14 @@ __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, bool* __mask, _U } template -_DifferenceType +std::pair<_DifferenceType, _DifferenceType> __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, _Bound __m, bool* __mask, _UnaryPredicate __pred) noexcept { _DifferenceType __count = 0; + _DifferenceType __i = 0; _ONEDPL_PRAGMA_SIMD_EARLYEXIT_REDUCTION(+ : __count) - for (_DifferenceType __i = 0; __i < __n; ++__i) + for (__i = 0; __i < __n; ++__i) { if(__count >= __m) break; @@ -393,7 +394,7 @@ __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, _Bound __m, bool __mask[__i] = __pred(__first[__i]); __count += __mask[__i]; } - return __count; + return {__count, __i}; } template @@ -416,6 +417,55 @@ __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator } } +template +void +__simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _Bound __m, bool* __mask, + _Assigner __assigner) noexcept +{ + _DifferenceType __cnt = 0; + _DifferenceType __i = 0; + if(__m >= __n) + { + _ONEDPL_PRAGMA_SIMD + for (__i = 0; __i < __n; ++__i) + { + if (__mask[__i]) + { + _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) + { + __assigner(__first + __i, __result + __cnt); + ++__cnt; + } + } + } + } + else // __m < __n + { + _ONEDPL_PRAGMA_SIMD + for (__i = 0; __i < __m; ++__i) + { + if (__mask[__i]) + { + _ONEDPL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1) + { + __assigner(__first + __i, __result + __cnt); + ++__cnt; + } + } + } + + //process the remaining (__n - __m) elements + for (__i = __m; __i < __n && __cnt < __m; ++__i) + { + if (__mask[__i]) + { + __assigner(__first + __i, __result + __cnt); + ++__cnt; + } + } + } +} + template void __simd_partition_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true, From f391e4a53d730ef3114b0f47abc3d73fc3dd59f6 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Tue, 19 Nov 2024 18:27:08 +0100 Subject: [PATCH 06/10] [oneDPL][ranges][copy_if] removed unused code --- include/oneapi/dpl/pstl/algorithm_ranges_impl.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h index c7cb7fa3a41..540e58ecd55 100644 --- a/include/oneapi/dpl/pstl/algorithm_ranges_impl.h +++ b/include/oneapi/dpl/pstl/algorithm_ranges_impl.h @@ -435,7 +435,6 @@ auto __pattern_copy_if_ranges(__serial_tag, _ExecutionPolicy&& __exec, _InRange&& __in_r, _OutRange&& __out_r, _Pred __pred, _Proj __proj) { -#if 1 using __return_type = std::ranges::copy_if_result, std::ranges::borrowed_iterator_t<_OutRange>>; @@ -451,9 +450,6 @@ __pattern_copy_if_ranges(__serial_tag, _ExecutionPo } return __return_type{__it_in, __it_out}; -#else - return std::ranges::copy_if(std::forward<_InRange>(__in_r), std::ranges::begin(__out_r), __pred, __proj); -#endif } //--------------------------------------------------------------------------------------------------------------------- From 025a734de0c9de4b626c1c5c1e2c06e760d4184b Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Wed, 20 Nov 2024 12:00:17 +0100 Subject: [PATCH 07/10] [oneDPL][ranges][copy_if] + support sized output range (openMP and serial backend) --- include/oneapi/dpl/pstl/omp/parallel_scan.h | 96 +++++++++++++++++++ .../oneapi/dpl/pstl/parallel_backend_serial.h | 17 ++++ .../oneapi/dpl/pstl/parallel_backend_tbb.h | 74 ++++++++++++-- 3 files changed, 180 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/omp/parallel_scan.h b/include/oneapi/dpl/pstl/omp/parallel_scan.h index c3bc022cb2e..d4abe26aff0 100644 --- a/include/oneapi/dpl/pstl/omp/parallel_scan.h +++ b/include/oneapi/dpl/pstl/omp/parallel_scan.h @@ -79,6 +79,37 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi } } +template +std::pair<_Index, _Index> +__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, + _Sp __scan, _Index __n_out) +{ + std::pair<_Index, _Index> __res{}; + if (__m == 1) + { + if(__initial < __n_out) + __scan(__i * __tilesize, __lastsize, __initial, __n_out - __initial); + } + else + { + const _Index __k = __split(__m); + std::pair<_Index, _Index> __res_1{}, __res_2{}; + oneapi::dpl::__omp_backend::__parallel_invoke_body( + [=, &__res_1] { + __res_1 = oneapi::dpl::__omp_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, + __scan, __n_out); + }, + // Assumes that __combine never throws. + // TODO: Consider adding a requirement for user functors to be constant. + [=, &__combine, &__res_2] { + __res_2 = oneapi::dpl::__omp_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, + __combine(__initial, __r[__k - 1]), __combine, __scan, __n_out); + }); + __res = std::make_pair(__res_1.first + __res_2.first, __res_1.second + __res_2.second); + } + return __res; +} + template void @@ -107,6 +138,35 @@ __parallel_strict_scan_body(_ExecutionPolicy&& __exec, _Index __n, _Tp __initial __initial, __combine, __scan); } + +template +void +__parallel_strict_scan_body(_ExecutionPolicy&& __exec, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, + _Sp __scan, _Ap __apex, _Index __n_out) +{ + _Index __p = omp_get_num_threads(); + const _Index __slack = 4; + _Index __tilesize = (__n - 1) / (__slack * __p) + 1; + _Index __m = (__n - 1) / __tilesize; + __buffer<_ExecutionPolicy, _Tp> __buf(::std::forward<_ExecutionPolicy>(__exec), __m + 1); + _Tp* __r = __buf.get(); + + oneapi::dpl::__omp_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, + __combine); + + std::size_t __k = __m + 1; + _Tp __t = __r[__k - 1]; + while ((__k &= __k - 1)) + { + __t = __combine(__r[__k - 1], __t); + } + + auto __res = oneapi::dpl::__omp_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, + __initial, __combine, __scan, __n_out); + __apex(__res.first, __res.second); +} + template void __parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial, @@ -143,6 +203,42 @@ __parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPol } } +template +void +__parallel_strict_scan(oneapi::dpl::__internal::__omp_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial, + _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out) +{ + if(__n_out == 0) + return; + else if(__n_out < 0) + __n_out = __n; + + if (__n <= __default_chunk_size) + { + if (__n) + { + auto __res = __scan(_Index(0), __n, __initial, __n_out); + __apex(__res.first, __res.second); + } + return; + } + + if (omp_in_parallel()) + { + oneapi::dpl::__omp_backend::__parallel_strict_scan_body(::std::forward<_ExecutionPolicy>(__exec), __n, + __initial, __reduce, __combine, __scan, __apex, __n_out); + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + { + oneapi::dpl::__omp_backend::__parallel_strict_scan_body(::std::forward<_ExecutionPolicy>(__exec), __n, + __initial, __reduce, __combine, __scan, __apex, __n_out); + } + } +} + } // namespace __omp_backend } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h index 6acd4b617f9..50ab0a2cd01 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_serial.h +++ b/include/oneapi/dpl/pstl/parallel_backend_serial.h @@ -86,6 +86,23 @@ __parallel_strict_scan(oneapi::dpl::__internal::__serial_backend_tag, _Execution __scan(_Index(0), __n, __initial); } +template +void +__parallel_strict_scan(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __n, _Tp __initial, + _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out) +{ + if(__n_out == 0) + return; + else if(__n_out < 0) + __n_out = __n; + + if (__n) + { + auto __res = __scan(_Index(0), __n, __initial, __n_out); + __apex(__res.first, __res.second); + } +} + template _Tp __parallel_transform_scan(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __n, _UnaryOp, diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h index 748c4ec614b..e23b166bd6b 100644 --- a/include/oneapi/dpl/pstl/parallel_backend_tbb.h +++ b/include/oneapi/dpl/pstl/parallel_backend_tbb.h @@ -316,6 +316,27 @@ __upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize } } +template +void +__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, + _Sp __scan) +{ + if (__m == 1) + __scan(__i * __tilesize, __lastsize, __initial); + else + { + const _Index __k = __split(__m); + tbb::parallel_invoke( + [=] { __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); }, + // Assumes that __combine never throws. + //TODO: Consider adding a requirement for user functors to be constant. + [=, &__combine] { + __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, + __combine(__initial, __r[__k - 1]), __combine, __scan); + }); + } +} + template std::pair<_Index, _Index> __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, @@ -353,6 +374,7 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi // combine(s1,s2) -> s -- return merged sum // apex(s) -- do any processing necessary between reduce and scan. // scan(i,len,initial) -- perform scan over i:len starting with initial. +// [n_out -- limit for output range] // The initial range 0:n is partitioned into consecutive subranges. // reduce and scan are each called exactly once per subrange. // Thus callers can rely upon side effects in reduce. @@ -363,9 +385,51 @@ __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsi template void __parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial, - _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, int __n_out = -1) + _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex) +{ + tbb::this_task_arena::isolate([=, &__combine]() { + if (__n > 1) + { + _Index __p = tbb::this_task_arena::max_concurrency(); + const _Index __slack = 4; + _Index __tilesize = (__n - 1) / (__slack * __p) + 1; + _Index __m = (__n - 1) / __tilesize; + __tbb_backend::__buffer<_ExecutionPolicy, _Tp> __buf(__exec, __m + 1); + _Tp* __r = __buf.get(); + __tbb_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, + __combine); + + // When __apex is a no-op and __combine has no side effects, a good optimizer + // should be able to eliminate all code between here and __apex. + // Alternatively, provide a default value for __apex that can be + // recognized by metaprogramming that conditionlly executes the following. + size_t __k = __m + 1; + _Tp __t = __r[__k - 1]; + while ((__k &= __k - 1)) + __t = __combine(__r[__k - 1], __t); + __apex(__combine(__initial, __t)); + __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial, + __combine, __scan); + return; + } + // Fewer than 2 elements in sequence, or out of memory. Handle has single block. + _Tp __sum = __initial; + if (__n) + __sum = __combine(__sum, __reduce(_Index(0), __n)); + __apex(__sum); + if (__n) + __scan(_Index(0), __n, __initial); + }); +} + +template +void +__parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy&& __exec, _Index __n, _Tp __initial, + _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex, _Index __n_out) { - if(__n_out < 0) + if(__n_out == 0) + return; + else if(__n_out < 0) __n_out = __n; tbb::this_task_arena::isolate([=, &__combine]() { if (__n > 1) @@ -394,11 +458,7 @@ __parallel_strict_scan(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPol return; } // Fewer than 2 elements in sequence, or out of memory. Handle has single block. - _Tp __sum = __initial; - if (__n && __n_out > 0) - __sum = __combine(__sum, __reduce(_Index(0), __n)); - //__apex(__sum); - if (__n && __n_out > 0) + if (__n) { auto __res = __scan(_Index(0), __n, __initial, __n_out); __apex(__res.first, __res.second); From 78e5794010d5a21506d363662b35e493daceb0fb Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Fri, 29 Nov 2024 19:14:43 +0100 Subject: [PATCH 08/10] [oneDPL][ranges][test][copy_if] support size limit for output in a test checker --- .../ranges/std_ranges_copy_if.pass.cpp | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp b/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp index 842d9038b52..d343d0d6ac3 100644 --- a/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp +++ b/test/parallel_api/ranges/std_ranges_copy_if.pass.cpp @@ -23,17 +23,25 @@ main() namespace dpl_ranges = oneapi::dpl::ranges; auto copy_if_checker = [](std::ranges::random_access_range auto&& r_in, - std::ranges::random_access_range auto&& r_out, auto&&... args) + std::ranges::random_access_range auto&& r_out, auto pred, auto proj) { - auto res = std::ranges::copy_if(std::forward(r_in), std::ranges::begin(r_out), - std::forward(args)...); - using ret_type = std::ranges::copy_if_result, std::ranges::borrowed_iterator_t>; - return ret_type{res.in, res.out}; + + auto it_in = std::ranges::begin(r_in); + auto it_out = std::ranges::begin(r_out); + for(; it_in != std::ranges::end(r_in) && it_out != std::ranges::end(r_out); ++it_in) + { + if (std::invoke(pred, std::invoke(proj, *it_in))) + { + *it_out = *it_in; + ++it_out; + } + } + return ret_type{it_in, it_out}; }; - test_range_algo<0, int, data_in_out>{big_sz}(dpl_ranges::copy_if, copy_if_checker, pred); + test_range_algo<0, int, data_in_out>{big_sz}(dpl_ranges::copy_if, copy_if_checker, pred, std::identity{}); test_range_algo<1, int, data_in_out>{}(dpl_ranges::copy_if, copy_if_checker, pred, proj); test_range_algo<2, P2, data_in_out>{}(dpl_ranges::copy_if, copy_if_checker, pred, &P2::x); test_range_algo<3, P2, data_in_out>{}(dpl_ranges::copy_if, copy_if_checker, pred, &P2::proj); From 3e86e512693b664403670d98f0516f8090a74010 Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Fri, 13 Dec 2024 15:41:17 +0100 Subject: [PATCH 09/10] [oneDPL][ranges] + support sized output range for copy_if; dpcpp backend --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 4 +- .../hetero/algorithm_ranges_impl_hetero.h | 21 ++-- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 34 +++++-- .../dpcpp/parallel_backend_sycl_utils.h | 96 ++++++++++++++----- .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 3 +- 5 files changed, 113 insertions(+), 45 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 65bf99c8777..29277b8d7db 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -901,15 +901,13 @@ __pattern_copy_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato if (__first == __last) return __result_first; - _It1DifferenceType __n = __last - __first; - auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); auto __buf1 = __keep1(__first, __last); auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); auto __buf2 = __keep2(__result_first, __result_first + __n); auto __res = __par_backend_hetero::__parallel_copy_if(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), __n, __pred); + __buf1.all_view(), __buf2.all_view(), __pred); ::std::size_t __num_copied = __res.get(); //is a blocking call return __result_first + __num_copied; diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index da7820b91a2..93ee410c36a 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -536,19 +536,22 @@ __pattern_count_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ template -oneapi::dpl::__internal::__difference_t<_Range2> +std::pair, oneapi::dpl::__internal::__difference_t<_Range2>> __pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Predicate __pred, _Assign __assign) { - oneapi::dpl::__internal::__difference_t<_Range2> __n = __rng1.size(); - if (__n == 0) - return 0; + using _Index = oneapi::dpl::__internal::__difference_t<_Range2>; + _Index __n = __rng1.size(); + if (__n == 0 || __rng2.empty()) + return {0, 0}; - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if_out_lim( _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), __n, __pred, __assign); + std::forward<_Range2>(__rng2), __pred, __assign).get(); - return __res.get(); //is a blocking call + std::array<_Index, _2> __idx; + __res.get_values(__idx); //a blocking call + return {__idx[0], __idx[1]; } #if _ONEDPL_CPP20_RANGES_PRESENT @@ -561,7 +564,7 @@ __pattern_copy_if_ranges(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e auto __pred_1 = [__pred, __proj](auto&& __val) { return std::invoke(__pred, std::invoke(__proj, std::forward(__val)));}; - auto __res_idx = oneapi::dpl::__internal::__ranges::__pattern_copy_if(__tag, + auto __res = oneapi::dpl::__internal::__ranges::__pattern_copy_if(__tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::views::all_read(__in_r), oneapi::dpl::__ranges::views::all_write(__out_r), __pred_1, oneapi::dpl::__internal::__pstl_assign()); @@ -569,7 +572,7 @@ __pattern_copy_if_ranges(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __e using __return_t = std::ranges::copy_if_result, std::ranges::borrowed_iterator_t<_OutRange>>; - return __return_t{std::ranges::begin(__in_r) + std::ranges::size(__in_r), std::ranges::begin(__out_r) + __res_idx}; + return __return_t{std::ranges::begin(__in_r) + __res.first, std::ranges::begin(__out_r) + __res.second}; } #endif //_ONEDPL_CPP20_RANGES_PRESENT diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c7b72625315..995b74bfd78 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1228,11 +1228,11 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _ /*_Inclusive=*/std::true_type{}, __is_unique_pattern); } -template auto __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op, + _InRng&& __in_rng, _OutRng&& __out_rng, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) { using _ReduceOp = std::plus<_Size>; @@ -1248,7 +1248,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag _MaskAssigner __add_mask_op; // temporary buffer to store boolean mask - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n); + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __in_rng.size()); return __parallel_transform_scan_base( __backend_tag, std::forward<_ExecutionPolicy>(__exec), @@ -1299,7 +1299,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, + std::forward<_Range2>(__result), _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, _CopyOp{_ReduceOp{}, _Assign{}}); } @@ -1360,16 +1360,34 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); + std::forward<_Range2>(__result), _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); } } -template +auto +__parallel_copy_if_out_lim(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{}) +{ + using _ReduceOp = std::plus<_Size>; + using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>; + using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, + /*inclusive*/ std::true_type, 1>; + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); +} + +template auto __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{}) + _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{}) { + auto __n = __in_rng.size(); + using _Size = decltype(__n); using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; // Next power of 2 greater than or equal to __n @@ -1413,7 +1431,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, /*inclusive*/ std::true_type, 1>; return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 3d9d923d80d..1b487cf90c6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -665,6 +665,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base _T __get_value(size_t idx = 0) const { + assert(__result_n > 0); assert(idx < __result_n); if (__use_USM_host && __supports_USM_device) { @@ -682,6 +683,26 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base } } + template + void get_values(std::array<_T, _N>& __arr) + { + assert(__result_n > 0); + assert(_N == __result_n); + if (__use_USM_host && __supports_USM_device) + { + std::copy_n(__result_buf.get(), __result_n, __arr.begin()); + } + else if (__supports_USM_device) + { + __exec.queue().memcpy(__arr.begin(), __scratch_buf.get() + __scratch_n, __result_n * sizeof(_T)).wait(); + } + else + { + auto _acc_h = __sycl_buf->get_host_access(sycl::read_only); + std::copy_n(_acc_h.begin() + __scratch_n, __result_n, __arr.begin()); + } + } + template _T __wait_and_get_value(_Event&& __event, size_t idx = 0) const @@ -691,6 +712,49 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base return __get_value(idx); } + + template + void + __wait_and_get_value(_Event&& __event, std::array<_T, _N>& __arr) const + { + if (is_USM()) + __event.wait_and_throw(); + + return get_values(__arr); + } +}; + +// The type specifies the polymorphic behaviour for different value types via the overloads +struct __wait_and_get_value +{ + template + constexpr auto + operator()(auto&& /*__event*/, const sycl::buffer<_T>& __buf) + { + return __buf.get_host_access(sycl::read_only)[0]; + } + + template + constexpr auto + operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage) + { + return __storage.__wait_and_get_value(__event); + } + + template + constexpr void + operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage, std::array<_T, _N>& __arr) + { + return __storage.__wait_and_get_value(__event, __arr); + } + + template + constexpr auto + operator()(auto&& __event, const _T& __val) + { + __event.wait_and_throw(); + return __val; + } }; // Tag __async_mode describe a pattern call mode which should be executed asynchronously @@ -714,29 +778,6 @@ class __future : private std::tuple<_Args...> { _Event __my_event; - template - constexpr auto - __wait_and_get_value(const sycl::buffer<_T>& __buf) - { - //according to a contract, returned value is one-element sycl::buffer - return __buf.get_host_access(sycl::read_only)[0]; - } - - template - constexpr auto - __wait_and_get_value(const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage) - { - return __storage.__wait_and_get_value(__my_event); - } - - template - constexpr auto - __wait_and_get_value(const _T& __val) - { - wait(); - return __val; - } - public: __future(_Event __e, _Args... __args) : std::tuple<_Args...>(__args...), __my_event(__e) {} __future(_Event __e, std::tuple<_Args...> __t) : std::tuple<_Args...>(__t), __my_event(__e) {} @@ -770,13 +811,20 @@ class __future : private std::tuple<_Args...> #endif } + template + std::enable_if_t 0> + get_values(std::array<_T, _N>& __arr) + { + __wait_and_get_value{}(event(), __val, __arr); + } + auto get() { if constexpr (sizeof...(_Args) > 0) { auto& __val = std::get<0>(*this); - return __wait_and_get_value(__val); + return __wait_and_get_value{}(event(), __val); } else wait(); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index e0b57260ee0..aabd0fcac27 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -624,7 +624,8 @@ struct __copy_by_mask // NOTE: we only need this explicit conversion when we have internal::tuple and // ::std::tuple as operands, in all the other cases this is not necessary and no conversion // is performed(i.e. __typle_type is the same type as its operand). - __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]); + if(__out_idx < __out_acc.size()) + __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]); } if (__item_idx == 0) { From 4eddf48c2fe764992d5010b8a7e13e8c5df182aa Mon Sep 17 00:00:00 2001 From: MikeDvorskiy Date: Mon, 16 Dec 2024 14:56:08 +0100 Subject: [PATCH 10/10] [oneDPL][ranges] + support sized output range for copy_if; dpcpp backend, part 2 --- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 1 + .../pstl/hetero/algorithm_ranges_impl_hetero.h | 8 ++++---- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 +++- .../hetero/dpcpp/parallel_backend_sycl_utils.h | 16 +++++++++------- .../dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h | 13 ++++++++++++- 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 29277b8d7db..a609867dee8 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -901,6 +901,7 @@ __pattern_copy_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato if (__first == __last) return __result_first; + auto __n = __last - __first; auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); auto __buf1 = __keep1(__first, __last); auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 93ee410c36a..b03b73c78e7 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -540,18 +540,18 @@ std::pair, oneapi::dpl::__inter __pattern_copy_if(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Predicate __pred, _Assign __assign) { - using _Index = oneapi::dpl::__internal::__difference_t<_Range2>; + using _Index = std::size_t; //TODO _Index __n = __rng1.size(); if (__n == 0 || __rng2.empty()) return {0, 0}; auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if_out_lim( _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), __pred, __assign).get(); + std::forward<_Range2>(__rng2), __pred, __assign); - std::array<_Index, _2> __idx; + std::array<_Index, 2> __idx; __res.get_values(__idx); //a blocking call - return {__idx[0], __idx[1]; + return {__idx[1], __idx[0]}; //__parallel_copy_if_out_lim returns {last index in output, last index in input} } #if _ONEDPL_CPP20_RANGES_PRESENT diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 995b74bfd78..6f72994eca1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -316,7 +316,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name // Storage for the results of scan for each workgroup using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _Type>; - __result_and_scratch_storage_t __result_and_scratch{__exec, 1, __n_groups + 1}; + __result_and_scratch_storage_t __result_and_scratch{__exec, 2, __n_groups + 1}; _PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu); @@ -1235,6 +1235,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag _InRng&& __in_rng, _OutRng&& __out_rng, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) { + using _Size = decltype(__out_rng.size()); using _ReduceOp = std::plus<_Size>; using _Assigner = unseq_backend::__scan_assigner; using _NoAssign = unseq_backend::__scan_no_assign; @@ -1370,6 +1371,7 @@ auto __parallel_copy_if_out_lim(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign = _Assign{}) { + using _Size = decltype(__out_rng.size()); using _ReduceOp = std::plus<_Size>; using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>; using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 1b487cf90c6..c36032617fe 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -22,7 +22,7 @@ #include #include -#include "../../iterator_impl.h" +#include "../../iterator_impl.h" #include "sycl_defs.h" #include "execution_sycl_defs.h" @@ -683,8 +683,8 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base } } - template - void get_values(std::array<_T, _N>& __arr) + template + void get_values(std::array<_T, _N>& __arr) const { assert(__result_n > 0); assert(_N == __result_n); @@ -713,14 +713,14 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base return __get_value(idx); } - template + template void __wait_and_get_value(_Event&& __event, std::array<_T, _N>& __arr) const { if (is_USM()) __event.wait_and_throw(); - return get_values(__arr); + get_values(__arr); } }; @@ -745,7 +745,7 @@ struct __wait_and_get_value constexpr void operator()(auto&& __event, const __result_and_scratch_storage<_ExecutionPolicy, _T>& __storage, std::array<_T, _N>& __arr) { - return __storage.__wait_and_get_value(__event, __arr); + __storage.__wait_and_get_value(__event, __arr); } template @@ -812,9 +812,11 @@ class __future : private std::tuple<_Args...> } template - std::enable_if_t 0> + void get_values(std::array<_T, _N>& __arr) { + static_assert(sizeof...(_Args) > 0); + auto& __val = std::get<0>(*this); __wait_and_get_value{}(event(), __val, __arr); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index aabd0fcac27..9755c6affc7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -625,12 +625,23 @@ struct __copy_by_mask // ::std::tuple as operands, in all the other cases this is not necessary and no conversion // is performed(i.e. __typle_type is the same type as its operand). if(__out_idx < __out_acc.size()) + { __assigner(static_cast<__tuple_type>(get<0>(__in_acc[__item_idx])), __out_acc[__out_idx]); + auto __last_out_idx = __wg_sums_ptr[(__n - 1) / __size_per_wg]; + if(__out_idx + 1 == __last_out_idx) + { + __ret_ptr[0] = __item_idx + 1, __ret_ptr[1] = __last_out_idx; + } + } + else if(__out_idx == __out_acc.size()) + { + __ret_ptr[0] = __item_idx, __ret_ptr[1] = __out_idx; + } } if (__item_idx == 0) { //copy final result to output - *__ret_ptr = __wg_sums_ptr[(__n - 1) / __size_per_wg]; + __ret_ptr[1] = __wg_sums_ptr[(__n - 1) / __size_per_wg]; } } };