From cd2dba5ca68011abe4732dffc15f86a3d605b386 Mon Sep 17 00:00:00 2001 From: Julian Miller Date: Mon, 15 Apr 2024 16:27:54 +0200 Subject: [PATCH] Fix merge issue --- .../dpcpp/parallel_backend_sycl_reduce.h | 25 +++++------ .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 41 ++++++++----------- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index bec2fbf5b3b..a4d61d41cb2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -75,7 +75,7 @@ __work_group_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Si } __result; // 1. Initialization (transform part). Fill local memory __transform_pattern(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0, __is_full, - /*__n_groups*/ (_Size)1, __local_mem, __result, __acc...); + /*__n_groups*/ (_Size)1, __result, __acc...); const _Size __n_items = __transform_pattern.output_size(__n, __group_size, __iters_per_work_item); // 2. Reduce within work group using local memory @@ -107,7 +107,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _ } __result; // 1. Initialization (transform part). Fill local memory __transform_pattern(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0, __is_full, __n_groups, - __local_mem, __result, __acc...); + __result, __acc...); const _Size __n_items = __transform_pattern.output_size(__n, __group_size, __iters_per_work_item); // 2. Reduce within work group using local memory @@ -197,8 +197,8 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V auto operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, - _TransformOp __transform_op, __result_and_scratch_storage<_ExecutionPolicy2, _Tp> __scratch_container, - _Ranges&&... __rngs) const + _TransformOp __transform_op, __result_and_scratch_storage<_ExecutionPolicy2, _Tp> __scratch_container, + _Ranges&&... __rngs) const { auto __transform_pattern = unseq_backend::transform_reduce<_ExecutionPolicy, _ReduceOp, _TransformOp, _Tp, _Commutative, _VecSize>{ @@ -220,7 +220,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V [=](sycl::nd_item<1> __item_id) { auto __temp_ptr = __temp_acc.__get_pointer(); __device_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full, __n_groups, - __reduce_pattern, __temp_local, __temp_ptr, __rngs...); + __transform_pattern, __reduce_pattern, __temp_local, __temp_ptr, __rngs...); }); }); @@ -238,6 +238,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative __internal::__optional_kernel_name<_KernelName...>> { template auto operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, @@ -264,7 +265,8 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative auto __temp_ptr = __temp_acc.__get_pointer(); auto __res_ptr = __res_acc.__get_pointer(); __work_group_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full, - __reduce_pattern, __init, __temp_local, __res_ptr, __temp_ptr); + __transform_pattern, __reduce_pattern, __init, __temp_local, + __res_ptr, __temp_ptr); }); }); @@ -294,15 +296,14 @@ __parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_t sycl::event __reduce_event = __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize, _ReduceDeviceKernel>()( - __backend_tag, __exec, __n, __reduce_op, __transform_op, __scratch_container, - ::std::forward<_Ranges>(__rngs)...); + __backend_tag, __exec, __n, __work_group_size, __iters_per_work_item_device_kernel, __reduce_op, + __transform_op, __scratch_container, ::std::forward<_Ranges>(__rngs)...); // __n_groups preliminary results from the device kernel. return __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize, _ReduceWorkGroupKernel>()( __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __reduce_event, __n_groups, __work_group_size, - __iters_per_work_item_work_group_kernel, __reduce_op, __init, __temp); - __init, __scratch_container); + __iters_per_work_item_work_group_kernel, __reduce_op, __init, __scratch_container); } // General implementation using a tree reduction @@ -389,13 +390,13 @@ struct __parallel_transform_reduce_impl if (__is_first) { __transform_pattern1(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0, - __temp_local, __result, __rngs...); + __is_full, __n_groups, __result, __rngs...); __n_items = __transform_pattern1.output_size(__n, __work_group_size, __iters_per_work_item); } else { __transform_pattern2(__item_id, __n, __iters_per_work_item, __offset_2, __is_full, - __result, __temp_ptr); + __n_groups, __result, __temp_ptr); __n_items = __transform_pattern2.output_size(__n, __work_group_size, __iters_per_work_item); } // 2. Reduce within work group using local memory diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 8dfa2d75cfc..19438614946 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -49,7 +49,7 @@ inline constexpr bool __can_use_known_identity = template using __has_known_identity = ::std::conditional_t< __can_use_known_identity<_Tp>, -# if _ONEDPL_LIBSYCL_VERSION >= 50200 +# if _ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::disjunction< __dpl_sycl::__has_known_identity<_BinaryOp, _Tp>, ::std::conjunction<::std::is_arithmetic<_Tp>, @@ -61,22 +61,22 @@ using __has_known_identity = ::std::conditional_t< ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>, -# else //_ONEDPL_LIBSYCL_VERSION >= 50200 +# else //_ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::conjunction< ::std::is_arithmetic<_Tp>, ::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>, -# endif //_ONEDPL_LIBSYCL_VERSION >= 50200 +# endif //_ONEDPL_LIBSYCL_VERSION >= 50200 ::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false -#else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL) +# else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL) template using __has_known_identity = std::false_type; -#endif //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL) +# endif //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL) template struct __known_identity_for_plus @@ -209,15 +209,14 @@ struct transform_reduce _Operation1 __binary_op; _Operation2 __unary_op; - template - inline _Res - vectorized_reduction_first(const _Size __start_idx, const _Acc&... __acc) const + template + inline void + vectorized_reduction_first(const _Size __start_idx, _Res& __res, const _Acc&... __acc) const { - _Res __res = __unary_op(__start_idx, __acc...); + new (&__res.__v) _Tp(std::move(__unary_op(__start_idx, __acc...))); _ONEDPL_PRAGMA_UNROLL for (_Size __i = 1; __i < _VecSize; ++__i) - __res = __binary_op(__res, __unary_op(__start_idx + __i, __acc...)); - return __res; + __res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __i, __acc...)); } template @@ -226,7 +225,7 @@ struct transform_reduce { _ONEDPL_PRAGMA_UNROLL for (_Size __i = 0; __i < _VecSize; ++__i) - __res = __binary_op(__res, __unary_op(__start_idx + __i, __acc...)); + __res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __i, __acc...)); } template @@ -236,21 +235,20 @@ struct transform_reduce { const _Size __no_iters = ::std::min(static_cast<_Size>(__adjusted_n - __start_idx), __max_iters); for (_Size __idx = 0; __idx < __no_iters; ++__idx) - __res = __binary_op(__res, __unary_op(__start_idx + __idx, __acc...)); + __res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __idx, __acc...)); } - template + template inline void operator()(const _NDItemId& __item_id, const _Size& __n, const _Size& __iters_per_work_item, - const _Size& __global_offset, const bool __is_full, const _Size __n_groups, const _AccLocal& __local_mem, + const _Size& __global_offset, const bool __is_full, const _Size __n_groups, _Res& __res, const _Acc&... __acc) const { - using _Res = typename _AccLocal::value_type; const _Size __local_idx = __item_id.get_local_id(0); const _Size __global_idx = __item_id.get_global_id(0); if (__iters_per_work_item == 1) { - __local_mem[__local_idx] = __unary_op(__global_idx, __acc...); + new (&__res.__v) _Tp(std::move(__unary_op(__global_idx, __acc...))); return; } const _Size __local_range = __item_id.get_local_range(0); @@ -285,15 +283,14 @@ struct transform_reduce // _VecSize-wide vectorized path (__iters_per_work_item are multiples of _VecSize) if (__is_full_wg) { - _Res __res = vectorized_reduction_first<_Res>(__adjusted_global_id, __acc...); + vectorized_reduction_first(__adjusted_global_id, __res, __acc...); for (_Size __i = 1; __i < __no_vec_ops; ++__i) vectorized_reduction_remainder(__adjusted_global_id + __i * __stride, __res, __acc...); - __local_mem[__local_idx] = __res; } // At least one vector operation else if (__adjusted_global_id + __vec_size_minus_one < __adjusted_n) { - _Res __res = vectorized_reduction_first<_Res>(__adjusted_global_id, __acc...); + vectorized_reduction_first(__adjusted_global_id, __res, __acc...); for (_Size __i = 1; __i < __no_vec_ops; ++__i) { const _Size __base_idx = __adjusted_global_id + __i * __stride; @@ -306,15 +303,13 @@ struct transform_reduce else break; } - __local_mem[__local_idx] = __res; } // Scalar remainder else if (__adjusted_global_id < __adjusted_n) { - _Res __res = __unary_op(__adjusted_global_id, __acc...); + new (&__res.__v) _Tp(std::move(__unary_op(__adjusted_global_id, __acc...))); scalar_reduction_remainder(static_cast<_Size>(__adjusted_global_id + 1), __adjusted_n, static_cast<_Size>(_VecSize - 2), __res, __acc...); - __local_mem[__local_idx] = __res; } }