From cd2dba5ca68011abe4732dffc15f86a3d605b386 Mon Sep 17 00:00:00 2001
From: Julian Miller <julian.miller@intel.com>
Date: Mon, 15 Apr 2024 16:27:54 +0200
Subject: [PATCH] Fix merge issue

---
 .../dpcpp/parallel_backend_sycl_reduce.h      | 25 +++++------
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 41 ++++++++-----------
 2 files changed, 31 insertions(+), 35 deletions(-)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
index bec2fbf5b3b..a4d61d41cb2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -75,7 +75,7 @@ __work_group_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Si
     } __result;
     // 1. Initialization (transform part). Fill local memory
     __transform_pattern(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0, __is_full,
-                        /*__n_groups*/ (_Size)1, __local_mem, __result, __acc...);
+                        /*__n_groups*/ (_Size)1, __result, __acc...);
 
     const _Size __n_items = __transform_pattern.output_size(__n, __group_size, __iters_per_work_item);
     // 2. Reduce within work group using local memory
@@ -107,7 +107,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _
     } __result;
     // 1. Initialization (transform part). Fill local memory
     __transform_pattern(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0, __is_full, __n_groups,
-                        __local_mem, __result, __acc...);
+                        __result, __acc...);
 
     const _Size __n_items = __transform_pattern.output_size(__n, __group_size, __iters_per_work_item);
     // 2. Reduce within work group using local memory
@@ -197,8 +197,8 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
     auto
     operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n,
                const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
-			   _TransformOp __transform_op, __result_and_scratch_storage<_ExecutionPolicy2, _Tp> __scratch_container,
-			   _Ranges&&... __rngs) const
+               _TransformOp __transform_op, __result_and_scratch_storage<_ExecutionPolicy2, _Tp> __scratch_container,
+               _Ranges&&... __rngs) const
     {
         auto __transform_pattern =
             unseq_backend::transform_reduce<_ExecutionPolicy, _ReduceOp, _TransformOp, _Tp, _Commutative, _VecSize>{
@@ -220,7 +220,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
                 [=](sycl::nd_item<1> __item_id) {
                     auto __temp_ptr = __temp_acc.__get_pointer();
                     __device_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full, __n_groups,
-                                                __reduce_pattern, __temp_local, __temp_ptr, __rngs...);
+                                                __transform_pattern, __reduce_pattern, __temp_local, __temp_ptr,
                                                 __rngs...);
                 });
         });
@@ -238,6 +238,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
                                                                __internal::__optional_kernel_name<_KernelName...>>
 {
     template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _InitType,
+              typename _ExecutionPolicy2>
     auto
     operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event,
                const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
@@ -264,7 +265,8 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
                     auto __temp_ptr = __temp_acc.__get_pointer();
                     auto __res_ptr = __res_acc.__get_pointer();
                     __work_group_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full,
-                                                    __reduce_pattern, __init, __temp_local, __res_ptr, __temp_ptr);
+                                                    __transform_pattern, __reduce_pattern, __init, __temp_local,
+                                                    __res_ptr, __temp_ptr);
                 });
         });
 
@@ -294,15 +296,14 @@ __parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_t
 
     sycl::event __reduce_event =
         __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize, _ReduceDeviceKernel>()(
-            __backend_tag, __exec, __n, __reduce_op, __transform_op, __scratch_container,
-            ::std::forward<_Ranges>(__rngs)...);
+            __backend_tag, __exec, __n, __work_group_size, __iters_per_work_item_device_kernel, __reduce_op,
+            __transform_op, __scratch_container, ::std::forward<_Ranges>(__rngs)...);
 
     // __n_groups preliminary results from the device kernel.
     return __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize,
                                                                    _ReduceWorkGroupKernel>()(
         __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __reduce_event, __n_groups, __work_group_size,
-        __iters_per_work_item_work_group_kernel, __reduce_op, __init, __temp);
-        __init, __scratch_container);
+        __iters_per_work_item_work_group_kernel, __reduce_op, __init, __scratch_container);
 }
 
 // General implementation using a tree reduction
@@ -389,13 +390,13 @@ struct __parallel_transform_reduce_impl
                         if (__is_first)
                         {
                             __transform_pattern1(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0,
-                                                 __temp_local, __result, __rngs...);
+                                                 __is_full, __n_groups, __result, __rngs...);
                             __n_items = __transform_pattern1.output_size(__n, __work_group_size, __iters_per_work_item);
                         }
                         else
                         {
                             __transform_pattern2(__item_id, __n, __iters_per_work_item, __offset_2, __is_full,
-                                                 __result, __temp_ptr);
+                                                 __n_groups, __result, __temp_ptr);
                             __n_items = __transform_pattern2.output_size(__n, __work_group_size, __iters_per_work_item);
                         }
                         // 2. Reduce within work group using local memory
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 8dfa2d75cfc..19438614946 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -49,7 +49,7 @@ inline constexpr bool __can_use_known_identity =
 template <typename _BinaryOp, typename _Tp>
 using __has_known_identity = ::std::conditional_t<
     __can_use_known_identity<_Tp>,
-#    if _ONEDPL_LIBSYCL_VERSION >= 50200
+#        if _ONEDPL_LIBSYCL_VERSION >= 50200
     typename ::std::disjunction<
         __dpl_sycl::__has_known_identity<_BinaryOp, _Tp>,
         ::std::conjunction<::std::is_arithmetic<_Tp>,
@@ -61,22 +61,22 @@ using __has_known_identity = ::std::conditional_t<
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<void>>,
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>,
                                               ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>,
-#    else               //_ONEDPL_LIBSYCL_VERSION >= 50200
+#        else           //_ONEDPL_LIBSYCL_VERSION >= 50200
     typename ::std::conjunction<
         ::std::is_arithmetic<_Tp>,
         ::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>,
                            ::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<void>>,
                            ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>,
                            ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>,
-#    endif              //_ONEDPL_LIBSYCL_VERSION >= 50200
+#        endif          //_ONEDPL_LIBSYCL_VERSION >= 50200
     ::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false
 
-#else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)
+#    else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)
 
 template <typename _BinaryOp, typename _Tp>
 using __has_known_identity = std::false_type;
 
-#endif //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)
+#    endif //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)
 
 template <typename _BinaryOp, typename _Tp>
 struct __known_identity_for_plus
@@ -209,15 +209,14 @@ struct transform_reduce
     _Operation1 __binary_op;
     _Operation2 __unary_op;
 
-    template <typename _Res, typename _Size, typename... _Acc>
-    inline _Res
-    vectorized_reduction_first(const _Size __start_idx, const _Acc&... __acc) const
+    template <typename _Size, typename _Res, typename... _Acc>
+    inline void
+    vectorized_reduction_first(const _Size __start_idx, _Res& __res, const _Acc&... __acc) const
     {
-        _Res __res = __unary_op(__start_idx, __acc...);
+        new (&__res.__v) _Tp(std::move(__unary_op(__start_idx, __acc...)));
         _ONEDPL_PRAGMA_UNROLL
         for (_Size __i = 1; __i < _VecSize; ++__i)
-            __res = __binary_op(__res, __unary_op(__start_idx + __i, __acc...));
-        return __res;
+            __res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __i, __acc...));
     }
 
     template <typename _Size, typename _Res, typename... _Acc>
@@ -226,7 +225,7 @@ struct transform_reduce
     {
         _ONEDPL_PRAGMA_UNROLL
         for (_Size __i = 0; __i < _VecSize; ++__i)
-            __res = __binary_op(__res, __unary_op(__start_idx + __i, __acc...));
+            __res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __i, __acc...));
     }
 
     template <typename _Size, typename _Res, typename... _Acc>
@@ -236,21 +235,20 @@ struct transform_reduce
     {
         const _Size __no_iters = ::std::min(static_cast<_Size>(__adjusted_n - __start_idx), __max_iters);
         for (_Size __idx = 0; __idx < __no_iters; ++__idx)
-            __res = __binary_op(__res, __unary_op(__start_idx + __idx, __acc...));
+            __res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __idx, __acc...));
     }
 
-    template <typename _NDItemId, typename _Size, typename _AccLocal, typename... _Acc>
+    template <typename _NDItemId, typename _Size, typename _Res, typename... _Acc>
     inline void
     operator()(const _NDItemId& __item_id, const _Size& __n, const _Size& __iters_per_work_item,
-               const _Size& __global_offset, const bool __is_full, const _Size __n_groups, const _AccLocal& __local_mem,
+               const _Size& __global_offset, const bool __is_full, const _Size __n_groups, _Res& __res,
                const _Acc&... __acc) const
     {
-        using _Res = typename _AccLocal::value_type;
         const _Size __local_idx = __item_id.get_local_id(0);
         const _Size __global_idx = __item_id.get_global_id(0);
         if (__iters_per_work_item == 1)
         {
-            __local_mem[__local_idx] = __unary_op(__global_idx, __acc...);
+            new (&__res.__v) _Tp(std::move(__unary_op(__global_idx, __acc...)));
             return;
         }
         const _Size __local_range = __item_id.get_local_range(0);
@@ -285,15 +283,14 @@ struct transform_reduce
         // _VecSize-wide vectorized path (__iters_per_work_item are multiples of _VecSize)
         if (__is_full_wg)
         {
-            _Res __res = vectorized_reduction_first<_Res>(__adjusted_global_id, __acc...);
+            vectorized_reduction_first(__adjusted_global_id, __res, __acc...);
             for (_Size __i = 1; __i < __no_vec_ops; ++__i)
                 vectorized_reduction_remainder(__adjusted_global_id + __i * __stride, __res, __acc...);
-            __local_mem[__local_idx] = __res;
         }
         // At least one vector operation
         else if (__adjusted_global_id + __vec_size_minus_one < __adjusted_n)
         {
-            _Res __res = vectorized_reduction_first<_Res>(__adjusted_global_id, __acc...);
+            vectorized_reduction_first(__adjusted_global_id, __res, __acc...);
             for (_Size __i = 1; __i < __no_vec_ops; ++__i)
             {
                 const _Size __base_idx = __adjusted_global_id + __i * __stride;
@@ -306,15 +303,13 @@ struct transform_reduce
                 else
                     break;
             }
-            __local_mem[__local_idx] = __res;
         }
         // Scalar remainder
         else if (__adjusted_global_id < __adjusted_n)
         {
-            _Res __res = __unary_op(__adjusted_global_id, __acc...);
+            new (&__res.__v) _Tp(std::move(__unary_op(__adjusted_global_id, __acc...)));
             scalar_reduction_remainder(static_cast<_Size>(__adjusted_global_id + 1), __adjusted_n,
                                        static_cast<_Size>(_VecSize - 2), __res, __acc...);
-            __local_mem[__local_idx] = __res;
         }
     }