Skip to content

Commit

Permalink
Fix merge issue
Browse files Browse the repository at this point in the history
  • Loading branch information
julianmi committed Apr 15, 2024
1 parent fa87dca commit cd2dba5
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 35 deletions.
25 changes: 13 additions & 12 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ __work_group_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Si
} __result;
// 1. Initialization (transform part). Fill local memory
__transform_pattern(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0, __is_full,
/*__n_groups*/ (_Size)1, __local_mem, __result, __acc...);
/*__n_groups*/ (_Size)1, __result, __acc...);

const _Size __n_items = __transform_pattern.output_size(__n, __group_size, __iters_per_work_item);
// 2. Reduce within work group using local memory
Expand Down Expand Up @@ -107,7 +107,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _
} __result;
// 1. Initialization (transform part). Fill local memory
__transform_pattern(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0, __is_full, __n_groups,
__local_mem, __result, __acc...);
__result, __acc...);

const _Size __n_items = __transform_pattern.output_size(__n, __group_size, __iters_per_work_item);
// 2. Reduce within work group using local memory
Expand Down Expand Up @@ -197,8 +197,8 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
auto
operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n,
const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
_TransformOp __transform_op, __result_and_scratch_storage<_ExecutionPolicy2, _Tp> __scratch_container,
_Ranges&&... __rngs) const
_TransformOp __transform_op, __result_and_scratch_storage<_ExecutionPolicy2, _Tp> __scratch_container,
_Ranges&&... __rngs) const
{
auto __transform_pattern =
unseq_backend::transform_reduce<_ExecutionPolicy, _ReduceOp, _TransformOp, _Tp, _Commutative, _VecSize>{
Expand All @@ -220,7 +220,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
[=](sycl::nd_item<1> __item_id) {
auto __temp_ptr = __temp_acc.__get_pointer();
__device_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full, __n_groups,
__reduce_pattern, __temp_local, __temp_ptr, __rngs...);
__transform_pattern, __reduce_pattern, __temp_local, __temp_ptr,
__rngs...);
});
});
Expand All @@ -238,6 +238,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
__internal::__optional_kernel_name<_KernelName...>>
{
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _InitType,
typename _ExecutionPolicy2>
auto
operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event,
const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op,
Expand All @@ -264,7 +265,8 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
auto __temp_ptr = __temp_acc.__get_pointer();
auto __res_ptr = __res_acc.__get_pointer();
__work_group_reduce_kernel<_Tp>(__item_id, __n, __iters_per_work_item, __is_full,
__reduce_pattern, __init, __temp_local, __res_ptr, __temp_ptr);
__transform_pattern, __reduce_pattern, __init, __temp_local,
__res_ptr, __temp_ptr);
});
});

Expand Down Expand Up @@ -294,15 +296,14 @@ __parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_t

sycl::event __reduce_event =
__parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize, _ReduceDeviceKernel>()(
__backend_tag, __exec, __n, __reduce_op, __transform_op, __scratch_container,
::std::forward<_Ranges>(__rngs)...);
__backend_tag, __exec, __n, __work_group_size, __iters_per_work_item_device_kernel, __reduce_op,
__transform_op, __scratch_container, ::std::forward<_Ranges>(__rngs)...);

// __n_groups preliminary results from the device kernel.
return __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize,
_ReduceWorkGroupKernel>()(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __reduce_event, __n_groups, __work_group_size,
__iters_per_work_item_work_group_kernel, __reduce_op, __init, __temp);
__init, __scratch_container);
__iters_per_work_item_work_group_kernel, __reduce_op, __init, __scratch_container);
}

// General implementation using a tree reduction
Expand Down Expand Up @@ -389,13 +390,13 @@ struct __parallel_transform_reduce_impl
if (__is_first)
{
__transform_pattern1(__item_id, __n, __iters_per_work_item, /*global_offset*/ (_Size)0,
__temp_local, __result, __rngs...);
__is_full, __n_groups, __result, __rngs...);
__n_items = __transform_pattern1.output_size(__n, __work_group_size, __iters_per_work_item);
}
else
{
__transform_pattern2(__item_id, __n, __iters_per_work_item, __offset_2, __is_full,
__result, __temp_ptr);
__n_groups, __result, __temp_ptr);
__n_items = __transform_pattern2.output_size(__n, __work_group_size, __iters_per_work_item);
}
// 2. Reduce within work group using local memory
Expand Down
41 changes: 18 additions & 23 deletions include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ inline constexpr bool __can_use_known_identity =
template <typename _BinaryOp, typename _Tp>
using __has_known_identity = ::std::conditional_t<
__can_use_known_identity<_Tp>,
# if _ONEDPL_LIBSYCL_VERSION >= 50200
# if _ONEDPL_LIBSYCL_VERSION >= 50200
typename ::std::disjunction<
__dpl_sycl::__has_known_identity<_BinaryOp, _Tp>,
::std::conjunction<::std::is_arithmetic<_Tp>,
Expand All @@ -61,22 +61,22 @@ using __has_known_identity = ::std::conditional_t<
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<void>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>,
# else //_ONEDPL_LIBSYCL_VERSION >= 50200
# else //_ONEDPL_LIBSYCL_VERSION >= 50200
typename ::std::conjunction<
::std::is_arithmetic<_Tp>,
::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<void>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>,
# endif //_ONEDPL_LIBSYCL_VERSION >= 50200
# endif //_ONEDPL_LIBSYCL_VERSION >= 50200
::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false

#else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)
# else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)

template <typename _BinaryOp, typename _Tp>
using __has_known_identity = std::false_type;

#endif //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)
# endif //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)

template <typename _BinaryOp, typename _Tp>
struct __known_identity_for_plus
Expand Down Expand Up @@ -209,15 +209,14 @@ struct transform_reduce
_Operation1 __binary_op;
_Operation2 __unary_op;

template <typename _Res, typename _Size, typename... _Acc>
inline _Res
vectorized_reduction_first(const _Size __start_idx, const _Acc&... __acc) const
template <typename _Size, typename _Res, typename... _Acc>
inline void
vectorized_reduction_first(const _Size __start_idx, _Res& __res, const _Acc&... __acc) const
{
_Res __res = __unary_op(__start_idx, __acc...);
new (&__res.__v) _Tp(std::move(__unary_op(__start_idx, __acc...)));
_ONEDPL_PRAGMA_UNROLL
for (_Size __i = 1; __i < _VecSize; ++__i)
__res = __binary_op(__res, __unary_op(__start_idx + __i, __acc...));
return __res;
__res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __i, __acc...));
}

template <typename _Size, typename _Res, typename... _Acc>
Expand All @@ -226,7 +225,7 @@ struct transform_reduce
{
_ONEDPL_PRAGMA_UNROLL
for (_Size __i = 0; __i < _VecSize; ++__i)
__res = __binary_op(__res, __unary_op(__start_idx + __i, __acc...));
__res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __i, __acc...));
}

template <typename _Size, typename _Res, typename... _Acc>
Expand All @@ -236,21 +235,20 @@ struct transform_reduce
{
const _Size __no_iters = ::std::min(static_cast<_Size>(__adjusted_n - __start_idx), __max_iters);
for (_Size __idx = 0; __idx < __no_iters; ++__idx)
__res = __binary_op(__res, __unary_op(__start_idx + __idx, __acc...));
__res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __idx, __acc...));
}

template <typename _NDItemId, typename _Size, typename _AccLocal, typename... _Acc>
template <typename _NDItemId, typename _Size, typename _Res, typename... _Acc>
inline void
operator()(const _NDItemId& __item_id, const _Size& __n, const _Size& __iters_per_work_item,
const _Size& __global_offset, const bool __is_full, const _Size __n_groups, const _AccLocal& __local_mem,
const _Size& __global_offset, const bool __is_full, const _Size __n_groups, _Res& __res,
const _Acc&... __acc) const
{
using _Res = typename _AccLocal::value_type;
const _Size __local_idx = __item_id.get_local_id(0);
const _Size __global_idx = __item_id.get_global_id(0);
if (__iters_per_work_item == 1)
{
__local_mem[__local_idx] = __unary_op(__global_idx, __acc...);
new (&__res.__v) _Tp(std::move(__unary_op(__global_idx, __acc...)));
return;
}
const _Size __local_range = __item_id.get_local_range(0);
Expand Down Expand Up @@ -285,15 +283,14 @@ struct transform_reduce
// _VecSize-wide vectorized path (__iters_per_work_item are multiples of _VecSize)
if (__is_full_wg)
{
_Res __res = vectorized_reduction_first<_Res>(__adjusted_global_id, __acc...);
vectorized_reduction_first(__adjusted_global_id, __res, __acc...);
for (_Size __i = 1; __i < __no_vec_ops; ++__i)
vectorized_reduction_remainder(__adjusted_global_id + __i * __stride, __res, __acc...);
__local_mem[__local_idx] = __res;
}
// At least one vector operation
else if (__adjusted_global_id + __vec_size_minus_one < __adjusted_n)
{
_Res __res = vectorized_reduction_first<_Res>(__adjusted_global_id, __acc...);
vectorized_reduction_first(__adjusted_global_id, __res, __acc...);
for (_Size __i = 1; __i < __no_vec_ops; ++__i)
{
const _Size __base_idx = __adjusted_global_id + __i * __stride;
Expand All @@ -306,15 +303,13 @@ struct transform_reduce
else
break;
}
__local_mem[__local_idx] = __res;
}
// Scalar remainder
else if (__adjusted_global_id < __adjusted_n)
{
_Res __res = __unary_op(__adjusted_global_id, __acc...);
new (&__res.__v) _Tp(std::move(__unary_op(__adjusted_global_id, __acc...)));
scalar_reduction_remainder(static_cast<_Size>(__adjusted_global_id + 1), __adjusted_n,
static_cast<_Size>(_VecSize - 2), __res, __acc...);
__local_mem[__local_idx] = __res;
}
}

Expand Down

0 comments on commit cd2dba5

Please sign in to comment.