Skip to content

Commit

Permalink
Address review feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
julianmi committed Apr 22, 2024
1 parent e6983fc commit afd0d90
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 25 deletions.
28 changes: 13 additions & 15 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ class __reduce_kernel;
// Adjust number of sequential operations per work-item based on the vector size. Single elements are kept to
// improve performance of small arrays or remainder loops.
template <std::uint8_t _VecSize, typename _Size>
_Size
__adjust_iters_per_work_item(_Size __iters_per_work_item)
auto
__adjust_iters_per_work_item(_Size __iters_per_work_item) -> _Size
{
if (__iters_per_work_item > 1)
return ((__iters_per_work_item + _VecSize - 1) / _VecSize) * _VecSize;
Expand Down Expand Up @@ -340,8 +340,8 @@ struct __parallel_transform_reduce_impl

#if _ONEDPL_COMPILE_KERNEL
auto __kernel = __internal::__kernel_compiler<_ReduceKernel>::__compile(__exec);
__work_group_size = std::min(
__work_group_size, static_cast<_Size>(oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel)));
_Size __adjusted_work_group_size = oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel);
__work_group_size = std::min(__work_group_size, __adjusted_work_group_size);
#endif

const _Size __size_per_work_group =
Expand Down Expand Up @@ -479,8 +479,8 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back
// We can use 16-bit addressing since we have at most __max_work_group_size * __max_iters_per_work_item elements.
if (__n <= __max_elements_per_wg)
{
const std::uint16_t __n_short = static_cast<std::uint16_t>(__n);
const std::uint16_t __work_group_size_short = static_cast<std::uint16_t>(__work_group_size);
const auto __n_short = static_cast<std::uint16_t>(__n);
const auto __work_group_size_short = static_cast<std::uint16_t>(__work_group_size);
std::uint16_t __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __work_group_size);
__iters_per_work_item = __adjust_iters_per_work_item<__vector_size>(__iters_per_work_item);
return __parallel_transform_reduce_small_impl<_Tp, _Commutative, __vector_size>(
Expand All @@ -492,10 +492,10 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back
// Second step reduces __work_group_size * __iters_per_work_item_work_group_kernel elements.
// We can use 32-bit addressing since we have at most (__max_work_group_size * __max_iters_per_work_item) ^ 2
// elements.
else if (__n <= __max_elements_per_wg * __max_elements_per_wg)
if (__n <= __max_elements_per_wg * __max_elements_per_wg)
{
const std::uint32_t __n_short = static_cast<std::uint32_t>(__n);
const std::uint32_t __work_group_size_short = static_cast<std::uint32_t>(__work_group_size);
const auto __n_short = static_cast<std::uint32_t>(__n);
const auto __work_group_size_short = static_cast<std::uint32_t>(__work_group_size);
// Fully-utilize the device by running a work-group per compute unit.
// Add a factor more work-groups than compute units to fully utilizes the device and hide latencies.
const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
Expand All @@ -522,12 +522,10 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back
__init, std::forward<_Ranges>(__rngs)...);
}
// Otherwise use a recursive tree reduction with __max_iters_per_work_item __iters_per_work_item.
else
{
return __parallel_transform_reduce_impl<_Tp, _Commutative, __vector_size>::submit(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), __n, static_cast<_Size>(__work_group_size),
__max_iters_per_work_item, __reduce_op, __transform_op, __init, std::forward<_Ranges>(__rngs)...);
}
const auto __work_group_size_long = static_cast<_Size>(__work_group_size);
return __parallel_transform_reduce_impl<_Tp, _Commutative, __vector_size>::submit(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), __n, __work_group_size_long, __max_iters_per_work_item,
__reduce_op, __transform_op, __init, std::forward<_Ranges>(__rngs)...);
}

} // namespace __par_backend_hetero
Expand Down
26 changes: 16 additions & 10 deletions include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ struct transform_reduce
scalar_reduction_remainder(const _Size __start_idx, const _Size __adjusted_n, const _Size __max_iters, _Res& __res,
const _Acc&... __acc) const
{
const _Size __no_iters = std::min(static_cast<_Size>(__adjusted_n - __start_idx), __max_iters);
const _Size __remainder = __adjusted_n - __start_idx;
const _Size __no_iters = std::min(__remainder, __max_iters);
for (_Size __idx = 0; __idx < __no_iters; ++__idx)
__res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __idx, __acc...));
}
Expand All @@ -244,7 +245,6 @@ struct transform_reduce
const _Size& __global_offset, const bool __is_full, const _Size __n_groups, _Res& __res,
const _Acc&... __acc) const
{
const _Size __local_idx = __item_id.get_local_id(0);
const _Size __global_idx = __item_id.get_global_id(0);
if (__iters_per_work_item == 1)
{
Expand All @@ -254,15 +254,16 @@ struct transform_reduce
const _Size __local_range = __item_id.get_local_range(0);
const _Size __no_vec_ops = __iters_per_work_item / _VecSize;
const _Size __adjusted_n = __global_offset + __n;
constexpr _Size __vec_size_minus_one = static_cast<_Size>(_VecSize - 1);
constexpr _Size __vec_size_minus_one = _VecSize - 1;

_Size __stride = _VecSize; // sequential loads with _VecSize-wide vectors
_Size __adjusted_global_id = __global_offset;
if constexpr (_Commutative{})
{
__stride *= __local_range; // coalesced loads with _VecSize-wide vectors
__adjusted_global_id +=
__item_id.get_group_linear_id() * __local_range * __iters_per_work_item + __local_idx * _VecSize;
_Size __local_idx = __item_id.get_local_id(0);
_Size __group_idx = __item_id.get_group_linear_id();
__adjusted_global_id += __group_idx * __local_range * __iters_per_work_item + __local_idx * _VecSize;
}
else
__adjusted_global_id += __iters_per_work_item * __global_idx;
Expand All @@ -275,7 +276,9 @@ struct transform_reduce
const bool __is_multi_group = __n_groups > 1;
if (__is_multi_group)
{
const bool __is_last_wg = static_cast<_Size>(__item_id.get_group(0)) == __n_groups - (_Size)1;
_Size __group_idx = __item_id.get_group(0);
_Size __n_groups_minus_one = __n_groups - 1;
const bool __is_last_wg = __group_idx == __n_groups_minus_one;
if (!__is_last_wg)
__is_full_wg = true;
}
Expand Down Expand Up @@ -306,14 +309,16 @@ struct transform_reduce
else if (__adjusted_global_id < __adjusted_n)
{
new (&__res.__v) _Tp(__unary_op(__adjusted_global_id, __acc...));
scalar_reduction_remainder(static_cast<_Size>(__adjusted_global_id + 1), __adjusted_n,
static_cast<_Size>(_VecSize - 2), __res, __acc...);
const _Size __adjusted_global_id_plus_one = __adjusted_global_id + 1;
constexpr _Size __vec_size_minus_two = _VecSize - 2;
scalar_reduction_remainder(__adjusted_global_id_plus_one, __adjusted_n, __vec_size_minus_two, __res,
__acc...);
}
}

template <typename _Size>
_Size
output_size(const _Size& __n, const _Size& __work_group_size, const _Size& __iters_per_work_item) const
output_size(const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item) const
{
if (__iters_per_work_item == 1)
return __n;
Expand All @@ -324,7 +329,8 @@ struct transform_reduce
_Size __last_wg_remainder = __n % __items_per_work_group;
// Adjust remainder and wg size for vector size
_Size __last_wg_vec = oneapi::dpl::__internal::__dpl_ceiling_div(__last_wg_remainder, _VecSize);
_Size __last_wg_contrib = std::min(__last_wg_vec, static_cast<_Size>(__work_group_size * _VecSize));
_Size __wg_vec_size = __work_group_size * _VecSize;
_Size __last_wg_contrib = std::min(__last_wg_vec, __wg_vec_size);
return __full_group_contrib + __last_wg_contrib;
}
return oneapi::dpl::__internal::__dpl_ceiling_div(__n, __iters_per_work_item);
Expand Down

0 comments on commit afd0d90

Please sign in to comment.