Skip to content

Commit

Permalink
Fix CPU backend issue
Browse files Browse the repository at this point in the history
  • Loading branch information
julianmi committed Apr 4, 2024
1 parent 9d0cf02 commit 76097ed
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 42 deletions.
46 changes: 24 additions & 22 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,8 @@ struct __parallel_transform_reduce_impl

#if _ONEDPL_COMPILE_KERNEL
auto __kernel = __internal::__kernel_compiler<_ReduceKernel>::__compile(__exec);
__work_group_size =
::std::min(__work_group_size, (_Size)oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel));
__work_group_size = ::std::min(
__work_group_size, static_cast<_Size>(oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel)));
#endif

const _Size __size_per_work_group =
Expand Down Expand Up @@ -428,69 +428,71 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back

// Empirically found tuning parameters for typical devices.
constexpr _Size __max_iters_per_work_item = 32;
constexpr _Size __max_work_group_size = 256;
constexpr ::std::size_t __max_work_group_size = 256;
constexpr ::std::uint8_t __vector_size = 4;
constexpr ::std::uint32_t __oversubscription = 2;

// Get the work group size adjusted to the local memory limit.
// Pessimistically double the memory requirement to take into account memory used by compiled kernel.
// TODO: find a way to generalize getting of reliable work-group size.
_Size __work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_Tp) * 2);
::std::size_t __work_group_size =
oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, static_cast<::std::size_t>(sizeof(_Tp) * 2));

// Limit work-group size to __max_work_group_size for performance on GPUs. Empirically tested.
__work_group_size = ::std::min(__work_group_size, __max_work_group_size);

// Enable __vector_size-wide vectorization.
_Size __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __work_group_size);
__adjust_iters_per_work_item<__vector_size>(__iters_per_work_item);
const _Size __max_elements_per_wg = __work_group_size * __max_iters_per_work_item;

// Use single work group implementation if less than __max_iters_per_work_item elements per work-group.
// We can use 16-bit addressing since we have at most __max_work_group_size * __max_iters_per_work_item elements.
if (__iters_per_work_item <= __max_iters_per_work_item)
if (__n <= __max_elements_per_wg)
{
const ::std::uint16_t __n_short = static_cast<::std::uint16_t>(__n);
const ::std::uint16_t __work_group_size_short = static_cast<::std::uint16_t>(__work_group_size);
::std::uint16_t __iters_per_work_item =
oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __work_group_size);
__adjust_iters_per_work_item<__vector_size>(__iters_per_work_item);
return __parallel_transform_reduce_small_impl<_Tp, _Commutative, __vector_size>(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), (::std::uint16_t)__n,
(::std::uint16_t)__work_group_size, (::std::uint16_t)__iters_per_work_item, __reduce_op, __transform_op,
__init, ::std::forward<_Ranges>(__rngs)...);
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __n_short, __work_group_size_short,
__iters_per_work_item, __reduce_op, __transform_op, __init, ::std::forward<_Ranges>(__rngs)...);
}
// Use two-step tree reduction.
// First step reduces __work_group_size * __iters_per_work_item_device_kernel elements.
// Second step reduces __work_group_size * __iters_per_work_item_work_group_kernel elements.
// We can use 32-bit addressing since we have at most (__max_work_group_size * __max_iters_per_work_item) ^ 2
// elements.
else if (__iters_per_work_item <= __max_elements_per_wg * __max_elements_per_wg)
else if (__n <= __max_elements_per_wg * __max_elements_per_wg)
{
const ::std::uint32_t __n_short = static_cast<::std::uint32_t>(__n);
const ::std::uint32_t __work_group_size_short = static_cast<::std::uint32_t>(__work_group_size);
// Fully-utilize the device by running a work-group per compute unit.
// Add a factor more work-groups than compute units to fully utilizes the device and hide latencies.
const ::std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
::std::uint32_t __n_groups = __max_cu * __oversubscription;
::std::uint32_t __iters_per_work_item_device_kernel =
oneapi::dpl::__internal::__dpl_ceiling_div(__n, __n_groups * __work_group_size);
oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __n_groups * __work_group_size_short);
__adjust_iters_per_work_item<__vector_size>(__iters_per_work_item_device_kernel);

// Lower the number of iterations to not exceed the empirically found limit.
// This increases the number of work-groups up to the limit of work-group size times __max_iters_per_work_item.
if (__iters_per_work_item_device_kernel > __max_iters_per_work_item)
{
__iters_per_work_item_device_kernel = __max_iters_per_work_item;
__n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_elements_per_wg);
__n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __max_elements_per_wg);
}
::std::uint32_t __iters_per_work_item_work_group_kernel =
oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __work_group_size);
oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __work_group_size_short);
__adjust_iters_per_work_item<__vector_size>(__iters_per_work_item_work_group_kernel);
return __parallel_transform_reduce_mid_impl<_Tp, _Commutative, __vector_size>(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), (::std::uint32_t)__n,
(::std::uint32_t)__work_group_size, (::std::uint32_t)__iters_per_work_item_device_kernel,
(::std::uint32_t)__iters_per_work_item_work_group_kernel, __reduce_op, __transform_op, __init,
::std::forward<_Ranges>(__rngs)...);
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __n_short, __work_group_size_short,
__iters_per_work_item_device_kernel, __iters_per_work_item_work_group_kernel, __reduce_op, __transform_op,
__init, ::std::forward<_Ranges>(__rngs)...);
}
// Otherwise use a recursive tree reduction with __max_iters_per_work_item __iters_per_work_item.
else
{
return __parallel_transform_reduce_impl<_Tp, _Commutative, __vector_size>::submit(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __n, (_Size)__work_group_size,
(_Size)__max_iters_per_work_item, __reduce_op, __transform_op, __init, ::std::forward<_Ranges>(__rngs)...);
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __n, static_cast<_Size>(__work_group_size),
__max_iters_per_work_item, __reduce_op, __transform_op, __init, ::std::forward<_Ranges>(__rngs)...);
}
}

Expand Down
52 changes: 32 additions & 20 deletions include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ using __has_known_identity = ::std::conditional_t<
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<void>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>,
# else //_ONEDPL_LIBSYCL_VERSION >= 50200
# else //_ONEDPL_LIBSYCL_VERSION >= 50200
typename ::std::conjunction<
::std::is_arithmetic<_Tp>,
::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<void>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>,
# endif //_ONEDPL_LIBSYCL_VERSION >= 50200
::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false
# endif //_ONEDPL_LIBSYCL_VERSION >= 50200
::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false

#else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)

Expand Down Expand Up @@ -236,8 +236,8 @@ struct transform_reduce
const _Acc&... __acc) const
{
using _Res = typename _AccLocal::value_type;
auto __local_idx = __item_id.get_local_id(0);
auto __global_idx = __item_id.get_global_id(0);
const _Size __local_idx = __item_id.get_local_id(0);
const _Size __global_idx = __item_id.get_global_id(0);
if (__iters_per_work_item == 1)
{
__local_mem[__local_idx] = __unary_op(__global_idx, __acc...);
Expand All @@ -249,7 +249,7 @@ struct transform_reduce

_Size __stride = _VecSize; // sequential loads with _VecSize-wide vectors
if constexpr (_Commutative{})
__stride = __local_range * _VecSize; // coalesced loads with _VecSize-wide vectors
__stride *= __local_range; // coalesced loads with _VecSize-wide vectors
_Size __adjusted_global_id = __global_offset;
if constexpr (_Commutative{})
{
Expand All @@ -261,9 +261,19 @@ struct transform_reduce

// If n is not evenly divisible by the number of elements processed per work-group, the last work-group might
// need to process less elements than __iters_per_work_item.
const bool __is_last_wg = (__n_groups > 1) && (static_cast<_Size>(__item_id.get_group(0)) != __n_groups - 1);
bool __is_full_wg = __is_full;
if (!__is_full_wg)
{
const bool __is_multi_group = __n_groups > 1;
if (__is_multi_group)
{
const bool __is_last_wg = static_cast<_Size>(__item_id.get_group(0)) == __n_groups - (_Size)1;
if (!__is_last_wg)
__is_full_wg = true;
}
}
// _VecSize-wide vectorized path (__iters_per_work_item are multiples of _VecSize)
if (__is_full || __is_last_wg)
if (__is_full_wg)
{
_Res __res = vectorized_reduction_first<_Res>(__adjusted_global_id, __acc...);
for (_Size __i = 1; __i < __no_vec_ops; ++__i)
Expand All @@ -272,22 +282,23 @@ struct transform_reduce
return;
}
// At least one vector operation
if (__adjusted_global_id + _VecSize - 1 < __adjusted_n)
constexpr _Size __vec_size_minus_one = static_cast<_Size>(_VecSize - 1);
if (__adjusted_global_id + __vec_size_minus_one < __adjusted_n)
{
_Res __res = vectorized_reduction_first<_Res>(__adjusted_global_id, __acc...);
for (_Size __i = 1; __i < __no_vec_ops; ++__i)
{
if (__adjusted_global_id + __i * __stride + _VecSize - 1 < __adjusted_n)
vectorized_reduction_remainder(__adjusted_global_id + __i * __stride, __res, __acc...);
else if (__adjusted_global_id + __i * __stride < __adjusted_n)
const _Size __base_idx = __adjusted_global_id + __i * __stride;
if (__base_idx + __vec_size_minus_one < __adjusted_n)
vectorized_reduction_remainder(__base_idx, __res, __acc...);
else if (__base_idx < __adjusted_n)
{
for (_Size __idx = 0; __idx < _VecSize - 1; ++__idx)
for (_Size __idx = 0; __idx < __vec_size_minus_one; ++__idx)
{
if (__adjusted_global_id + __i * __stride + __idx < __adjusted_n)
__res =
__binary_op(__res, __unary_op(__adjusted_global_id + __i * __stride + __idx, __acc...));
if (__base_idx + __idx >= __adjusted_n)
break;
__res = __binary_op(__res, __unary_op(__base_idx + __idx, __acc...));
}
break;
}
else
break;
Expand All @@ -299,11 +310,12 @@ struct transform_reduce
if (__adjusted_global_id < __adjusted_n)
{
_Res __res = __unary_op(__adjusted_global_id, __acc...);
for (_Size __i = 1; __i < _VecSize - 1; ++__i)
for (_Size __i = 1; __i < __vec_size_minus_one; ++__i)
{
if (__adjusted_global_id + __i >= __adjusted_n)
const _Size __base_idx = __adjusted_global_id + __i;
if (__base_idx >= __adjusted_n)
break;
__res = __binary_op(__res, __unary_op(__adjusted_global_id + __i, __acc...));
__res = __binary_op(__res, __unary_op(__base_idx, __acc...));
}
__local_mem[__local_idx] = __res;
return;
Expand Down

0 comments on commit 76097ed

Please sign in to comment.