Skip to content

Commit

Permalink
Remove move statement and ::std
Browse files Browse the repository at this point in the history
  • Loading branch information
julianmi committed Apr 18, 2024
1 parent be6bcd1 commit e6983fc
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 50 deletions.
81 changes: 40 additions & 41 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class __reduce_kernel;

// Adjust number of sequential operations per work-item based on the vector size. Single elements are kept to
// improve performance of small arrays or remainder loops.
template <::std::uint8_t _VecSize, typename _Size>
template <std::uint8_t _VecSize, typename _Size>
_Size
__adjust_iters_per_work_item(_Size __iters_per_work_item)
{
Expand Down Expand Up @@ -124,10 +124,10 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _

// Parallel_transform_reduce for a small arrays using a single work group.
// Transforms and reduces __work_group_size * __iters_per_work_item elements.
template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize, typename _KernelName>
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename _KernelName>
struct __parallel_transform_reduce_small_submitter;

template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize, typename... _Name>
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename... _Name>
struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize,
__internal::__optional_kernel_name<_Name...>>
{
Expand All @@ -149,7 +149,7 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize,
sycl::event __reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer
auto __res_acc = __scratch_container.__get_result_acc(__cgh);
::std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
__cgh.parallel_for<_Name...>(
sycl::nd_range<1>(sycl::range<1>(__work_group_size), sycl::range<1>(__work_group_size)),
Expand All @@ -167,7 +167,7 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize,
}
}; // struct __parallel_transform_reduce_small_submitter

template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize, typename _ExecutionPolicy, typename _Size,
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename _ExecutionPolicy, typename _Size,
typename _ReduceOp, typename _TransformOp, typename _InitType, typename... _Ranges>
auto
__parallel_transform_reduce_small_impl(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
Expand All @@ -180,17 +180,17 @@ __parallel_transform_reduce_small_impl(oneapi::dpl::__internal::__device_backend
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__reduce_small_kernel<_CustomName>>;

return __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, _ReduceKernel>()(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __n, __work_group_size, __iters_per_work_item,
__reduce_op, __transform_op, __init, ::std::forward<_Ranges>(__rngs)...);
__backend_tag, std::forward<_ExecutionPolicy>(__exec), __n, __work_group_size, __iters_per_work_item,
__reduce_op, __transform_op, __init, std::forward<_Ranges>(__rngs)...);
}

// Submits the first kernel of the parallel_transform_reduce for mid-sized arrays.
// Uses multiple work groups that each reduce __work_group_size * __iters_per_work_item items and store the preliminary
// results in __temp.
template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize, typename _KernelName>
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename _KernelName>
struct __parallel_transform_reduce_device_kernel_submitter;

template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize, typename... _KernelName>
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename... _KernelName>
struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize,
__internal::__optional_kernel_name<_KernelName...>>
{
Expand All @@ -214,7 +214,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V

return __exec.queue().submit([&, __n](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer
::std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
__cgh.parallel_for<_KernelName...>(
Expand All @@ -234,10 +234,10 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
// Submits the second kernel of the parallel_transform_reduce for mid-sized arrays.
// Uses a single work groups to reduce __n preliminary results stored in __temp and returns a future object with the
// result buffer.
template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize, typename _KernelName>
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename _KernelName>
struct __parallel_transform_reduce_work_group_kernel_submitter;

template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize, typename... _KernelName>
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename... _KernelName>
struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize,
__internal::__optional_kernel_name<_KernelName...>>
{
Expand Down Expand Up @@ -282,7 +282,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
}
}; // struct __parallel_transform_reduce_work_group_kernel_submitter

template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize, typename _ExecutionPolicy, typename _Size,
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize, typename _ExecutionPolicy, typename _Size,
typename _ReduceOp, typename _TransformOp, typename _InitType, typename... _Ranges>
auto
__parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
Expand All @@ -305,17 +305,17 @@ __parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_t
sycl::event __reduce_event =
__parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize, _ReduceDeviceKernel>()(
__backend_tag, __exec, __n, __work_group_size, __iters_per_work_item_device_kernel, __reduce_op,
__transform_op, __scratch_container, ::std::forward<_Ranges>(__rngs)...);
__transform_op, __scratch_container, std::forward<_Ranges>(__rngs)...);

// __n_groups preliminary results from the device kernel.
return __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize,
_ReduceWorkGroupKernel>()(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __reduce_event, __n_groups, __work_group_size,
__backend_tag, std::forward<_ExecutionPolicy>(__exec), __reduce_event, __n_groups, __work_group_size,
__iters_per_work_item_work_group_kernel, __reduce_op, __init, __scratch_container);
}

// General implementation using a tree reduction
template <typename _Tp, typename _Commutative, ::std::uint8_t _VecSize>
template <typename _Tp, typename _Commutative, std::uint8_t _VecSize>
struct __parallel_transform_reduce_impl
{
template <typename _ExecutionPolicy, typename _Size, typename _ReduceOp, typename _TransformOp, typename _InitType,
Expand All @@ -340,7 +340,7 @@ struct __parallel_transform_reduce_impl

#if _ONEDPL_COMPILE_KERNEL
auto __kernel = __internal::__kernel_compiler<_ReduceKernel>::__compile(__exec);
__work_group_size = ::std::min(
__work_group_size = std::min(
__work_group_size, static_cast<_Size>(oneapi::dpl::__internal::__kernel_work_group_size(__exec, __kernel)));
#endif

Expand Down Expand Up @@ -371,7 +371,7 @@ struct __parallel_transform_reduce_impl

// get an access to data under SYCL buffer
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
::std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__kernel.get_kernel_bundle());
Expand Down Expand Up @@ -428,7 +428,7 @@ struct __parallel_transform_reduce_impl
});
});
__is_first = false;
::std::swap(__offset_1, __offset_2);
std::swap(__offset_1, __offset_2);
__n = __n_groups;
__n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __size_per_work_group);
} while (__n > 1);
Expand Down Expand Up @@ -461,32 +461,31 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back

// Empirically found tuning parameters for typical devices.
constexpr _Size __max_iters_per_work_item = 32;
constexpr ::std::size_t __max_work_group_size = 256;
constexpr ::std::uint8_t __vector_size = 4;
constexpr ::std::uint32_t __oversubscription = 2;
constexpr std::size_t __max_work_group_size = 256;
constexpr std::uint8_t __vector_size = 4;
constexpr std::uint32_t __oversubscription = 2;

// Get the work group size adjusted to the local memory limit.
// Pessimistically double the memory requirement to take into account memory used by compiled kernel.
// TODO: find a way to generalize getting of reliable work-group size.
::std::size_t __work_group_size =
oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, static_cast<::std::size_t>(sizeof(_Tp) * 2));
std::size_t __work_group_size =
oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, static_cast<std::size_t>(sizeof(_Tp) * 2));

// Limit work-group size to __max_work_group_size for performance on GPUs. Empirically tested.
__work_group_size = ::std::min(__work_group_size, __max_work_group_size);
__work_group_size = std::min(__work_group_size, __max_work_group_size);
const _Size __max_elements_per_wg = __work_group_size * __max_iters_per_work_item;

// Use single work group implementation if less than __max_iters_per_work_item elements per work-group.
// We can use 16-bit addressing since we have at most __max_work_group_size * __max_iters_per_work_item elements.
if (__n <= __max_elements_per_wg)
{
const ::std::uint16_t __n_short = static_cast<::std::uint16_t>(__n);
const ::std::uint16_t __work_group_size_short = static_cast<::std::uint16_t>(__work_group_size);
::std::uint16_t __iters_per_work_item =
oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __work_group_size);
const std::uint16_t __n_short = static_cast<std::uint16_t>(__n);
const std::uint16_t __work_group_size_short = static_cast<std::uint16_t>(__work_group_size);
std::uint16_t __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __work_group_size);
__iters_per_work_item = __adjust_iters_per_work_item<__vector_size>(__iters_per_work_item);
return __parallel_transform_reduce_small_impl<_Tp, _Commutative, __vector_size>(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __n_short, __work_group_size_short,
__iters_per_work_item, __reduce_op, __transform_op, __init, ::std::forward<_Ranges>(__rngs)...);
__backend_tag, std::forward<_ExecutionPolicy>(__exec), __n_short, __work_group_size_short,
__iters_per_work_item, __reduce_op, __transform_op, __init, std::forward<_Ranges>(__rngs)...);
}
// Use two-step tree reduction.
// First step reduces __work_group_size * __iters_per_work_item_device_kernel elements.
Expand All @@ -495,13 +494,13 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back
// elements.
else if (__n <= __max_elements_per_wg * __max_elements_per_wg)
{
const ::std::uint32_t __n_short = static_cast<::std::uint32_t>(__n);
const ::std::uint32_t __work_group_size_short = static_cast<::std::uint32_t>(__work_group_size);
const std::uint32_t __n_short = static_cast<std::uint32_t>(__n);
const std::uint32_t __work_group_size_short = static_cast<std::uint32_t>(__work_group_size);
// Fully-utilize the device by running a work-group per compute unit.
// Add a factor more work-groups than compute units to fully utilizes the device and hide latencies.
const ::std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
::std::uint32_t __n_groups = __max_cu * __oversubscription;
::std::uint32_t __iters_per_work_item_device_kernel =
const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
std::uint32_t __n_groups = __max_cu * __oversubscription;
std::uint32_t __iters_per_work_item_device_kernel =
oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __n_groups * __work_group_size_short);
__iters_per_work_item_device_kernel =
__adjust_iters_per_work_item<__vector_size>(__iters_per_work_item_device_kernel);
Expand All @@ -513,21 +512,21 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back
__iters_per_work_item_device_kernel = __max_iters_per_work_item;
__n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __max_elements_per_wg);
}
::std::uint32_t __iters_per_work_item_work_group_kernel =
std::uint32_t __iters_per_work_item_work_group_kernel =
oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __work_group_size_short);
__iters_per_work_item_work_group_kernel =
__adjust_iters_per_work_item<__vector_size>(__iters_per_work_item_work_group_kernel);
return __parallel_transform_reduce_mid_impl<_Tp, _Commutative, __vector_size>(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __n_short, __work_group_size_short,
__backend_tag, std::forward<_ExecutionPolicy>(__exec), __n_short, __work_group_size_short,
__iters_per_work_item_device_kernel, __iters_per_work_item_work_group_kernel, __reduce_op, __transform_op,
__init, ::std::forward<_Ranges>(__rngs)...);
__init, std::forward<_Ranges>(__rngs)...);
}
// Otherwise use a recursive tree reduction with __max_iters_per_work_item __iters_per_work_item.
else
{
return __parallel_transform_reduce_impl<_Tp, _Commutative, __vector_size>::submit(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __n, static_cast<_Size>(__work_group_size),
__max_iters_per_work_item, __reduce_op, __transform_op, __init, ::std::forward<_Ranges>(__rngs)...);
__backend_tag, std::forward<_ExecutionPolicy>(__exec), __n, static_cast<_Size>(__work_group_size),
__max_iters_per_work_item, __reduce_op, __transform_op, __init, std::forward<_Ranges>(__rngs)...);
}
}

Expand Down
18 changes: 9 additions & 9 deletions include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ struct __init_processing
// Load elements consecutively from global memory, transform them, and apply a local reduction. Each local result is
// stored in local memory.
template <typename _ExecutionPolicy, typename _Operation1, typename _Operation2, typename _Tp, typename _Commutative,
::std::uint8_t _VecSize>
std::uint8_t _VecSize>
struct transform_reduce
{
_Operation1 __binary_op;
Expand All @@ -213,7 +213,7 @@ struct transform_reduce
void
vectorized_reduction_first(const _Size __start_idx, _Res& __res, const _Acc&... __acc) const
{
new (&__res.__v) _Tp(std::move(__unary_op(__start_idx, __acc...)));
new (&__res.__v) _Tp(__unary_op(__start_idx, __acc...));
_ONEDPL_PRAGMA_UNROLL
for (_Size __i = 1; __i < _VecSize; ++__i)
__res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __i, __acc...));
Expand All @@ -233,7 +233,7 @@ struct transform_reduce
scalar_reduction_remainder(const _Size __start_idx, const _Size __adjusted_n, const _Size __max_iters, _Res& __res,
const _Acc&... __acc) const
{
const _Size __no_iters = ::std::min(static_cast<_Size>(__adjusted_n - __start_idx), __max_iters);
const _Size __no_iters = std::min(static_cast<_Size>(__adjusted_n - __start_idx), __max_iters);
for (_Size __idx = 0; __idx < __no_iters; ++__idx)
__res.__v = __binary_op(__res.__v, __unary_op(__start_idx + __idx, __acc...));
}
Expand All @@ -248,7 +248,7 @@ struct transform_reduce
const _Size __global_idx = __item_id.get_global_id(0);
if (__iters_per_work_item == 1)
{
new (&__res.__v) _Tp(std::move(__unary_op(__global_idx, __acc...)));
new (&__res.__v) _Tp(__unary_op(__global_idx, __acc...));
return;
}
const _Size __local_range = __item_id.get_local_range(0);
Expand Down Expand Up @@ -305,7 +305,7 @@ struct transform_reduce
// Scalar remainder
else if (__adjusted_global_id < __adjusted_n)
{
new (&__res.__v) _Tp(std::move(__unary_op(__adjusted_global_id, __acc...)));
new (&__res.__v) _Tp(__unary_op(__adjusted_global_id, __acc...));
scalar_reduction_remainder(static_cast<_Size>(__adjusted_global_id + 1), __adjusted_n,
static_cast<_Size>(_VecSize - 2), __res, __acc...);
}
Expand All @@ -324,7 +324,7 @@ struct transform_reduce
_Size __last_wg_remainder = __n % __items_per_work_group;
// Adjust remainder and wg size for vector size
_Size __last_wg_vec = oneapi::dpl::__internal::__dpl_ceiling_div(__last_wg_remainder, _VecSize);
_Size __last_wg_contrib = ::std::min(__last_wg_vec, static_cast<_Size>(__work_group_size * _VecSize));
_Size __last_wg_contrib = std::min(__last_wg_vec, static_cast<_Size>(__work_group_size * _VecSize));
return __full_group_contrib + __last_wg_contrib;
}
return oneapi::dpl::__internal::__dpl_ceiling_div(__n, __iters_per_work_item);
Expand Down Expand Up @@ -361,7 +361,7 @@ struct reduce_over_group
auto __group_size = __item_id.get_local_range().size();

__local_mem[__local_idx] = __val;
for (::std::uint32_t __power_2 = 1; __power_2 < __group_size; __power_2 *= 2)
for (std::uint32_t __power_2 = 1; __power_2 < __group_size; __power_2 *= 2)
{
__dpl_sycl::__group_barrier(__item_id);
if ((__local_idx & (2 * __power_2 - 1)) == 0 && __local_idx + __power_2 < __group_size &&
Expand All @@ -387,8 +387,8 @@ struct reduce_over_group
__init_processing<_Tp>{}(__init, __result, __bin_op1);
}

inline ::std::size_t
local_mem_req(const ::std::uint16_t& __work_group_size) const
inline std::size_t
local_mem_req(const std::uint16_t& __work_group_size) const
{
if constexpr (__has_known_identity<_BinaryOperation1, _Tp>{})
return 0;
Expand Down

0 comments on commit e6983fc

Please sign in to comment.