Skip to content

Commit

Permalink
Make __data_per_workitem a runtime argument
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitriy Sobolev <[email protected]>
  • Loading branch information
dmitriy-sobolev committed Aug 12, 2024
1 parent c7101ad commit 4d6565b
Showing 1 changed file with 44 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ struct __subgroup_bubble_sorter
}
};

template <std::uint16_t __data_per_workitem>
struct __group_merge_path_sorter
{
template <typename _StorageAcc, typename _Compare>
bool
sort(const sycl::nd_item<1>& __item, const _StorageAcc& __storage_acc, _Compare __comp, std::uint32_t __start,
std::uint32_t __end, std::uint32_t __sorted, std::uint32_t __workgroup_size) const
std::uint32_t __end, std::uint32_t __sorted, std::uint16_t __data_per_workitem,
std::uint32_t __workgroup_size) const
{
const std::uint32_t __sorted_final = __data_per_workitem * __workgroup_size;

Expand Down Expand Up @@ -103,34 +103,34 @@ struct __group_merge_path_sorter
}
};

template <std::uint16_t _DataPerWorkitem, typename _Range, typename _Compare>
template <typename _Range, typename _Compare>
struct __leaf_sorter
{
static constexpr std::uint16_t __data_per_workitem = _DataPerWorkitem;

using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
using _Size = oneapi::dpl::__internal::__difference_t<_Range>;
using _StorageAcc = __dpl_sycl::__local_accessor<_Tp>;
// TODO: select a better sub-group sorter depending on sort stability,
// a type (e.g. it can be trivially copied for shuffling within a sub-group)
using _SubGroupSorter = __subgroup_bubble_sorter;
using _GroupSorter = __group_merge_path_sorter<__data_per_workitem>;
using _GroupSorter = __group_merge_path_sorter;

static std::uint32_t
storage_size(std::uint32_t __future_workgroup_size)
storage_size(std::uint16_t __future_data_per_workitem, std::uint32_t __future_workgroup_size)
{
return 2 * __data_per_workitem * __future_workgroup_size;
return 2 * __future_data_per_workitem * __future_workgroup_size;
}

_StorageAcc
create_storage_accessor(sycl::handler& __cgh)
{
return _StorageAcc(storage_size(__workgroup_size), __cgh);
return _StorageAcc(storage_size(__data_per_workitem, __workgroup_size), __cgh);
}

__leaf_sorter(const _Range& __rng, _Compare __comp, std::uint32_t __workgroup_size)
: __rng(__rng), __comp(__comp), __n(__rng.size()), __workgroup_size(__workgroup_size),
__process_size(__data_per_workitem * __workgroup_size), __sub_group_sorter(), __group_sorter()
__leaf_sorter(const _Range& __rng, _Compare __comp, std::uint16_t __data_per_workitem,
std::uint32_t __workgroup_size)
: __rng(__rng), __comp(__comp), __n(__rng.size()), __data_per_workitem(__data_per_workitem),
__workgroup_size(__workgroup_size), __process_size(__data_per_workitem * __workgroup_size),
__sub_group_sorter(), __group_sorter()
{
assert((__process_size & (__process_size - 1)) == 0 && "Process size must be a power of 2");
}
Expand All @@ -151,7 +151,6 @@ struct __leaf_sorter

// 1. Load
// TODO: add a specialization for a case __global_value_id < __n condition is true for the whole work-group
_ONEDPL_PRAGMA_UNROLL
for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i)
{
const std::uint32_t __sg_offset = __sg_start + __i * __sg_size;
Expand All @@ -176,11 +175,11 @@ struct __leaf_sorter

// 3. Sort on work-group level
bool __data_in_temp = __group_sorter.sort(__item, __storage_acc, __comp, static_cast<std::uint32_t>(0),
__adjusted_process_size, __data_per_workitem, __workgroup_size);
__adjusted_process_size, /*sorted per sub-group*/__data_per_workitem,
__data_per_workitem, __workgroup_size);
// barrier is not needed here because of the barrier inside the sort method

// 4. Store
_ONEDPL_PRAGMA_UNROLL
for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i)
{
const std::uint32_t __sg_offset = __sg_start + __i * __sg_size;
Expand All @@ -196,6 +195,7 @@ struct __leaf_sorter
_Range __rng;
_Compare __comp;
_Size __n;
std::uint16_t __data_per_workitem;
std::uint32_t __workgroup_size;
std::uint32_t __process_size;
_SubGroupSorter __sub_group_sorter;
Expand Down Expand Up @@ -317,50 +317,27 @@ class __sort_global_kernel;
template <typename... _Name>
class __sort_copy_back_kernel;

template <typename _IndexT>
struct __parallel_sort_submitter_caller
{
template <typename _ExecutionPolicy, typename _Range, typename _Compare, typename _LeafSorterT>
auto
operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSorterT& __leaf_sorter)
{
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
using _LeafDPWI = std::integral_constant<std::uint16_t, _LeafSorterT::__data_per_workitem>;
// TODO: split the submitter into multiple ones to avoid extra compilation of kernels
// - _LeafSortKernel does not need _IndexT
// - _GlobalSortKernel does not need _LeafDPWI
// - _CopyBackKernel does not need either of them
using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__sort_leaf_kernel<_CustomName, _IndexT, _LeafDPWI>>;
using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__sort_global_kernel<_CustomName, _IndexT, _LeafDPWI>>;
using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__sort_copy_back_kernel<_CustomName, _IndexT, _LeafDPWI>>;

return __parallel_sort_submitter<_IndexT, _LeafSortKernel, _GlobalSortKernel, _CopyBackKernel>()(
std::forward<_ExecutionPolicy>(__exec), std::forward<_Range>(__rng), __comp, __leaf_sorter);
}
};

template <typename _IndexT, typename _ExecutionPolicy, typename _Range, typename _Compare>
auto
__submit_selecting_leaf(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp)
{
// 8 is the maximum reasonable value for bubble sub-group sorter due to algorithm complexity
// TODO: reconsider the value if another algorithm is used,
// or an internal cap is set (e.g. sorting 2 sequences of 4/8 items each)
using _Leaf8 = __leaf_sorter<8, std::decay_t<_Range>, _Compare>;
using _Leaf4 = __leaf_sorter<4, std::decay_t<_Range>, _Compare>;
// 2 is the smallest reasonable value for merge-path group sorter since it loads 2 values at least
using _Leaf2 = __leaf_sorter<2, std::decay_t<_Range>, _Compare>;

using _Leaf = __leaf_sorter<std::decay_t<_Range>, _Compare>;
using _Tp = oneapi::dpl::__internal::__value_t<_Range>;

using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
// TODO: split the submitter into multiple ones to avoid extra compilation of kernels:
// - _LeafSortKernel and _CopyBackKernel do not need _IndexT
using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__sort_leaf_kernel<_CustomName, _IndexT>>;
using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__sort_global_kernel<_CustomName, _IndexT>>;
using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
__sort_copy_back_kernel<_CustomName, _IndexT>>;

const std::size_t __n = __rng.size();
auto __device = __exec.queue().get_device();

const std::size_t __max_wg_size = __device.template get_info<sycl::info::device::max_work_group_size>();
std::size_t __wg_size = oneapi::dpl::__internal::__dpl_bit_floor(__max_wg_size);

const bool __is_cpu = __device.is_cpu();
std::uint32_t __max_sg_size{};
Expand All @@ -377,39 +354,29 @@ __submit_selecting_leaf(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __co
// TODO: adjust the saturation point for Intel GPUs:
// CU number is incorrect for Intel GPUs since it returns the number of VE instead of XC,
// and max work-group size utilizes only a half of the XC resources for Data Center GPUs
const auto __saturation_point = __max_cu * __max_hw_wg_size;
const auto __desired_data_per_workitem = __n / __saturation_point;
const std::uint32_t __saturation_point = __max_cu * __max_hw_wg_size;
const std::uint32_t __desired_data_per_workitem = __n / __saturation_point;

// 8 is the maximum reasonable value for bubble sub-group sorter due to algorithm complexity
// 2 is the smallest reasonable value for merge-path group sorter since it loads 2 values at least
// TODO: reconsider the values if other algorithms are used
const std::uint16_t __data_per_workitem = __desired_data_per_workitem <= 2 ?
2 : std::min<std::uint32_t>(oneapi::dpl::__internal::__dpl_bit_floor(__desired_data_per_workitem), 8);

// Pessimistically double the memory requirement to take into account memory used by compiled kernel.
// TODO: investigate if the adjustment can be less conservative
const std::size_t __max_slm_items =
__device.template get_info<sycl::info::device::local_mem_size>() / (sizeof(_Tp) * 2);
auto __caller = __parallel_sort_submitter_caller<_IndexT>();
if (__max_slm_items >= _Leaf8::storage_size(__wg_size) && __desired_data_per_workitem >= 8)
{
_Leaf8 __leaf_sorter(__rng, __comp, __wg_size);
return __caller(std::forward<_ExecutionPolicy>(__exec), __rng, __comp, __leaf_sorter);
}
else if (__max_slm_items >= _Leaf4::storage_size(__wg_size) && __desired_data_per_workitem >= 4)
{
_Leaf4 __leaf_sorter(__rng, __comp, __wg_size);
return __caller(std::forward<_ExecutionPolicy>(__exec), __rng, __comp, __leaf_sorter);
}
else if (__max_slm_items >= _Leaf2::storage_size(__wg_size) && __desired_data_per_workitem >= 2)
{
_Leaf2 __leaf_sorter(__rng, __comp, __wg_size);
return __caller(std::forward<_ExecutionPolicy>(__exec), __rng, __comp, __leaf_sorter);
}
else
{
std::size_t __slm_max_wg_size = __max_slm_items / _Leaf2::storage_size(1);
// __n is taken as is because of the bit floor and processing 2 items per work-item
// hence the processed size always fits a single work-group if __n is chosen
__wg_size = std::min<std::size_t>({__wg_size, __slm_max_wg_size, __n});
__wg_size = oneapi::dpl::__internal::__dpl_bit_floor(__wg_size);
_Leaf2 __leaf_sorter(__rng, __comp, __wg_size);
return __caller(std::forward<_ExecutionPolicy>(__exec), __rng, __comp, __leaf_sorter);
}

const std::size_t __max_slm_wg_size = __max_slm_items / _Leaf::storage_size(__data_per_workitem, 1);
// __n is taken as is because of the bit floor and processing at least 2 items per work-item
// hence the processed size always fits a single work-group if __n is chosen
std::size_t __wg_size = std::min<std::size_t>({__max_hw_wg_size, __max_slm_wg_size, __n});
__wg_size = oneapi::dpl::__internal::__dpl_bit_floor(__wg_size);

_Leaf __leaf(__rng, __comp, __data_per_workitem, __wg_size);
return __parallel_sort_submitter<_IndexT, _LeafSortKernel, _GlobalSortKernel, _CopyBackKernel>()(
std::forward<_ExecutionPolicy>(__exec), std::forward<_Range>(__rng), __comp, __leaf);
};

template <typename _ExecutionPolicy, typename _Range, typename _Compare>
Expand Down

0 comments on commit 4d6565b

Please sign in to comment.