Make __data_per_workitem a runtime argument

Signed-off-by: Dmitriy Sobolev <[email protected]>
uxlfoundation · Aug 12, 2024 · 4d6565b · 4d6565b
1 parent c7101ad
commit 4d6565b
Showing 1 changed file with 44 additions and 77 deletions.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h
@@ -56,13 +56,13 @@ struct __subgroup_bubble_sorter
     }
 };
 
-template <std::uint16_t __data_per_workitem>
 struct __group_merge_path_sorter
 {
     template <typename _StorageAcc, typename _Compare>
     bool
     sort(const sycl::nd_item<1>& __item, const _StorageAcc& __storage_acc, _Compare __comp, std::uint32_t __start,
-         std::uint32_t __end, std::uint32_t __sorted, std::uint32_t __workgroup_size) const
+         std::uint32_t __end, std::uint32_t __sorted, std::uint16_t __data_per_workitem,
+         std::uint32_t __workgroup_size) const
     {
         const std::uint32_t __sorted_final = __data_per_workitem * __workgroup_size;
 
@@ -103,34 +103,34 @@ struct __group_merge_path_sorter
     }
 };
 
-template <std::uint16_t _DataPerWorkitem, typename _Range, typename _Compare>
+template <typename _Range, typename _Compare>
 struct __leaf_sorter
 {
-    static constexpr std::uint16_t __data_per_workitem = _DataPerWorkitem;
-
     using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
     using _Size = oneapi::dpl::__internal::__difference_t<_Range>;
     using _StorageAcc = __dpl_sycl::__local_accessor<_Tp>;
     // TODO: select a better sub-group sorter depending on sort stability,
     // a type (e.g. it can be trivially copied for shuffling within a sub-group)
     using _SubGroupSorter = __subgroup_bubble_sorter;
-    using _GroupSorter = __group_merge_path_sorter<__data_per_workitem>;
+    using _GroupSorter = __group_merge_path_sorter;
 
     static std::uint32_t
-    storage_size(std::uint32_t __future_workgroup_size)
+    storage_size(std::uint16_t __future_data_per_workitem, std::uint32_t __future_workgroup_size)
     {
-        return 2 * __data_per_workitem * __future_workgroup_size;
+        return 2 * __future_data_per_workitem * __future_workgroup_size;
     }
 
     _StorageAcc
     create_storage_accessor(sycl::handler& __cgh)
     {
-        return _StorageAcc(storage_size(__workgroup_size), __cgh);
+        return _StorageAcc(storage_size(__data_per_workitem, __workgroup_size), __cgh);
     }
 
-    __leaf_sorter(const _Range& __rng, _Compare __comp, std::uint32_t __workgroup_size)
-        : __rng(__rng), __comp(__comp), __n(__rng.size()), __workgroup_size(__workgroup_size),
-          __process_size(__data_per_workitem * __workgroup_size), __sub_group_sorter(), __group_sorter()
+    __leaf_sorter(const _Range& __rng, _Compare __comp, std::uint16_t __data_per_workitem,
+                  std::uint32_t __workgroup_size)
+        : __rng(__rng), __comp(__comp), __n(__rng.size()), __data_per_workitem(__data_per_workitem),
+          __workgroup_size(__workgroup_size), __process_size(__data_per_workitem * __workgroup_size),
+          __sub_group_sorter(), __group_sorter()
     {
         assert((__process_size & (__process_size - 1)) == 0 && "Process size must be a power of 2");
     }
@@ -151,7 +151,6 @@ struct __leaf_sorter
 
         // 1. Load
         // TODO: add a specialization for a case __global_value_id < __n condition is true for the whole work-group
-        _ONEDPL_PRAGMA_UNROLL
         for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i)
         {
             const std::uint32_t __sg_offset = __sg_start + __i * __sg_size;
@@ -176,11 +175,11 @@ struct __leaf_sorter
 
         // 3. Sort on work-group level
         bool __data_in_temp = __group_sorter.sort(__item, __storage_acc, __comp, static_cast<std::uint32_t>(0),
-                                                  __adjusted_process_size, __data_per_workitem, __workgroup_size);
+                                                  __adjusted_process_size, /*sorted per sub-group*/__data_per_workitem,
+                                                  __data_per_workitem, __workgroup_size);
         // barrier is not needed here because of the barrier inside the sort method
 
         // 4. Store
-        _ONEDPL_PRAGMA_UNROLL
         for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i)
         {
             const std::uint32_t __sg_offset = __sg_start + __i * __sg_size;
@@ -196,6 +195,7 @@ struct __leaf_sorter
     _Range __rng;
     _Compare __comp;
     _Size __n;
+    std::uint16_t __data_per_workitem;
     std::uint32_t __workgroup_size;
     std::uint32_t __process_size;
     _SubGroupSorter __sub_group_sorter;
@@ -317,50 +317,27 @@ class __sort_global_kernel;
 template <typename... _Name>
 class __sort_copy_back_kernel;
 
-template <typename _IndexT>
-struct __parallel_sort_submitter_caller
-{
-    template <typename _ExecutionPolicy, typename _Range, typename _Compare, typename _LeafSorterT>
-    auto
-    operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSorterT& __leaf_sorter)
-    {
-        using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-        using _LeafDPWI = std::integral_constant<std::uint16_t, _LeafSorterT::__data_per_workitem>;
-        // TODO: split the submitter into multiple ones to avoid extra compilation of kernels
-        // - _LeafSortKernel does not need _IndexT
-        // - _GlobalSortKernel does not need _LeafDPWI
-        // - _CopyBackKernel does not need either of them
-        using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-            __sort_leaf_kernel<_CustomName, _IndexT, _LeafDPWI>>;
-        using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-            __sort_global_kernel<_CustomName, _IndexT, _LeafDPWI>>;
-        using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-            __sort_copy_back_kernel<_CustomName, _IndexT, _LeafDPWI>>;
-
-        return __parallel_sort_submitter<_IndexT, _LeafSortKernel, _GlobalSortKernel, _CopyBackKernel>()(
-            std::forward<_ExecutionPolicy>(__exec), std::forward<_Range>(__rng), __comp, __leaf_sorter);
-    }
-};
-
 template <typename _IndexT, typename _ExecutionPolicy, typename _Range, typename _Compare>
 auto
 __submit_selecting_leaf(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp)
 {
-    // 8 is the maximum reasonable value for bubble sub-group sorter due to algorithm complexity
-    // TODO: reconsider the value if another algorithm is used,
-    // or an internal cap is set (e.g. sorting 2 sequences of 4/8 items each)
-    using _Leaf8 = __leaf_sorter<8, std::decay_t<_Range>, _Compare>;
-    using _Leaf4 = __leaf_sorter<4, std::decay_t<_Range>, _Compare>;
-    // 2 is the smallest reasonable value for merge-path group sorter since it loads 2 values at least
-    using _Leaf2 = __leaf_sorter<2, std::decay_t<_Range>, _Compare>;
-
+    using _Leaf = __leaf_sorter<std::decay_t<_Range>, _Compare>;
     using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
 
+    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
+    // TODO: split the submitter into multiple ones to avoid extra compilation of kernels:
+    // - _LeafSortKernel and _CopyBackKernel do not need _IndexT
+    using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __sort_leaf_kernel<_CustomName, _IndexT>>;
+    using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __sort_global_kernel<_CustomName, _IndexT>>;
+    using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __sort_copy_back_kernel<_CustomName, _IndexT>>;
+
     const std::size_t __n = __rng.size();
     auto __device = __exec.queue().get_device();
 
     const std::size_t __max_wg_size = __device.template get_info<sycl::info::device::max_work_group_size>();
-    std::size_t __wg_size = oneapi::dpl::__internal::__dpl_bit_floor(__max_wg_size);
 
     const bool __is_cpu = __device.is_cpu();
     std::uint32_t __max_sg_size{};
@@ -377,39 +354,29 @@ __submit_selecting_leaf(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __co
     // TODO: adjust the saturation point for Intel GPUs:
     // CU number is incorrect for Intel GPUs since it returns the number of VE instead of XC,
     // and max work-group size utilizes only a half of the XC resources for Data Center GPUs
-    const auto __saturation_point = __max_cu * __max_hw_wg_size;
-    const auto __desired_data_per_workitem = __n / __saturation_point;
+    const std::uint32_t __saturation_point = __max_cu * __max_hw_wg_size;
+    const std::uint32_t __desired_data_per_workitem = __n / __saturation_point;
+
+    // 8 is the maximum reasonable value for bubble sub-group sorter due to algorithm complexity
+    // 2 is the smallest reasonable value for merge-path group sorter since it loads 2 values at least
+    // TODO: reconsider the values if other algorithms are used
+    const std::uint16_t __data_per_workitem = __desired_data_per_workitem <= 2 ?
+        2 : std::min<std::uint32_t>(oneapi::dpl::__internal::__dpl_bit_floor(__desired_data_per_workitem), 8);
 
     // Pessimistically double the memory requirement to take into account memory used by compiled kernel.
     // TODO: investigate if the adjustment can be less conservative
     const std::size_t __max_slm_items =
         __device.template get_info<sycl::info::device::local_mem_size>() / (sizeof(_Tp) * 2);
-    auto __caller = __parallel_sort_submitter_caller<_IndexT>();
-    if (__max_slm_items >= _Leaf8::storage_size(__wg_size) && __desired_data_per_workitem >= 8)
-    {
-        _Leaf8 __leaf_sorter(__rng, __comp, __wg_size);
-        return __caller(std::forward<_ExecutionPolicy>(__exec), __rng, __comp, __leaf_sorter);
-    }
-    else if (__max_slm_items >= _Leaf4::storage_size(__wg_size) && __desired_data_per_workitem >= 4)
-    {
-        _Leaf4 __leaf_sorter(__rng, __comp, __wg_size);
-        return __caller(std::forward<_ExecutionPolicy>(__exec), __rng, __comp, __leaf_sorter);
-    }
-    else if (__max_slm_items >= _Leaf2::storage_size(__wg_size) && __desired_data_per_workitem >= 2)
-    {
-        _Leaf2 __leaf_sorter(__rng, __comp, __wg_size);
-        return __caller(std::forward<_ExecutionPolicy>(__exec), __rng, __comp, __leaf_sorter);
-    }
-    else
-    {
-        std::size_t __slm_max_wg_size = __max_slm_items / _Leaf2::storage_size(1);
-        // __n is taken as is because of the bit floor and processing 2 items per work-item
-        // hence the processed size always fits a single work-group if __n is chosen
-        __wg_size = std::min<std::size_t>({__wg_size, __slm_max_wg_size, __n});
-        __wg_size = oneapi::dpl::__internal::__dpl_bit_floor(__wg_size);
-        _Leaf2 __leaf_sorter(__rng, __comp, __wg_size);
-        return __caller(std::forward<_ExecutionPolicy>(__exec), __rng, __comp, __leaf_sorter);
-    }
+
+    const std::size_t __max_slm_wg_size = __max_slm_items / _Leaf::storage_size(__data_per_workitem, 1);
+    // __n is taken as is because of the bit floor and processing at least 2 items per work-item
+    // hence the processed size always fits a single work-group if __n is chosen
+    std::size_t __wg_size = std::min<std::size_t>({__max_hw_wg_size, __max_slm_wg_size, __n});
+    __wg_size = oneapi::dpl::__internal::__dpl_bit_floor(__wg_size);
+
+    _Leaf __leaf(__rng, __comp, __data_per_workitem, __wg_size);
+    return __parallel_sort_submitter<_IndexT, _LeafSortKernel, _GlobalSortKernel, _CopyBackKernel>()(
+        std::forward<_ExecutionPolicy>(__exec), std::forward<_Range>(__rng), __comp, __leaf);
 };
 
 template <typename _ExecutionPolicy, typename _Range, typename _Compare>