Skip to content

Commit

Permalink
Make sycl feature selection logic more scalable
Browse files Browse the repository at this point in the history
  • Loading branch information
dmitriy-sobolev committed Dec 9, 2024
1 parent 24571ce commit f660bfe
Show file tree
Hide file tree
Showing 10 changed files with 147 additions and 128 deletions.
16 changes: 8 additions & 8 deletions include/oneapi/dpl/internal/reduce_by_segment_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,12 +281,12 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __keys);
auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::write>(__cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReduceCountKernel>(
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__seg_reduce_count_kernel,
#endif
sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -319,11 +319,11 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
__cgh.depends_on(__seg_end_identification);
auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::read>(__cgh);
auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read_write>(__cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReduceOffsetKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__seg_reduce_offset_kernel,
#endif
sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand All @@ -342,11 +342,11 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
auto __partials_acc = __partials.template get_access<sycl::access_mode::read_write>(__cgh);
auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read>(__cgh);
__dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReduceWgKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__seg_reduce_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -465,11 +465,11 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
__dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh);

__cgh.depends_on(__wg_reduce);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReducePrefixKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__seg_reduce_prefix_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down
8 changes: 4 additions & 4 deletions include/oneapi/dpl/internal/scan_by_segment_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,11 @@ struct __sycl_scan_by_segment_impl

__dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh);

#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__seg_scan_wg_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegScanWgKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__seg_scan_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -268,11 +268,11 @@ struct __sycl_scan_by_segment_impl
__dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh);

__dpl_sycl::__local_accessor<bool> __loc_seg_ends_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__seg_scan_prefix_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegScanPrefixKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__seg_scan_prefix_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down
8 changes: 4 additions & 4 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,11 +324,11 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
auto __temp_acc = __result_and_scratch.template __get_scratch_acc<sycl::access_mode::write>(
__cgh, __dpl_sycl::__no_init{});
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle());
#endif
__cgh.parallel_for<_LocalScanKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__kernel_1,
#endif
sycl::nd_range<1>(__n_groups * __wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) {
Expand All @@ -345,11 +345,11 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
__cgh.depends_on(__submit_event);
auto __temp_acc = __result_and_scratch.template __get_scratch_acc<sycl::access_mode::read_write>(__cgh);
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle());
#endif
__cgh.parallel_for<_GroupScanKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__kernel_2,
#endif
// TODO: try to balance work between several workgroups instead of one
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@

#include "sycl_traits.h" //SYCL traits specialization for some oneDPL types.

#define _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT (_ONEDPL_LIBSYCL_VERSION == 0 || _ONEDPL_LIBSYCL_VERSION >= 50700)
#define _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT (_ONEDPL_LIBSYCL_VERSION >= 50700)

#define _ONEDPL_RADIX_WORKLOAD_TUNING 1
//To achieve better performance, number of segments and work-group size are variated depending on a number of elements:
//1. 32K...512K - number of segments is increased up to 8 times
Expand Down Expand Up @@ -198,11 +201,11 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, :
oneapi::dpl::__ranges::__require_access(__hdl, __val_rng, __count_rng);
// an accessor per work-group with value counters from each work-item
auto __count_lacc = __dpl_sycl::__local_accessor<_CountT>(__wg_size * __radix_states, __hdl);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__kernel,
#endif
sycl::nd_range<1>(__segments * __wg_size, __wg_size), [=](sycl::nd_item<1> __self_item) {
Expand Down Expand Up @@ -299,11 +302,11 @@ __radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size
__hdl.depends_on(__dependency_event);
// access the counters for all work groups
oneapi::dpl::__ranges::__require_access(__hdl, __count_rng);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__kernel,
#endif
sycl::nd_range<1>(__radix_states * __scan_wg_size, __scan_wg_size), [=](sycl::nd_item<1> __self_item) {
Expand Down Expand Up @@ -346,8 +349,6 @@ enum class __peer_prefix_algo
template <std::uint32_t __radix_states, typename _OffsetT, __peer_prefix_algo _Algo>
struct __peer_prefix_helper;

#define _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50700)

#if _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT
template <std::uint32_t __radix_states, typename _OffsetT>
struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::atomic_fetch_or>
Expand Down Expand Up @@ -546,11 +547,11 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments,

typename _PeerHelper::_TempStorageT __peer_temp(1, __hdl);

#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__kernel,
#endif
//Each SYCL work group processes one data segment.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,11 @@ struct __parallel_transform_reduce_impl
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__cgh.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_ReduceKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
__kernel,
#endif
sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ __supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_s

// 20201214 value corresponds to Intel(R) oneAPI C++ Compiler Classic 2021.1.2 Patch release
#define _ONEDPL_SYCL2020_KERNEL_DEVICE_API_PRESENT \
(_ONEDPL_GENERIC_SYCL_LIBRARY || __SYCL_COMPILER_VERSION > 20201214 || _ONEDPL_LIBSYCL_VERSION >= 50700)
(__SYCL_COMPILER_VERSION > 20201214 || (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION >= 50700))

template <typename _ExecutionPolicy>
::std::size_t
Expand Down Expand Up @@ -268,7 +268,7 @@ class __kernel_compiler
static_assert(__kernel_count > 0, "At least one kernel name should be provided");

public:
#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT
template <typename _Exec>
static auto
__compile(_Exec&& __exec)
Expand Down Expand Up @@ -543,7 +543,7 @@ struct __result_and_scratch_storage
inline bool
__use_USM_host_allocations(sycl::queue __queue)
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT
auto __device = __queue.get_device();
if (!__device.is_gpu())
return false;
Expand All @@ -560,7 +560,7 @@ struct __result_and_scratch_storage
inline bool
__use_USM_allocations(sycl::queue __queue)
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT
return __queue.get_device().has(sycl::aspect::usm_device_allocations);
#else
return false;
Expand Down Expand Up @@ -613,7 +613,7 @@ struct __result_and_scratch_storage
static auto
__get_usm_or_buffer_accessor_ptr(const _Acc& __acc, std::size_t __scratch_n = 0)
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT
return __acc.__get_pointer();
#else
return &__acc[__scratch_n];
Expand All @@ -624,7 +624,7 @@ struct __result_and_scratch_storage
auto
__get_result_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT
if (__use_USM_host && __supports_USM_device)
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __result_buf.get(), __prop_list);
else if (__supports_USM_device)
Expand All @@ -640,7 +640,7 @@ struct __result_and_scratch_storage
auto
__get_scratch_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT
if (__use_USM_host || __supports_USM_device)
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __scratch_buf.get(), __prop_list);
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __sycl_buf.get(), __prop_list);
Expand Down
Loading

0 comments on commit f660bfe

Please sign in to comment.