diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h index 4d7b81e6a2e..4fc274f2445 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h @@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl { //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ template class __scan_copy_single_wg_kernel; +template +class __parallel_for_small_kernel; + +template +class __parallel_for_large_kernel; + //------------------------------------------------------------------------ // parallel_for - async pattern //------------------------------------------------------------------------ @@ -222,10 +228,10 @@ class __scan_copy_single_wg_kernel; // as the parameter pack that can be empty (for unnamed kernels) or contain exactly one // type (for explicitly specified name by the user) template -struct __parallel_for_submitter; +struct __parallel_for_small_submitter; template -struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> +struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>> { template auto @@ -246,6 +252,91 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> } }; +template +struct __parallel_for_large_submitter; + +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...> +{ + // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip + // iterator. + using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple< + std::tuple...>>::type; + static constexpr std::size_t __min_type_size = + oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>; + // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the + // flattened ranges. This allows us to launch enough work per item to saturate the device's memory + // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to + // achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good + // value across the different for-based algorithms. + static constexpr std::uint8_t __bytes_per_work_item = 16; + static constexpr std::uint8_t __iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, __min_type_size); + // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. + static constexpr std::uint16_t __max_work_group_size = 512; + + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, + // then we should start using this code path. + template + static std::size_t + __estimate_best_start_size(const _ExecutionPolicy& __exec) + { + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + return __work_group_size * __iters_per_work_item * __max_cu; + } + + template + auto + operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + { + assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); + _PRINT_INFO_IN_DEBUG_MODE(__exec); + auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) { + //get an access to data under SYCL buffer: + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + const std::size_t __num_groups = + oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)); + const std::size_t __num_items = __num_groups * __work_group_size; + __cgh.parallel_for<_Name...>( + sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), + [=](sycl::nd_item __item) { + // TODO: Investigate adding a vectorized path similar to reduce. + // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but + // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to + // vectorize our code. Vectorization may also improve performance of for-algorithms over small data + // types. + auto [__idx, __stride, __is_full] = + __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size); + if (__is_full) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i) + { + __brick(__idx, __rngs...); + __idx += __stride; + } + } + // If we are not full, then take this branch only if there is work to process. + else if (__idx < __count) + { + const std::uint8_t __adjusted_iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__count - __idx, __stride); + for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i) + { + __brick(__idx, __rngs...); + __idx += __stride; + } + } + }); + }); + return __future(__event); + } +}; + //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. template @@ -254,17 +345,32 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& _Ranges&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>; - - return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count, - ::std::forward<_Ranges>(__rngs)...); + using _ForKernelSmall = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>; + using _ForKernelLarge = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + + using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; + using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single + // kernel that worsen performance for small cases. + if (__count < __large_submitter::__estimate_best_start_size(__exec)) + { + return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + else + { + return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } } //------------------------------------------------------------------------ // parallel_transform_scan - async pattern //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_scan_submitter; @@ -1796,7 +1902,7 @@ struct __partial_merge_kernel } }; -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 7baee78b1b1..3be82fdc623 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -48,7 +48,7 @@ namespace __par_backend_hetero //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_for_fpga_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9e63f6e1236..70b8a81e275 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -140,7 +140,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index1 } } -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_merge_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 19a4f25b889..2950d5f9800 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -202,7 +202,7 @@ struct __leaf_sorter _GroupSorter __group_sorter; }; -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_sort_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index ca776e94dce..9f2e2ef824b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _ //------------------------------------------------------------------------ // parallel_transform_reduce - async patterns -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ // Parallel_transform_reduce for a small arrays using a single work group. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 9bd195a80a9..2f84b560472 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -823,6 +823,39 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, } }; +// Utility to recommend a stride for the best-performing memory access pattern from empirical testing on different +// devices. This utility can only be called from the device. +// +// SPIR-V compilation targets show best performance with a stride of the sub-group size. +// Other compilation targets perform best with a work-group size stride. +inline std::tuple +__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __work_group_size) +{ + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); + + const std::size_t __sub_group_start_idx = + __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count; + const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id; + return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group); + } + else + { + const std::size_t __work_group_start_idx = + __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item; + const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id(); + const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count; + return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group); + } +} + } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h index 239734d4861..c758a4a3f1b 100644 --- a/include/oneapi/dpl/pstl/tuple_impl.h +++ b/include/oneapi/dpl/pstl/tuple_impl.h @@ -793,6 +793,25 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>> template using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type; +// Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple. +template +struct __flatten_std_or_internal_tuple +{ + using type = std::tuple<_T>; +}; + +template +struct __flatten_std_or_internal_tuple> +{ + using type = decltype(std::tuple_cat(std::declval::type>()...)); +}; + +template +struct __flatten_std_or_internal_tuple> +{ + using type = decltype(std::tuple_cat(std::declval::type>()...)); +}; + } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index df2b7416e58..546fbd997f9 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -783,6 +783,29 @@ union __lazy_ctor_storage } }; +// Utility that returns the smallest type size in a tuple. +template +class __min_tuple_type_size; + +template +class __min_tuple_type_size> +{ + public: + static constexpr std::size_t value = sizeof(_T); +}; + +template +class __min_tuple_type_size> +{ + static constexpr std::size_t __min_type_value_ts = __min_tuple_type_size>::value; + + public: + static constexpr std::size_t value = std::min(sizeof(_T), __min_type_value_ts); +}; + +template +inline constexpr std::size_t __min_tuple_type_size_v = __min_tuple_type_size<_Tuple>::value; + } // namespace __internal } // namespace dpl } // namespace oneapi