Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tune the amount of groups in __parallel_find_or pattern #1723

Merged
merged 40 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
b35f42a
Implementation of __parallel_find_or_nd_range_tuner
SergeyKopienko Aug 6, 2024
387dedd
Using _GroupsTuner in __parallel_find_or
SergeyKopienko Aug 6, 2024
f70dca7
include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - __pattern_ad…
SergeyKopienko Aug 6, 2024
245d9e5
include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - __pattern_an…
SergeyKopienko Aug 6, 2024
ecda02b
include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h - __pattern_eq…
SergeyKopienko Aug 6, 2024
deef531
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - __para…
SergeyKopienko Aug 6, 2024
37b7d08
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h - __para…
SergeyKopienko Aug 6, 2024
b675f23
include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pat…
SergeyKopienko Aug 6, 2024
4a1282c
include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pat…
SergeyKopienko Aug 6, 2024
c2ab99e
include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pat…
SergeyKopienko Aug 6, 2024
00e2a7b
include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pat…
SergeyKopienko Aug 6, 2024
6ee3176
include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pat…
SergeyKopienko Aug 6, 2024
9222e4d
include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pat…
SergeyKopienko Aug 6, 2024
c29ac42
include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h - __pat…
SergeyKopienko Aug 6, 2024
8a16fad
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h - …
SergeyKopienko Aug 8, 2024
7d0f39e
Fix review comment
SergeyKopienko Aug 8, 2024
7f8453a
Restore usage of the common tuner in all algorithms
SergeyKopienko Aug 8, 2024
dcc066a
Tune from 65'536 size and more
SergeyKopienko Aug 8, 2024
5128952
Remove tuner from params
SergeyKopienko Aug 8, 2024
369b979
Move _PRINT_INFO_IN_DEBUG_MODE into __parallel_find_or from __paralle…
SergeyKopienko Aug 8, 2024
2d20b75
Remove __min_tune_rng_n
SergeyKopienko Aug 8, 2024
b11f794
Apply GitHUB clang format
SergeyKopienko Aug 8, 2024
e8f59e8
Fix performance degradation for 64K of source data: tune only when __…
SergeyKopienko Aug 8, 2024
9dd761c
Apply GitHUB clang format
SergeyKopienko Aug 9, 2024
67e59d4
Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
SergeyKopienko Aug 9, 2024
f2332af
Using structure binding in __parallel_find_or_nd_range_tuner<oneapi::…
SergeyKopienko Aug 9, 2024
5eb4720
Fix review comment: remove __parallel_find_or_nd_range_tuner_common
SergeyKopienko Aug 9, 2024
001a75f
Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
SergeyKopienko Aug 9, 2024
58bb632
Remove alias and extra closing bracket
SergeyKopienko Aug 9, 2024
d3ba89b
Fix comment
SergeyKopienko Aug 9, 2024
dbb77e4
Fix review comment: remove __base_rng_n
SergeyKopienko Aug 9, 2024
193983b
__k = std::pow(2, std::ceil(std::log2(__k)));Fix review comment: remo…
SergeyKopienko Aug 9, 2024
b2af0e8
Fix review comments: __current_iters_per_work_item is not required
SergeyKopienko Aug 9, 2024
4922f46
Fix review comments: __current_iters_per_work_item is not required
SergeyKopienko Aug 9, 2024
0f9a60d
Fix review comment
SergeyKopienko Aug 9, 2024
258f043
Fix review comment
SergeyKopienko Aug 9, 2024
adff34e
Fix review comment: // TODO : need to re-evaluate this formula.
SergeyKopienko Aug 12, 2024
e48be00
Fix review comment: use float type instead auto
SergeyKopienko Aug 12, 2024
99fed47
Add const to std::size_t __k
SergeyKopienko Aug 12, 2024
65ce105
Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
SergeyKopienko Aug 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 85 additions & 16 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1134,6 +1134,85 @@ struct __early_exit_find_or
// parallel_find_or - sync pattern
//------------------------------------------------------------------------

template <typename Tag>
struct __parallel_find_or_nd_range_tuner
{
// Tune the amount of work-groups and work-group size
template <typename _ExecutionPolicy>
std::tuple<std::size_t, std::size_t>
operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const
{
// TODO: find a way to generalize getting of reliable work-group size
// Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
// This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096);

const auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size);
__n_groups = ::std::min(__n_groups, decltype(__n_groups)(__max_cu));

// Pass all small data into single WG implementation
constexpr std::size_t __max_iters_per_work_item = 32;
if (__rng_n <= __wgroup_size * __max_iters_per_work_item)
{
__n_groups = 1;
}
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved

return {__n_groups, __wgroup_size};
}
};

// No tuning for FPGA_EMU because we are not going to tune here the performance for FPGA emulation.
#if !_ONEDPL_FPGA_EMU
template <>
struct __parallel_find_or_nd_range_tuner<oneapi::dpl::__internal::__device_backend_tag>
{
static constexpr std::size_t __base_rng_n = 4096;
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved

// Tune the amount of work-groups and work-group size
template <typename _ExecutionPolicy>
std::tuple<std::size_t, std::size_t>
operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const
{
// Define common tuner type
using __parallel_find_or_nd_range_tuner_common = __parallel_find_or_nd_range_tuner<int>;

akukanov marked this conversation as resolved.
Show resolved Hide resolved
// Call common tuning function to get the work-group size
auto __nd_range_params = __parallel_find_or_nd_range_tuner_common{}(__exec, __rng_n);

auto __n_groups = std::get<0>(__nd_range_params);
akukanov marked this conversation as resolved.
Show resolved Hide resolved
if (__n_groups > 1)
{
auto __wgroup_size = std::get<1>(__nd_range_params);

auto __iters_per_work_item =
oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size);

// If our work capacity is not enough to process all data in one iteration, will tune the number of work-groups
if (__iters_per_work_item > 1)
{
// Empirically found formula for typical devices.
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
const auto __rng_x = __rng_n / __base_rng_n;
const auto __required_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.);


// We halve the number of work-groups until the number of iterations per work-item
// is greater than or equal to the desired number of iterations per work-item.
while (__iters_per_work_item < __required_iters_per_work_item && __n_groups > 1)
{
__n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, 2);
__iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size);
}
}

__nd_range_params = {__n_groups, __wgroup_size};
}

return __nd_range_params;
}
};
#endif // !_ONEDPL_FPGA_EMU

// Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag.
template <typename KernelName, bool __or_tag_check, typename _ExecutionPolicy, typename _BrickTag,
typename __FoundStateType, typename _Predicate, typename... _Ranges>
Expand Down Expand Up @@ -1274,23 +1353,13 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli
auto __rng_n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...);
assert(__rng_n > 0);

// TODO: find a way to generalize getting of reliable work-group size
// Limit the work-group size to prevent large sizes on CPUs. Empirically found value.
// This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future.
std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)4096);

const auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
auto __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size);
__n_groups = ::std::min(__n_groups, decltype(__n_groups)(__max_cu));

// Pass all small data into single WG implementation
constexpr std::size_t __max_iters_per_work_item = 32;
if (__rng_n <= __wgroup_size * __max_iters_per_work_item)
{
__n_groups = 1;
}
// Evaluate the amount of work-groups and work-group size
const auto __nd_range_params =
__parallel_find_or_nd_range_tuner<oneapi::dpl::__internal::__device_backend_tag>{}(__exec, __rng_n);
const auto __n_groups = std::get<0>(__nd_range_params);
const auto __wgroup_size = std::get<1>(__nd_range_params);
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved

_PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size, __max_cu);
_PRINT_INFO_IN_DEBUG_MODE(__exec, __wgroup_size);

using _AtomicType = typename _BrickTag::_AtomicType;
const _AtomicType __init_value = _BrickTag::__init_value(__rng_n);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ class __kernel_compiler
template <typename _Policy>
inline void
// Passing policy by value should be enough for debugging
__print_device_debug_info(_Policy __policy, size_t __wg_size = 0, size_t __max_cu = 0)
__print_device_debug_info(const _Policy& __policy, size_t __wg_size = 0, size_t __max_cu = 0)
akukanov marked this conversation as resolved.
Show resolved Hide resolved
{
::std::cout << "Device info" << ::std::endl;
::std::cout << " > device name: " << oneapi::dpl::__internal::__device_info(__policy) << ::std::endl;
Expand All @@ -309,7 +309,8 @@ __print_device_debug_info(_Policy __policy, size_t __wg_size = 0, size_t __max_c
}
#else
template <typename _Policy>
inline void __print_device_debug_info(_Policy, size_t = 0, size_t = 0)
inline void
__print_device_debug_info(const _Policy& __policy, size_t = 0, size_t = 0)
akukanov marked this conversation as resolved.
Show resolved Hide resolved
{
}
#endif
Expand Down
Loading