Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SYCL backend __parallel_for performance for large input sizes #1870

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 119 additions & 8 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,12 @@ class __scan_single_wg_dynamic_kernel;
template <typename... Name>
class __scan_copy_single_wg_kernel;

template <typename... Name>
class __parallel_for_small_kernel;

template <typename... Name>
class __parallel_for_large_kernel;

//------------------------------------------------------------------------
// parallel_for - async pattern
//------------------------------------------------------------------------
Expand All @@ -222,10 +228,10 @@ class __scan_copy_single_wg_kernel;
// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
// type (for explicitly specified name by the user)
template <typename _KernelName>
struct __parallel_for_submitter;
struct __parallel_for_small_submitter;

template <typename... _Name>
struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>>
{
template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
auto
Expand All @@ -246,6 +252,96 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
}
};

template <typename _KernelName, typename... _Ranges>
struct __parallel_for_large_submitter;

template <typename... _Name, typename... _Ranges>
struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...>
{
// Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip
// iterator.
using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple<
std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>::type;
using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type;
// __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
// flattened ranges. This allows us to launch enough work per item to saturate the device's memory
// bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
// achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good
// value across the different for-based algorithms.
static constexpr std::uint8_t __bytes_per_work_item = 16;
dmitriy-sobolev marked this conversation as resolved.
Show resolved Hide resolved
static constexpr std::uint8_t __iters_per_work_item =
oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType));
// Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
static constexpr std::uint16_t __max_work_group_size = 512;

// Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
// then we should start using this code path.
template <typename _ExecutionPolicy>
static std::size_t
__estimate_best_start_size(const _ExecutionPolicy& __exec)
{
const std::size_t __work_group_size =
oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
return __work_group_size * __iters_per_work_item * __max_cu;
}

template <typename _ExecutionPolicy, typename _Fp, typename _Index>
auto
operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
{
assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
_PRINT_INFO_IN_DEBUG_MODE(__exec);
auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
//get an access to data under SYCL buffer:
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
const std::size_t __work_group_size =
oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
const std::size_t __num_groups =
oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
const std::size_t __num_items = __num_groups * __work_group_size;
__cgh.parallel_for<_Name...>(
sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
[=](sycl::nd_item</*dim=*/1> __ndi) {
// TODO: Investigate adding a vectorized path similar to reduce.
// Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
// performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to
// vectorize our code. Vectorization may also improve performance of for-algorithms over small data
// types.
auto [__idx, __group_start_idx, __stride, __is_full] =
__stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May be better to specify __stride_recommender as template param type with ability to change from caller side if required?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, leaving it as is currently is the best although maybe I should rename the function since stride is a loaded term.

We want to enforce a good access pattern. Work-group strides to enable coalescing is a good general choice for devices, and sub-group strides are used in the oneAPI GPU optimization guide and show slightly better performance for SPIR-V compiled targets. I do not see a need to change this as we should always use the best performing stride. I could see future improvements modifying __stride_recommender itself, but I do not see a need to accept a templated functor at this point. Do you see a potential use case for this?

if (__is_full)
{
_ONEDPL_PRAGMA_UNROLL
for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
{
__brick(__idx, __rngs...);
__idx += __stride;
}
}
else
{
// Recompute iters per item and manually unroll last loop iteration to remove most branching.
if (__group_start_idx >= __count)
return;
const std::uint8_t __adjusted_iters_per_work_item =
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride);
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Answered in the other comment regarding uint8_t

{
__brick(__idx, __rngs...);
__idx += __stride;
}
if (__idx < __count)
{
__brick(__idx, __rngs...);
}
}
Copy link
Contributor

@SergeyKopienko SergeyKopienko Sep 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
else
{
// Recompute iters per item and manually unroll last loop iteration to remove most branching.
if (__group_start_idx >= __count)
return;
const std::uint8_t __adjusted_iters_per_work_item =
oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride);
for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i)
{
__brick(__idx, __rngs...);
__idx += __stride;
}
if (__idx < __count)
{
__brick(__idx, __rngs...);
}
}
// Recompute iters per item and manually unroll last loop iteration to remove most branching.
if (__group_start_idx < __count)
{
const std::uint8_t __adjusted_iters_per_work_item =
oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride);
for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i)
{
__brick(__idx, __rngs...);
__idx += __stride;
}
if (__idx < __count)
{
__brick(__idx, __rngs...);
}
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A similar change has been committed.

});
});
return __future(__event);
}
};

//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
Expand All @@ -254,17 +350,32 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
_Ranges&&... __rngs)
{
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>;

return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count,
::std::forward<_Ranges>(__rngs)...);
using _ForKernelSmall =
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>;
using _ForKernelLarge =
oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;

using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
// Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single
// kernel that worsen performance for small cases.
if (__count < __large_submitter::__estimate_best_start_size(__exec))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure that __large_submitter able to calculate to best size in all cases and it doesn't depends on Kernel code and it's logic at all. So may be we should have ability to customize this condition check to?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its an interesting point. I'm trying to think about how different workloads / algorithms would impact this number...

I think the decision is largely dependent on overhead vs memory bandwidth optimization. Its possible that more computation would make this less important because we are less reliant on memory bandwidth. However, depending on user provided call-ables in the library makes this very difficult to make good decisions, unless there are some APIs which we know are always very high computationally that use parallel_for internally (I don't know of any).

Another aspect to consider, the larger the size of the minimum type size in the input ranges, the fewer iterations would be run by the large submitter. At the limit, I imagine there is no advantage of the large submitter when we only would run a single iteration. This is knowable at compile time, perhaps we should detect this case and always choose the small submitter when a single iteration would be used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing to note is that the performance difference at this "estimated point" is very small between the two versions so computing an optimal point to switch for each algorithm does not bring us any significant benefit and would likely be overtuned for a specific case / architecture, so I am in favor of leaving it as it currently is. The improvements are really observed once we scale to large sizes.

With regards to @danhoeflinger's comment, I agree that a user providing a heavily compute intensive operator might minimize the observed benefit if we become compute bound. I do not think we have encountered such a case yet, but I do not think there is a high risk of performance loss in this scenario although there may be small differences around this estimated point.

Good point on the case where we load one element per item. I will look into adding this as a compile time decision.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense to me. Thanks for the reply.

{
return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
std::forward<_Ranges>(__rngs)...);
}
else
{
return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
std::forward<_Ranges>(__rngs)...);
}
}

//------------------------------------------------------------------------
// parallel_transform_scan - async pattern
//------------------------------------------------------------------------

// Please see the comment for __parallel_for_submitter for optional kernel name explanation
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation
template <typename _CustomName, typename _PropagateScanName>
struct __parallel_scan_submitter;

Expand Down Expand Up @@ -1796,7 +1907,7 @@ struct __partial_merge_kernel
}
};

// Please see the comment for __parallel_for_submitter for optional kernel name explanation
// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation
template <typename _GlobalSortName, typename _CopyBackName>
struct __parallel_partial_sort_submitter;

Expand Down
34 changes: 34 additions & 0 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,40 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
}
};

// Utility to recommend a stride for the best-performing memory access pattern from empirical testing on different
// devices. This utility can only be called from the device.
//
// SPIR-V compilation targets show best performance with a stride of the sub-group size.
// Other compilation targets perform best with a work-group size stride.
template <typename NdItem>
std::tuple<std::size_t, std::size_t, std::size_t, bool>
__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item,
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
std::size_t __work_group_size)
{
if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
{
const __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id();

const std::size_t __sub_group_start_idx =
__iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
return std::make_tuple(__work_item_idx, __sub_group_start_idx, __sub_group_size, __is_full_sub_group);
}
else
{
const std::size_t __work_group_start_idx =
__ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id();
const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group);
}
}

} // namespace __par_backend_hetero
} // namespace dpl
} // namespace oneapi
Expand Down
19 changes: 19 additions & 0 deletions include/oneapi/dpl/pstl/tuple_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,25 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>>
template <typename... _Args>
using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type;

// Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple.
template <typename _T>
struct __flatten_std_or_internal_tuple
{
using type = std::tuple<_T>;
};

template <typename... _Ts>
struct __flatten_std_or_internal_tuple<std::tuple<_Ts...>>
{
using type = decltype(std::tuple_cat(std::declval<typename __flatten_std_or_internal_tuple<_Ts>::type>()...));
};

template <typename... _Ts>
struct __flatten_std_or_internal_tuple<oneapi::dpl::__internal::tuple<_Ts...>>
{
using type = decltype(std::tuple_cat(std::declval<typename __flatten_std_or_internal_tuple<_Ts>::type>()...));
};

} // namespace __internal
} // namespace dpl
} // namespace oneapi
Expand Down
20 changes: 20 additions & 0 deletions include/oneapi/dpl/pstl/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,26 @@ union __lazy_ctor_storage
}
};

// Utility that returns the smallest type in tuple.
template <typename _Tuple>
class __min_tuple_type;

template <typename _T>
class __min_tuple_type<std::tuple<_T>>
{
public:
using type = _T;
};

template <typename _T, typename... _Ts>
class __min_tuple_type<std::tuple<_T, _Ts...>>
{
using __min_type_ts = typename __min_tuple_type<std::tuple<_Ts...>>::type;

public:
using type = std::conditional_t<(sizeof(_T) < sizeof(__min_type_ts)), _T, __min_type_ts>;
};

danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
} // namespace __internal
} // namespace dpl
} // namespace oneapi
Expand Down
Loading