From cd477a378fa89bb6da81c63e8d3187dc0aa2e5ee Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 14 Nov 2024 12:57:51 +0100 Subject: [PATCH 01/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - re-implement __find_start_point function Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 155 +++++++++++++++--- 1 file changed, 129 insertions(+), 26 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 753e32816a0..bd87d000354 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -32,6 +32,12 @@ namespace dpl namespace __par_backend_hetero { +template +using _split_point_t = std::pair<_Index, _Index>; + +template +constexpr _split_point_t<_Index> __zero_split_point{0, 0}; + //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: // 0 1 1 2 3 @@ -45,37 +51,134 @@ namespace __par_backend_hetero // | ----> // 3 | 0 0 0 0 0 | template -auto +_split_point_t<_Index> __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, const _Index __n2, _Compare __comp) { - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); + const _Index __rng1_from = 0; + const _Index __rng1_to = __n1; + const _Index __rng2_from = 0; + const _Index __rng2_to = __n2; - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); - } - else + assert(__rng1_from <= __rng1_to); + assert(__rng2_from <= __rng2_to); + + assert(__rng1_to > 0 || __rng2_to > 0); + + if constexpr (!std::is_pointer_v<_Rng1>) + assert(__rng1_to <= __rng1.size()); + if constexpr (!std::is_pointer_v<_Rng2>) + assert(__rng2_to <= __rng2.size()); + + assert(__i_elem >= 0); + + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Process the corner case: for the first diagonal with the index 0 split point + // is equal to (0, 0) regardless of the size and content of the data. + if (__i_elem > 0) { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + assert(idx1_from <= idx1_to); + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + assert(idx2_from <= idx2_to); + + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + assert(idx1_from <= idx1_to); + assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); + + assert(idx2_from <= idx2_to); + assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + const auto __rng1_idx = __idx; + const auto __rng2_idx = __index_sum - __idx; + + assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); + assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); + assert(__rng1_idx + __rng2_idx == __index_sum); + + const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); + return __zero_or_one < kValue; + }); + + const _split_point_t<_Index> __result = std::make_pair(*__res, __index_sum - *__res + 1); + assert(__result.first + __result.second == __i_elem); + + assert(__rng1_from <= __result.first && __result.first <= __rng1_to); + assert(__rng2_from <= __result.second && __result.second <= __rng2_to); + + return __result; } + + return std::make_pair(0, 0); } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing @@ -157,7 +260,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + const _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); }); From c76ad72c59673a0508570255b3e61cfc17ab8b9e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 14 Nov 2024 12:59:13 +0100 Subject: [PATCH 02/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename template params in __parallel_merge_submitter Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index bd87d000354..0c0befe175e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -233,11 +233,11 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } // Please see the comment for __parallel_for_submitter for optional kernel name explanation -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> { template auto @@ -258,7 +258,8 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { const _IdType __i_elem = __item_id.get_linear_id() * __chunk; const _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, @@ -283,18 +284,18 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy if (__n <= std::numeric_limits::max()) { using _WiIndex = std::uint32_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { using _WiIndex = std::uint64_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } From cdf7d2b94d031db749a22fb79e2ea00faf21b548 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 14 Nov 2024 13:08:44 +0100 Subject: [PATCH 03/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - implementation of __parallel_merge_submitter_large Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 0c0befe175e..472a45bd0c3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -21,6 +21,7 @@ #include // std::uint8_t, ... #include // std::make_pair, std::forward #include // std::min, std::lower_bound +#include // std::tuple #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" @@ -270,6 +271,240 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M } }; +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> +{ + // Create local accessors for data cache in SLM: + // - one accessor for the first and for the second ranges if _Range1 and _Range2 has the SAME value types; + // - two accessors for the first and for the second ranges if _Range1 and _Range2 has DIFFERENT value types. + struct __merge_slm_helper + { + template + static std::size_t + get_data_size(_Range1&& __rng1, _Range2&& __rng2) + { + using _Range1ValueType = typename std::iterator_traits::value_type; + using _Range2ValueType = typename std::iterator_traits::value_type; + + return sizeof(_Range1ValueType) + sizeof(_Range2ValueType); + } + + template + static constexpr auto + create_local_accessors(sycl::handler& __cgh, _Range1&& __rng1, _Range2&& __rng2, + std::size_t __slm_cached_data_size) + { + using _Range1ValueType = typename std::iterator_traits::value_type; + using _Range2ValueType = typename std::iterator_traits::value_type; + + if constexpr (std::is_same_v<_Range1ValueType, _Range2ValueType>) + return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>>( + __dpl_sycl::__local_accessor<_Range1ValueType>(2 * __slm_cached_data_size, __cgh)); + else + return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>, + __dpl_sycl::__local_accessor<_Range2ValueType>>( + __dpl_sycl::__local_accessor<_Range1ValueType>(__slm_cached_data_size, __cgh), + __dpl_sycl::__local_accessor<_Range2ValueType>(__slm_cached_data_size, __cgh)); + } + + template + static auto + get_local_accessor(AccessorsTuple& __acc_tuple, std::size_t __offset = 0) + { + static_assert(std::tuple_size_v == 1 || std::tuple_size_v == 2); + + if constexpr (std::tuple_size_v == 1) + return std::pair(std::get<0>(__acc_tuple), __offset); + + else + return std::pair(std::get(__acc_tuple), 0); + } + }; + + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + assert(__chunk > 0); + + // Pessimistically only use half of the memory to take into account memory used by compiled kernel + const std::size_t __max_slm_size_adj = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, __merge_slm_helper::get_data_size(__rng1, __rng2)); + + // The amount of data must be a multiple of the chunk size. + const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk; + assert(__max_source_data_items_fit_into_slm > 0); + assert(__max_source_data_items_fit_into_slm % __chunk == 0); + + // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group) + const _IdType __items_in_wg_count = __max_source_data_items_fit_into_slm / __chunk; + assert(__items_in_wg_count > 0); + + // The amount of the base diagonals is the amount of the work-groups + // - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group + const _IdType __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_source_data_items_fit_into_slm); + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + // - in GLOBAL coordinates + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; + __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __wg_count + 1}; + + // 1. Calculate split points on each base diagonal + // - one work-item processing one base diagonal + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_DiagonalsKernelName...>( + sycl::range(__wg_count + 1), [=](sycl::item __item_id) { + + const std::size_t __global_idx = __item_id.get_linear_id(); + + _split_point_t<_IdType>* __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + // Save top-left split point for first/last base diagonals of merge matrix + // - in GLOBAL coordinates + _split_point_t<_IdType> __sp(__global_idx == 0 ? __zero_split_point<_IdType> : _split_point_t<_IdType>{__n1, __n2}); + + if (0 < __global_idx && __global_idx < __wg_count) + { + const _IdType __i_elem = __global_idx * __items_in_wg_count * __chunk; + + // Save bottom-right split point for current base diagonal of merge matrix + // - in GLOBAL coordinates + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; + }); + }); + + // 2. Merge data using split points on each base diagonal + // - one work-item processing one diagonal + // - work-items grouped to process diagonals between two base diagonals (include left base diagonal and exclude right base diagonal) + __event = __exec.queue().submit([&](sycl::handler& __cgh) { + + __cgh.depends_on(__event); + + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + const std::size_t __slm_cached_data_size = __items_in_wg_count * __chunk; + auto local_accessors = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); + + // Run nd_range parallel_for to process all the data + __cgh.parallel_for<_MergeKernelName...>( + sycl::nd_range(__wg_count * __items_in_wg_count, __items_in_wg_count), + [=](sycl::nd_item __nd_item) + { + // Merge matrix diagonal's GLOBAL index + const std::size_t __global_idx = __nd_item.get_global_linear_id(); + + // Merge sub-matrix LOCAL diagonal's index + const std::size_t __local_idx = __nd_item.get_local_id(0); + + // Merge matrix base diagonal's GLOBAL index + const std::size_t __wg_id = __nd_item.get_group_linear_id(); + + auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + // Split points on left anr right base diagonals + // - in GLOBAL coordinates + assert(__wg_id + 1 < __wg_count + 1); + const _split_point_t<_IdType>& __sp_base_left_global = __base_diagonals_sp_global_ptr[__wg_id]; + const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; + + auto [__local_accessor_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(local_accessors); + auto [__local_accessor_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(local_accessors, (std::size_t)(__sp_base_right_global.first -__sp_base_left_global.first)); + auto __rngs_data_in_slm1 = std::addressof(__local_accessor_rng1[0]) + offset_to_slm1; + auto __rngs_data_in_slm2 = std::addressof(__local_accessor_rng2[0]) + offset_to_slm2; + + // Full amount of work-items may be great then the amount of diagonals in the merge matrix + // so we should skip the redundant work-items + const bool __out_of_data = __global_idx * __chunk >= __n; + if (!__out_of_data) + { + // Load the current part of merging data placed between two base diagonals into SLM + // TODO implement cooperative data load by multiple work-items + assert(__items_in_wg_count > 1); + if (__local_idx == 0) + { + _IdType __slm_idx = 0; + for (_IdType __idx = __sp_base_left_global.first; __idx < __sp_base_right_global.first; ++__idx, ++__slm_idx) + { + assert(__slm_idx < __slm_cached_data_size); + assert(__idx < __n1); + __rngs_data_in_slm1[__slm_idx] = __rng1[__idx]; + } + } + + if (__local_idx == 1 && __items_in_wg_count > 1 || __local_idx == 0) + { + _IdType __slm_idx = 0; + for (_IdType __idx = __sp_base_left_global.second; __idx < __sp_base_right_global.second; ++__idx, ++__slm_idx) + { + assert(__slm_idx < __slm_cached_data_size); + assert(__idx < __n2); + __rngs_data_in_slm2[__slm_idx] = __rng2[__idx]; + } + } + } + + // Wait until all the data is loaded + // - we shouldn't setup this barrier under any conditions!!! + __dpl_sycl::__group_barrier(__nd_item); + + if (!__out_of_data) + { + // We are between two base diagonals and need to find the start points in the merge matrix area, + // limited by split points of the left and right base diagonals. + + // Find split point in LOCAL coordinates + // - top-left split point is (0, 0); + // - bottom-right split point describes the size of current area between two base diagonals. + assert(__sp_base_right_global.first >= __sp_base_left_global.first); + assert(__sp_base_right_global.second >= __sp_base_left_global.second); + const _split_point_t<_IdType> __sp_local = __find_start_point( + __rngs_data_in_slm1, __rngs_data_in_slm2, // SLM cached copy of merging data + (_IdType)(__local_idx * __chunk), // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data + (_IdType)(__sp_base_right_global.first - __sp_base_left_global.first), // size of rng1 + (_IdType)(__sp_base_right_global.second - __sp_base_left_global.second), // size of rng2 + __comp); + + // Merge data for the current diagonal + // - we should have here __sp_global in GLOBAL coordinates + __serial_merge(__rngs_data_in_slm1, __rngs_data_in_slm2, // SLM cached copy of merging data + __rng3, // Destination range + __sp_local.first, // __start1 in LOCAL coordinates because __rngs_data_in_slm1 is work-group SLM cached copy of source data + __sp_local.second, // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data + (_IdType)(__global_idx * __chunk), // __start3 in GLOBAL coordinates because __rng3 is not cached at all + __chunk, + __sp_base_right_global.first - __sp_base_left_global.first, // size of __rngs_data_in_slm1 + __sp_base_right_global.second - __sp_base_left_global.second, // size of __rngs_data_in_slm2 + __comp); + } + }); + }); + return __future(__event); + } +}; + template class __merge_kernel_name; From e5ced865f051c6214ed7f1cd4d40a13851b8d580 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 14 Nov 2024 13:04:45 +0100 Subject: [PATCH 04/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using __parallel_merge_submitter_large in the __parallel_merge Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 60 +++++++++++++++---- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 472a45bd0c3..9053310a7ef 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -508,6 +508,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, template class __merge_kernel_name; +template +class __diagonals_kernel_name; + +template +class __merge_kernel_name_large; + template auto __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, @@ -516,23 +522,51 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; const auto __n = __rng1.size() + __rng2.size(); - if (__n <= std::numeric_limits::max()) + if (__n < 4 * 1'048'576) { - using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } else { - using _WiIndex = std::uint64_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } } From 99dfb4aba50f67d4f57285ba7d456ff2df57c585 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 12:30:21 +0100 Subject: [PATCH 05/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - removed redundand comment Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9053310a7ef..56044f9646e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -476,7 +476,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // limited by split points of the left and right base diagonals. // Find split point in LOCAL coordinates - // - top-left split point is (0, 0); // - bottom-right split point describes the size of current area between two base diagonals. assert(__sp_base_right_global.first >= __sp_base_left_global.first); assert(__sp_base_right_global.second >= __sp_base_left_global.second); @@ -495,9 +494,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __sp_local.second, // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data (_IdType)(__global_idx * __chunk), // __start3 in GLOBAL coordinates because __rng3 is not cached at all __chunk, - __sp_base_right_global.first - __sp_base_left_global.first, // size of __rngs_data_in_slm1 - __sp_base_right_global.second - __sp_base_left_global.second, // size of __rngs_data_in_slm2 - __comp); + __sp_base_right_global.first - __sp_base_left_global.first, // size of __rngs_data_in_slm1 + __sp_base_right_global.second - __sp_base_left_global.second, // size of __rngs_data_in_slm2 + __comp); } }); }); From e79d00c97e217682b77e0b48f8e40035a058275a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 12:32:03 +0100 Subject: [PATCH 06/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - small data types should be acceptable too Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 56044f9646e..cec67084da5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -343,8 +343,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, assert(__chunk > 0); // Pessimistically only use half of the memory to take into account memory used by compiled kernel - const std::size_t __max_slm_size_adj = - oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, __merge_slm_helper::get_data_size(__rng1, __rng2)); + const std::size_t __max_slm_size_adj = + std::max((std::size_t)__chunk, + std::min((std::size_t)__n, + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, __merge_slm_helper::get_data_size(__rng1, __rng2)))); // The amount of data must be a multiple of the chunk size. const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk; From 30b70b3d700039b2882a6c05f9edd7ebbb742fff Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 12:32:51 +0100 Subject: [PATCH 07/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - define __base_diagonals_sp_global_ptr outside of parallel_for Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index cec67084da5..ca7813dc3f7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -406,6 +406,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); const std::size_t __slm_cached_data_size = __items_in_wg_count * __chunk; auto local_accessors = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); @@ -424,8 +425,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Merge matrix base diagonal's GLOBAL index const std::size_t __wg_id = __nd_item.get_group_linear_id(); - auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - // Split points on left anr right base diagonals // - in GLOBAL coordinates assert(__wg_id + 1 < __wg_count + 1); From e9222aa0584534c80e3af583821f5cbc2e5943b9 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 12:36:37 +0100 Subject: [PATCH 08/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - calculate and use cached data-size for work-group Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ca7813dc3f7..621d84f05f3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -429,7 +429,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - in GLOBAL coordinates assert(__wg_id + 1 < __wg_count + 1); const _split_point_t<_IdType>& __sp_base_left_global = __base_diagonals_sp_global_ptr[__wg_id]; - const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; + const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; + + assert(__sp_base_right_global.first >= __sp_base_left_global.first); + assert(__sp_base_right_global.second >= __sp_base_left_global.second); + + const _IdType __wg_data_size_rng1 = __sp_base_right_global.first - __sp_base_left_global.first; + const _IdType __wg_data_size_rng2 = __sp_base_right_global.second - __sp_base_left_global.second; + auto [__local_accessor_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(local_accessors); auto [__local_accessor_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(local_accessors, (std::size_t)(__sp_base_right_global.first -__sp_base_left_global.first)); @@ -478,13 +485,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Find split point in LOCAL coordinates // - bottom-right split point describes the size of current area between two base diagonals. - assert(__sp_base_right_global.first >= __sp_base_left_global.first); - assert(__sp_base_right_global.second >= __sp_base_left_global.second); const _split_point_t<_IdType> __sp_local = __find_start_point( __rngs_data_in_slm1, __rngs_data_in_slm2, // SLM cached copy of merging data (_IdType)(__local_idx * __chunk), // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data - (_IdType)(__sp_base_right_global.first - __sp_base_left_global.first), // size of rng1 - (_IdType)(__sp_base_right_global.second - __sp_base_left_global.second), // size of rng2 + __wg_data_size_rng1, // size of rng1 + __wg_data_size_rng2, // size of rng2 __comp); // Merge data for the current diagonal @@ -495,8 +500,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __sp_local.second, // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data (_IdType)(__global_idx * __chunk), // __start3 in GLOBAL coordinates because __rng3 is not cached at all __chunk, - __sp_base_right_global.first - __sp_base_left_global.first, // size of __rngs_data_in_slm1 - __sp_base_right_global.second - __sp_base_left_global.second, // size of __rngs_data_in_slm2 + __wg_data_size_rng1, // size of __rngs_data_in_slm1 + __wg_data_size_rng2, // size of __rngs_data_in_slm2 __comp); } }); From ed1a1b20bc99b7d3171296913382279f1b71c2ee Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 12:38:05 +0100 Subject: [PATCH 09/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename some local variables Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 621d84f05f3..90837008415 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -409,7 +409,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); const std::size_t __slm_cached_data_size = __items_in_wg_count * __chunk; - auto local_accessors = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); + auto loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); // Run nd_range parallel_for to process all the data __cgh.parallel_for<_MergeKernelName...>( @@ -437,11 +437,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __wg_data_size_rng1 = __sp_base_right_global.first - __sp_base_left_global.first; const _IdType __wg_data_size_rng2 = __sp_base_right_global.second - __sp_base_left_global.second; - - auto [__local_accessor_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(local_accessors); - auto [__local_accessor_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(local_accessors, (std::size_t)(__sp_base_right_global.first -__sp_base_left_global.first)); - auto __rngs_data_in_slm1 = std::addressof(__local_accessor_rng1[0]) + offset_to_slm1; - auto __rngs_data_in_slm2 = std::addressof(__local_accessor_rng2[0]) + offset_to_slm2; + auto [__loc_acc_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack); + auto [__loc_acc_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __wg_data_size_rng1); + auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1; + auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2; // Full amount of work-items may be great then the amount of diagonals in the merge matrix // so we should skip the redundant work-items From 744bcdb64c64ba15041c63f044424d6353cd2b84 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 12:39:45 +0100 Subject: [PATCH 10/80] @@@ include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - debug code Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 139 +++++++++++++++--- 1 file changed, 117 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 90837008415..8c6163cfe9b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -26,6 +26,8 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" +#define USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE 0 + namespace oneapi { namespace dpl @@ -274,6 +276,34 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M template struct __parallel_merge_submitter_large; +#if USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE +// TODO remove debug code +template +void +load_data(std::size_t __n1, std::size_t __n2, std::size_t __wg_id, std::size_t __rng_no, std::size_t __local_idx, _RngTo& __rng_to, std::size_t __idx_to, const _RngFrom& __rng_from, std::size_t __idx_from, + _IdType __wg_data_size_rng, + _IdType __items_in_wg_count, + std::size_t __max_wi_amount_for_data_loading, + const std::size_t __loading_data_per_wi, + const _split_point_t<_IdType> __sp_base_left_global, + const _split_point_t<_IdType> __sp_base_right_global) +{ + __rng_to[__idx_to] = __rng_from[__idx_from]; +} + +// TODO remove debug code +template +void +dump_split_point(_IdType __idx, const _split_point_t<_IdType> __sp) +{ + auto first = __sp.first; + auto second = __sp.second; + + first = first; + second = second; +} +#endif + template struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, @@ -412,6 +442,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); // Run nd_range parallel_for to process all the data + // - each work-group caching source data in SLM and processing diagonals between two base diagonals; + // - each work-item processing one diagonal. __cgh.parallel_for<_MergeKernelName...>( sycl::nd_range(__wg_count * __items_in_wg_count, __items_in_wg_count), [=](sycl::nd_item __nd_item) @@ -425,50 +457,112 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Merge matrix base diagonal's GLOBAL index const std::size_t __wg_id = __nd_item.get_group_linear_id(); +#if USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE + // TODO remove debug code: dump split points + { + if (__wg_id == 0 && __local_idx == 0) + for (_IdType i = 0; i < __wg_count + 1; ++i) + dump_split_point(i, __base_diagonals_sp_global_ptr[i]); + __dpl_sycl::__group_barrier(__nd_item); + } +#endif + // Split points on left anr right base diagonals // - in GLOBAL coordinates assert(__wg_id + 1 < __wg_count + 1); const _split_point_t<_IdType>& __sp_base_left_global = __base_diagonals_sp_global_ptr[__wg_id]; - const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; + const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; assert(__sp_base_right_global.first >= __sp_base_left_global.first); assert(__sp_base_right_global.second >= __sp_base_left_global.second); const _IdType __wg_data_size_rng1 = __sp_base_right_global.first - __sp_base_left_global.first; const _IdType __wg_data_size_rng2 = __sp_base_right_global.second - __sp_base_left_global.second; - + auto [__loc_acc_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack); auto [__loc_acc_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __wg_data_size_rng1); auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1; auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2; - // Full amount of work-items may be great then the amount of diagonals in the merge matrix - // so we should skip the redundant work-items - const bool __out_of_data = __global_idx * __chunk >= __n; - if (!__out_of_data) + constexpr std::size_t __max_wi_amount_for_data_loading = 16; + + if (__local_idx < __max_wi_amount_for_data_loading) { + //////////////////////////////////////////////////////////////////////////////////////// // Load the current part of merging data placed between two base diagonals into SLM - // TODO implement cooperative data load by multiple work-items - assert(__items_in_wg_count > 1); - if (__local_idx == 0) + + // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2023-0/shared-local-memory.html + // SLM: 64 bytes x 16 banks (granularity: 4 bytes / 32 bits) + // the goal - each WI should write into separate bank + // -> load from max 16 work-items (defined at __max_wi_amount_for_data_loading) + // -> it is necessary to ensure sequential writing to adjacent addresses of SLM memory + + //////////////////////////////////////////////////////////////////////////////////////// + // Cooperative data load from __rng1 to __rngs_data_in_slm1 + if (__wg_data_size_rng1 > 0) { - _IdType __slm_idx = 0; - for (_IdType __idx = __sp_base_left_global.first; __idx < __sp_base_right_global.first; ++__idx, ++__slm_idx) + // Calculate the size of the current part of merging data per work-item + const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng1, std::min((std::size_t)__items_in_wg_count, __max_wi_amount_for_data_loading)); + + // Calculate the range of SLM indexes of loading data + const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi; + const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; + + for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) { - assert(__slm_idx < __slm_cached_data_size); - assert(__idx < __n1); - __rngs_data_in_slm1[__slm_idx] = __rng1[__idx]; + const _IdType __rng_idx = __sp_base_left_global.first + __slm_idx; + if (__rng_idx < __sp_base_right_global.first) + { + assert(__slm_idx < __wg_data_size_rng1); + assert(__rng_idx < __n1); +#if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE + __rngs_data_in_slm1[__slm_idx] = __rng1[__rng_idx]; +#else + load_data(__n1, __n2, __wg_id, 1, __local_idx, __rngs_data_in_slm1, __slm_idx, __rng1, __rng_idx, + __wg_data_size_rng2, + __items_in_wg_count, + __max_wi_amount_for_data_loading, + __loading_data_per_wi, + __sp_base_left_global, + __sp_base_right_global); +#endif + } } } - if (__local_idx == 1 && __items_in_wg_count > 1 || __local_idx == 0) + //////////////////////////////////////////////////////////////////////////////////////// + // Cooperative data load from __rng2 to __rngs_data_in_slm2 + if (__wg_data_size_rng2 > 0) { - _IdType __slm_idx = 0; - for (_IdType __idx = __sp_base_left_global.second; __idx < __sp_base_right_global.second; ++__idx, ++__slm_idx) + // __loading_data_per_wi = 3, __sp_base_left_global = (521, 247), __sp_base_right_global = (521, 260) + // -> __wg_data_size_rng2 = 260 - 247 = 13 + // -> __loading_data_per_wi = __dpl_ceiling_div(13, 6) = 3 + // Calculate the size of the current part of merging data per work-item + const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng2, std::min((std::size_t)__items_in_wg_count, __max_wi_amount_for_data_loading)); + + // Calculate the range of SLM indexes of loading data + const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi; + const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; + + for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) { - assert(__slm_idx < __slm_cached_data_size); - assert(__idx < __n2); - __rngs_data_in_slm2[__slm_idx] = __rng2[__idx]; + const _IdType __rng_idx = __sp_base_left_global.second + __slm_idx; + if (__rng_idx < __sp_base_right_global.second) + { + assert(__slm_idx < __wg_data_size_rng2); + assert(__rng_idx < __n2); +#if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE + __rngs_data_in_slm2[__slm_idx] = __rng2[__rng_idx]; +#else + load_data(__n1, __n2, __wg_id, 2, __local_idx, __rngs_data_in_slm2, __slm_idx, __rng2, __rng_idx, + __wg_data_size_rng2, + __items_in_wg_count, + __max_wi_amount_for_data_loading, + __loading_data_per_wi, + __sp_base_left_global, + __sp_base_right_global); +#endif + } } } } @@ -477,7 +571,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - we shouldn't setup this barrier under any conditions!!! __dpl_sycl::__group_barrier(__nd_item); - if (!__out_of_data) + // Current diagonal inside of the merge matrix? + if (__global_idx * __chunk < __n) { // We are between two base diagonals and need to find the start points in the merge matrix area, // limited by split points of the left and right base diagonals. @@ -526,7 +621,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; const auto __n = __rng1.size() + __rng2.size(); - if (__n < 4 * 1'048'576) + if (false) //if (__n < 4 * 1'048'576) { if (__n <= std::numeric_limits::max()) { From 2239811b71eb2dd8b72625a4ffa5943d1792f601 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 22:04:10 +0100 Subject: [PATCH 11/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: let's use __parallel_merge_submitter with std::uint32_t data type only Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 41 ++++++++----------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8c6163cfe9b..fb4e83f48e0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -384,12 +384,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, assert(__max_source_data_items_fit_into_slm % __chunk == 0); // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group) - const _IdType __items_in_wg_count = __max_source_data_items_fit_into_slm / __chunk; + const std::size_t __items_in_wg_count = __max_source_data_items_fit_into_slm / __chunk; assert(__items_in_wg_count > 0); // The amount of the base diagonals is the amount of the work-groups // - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group - const _IdType __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_source_data_items_fit_into_slm); + const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_source_data_items_fit_into_slm); // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) // - in GLOBAL coordinates @@ -461,7 +461,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // TODO remove debug code: dump split points { if (__wg_id == 0 && __local_idx == 0) - for (_IdType i = 0; i < __wg_count + 1; ++i) + for (std::size_t i = 0; i < __wg_count + 1; ++i) dump_split_point(i, __base_diagonals_sp_global_ptr[i]); __dpl_sycl::__group_barrier(__nd_item); } @@ -510,7 +510,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) { - const _IdType __rng_idx = __sp_base_left_global.first + __slm_idx; + const std::size_t __rng_idx = __sp_base_left_global.first + __slm_idx; if (__rng_idx < __sp_base_right_global.first) { assert(__slm_idx < __wg_data_size_rng1); @@ -546,7 +546,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) { - const _IdType __rng_idx = __sp_base_left_global.second + __slm_idx; + const std::size_t __rng_idx = __sp_base_left_global.second + __slm_idx; if (__rng_idx < __sp_base_right_global.second) { assert(__slm_idx < __wg_data_size_rng2); @@ -620,27 +620,18 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - const auto __n = __rng1.size() + __rng2.size(); - if (false) //if (__n < 4 * 1'048'576) + constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 Mb + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __starting_size_limit_for_large_submitter) { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } + static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits::max()); + + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName>>; + return __parallel_merge_submitter()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); } else { From 4039c85168fd463c5ec71e06c7239c6a603df26a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 13:17:05 +0100 Subject: [PATCH 12/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - load source data into SLM by all available work-items in the group Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 107 +++++++----------- 1 file changed, 43 insertions(+), 64 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index fb4e83f48e0..a99f99d4fa2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -283,7 +283,6 @@ void load_data(std::size_t __n1, std::size_t __n2, std::size_t __wg_id, std::size_t __rng_no, std::size_t __local_idx, _RngTo& __rng_to, std::size_t __idx_to, const _RngFrom& __rng_from, std::size_t __idx_from, _IdType __wg_data_size_rng, _IdType __items_in_wg_count, - std::size_t __max_wi_amount_for_data_loading, const std::size_t __loading_data_per_wi, const _split_point_t<_IdType> __sp_base_left_global, const _split_point_t<_IdType> __sp_base_right_global) @@ -484,85 +483,65 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1; auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2; - constexpr std::size_t __max_wi_amount_for_data_loading = 16; - - if (__local_idx < __max_wi_amount_for_data_loading) + //////////////////////////////////////////////////////////////////////////////////////// + // Cooperative data load from __rng1 to __rngs_data_in_slm1 + if (__wg_data_size_rng1 > 0) { - //////////////////////////////////////////////////////////////////////////////////////// - // Load the current part of merging data placed between two base diagonals into SLM - - // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2023-0/shared-local-memory.html - // SLM: 64 bytes x 16 banks (granularity: 4 bytes / 32 bits) - // the goal - each WI should write into separate bank - // -> load from max 16 work-items (defined at __max_wi_amount_for_data_loading) - // -> it is necessary to ensure sequential writing to adjacent addresses of SLM memory - - //////////////////////////////////////////////////////////////////////////////////////// - // Cooperative data load from __rng1 to __rngs_data_in_slm1 - if (__wg_data_size_rng1 > 0) - { - // Calculate the size of the current part of merging data per work-item - const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng1, std::min((std::size_t)__items_in_wg_count, __max_wi_amount_for_data_loading)); + // Calculate the size of the current part of merging data per work-item + const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng1, __items_in_wg_count); - // Calculate the range of SLM indexes of loading data - const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi; - const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; + // Calculate the range of SLM indexes of loading data + const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi; + const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; - for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) + for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) + { + const std::size_t __rng_idx = __sp_base_left_global.first + __slm_idx; + if (__rng_idx < __sp_base_right_global.first) { - const std::size_t __rng_idx = __sp_base_left_global.first + __slm_idx; - if (__rng_idx < __sp_base_right_global.first) - { - assert(__slm_idx < __wg_data_size_rng1); - assert(__rng_idx < __n1); + assert(__slm_idx < __wg_data_size_rng1); + assert(__rng_idx < __n1); #if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE - __rngs_data_in_slm1[__slm_idx] = __rng1[__rng_idx]; + __rngs_data_in_slm1[__slm_idx] = __rng1[__rng_idx]; #else - load_data(__n1, __n2, __wg_id, 1, __local_idx, __rngs_data_in_slm1, __slm_idx, __rng1, __rng_idx, - __wg_data_size_rng2, - __items_in_wg_count, - __max_wi_amount_for_data_loading, - __loading_data_per_wi, - __sp_base_left_global, - __sp_base_right_global); + load_data(__n1, __n2, __wg_id, 1, __local_idx, __rngs_data_in_slm1, __slm_idx, __rng1, __rng_idx, + __wg_data_size_rng2, + __items_in_wg_count, + __loading_data_per_wi, + __sp_base_left_global, + __sp_base_right_global); #endif - } } } + } - //////////////////////////////////////////////////////////////////////////////////////// - // Cooperative data load from __rng2 to __rngs_data_in_slm2 - if (__wg_data_size_rng2 > 0) - { - // __loading_data_per_wi = 3, __sp_base_left_global = (521, 247), __sp_base_right_global = (521, 260) - // -> __wg_data_size_rng2 = 260 - 247 = 13 - // -> __loading_data_per_wi = __dpl_ceiling_div(13, 6) = 3 - // Calculate the size of the current part of merging data per work-item - const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng2, std::min((std::size_t)__items_in_wg_count, __max_wi_amount_for_data_loading)); + //////////////////////////////////////////////////////////////////////////////////////// + // Cooperative data load from __rng2 to __rngs_data_in_slm2 + if (__wg_data_size_rng2 > 0) + { + const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng2, __items_in_wg_count); - // Calculate the range of SLM indexes of loading data - const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi; - const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; + // Calculate the range of SLM indexes of loading data + const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi; + const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; - for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) + for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) + { + const std::size_t __rng_idx = __sp_base_left_global.second + __slm_idx; + if (__rng_idx < __sp_base_right_global.second) { - const std::size_t __rng_idx = __sp_base_left_global.second + __slm_idx; - if (__rng_idx < __sp_base_right_global.second) - { - assert(__slm_idx < __wg_data_size_rng2); - assert(__rng_idx < __n2); + assert(__slm_idx < __wg_data_size_rng2); + assert(__rng_idx < __n2); #if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE - __rngs_data_in_slm2[__slm_idx] = __rng2[__rng_idx]; + __rngs_data_in_slm2[__slm_idx] = __rng2[__rng_idx]; #else - load_data(__n1, __n2, __wg_id, 2, __local_idx, __rngs_data_in_slm2, __slm_idx, __rng2, __rng_idx, - __wg_data_size_rng2, - __items_in_wg_count, - __max_wi_amount_for_data_loading, - __loading_data_per_wi, - __sp_base_left_global, - __sp_base_right_global); + load_data(__n1, __n2, __wg_id, 2, __local_idx, __rngs_data_in_slm2, __slm_idx, __rng2, __rng_idx, + __wg_data_size_rng2, + __items_in_wg_count, + __loading_data_per_wi, + __sp_base_left_global, + __sp_base_right_global); #endif - } } } } From 50f04459f2afc99c7f66acc03a7df4eca95c13ee Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 14:57:27 +0100 Subject: [PATCH 13/80] extract function load_data_into_slm to load source data into SLM --- .../dpcpp/parallel_backend_sycl_merge.h | 112 ++++++++---------- 1 file changed, 48 insertions(+), 64 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a99f99d4fa2..dff041bbd74 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -181,7 +181,7 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el return __result; } - return std::make_pair(0, 0); + return __zero_split_point<_Index>; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing @@ -355,6 +355,50 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, } }; + template + static void + load_data_into_slm(_Range&& __rng, _DataType* __slm, + std::size_t __sp_base_left_global_from, std::size_t __sp_base_left_global_to, + std::size_t __items_in_wg_count, std::size_t __local_idx) + { + const std::size_t __wg_data_size_rng = __sp_base_left_global_to - __sp_base_left_global_from; + if (__wg_data_size_rng > 0) + { + // Calculate the size of the current part of merging data per work-item + const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __items_in_wg_count); + assert(__loading_data_per_wi > 0); + + if (__loading_data_per_wi > 1) + { + const auto __slm_idx_begin = __local_idx * __loading_data_per_wi; + const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; + + for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) + { + const std::size_t __rng_idx = __sp_base_left_global_from + __slm_idx; + if (__rng_idx < __sp_base_left_global_to) + { + assert(__slm_idx < __wg_data_size_rng); + assert(__rng_idx < __rng.size()); + __slm[__slm_idx] = __rng[__rng_idx]; + } + } + } + else + { + assert(__loading_data_per_wi == 1); + + const std::size_t __rng_idx = __sp_base_left_global_from + __local_idx; + if (__rng_idx < __sp_base_left_global_to) + { + assert(__local_idx < __wg_data_size_rng); + assert(__rng_idx < __rng.size()); + __slm[__local_idx] = __rng[__rng_idx]; + } + } + } + } + template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -483,71 +527,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1; auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2; - //////////////////////////////////////////////////////////////////////////////////////// - // Cooperative data load from __rng1 to __rngs_data_in_slm1 - if (__wg_data_size_rng1 > 0) - { - // Calculate the size of the current part of merging data per work-item - const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng1, __items_in_wg_count); - - // Calculate the range of SLM indexes of loading data - const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi; - const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; - - for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) - { - const std::size_t __rng_idx = __sp_base_left_global.first + __slm_idx; - if (__rng_idx < __sp_base_right_global.first) - { - assert(__slm_idx < __wg_data_size_rng1); - assert(__rng_idx < __n1); -#if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE - __rngs_data_in_slm1[__slm_idx] = __rng1[__rng_idx]; -#else - load_data(__n1, __n2, __wg_id, 1, __local_idx, __rngs_data_in_slm1, __slm_idx, __rng1, __rng_idx, - __wg_data_size_rng2, - __items_in_wg_count, - __loading_data_per_wi, - __sp_base_left_global, - __sp_base_right_global); -#endif - } - } - } - - //////////////////////////////////////////////////////////////////////////////////////// - // Cooperative data load from __rng2 to __rngs_data_in_slm2 - if (__wg_data_size_rng2 > 0) - { - const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng2, __items_in_wg_count); - - // Calculate the range of SLM indexes of loading data - const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi; - const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; - - for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) - { - const std::size_t __rng_idx = __sp_base_left_global.second + __slm_idx; - if (__rng_idx < __sp_base_right_global.second) - { - assert(__slm_idx < __wg_data_size_rng2); - assert(__rng_idx < __n2); -#if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE - __rngs_data_in_slm2[__slm_idx] = __rng2[__rng_idx]; -#else - load_data(__n1, __n2, __wg_id, 2, __local_idx, __rngs_data_in_slm2, __slm_idx, __rng2, __rng_idx, - __wg_data_size_rng2, - __items_in_wg_count, - __loading_data_per_wi, - __sp_base_left_global, - __sp_base_right_global); -#endif - } - } - } + // Cooperative data load from __rng1 to __rngs_data_in_slm1, from __rng2 to __rngs_data_in_slm2 + load_data_into_slm(__rng1, __rngs_data_in_slm1, __sp_base_left_global.first, __sp_base_right_global.first, __items_in_wg_count, __local_idx); + load_data_into_slm(__rng2, __rngs_data_in_slm2, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_idx); // Wait until all the data is loaded - // - we shouldn't setup this barrier under any conditions!!! __dpl_sycl::__group_barrier(__nd_item); // Current diagonal inside of the merge matrix? From 3a0a8625926e48a52ba02e54b07316da10db854b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 15:35:29 +0100 Subject: [PATCH 14/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove debug code Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index dff041bbd74..fdeac2846e4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -26,8 +26,6 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" -#define USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE 0 - namespace oneapi { namespace dpl @@ -276,33 +274,6 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M template struct __parallel_merge_submitter_large; -#if USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE -// TODO remove debug code -template -void -load_data(std::size_t __n1, std::size_t __n2, std::size_t __wg_id, std::size_t __rng_no, std::size_t __local_idx, _RngTo& __rng_to, std::size_t __idx_to, const _RngFrom& __rng_from, std::size_t __idx_from, - _IdType __wg_data_size_rng, - _IdType __items_in_wg_count, - const std::size_t __loading_data_per_wi, - const _split_point_t<_IdType> __sp_base_left_global, - const _split_point_t<_IdType> __sp_base_right_global) -{ - __rng_to[__idx_to] = __rng_from[__idx_from]; -} - -// TODO remove debug code -template -void -dump_split_point(_IdType __idx, const _split_point_t<_IdType> __sp) -{ - auto first = __sp.first; - auto second = __sp.second; - - first = first; - second = second; -} -#endif - template struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, @@ -500,16 +471,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Merge matrix base diagonal's GLOBAL index const std::size_t __wg_id = __nd_item.get_group_linear_id(); -#if USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE - // TODO remove debug code: dump split points - { - if (__wg_id == 0 && __local_idx == 0) - for (std::size_t i = 0; i < __wg_count + 1; ++i) - dump_split_point(i, __base_diagonals_sp_global_ptr[i]); - __dpl_sycl::__group_barrier(__nd_item); - } -#endif - // Split points on left anr right base diagonals // - in GLOBAL coordinates assert(__wg_id + 1 < __wg_count + 1); From ab38d96d687bf7b3ed6d357bbc9034bf9660427a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 15:38:34 +0100 Subject: [PATCH 15/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename some variables Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 47 +++++++++---------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index fdeac2846e4..baf701bffaa 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -330,7 +330,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, static void load_data_into_slm(_Range&& __rng, _DataType* __slm, std::size_t __sp_base_left_global_from, std::size_t __sp_base_left_global_to, - std::size_t __items_in_wg_count, std::size_t __local_idx) + std::size_t __items_in_wg_count, std::size_t __local_id) { const std::size_t __wg_data_size_rng = __sp_base_left_global_to - __sp_base_left_global_from; if (__wg_data_size_rng > 0) @@ -341,7 +341,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__loading_data_per_wi > 1) { - const auto __slm_idx_begin = __local_idx * __loading_data_per_wi; + const auto __slm_idx_begin = __local_id * __loading_data_per_wi; const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) @@ -359,12 +359,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, { assert(__loading_data_per_wi == 1); - const std::size_t __rng_idx = __sp_base_left_global_from + __local_idx; + const std::size_t __rng_idx = __sp_base_left_global_from + __local_id; if (__rng_idx < __sp_base_left_global_to) { - assert(__local_idx < __wg_data_size_rng); + assert(__local_id < __wg_data_size_rng); assert(__rng_idx < __rng.size()); - __slm[__local_idx] = __rng[__rng_idx]; + __slm[__local_id] = __rng[__rng_idx]; } } } @@ -420,24 +420,24 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh.parallel_for<_DiagonalsKernelName...>( sycl::range(__wg_count + 1), [=](sycl::item __item_id) { - const std::size_t __global_idx = __item_id.get_linear_id(); + const std::size_t __linear_id = __item_id.get_linear_id(); _split_point_t<_IdType>* __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); // Save top-left split point for first/last base diagonals of merge matrix // - in GLOBAL coordinates - _split_point_t<_IdType> __sp(__global_idx == 0 ? __zero_split_point<_IdType> : _split_point_t<_IdType>{__n1, __n2}); + _split_point_t<_IdType> __sp(__linear_id == 0 ? __zero_split_point : _split_point_t{__n1, __n2}); - if (0 < __global_idx && __global_idx < __wg_count) + if (0 < __linear_id && __linear_id < __wg_count) { - const _IdType __i_elem = __global_idx * __items_in_wg_count * __chunk; + const _IdType __i_elem = __linear_id * __items_in_wg_count * __chunk; // Save bottom-right split point for current base diagonal of merge matrix // - in GLOBAL coordinates __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } - __base_diagonals_sp_global_ptr[__global_idx] = __sp; + __base_diagonals_sp_global_ptr[__linear_id] = __sp; }); }); @@ -462,20 +462,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, sycl::nd_range(__wg_count * __items_in_wg_count, __items_in_wg_count), [=](sycl::nd_item __nd_item) { - // Merge matrix diagonal's GLOBAL index - const std::size_t __global_idx = __nd_item.get_global_linear_id(); - - // Merge sub-matrix LOCAL diagonal's index - const std::size_t __local_idx = __nd_item.get_local_id(0); - - // Merge matrix base diagonal's GLOBAL index - const std::size_t __wg_id = __nd_item.get_group_linear_id(); + const std::size_t __global_linear_id = __nd_item.get_global_linear_id(); // Merge matrix diagonal's GLOBAL index + const std::size_t __local_id = __nd_item.get_local_id(0); // Merge sub-matrix LOCAL diagonal's index + const std::size_t __group_linear_id = __nd_item.get_group_linear_id(); // Merge matrix base diagonal's GLOBAL index // Split points on left anr right base diagonals // - in GLOBAL coordinates - assert(__wg_id + 1 < __wg_count + 1); - const _split_point_t<_IdType>& __sp_base_left_global = __base_diagonals_sp_global_ptr[__wg_id]; - const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; + assert(__group_linear_id + 1 < __wg_count + 1); + const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; + const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; assert(__sp_base_right_global.first >= __sp_base_left_global.first); assert(__sp_base_right_global.second >= __sp_base_left_global.second); @@ -489,14 +484,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2; // Cooperative data load from __rng1 to __rngs_data_in_slm1, from __rng2 to __rngs_data_in_slm2 - load_data_into_slm(__rng1, __rngs_data_in_slm1, __sp_base_left_global.first, __sp_base_right_global.first, __items_in_wg_count, __local_idx); - load_data_into_slm(__rng2, __rngs_data_in_slm2, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_idx); + load_data_into_slm(__rng1, __rngs_data_in_slm1, __sp_base_left_global.first, __sp_base_right_global.first, __items_in_wg_count, __local_id); + load_data_into_slm(__rng2, __rngs_data_in_slm2, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_id); // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); // Current diagonal inside of the merge matrix? - if (__global_idx * __chunk < __n) + if (__global_linear_id * __chunk < __n) { // We are between two base diagonals and need to find the start points in the merge matrix area, // limited by split points of the left and right base diagonals. @@ -505,7 +500,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - bottom-right split point describes the size of current area between two base diagonals. const _split_point_t<_IdType> __sp_local = __find_start_point( __rngs_data_in_slm1, __rngs_data_in_slm2, // SLM cached copy of merging data - (_IdType)(__local_idx * __chunk), // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data + (_IdType)(__local_id * __chunk), // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data __wg_data_size_rng1, // size of rng1 __wg_data_size_rng2, // size of rng2 __comp); @@ -516,7 +511,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __rng3, // Destination range __sp_local.first, // __start1 in LOCAL coordinates because __rngs_data_in_slm1 is work-group SLM cached copy of source data __sp_local.second, // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data - (_IdType)(__global_idx * __chunk), // __start3 in GLOBAL coordinates because __rng3 is not cached at all + (_IdType)(__global_linear_id * __chunk), // __start3 in GLOBAL coordinates because __rng3 is not cached at all __chunk, __wg_data_size_rng1, // size of __rngs_data_in_slm1 __wg_data_size_rng2, // size of __rngs_data_in_slm2 From cbbfb06e4c79421b3b52a60e67be61b120ddb330 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 15:39:44 +0100 Subject: [PATCH 16/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - removed redundand comment Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index baf701bffaa..cfd3f54d4da 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -493,9 +493,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Current diagonal inside of the merge matrix? if (__global_linear_id * __chunk < __n) { - // We are between two base diagonals and need to find the start points in the merge matrix area, - // limited by split points of the left and right base diagonals. - // Find split point in LOCAL coordinates // - bottom-right split point describes the size of current area between two base diagonals. const _split_point_t<_IdType> __sp_local = __find_start_point( From f73cf27c84b754252d2b50428c3b4717f8e43ad0 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 15:40:33 +0100 Subject: [PATCH 17/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - removed redundand assert Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index cfd3f54d4da..e7ac839bb76 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -468,7 +468,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Split points on left anr right base diagonals // - in GLOBAL coordinates - assert(__group_linear_id + 1 < __wg_count + 1); const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; From 75f6e4adbc79a7690d1b4bae4a58890eba6d2ac2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 15:43:15 +0100 Subject: [PATCH 18/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix unused variable Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index e7ac839bb76..8e7387ec442 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -427,15 +427,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Save top-left split point for first/last base diagonals of merge matrix // - in GLOBAL coordinates _split_point_t<_IdType> __sp(__linear_id == 0 ? __zero_split_point : _split_point_t{__n1, __n2}); - if (0 < __linear_id && __linear_id < __wg_count) - { - const _IdType __i_elem = __linear_id * __items_in_wg_count * __chunk; - - // Save bottom-right split point for current base diagonal of merge matrix - // - in GLOBAL coordinates - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - } + __sp = __find_start_point(__rng1, __rng2, (_IdType)(__linear_id * __items_in_wg_count * __chunk), __n1, __n2, __comp); __base_diagonals_sp_global_ptr[__linear_id] = __sp; }); From 15f52916ffad1043e5eda8ea8cde1bf80c67a110 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 15:52:42 +0100 Subject: [PATCH 19/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename some variables Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8e7387ec442..970f28f8584 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -467,17 +467,17 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, assert(__sp_base_right_global.first >= __sp_base_left_global.first); assert(__sp_base_right_global.second >= __sp_base_left_global.second); - const _IdType __wg_data_size_rng1 = __sp_base_right_global.first - __sp_base_left_global.first; - const _IdType __wg_data_size_rng2 = __sp_base_right_global.second - __sp_base_left_global.second; + const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; + const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; - auto [__loc_acc_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack); - auto [__loc_acc_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __wg_data_size_rng1); - auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1; - auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2; + auto [__rng1_loc_acc, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack); + auto [__rng2_loc_acc, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __rng1_wg_data_size); + auto __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + offset_to_slm1; + auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + offset_to_slm2; - // Cooperative data load from __rng1 to __rngs_data_in_slm1, from __rng2 to __rngs_data_in_slm2 - load_data_into_slm(__rng1, __rngs_data_in_slm1, __sp_base_left_global.first, __sp_base_right_global.first, __items_in_wg_count, __local_id); - load_data_into_slm(__rng2, __rngs_data_in_slm2, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_id); + // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm + load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, __items_in_wg_count, __local_id); + load_data_into_slm(__rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_id); // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); @@ -488,22 +488,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Find split point in LOCAL coordinates // - bottom-right split point describes the size of current area between two base diagonals. const _split_point_t<_IdType> __sp_local = __find_start_point( - __rngs_data_in_slm1, __rngs_data_in_slm2, // SLM cached copy of merging data - (_IdType)(__local_id * __chunk), // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data - __wg_data_size_rng1, // size of rng1 - __wg_data_size_rng2, // size of rng2 + __rng1_cache_slm, __rng2_cache_slm, // SLM cached copy of merging data + (_IdType)(__local_id * __chunk), // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data + __rng1_wg_data_size, __rng2_wg_data_size, // size of rng1 and rng2 __comp); // Merge data for the current diagonal // - we should have here __sp_global in GLOBAL coordinates - __serial_merge(__rngs_data_in_slm1, __rngs_data_in_slm2, // SLM cached copy of merging data - __rng3, // Destination range - __sp_local.first, // __start1 in LOCAL coordinates because __rngs_data_in_slm1 is work-group SLM cached copy of source data - __sp_local.second, // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data - (_IdType)(__global_linear_id * __chunk), // __start3 in GLOBAL coordinates because __rng3 is not cached at all + __serial_merge(__rng1_cache_slm, __rng2_cache_slm, // SLM cached copy of merging data + __rng3, // Destination range + __sp_local.first, // __start1 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data + __sp_local.second, // __start2 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data + (_IdType)(__global_linear_id * __chunk), // __start3 in GLOBAL coordinates because __rng3 is not cached at all __chunk, - __wg_data_size_rng1, // size of __rngs_data_in_slm1 - __wg_data_size_rng2, // size of __rngs_data_in_slm2 + __rng1_wg_data_size, __rng2_wg_data_size, // size of rng1 and rng2 __comp); } }); From 68f3d251ee4e223bf9510c39e599f5ff6402587a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 18:29:51 +0100 Subject: [PATCH 20/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - declare load_data_into_slm as inline Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 970f28f8584..8910aec321d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -327,7 +327,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, }; template - static void + inline static void load_data_into_slm(_Range&& __rng, _DataType* __slm, std::size_t __sp_base_left_global_from, std::size_t __sp_base_left_global_to, std::size_t __items_in_wg_count, std::size_t __local_id) From 07d7143b579edf0a5558cb22ff402e7dce7b6274 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 18:39:32 +0100 Subject: [PATCH 21/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - removed redundand assert Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8910aec321d..49e0df27fcb 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -357,8 +357,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, } else { - assert(__loading_data_per_wi == 1); - const std::size_t __rng_idx = __sp_base_left_global_from + __local_id; if (__rng_idx < __sp_base_left_global_to) { From 6c852bfbdfa021f960a7d5b9f810fc7eced25697 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 22:43:38 +0100 Subject: [PATCH 22/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - additional comments for load_data_into_slm Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 58 ++++++++++++++----- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 49e0df27fcb..0108ad232f2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -329,14 +329,42 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, template inline static void load_data_into_slm(_Range&& __rng, _DataType* __slm, - std::size_t __sp_base_left_global_from, std::size_t __sp_base_left_global_to, - std::size_t __items_in_wg_count, std::size_t __local_id) + std::size_t __idx_global_begin, std::size_t __idx_global_end, + std::size_t __wi_in_one_wg, std::size_t __local_id) { - const std::size_t __wg_data_size_rng = __sp_base_left_global_to - __sp_base_left_global_from; + // How we load data: + /* + +-------------------+--------------------------------------------------------------+------------+ + | Source data index | Work-items in one work-group | SLM index | + +-------------------+--------------------------------------------------------------+------------+ + | | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) | | <--- __local_id: in which work-item we are + +-------------------+-------+-------+-------+-------+-----+------------------------+------------+ + | rng[0] | | | | | | | | + | rng[1] | + | | | | | | slm[0] | <--- __idx_global_begin + | rng[2] | | + | | | | | slm[1] | + | rng[3] | | | + | | | | slm[2] | + | rng[4] | | | | + | | | slm[3] | + | ..... | | | | | +++ | | ... | + | rng[M + 1] | | | | | | + | slm[M] | + | rng[M + 2] | + | | | | | | slm[M + 1] | + | rng[M + 3] | | + | | | | | slm[M + 2] | + | rng[M + 4] | | | + | | | | slm[M + 3] | + | rng[M + 5] | | | | - | | | | <--- __idx_global_end + | ..... | | | | | --- | | | + | rng[M + M + 1] | | | | | | - | | + +-------------------+--------------------------------------------------------------+------------+ + ^ + | + __local_id + + "+" - load one source data item ito SLM + */ + + const std::size_t __wg_data_size_rng = __idx_global_end - __idx_global_begin; if (__wg_data_size_rng > 0) { // Calculate the size of the current part of merging data per work-item - const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __items_in_wg_count); + const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __wi_in_one_wg); assert(__loading_data_per_wi > 0); if (__loading_data_per_wi > 1) @@ -346,8 +374,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) { - const std::size_t __rng_idx = __sp_base_left_global_from + __slm_idx; - if (__rng_idx < __sp_base_left_global_to) + const std::size_t __rng_idx = __idx_global_begin + __slm_idx; + if (__rng_idx < __idx_global_end) { assert(__slm_idx < __wg_data_size_rng); assert(__rng_idx < __rng.size()); @@ -357,8 +385,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, } else { - const std::size_t __rng_idx = __sp_base_left_global_from + __local_id; - if (__rng_idx < __sp_base_left_global_to) + const std::size_t __rng_idx = __idx_global_begin + __local_id; + if (__rng_idx < __idx_global_end) { assert(__local_id < __wg_data_size_rng); assert(__rng_idx < __rng.size()); @@ -396,8 +424,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, assert(__max_source_data_items_fit_into_slm % __chunk == 0); // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group) - const std::size_t __items_in_wg_count = __max_source_data_items_fit_into_slm / __chunk; - assert(__items_in_wg_count > 0); + const std::size_t __wi_in_one_wg = __max_source_data_items_fit_into_slm / __chunk; + assert(__wi_in_one_wg > 0); // The amount of the base diagonals is the amount of the work-groups // - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group @@ -426,7 +454,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - in GLOBAL coordinates _split_point_t<_IdType> __sp(__linear_id == 0 ? __zero_split_point : _split_point_t{__n1, __n2}); if (0 < __linear_id && __linear_id < __wg_count) - __sp = __find_start_point(__rng1, __rng2, (_IdType)(__linear_id * __items_in_wg_count * __chunk), __n1, __n2, __comp); + __sp = __find_start_point(__rng1, __rng2, (_IdType)(__linear_id * __wi_in_one_wg * __chunk), __n1, __n2, __comp); __base_diagonals_sp_global_ptr[__linear_id] = __sp; }); @@ -443,14 +471,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - const std::size_t __slm_cached_data_size = __items_in_wg_count * __chunk; + const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk; auto loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); // Run nd_range parallel_for to process all the data // - each work-group caching source data in SLM and processing diagonals between two base diagonals; // - each work-item processing one diagonal. __cgh.parallel_for<_MergeKernelName...>( - sycl::nd_range(__wg_count * __items_in_wg_count, __items_in_wg_count), + sycl::nd_range(__wg_count * __wi_in_one_wg, __wi_in_one_wg), [=](sycl::nd_item __nd_item) { const std::size_t __global_linear_id = __nd_item.get_global_linear_id(); // Merge matrix diagonal's GLOBAL index @@ -474,8 +502,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + offset_to_slm2; // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm - load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, __items_in_wg_count, __local_id); - load_data_into_slm(__rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_id); + load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, __wi_in_one_wg, __local_id); + load_data_into_slm(__rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __wi_in_one_wg, __local_id); // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); From 64d856db7faeab523c69a2bda1ef9c6fb2b2520f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 15 Nov 2024 23:09:34 +0100 Subject: [PATCH 23/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename some local variables and params Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 38 +++++++++++-------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 0108ad232f2..80fe834ee2d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -314,15 +314,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, template static auto - get_local_accessor(AccessorsTuple& __acc_tuple, std::size_t __offset = 0) + get_local_accessor(AccessorsTuple& __loc_acc_pack, std::size_t __offset = 0) { static_assert(std::tuple_size_v == 1 || std::tuple_size_v == 2); if constexpr (std::tuple_size_v == 1) - return std::pair(std::get<0>(__acc_tuple), __offset); + return std::pair(std::get<0>(__loc_acc_pack), __offset); else - return std::pair(std::get(__acc_tuple), 0); + return std::pair(std::get(__loc_acc_pack), 0); } }; @@ -340,15 +340,23 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, | | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) | | <--- __local_id: in which work-item we are +-------------------+-------+-------+-------+-------+-----+------------------------+------------+ | rng[0] | | | | | | | | - | rng[1] | + | | | | | | slm[0] | <--- __idx_global_begin - | rng[2] | | + | | | | | slm[1] | - | rng[3] | | | + | | | | slm[2] | - | rng[4] | | | | + | | | slm[3] | + | rng[1] | + | | | | | | slm[0] | <--- __idx_global_begin \ + | rng[2] | + | | | | | | slm[1] | | SLM bank: write into one SLM bank from one work-item + | rng[3] | + | | | | | | slm[2] | / + | rng[4] | | + | | | | | slm[3] | + | rng[5] | | + | | | | | slm[3] | + | rng[6] | | + | | | | | slm[3] | + | rng[7] | | | + | | | | slm[3] | + | rng[8] | | | + | | | | slm[3] | + | rng[9] | | | + | | | | slm[3] | + | rng[10] | | | | + | | | slm[3] | + | rng[11] | | | | + | | | slm[3] | + | rng[12] | | | | + | | | slm[3] | | ..... | | | | | +++ | | ... | | rng[M + 1] | | | | | | + | slm[M] | - | rng[M + 2] | + | | | | | | slm[M + 1] | - | rng[M + 3] | | + | | | | | slm[M + 2] | - | rng[M + 4] | | | + | | | | slm[M + 3] | + | rng[M + 2] | | | | | | + | slm[M + 1] | + | rng[M + 3] | | | | | | + | slm[M + 2] | + | rng[M + 4] | | | | | | | slm[M + 3] | | rng[M + 5] | | | | - | | | | <--- __idx_global_end | ..... | | | | | --- | | | | rng[M + M + 1] | | | | | | - | | @@ -472,7 +480,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk; - auto loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); + auto __loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); // Run nd_range parallel_for to process all the data // - each work-group caching source data in SLM and processing diagonals between two base diagonals; @@ -496,10 +504,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; - auto [__rng1_loc_acc, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack); - auto [__rng2_loc_acc, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __rng1_wg_data_size); - auto __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + offset_to_slm1; - auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + offset_to_slm2; + auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack); + auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size); + auto __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1; + auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2; // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, __wi_in_one_wg, __local_id); From 3d233dd334e7a5356727cf8500897741f86f5ba1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 12:03:05 +0100 Subject: [PATCH 24/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rewrite the data loading into SLM cache #1 Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 77 ++++++++++++++++++- 1 file changed, 74 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 80fe834ee2d..1b48594332f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -328,7 +328,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, template inline static void - load_data_into_slm(_Range&& __rng, _DataType* __slm, + load_data_into_slm_impl(_Range&& __rng, _DataType* __slm, std::size_t __idx_global_begin, std::size_t __idx_global_end, std::size_t __wi_in_one_wg, std::size_t __local_id) { @@ -404,6 +404,76 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, } } + template + static std::size_t + __calc_wi_amount_for_data_reading(const std::size_t __wi_in_one_wg, const std::size_t __reading_data) + { + //const std::size_t __required_reading_data_per_wi = __slm_bank_size / sizeof(_RangeValueType); + + std::size_t __wi_for_data_reading = 0; + if (__reading_data > 0) + { + const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); + + __wi_for_data_reading = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__reading_data, __required_reading_data_per_wi)); + } + + return __wi_for_data_reading; + } + + template + static void + load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, + _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, + const std::size_t __wi_in_one_wg, const std::size_t __local_id) + { + // TODO what size of SLM bank we have now? + constexpr std::size_t __slm_bank_size = 1024; + +#if 0 + auto __n1 = __rng1.size(); + auto __n2 = __rng2.size(); + + if (__n1 == 521 && __n2 == 260) + { + __n1 = __n1; + __n2 = __n2; + } +#endif + + using _Range1ValueType = typename std::iterator_traits::value_type; + using _Range2ValueType = typename std::iterator_traits::value_type; + + // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache + const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1); + const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2); + +#if 0 + const std::size_t __wi_for_data_reading1_128 = __calc_wi_amount_for_data_reading<128, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1); + const std::size_t __wi_for_data_reading2_128 = __calc_wi_amount_for_data_reading<128, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2); +#endif + + // Now arrange the reading by work-items + if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2) + { + if (__local_id < __wi_for_data_reading1) + { + load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); + } + else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2) + { + // When we reading data from parallel-working work-items, we should reduce the local id of current work-item + // because we calculate readed data size based on this value. + load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1); + } + } + else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2) + { + load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); + load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id); + } + } + template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -510,8 +580,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2; // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm - load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, __wi_in_one_wg, __local_id); - load_data_into_slm(__rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __wi_in_one_wg, __local_id); + load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, + __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, + __wi_in_one_wg, __local_id); // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); From 6a66b1b04d691a165e483c9b9eee0ceee32fc664 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 12:29:45 +0100 Subject: [PATCH 25/80] @@@ include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - always use two separate SLM cache Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 1b48594332f..3ab7fb14991 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -302,7 +302,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type; - if constexpr (std::is_same_v<_Range1ValueType, _Range2ValueType>) + if constexpr (false && std::is_same_v<_Range1ValueType, _Range2ValueType>) return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>>( __dpl_sycl::__local_accessor<_Range1ValueType>(2 * __slm_cached_data_size, __cgh)); else From 62bf5ce39aa93301b2c3f5bba54fc50fed7c9ebd Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 12:36:36 +0100 Subject: [PATCH 26/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - use large submitter after 16M items Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 3ab7fb14991..e8bbdb45b84 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -631,7 +631,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 Mb + constexpr std::size_t __starting_size_limit_for_large_submitter = 16 * 1'048'576; // 4 Mb const std::size_t __n = __rng1.size() + __rng2.size(); if (__n < __starting_size_limit_for_large_submitter) From bf5b8ce08694c9f7132ad654216f8457690dd1b4 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 12:47:37 +0100 Subject: [PATCH 27/80] @@@ include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using __parallel_merge_submitter_large for all data sizes Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index e8bbdb45b84..c9bde2e42e6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -634,7 +634,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy constexpr std::size_t __starting_size_limit_for_large_submitter = 16 * 1'048'576; // 4 Mb const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __starting_size_limit_for_large_submitter) + if (false) //if (__n < __starting_size_limit_for_large_submitter) { static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits::max()); From 435259c31c14da2befe324a4564de0cc5bab17da Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 13:17:07 +0100 Subject: [PATCH 28/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - avoid barrier if we have more then one work-item in each work-group Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index c9bde2e42e6..7b01be0f06b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -584,8 +584,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __wi_in_one_wg, __local_id); - // Wait until all the data is loaded - __dpl_sycl::__group_barrier(__nd_item); + // Wait until all the data is loaded (if we have more then one item in work-group + if (__wi_in_one_wg > 1) + __dpl_sycl::__group_barrier(__nd_item); // Current diagonal inside of the merge matrix? if (__global_linear_id * __chunk < __n) From 809c0735b6c30281578a6e2a7e651331b0c95adf Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 13:26:17 +0100 Subject: [PATCH 29/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - avoid any action in the __parallel_merge_submitter_large::operator() if we haven't any data to process Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 52 ++++++++++++------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 7b01be0f06b..4768cd553fc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -478,6 +478,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { + using _Range1ValueType = typename std::iterator_traits::value_type; + using _Range2ValueType = typename std::iterator_traits::value_type; + const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; @@ -563,33 +566,46 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __local_id = __nd_item.get_local_id(0); // Merge sub-matrix LOCAL diagonal's index const std::size_t __group_linear_id = __nd_item.get_group_linear_id(); // Merge matrix base diagonal's GLOBAL index - // Split points on left anr right base diagonals - // - in GLOBAL coordinates - const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; - const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; - - assert(__sp_base_right_global.first >= __sp_base_left_global.first); - assert(__sp_base_right_global.second >= __sp_base_left_global.second); + _IdType __rng1_wg_data_size = 0; + _IdType __rng2_wg_data_size = 0; - const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; - const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + _Range1ValueType* __rng1_cache_slm = nullptr; + _Range1ValueType* __rng2_cache_slm = nullptr; - auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack); - auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size); - auto __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1; - auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2; + // Current diagonal inside of the merge matrix? + const bool __have_data = __global_linear_id * __chunk < __n; - // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm - load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, - __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, - __wi_in_one_wg, __local_id); + // Current diagonal inside of the merge matrix? + if (__have_data) + { + // Split points on left anr right base diagonals + // - in GLOBAL coordinates + const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; + const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; + + assert(__sp_base_right_global.first >= __sp_base_left_global.first); + assert(__sp_base_right_global.second >= __sp_base_left_global.second); + + __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; + __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + + auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack); + auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size); + __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1; + __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2; + + // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm + load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, + __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, + __wi_in_one_wg, __local_id); + } // Wait until all the data is loaded (if we have more then one item in work-group if (__wi_in_one_wg > 1) __dpl_sycl::__group_barrier(__nd_item); // Current diagonal inside of the merge matrix? - if (__global_linear_id * __chunk < __n) + if (__have_data) { // Find split point in LOCAL coordinates // - bottom-right split point describes the size of current area between two base diagonals. From 021dbb858f053fa919e8245dca05d5d4babc340b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 13:49:06 +0100 Subject: [PATCH 30/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove inline on load_data_into_slm_impl Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 4768cd553fc..78fc978e5d9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -327,7 +327,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, }; template - inline static void + static void load_data_into_slm_impl(_Range&& __rng, _DataType* __slm, std::size_t __idx_global_begin, std::size_t __idx_global_end, std::size_t __wi_in_one_wg, std::size_t __local_id) From 2fa02678e8bae2ff3fb195c9afab41147c4471a8 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 14:35:32 +0100 Subject: [PATCH 31/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove extra condition checks and asserts from load_data_into_slm_impl Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 78fc978e5d9..46aa6907085 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -380,26 +380,17 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const auto __slm_idx_begin = __local_id * __loading_data_per_wi; const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; - for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx) - { - const std::size_t __rng_idx = __idx_global_begin + __slm_idx; - if (__rng_idx < __idx_global_end) - { - assert(__slm_idx < __wg_data_size_rng); - assert(__rng_idx < __rng.size()); + std::size_t __slm_idx = __slm_idx_begin; + std::size_t __rng_idx = __idx_global_begin + __slm_idx; + + for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx) __slm[__slm_idx] = __rng[__rng_idx]; - } - } } else { const std::size_t __rng_idx = __idx_global_begin + __local_id; if (__rng_idx < __idx_global_end) - { - assert(__local_id < __wg_data_size_rng); - assert(__rng_idx < __rng.size()); __slm[__local_id] = __rng[__rng_idx]; - } } } } From 3f95ff12c6badbdce632c74302b03b23a5d05ec1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 14:36:37 +0100 Subject: [PATCH 32/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - add _ONEDPL_PRAGMA_UNROLL into load_data_into_slm_impl Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 46aa6907085..e0839d38756 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -383,6 +383,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, std::size_t __slm_idx = __slm_idx_begin; std::size_t __rng_idx = __idx_global_begin + __slm_idx; + _ONEDPL_PRAGMA_UNROLL for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx) __slm[__slm_idx] = __rng[__rng_idx]; } From 0caf24c83f6afed95f0443efc707783d918ab86a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 14:37:03 +0100 Subject: [PATCH 33/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rewrite the data loading into SLM cache #1 Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index e0839d38756..dda8755e774 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -385,7 +385,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _ONEDPL_PRAGMA_UNROLL for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx) - __slm[__slm_idx] = __rng[__rng_idx]; + __slm[__slm_idx] = __rng[__rng_idx]; } else { @@ -400,8 +400,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, static std::size_t __calc_wi_amount_for_data_reading(const std::size_t __wi_in_one_wg, const std::size_t __reading_data) { - //const std::size_t __required_reading_data_per_wi = __slm_bank_size / sizeof(_RangeValueType); - std::size_t __wi_for_data_reading = 0; if (__reading_data > 0) { From b12eada173a923b36c1bcdfe58630fbe99aa84c7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 15:16:13 +0100 Subject: [PATCH 34/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - define __slm_bank_size as 64 in load_data_into_slm Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index dda8755e774..66d497837e9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -418,7 +418,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __wi_in_one_wg, const std::size_t __local_id) { // TODO what size of SLM bank we have now? - constexpr std::size_t __slm_bank_size = 1024; + constexpr std::size_t __slm_bank_size = 64; // = 1024; #if 0 auto __n1 = __rng1.size(); From c2c66acc4ad891e18d9ffde96ce046d9e0bf815e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 15:17:42 +0100 Subject: [PATCH 35/80] Revert "@@@ include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - always use two separate SLM cache" This reverts commit 6a66b1b04d691a165e483c9b9eee0ceee32fc664. --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 66d497837e9..ee5084998ba 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -302,7 +302,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type; - if constexpr (false && std::is_same_v<_Range1ValueType, _Range2ValueType>) + if constexpr (std::is_same_v<_Range1ValueType, _Range2ValueType>) return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>>( __dpl_sycl::__local_accessor<_Range1ValueType>(2 * __slm_cached_data_size, __cgh)); else From f55c36fe8910450c623a6751bc853423cf912b82 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 15:59:28 +0100 Subject: [PATCH 36/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove debug code Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ee5084998ba..55cac57bfe9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -420,17 +420,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // TODO what size of SLM bank we have now? constexpr std::size_t __slm_bank_size = 64; // = 1024; -#if 0 - auto __n1 = __rng1.size(); - auto __n2 = __rng2.size(); - - if (__n1 == 521 && __n2 == 260) - { - __n1 = __n1; - __n2 = __n2; - } -#endif - using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type; @@ -438,11 +427,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1); const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2); -#if 0 - const std::size_t __wi_for_data_reading1_128 = __calc_wi_amount_for_data_reading<128, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1); - const std::size_t __wi_for_data_reading2_128 = __calc_wi_amount_for_data_reading<128, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2); -#endif - // Now arrange the reading by work-items if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2) { From 1b5f0a7b9bc4f401155d4a51ed5db4bbdc4f982f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 16:03:29 +0100 Subject: [PATCH 37/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - increate chunk size on GPU to 8 Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 55cac57bfe9..454ebfddee3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -464,7 +464,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _PRINT_INFO_IN_DEBUG_MODE(__exec); // Empirical number of values to process per work-item - const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 8; assert(__chunk > 0); // Pessimistically only use half of the memory to take into account memory used by compiled kernel From 55169cc3fd4ad703ab15fd90fc8e7d7701c1f65e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 16:56:40 +0100 Subject: [PATCH 38/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - declare __rng1_from and __rng2_from as constexpr in __find_start_point Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 454ebfddee3..a0b1138229f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -56,9 +56,10 @@ _split_point_t<_Index> __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, const _Index __n2, _Compare __comp) { - const _Index __rng1_from = 0; + constexpr _Index __rng1_from = 0; + constexpr _Index __rng2_from = 0; + const _Index __rng1_to = __n1; - const _Index __rng2_from = 0; const _Index __rng2_to = __n2; assert(__rng1_from <= __rng1_to); From 9fac5b97f0065a0781b621906e87ac2b44ba359b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 17:32:44 +0100 Subject: [PATCH 39/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix spell-check error Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a0b1138229f..5d9fbd43dfd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -438,7 +438,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2) { // When we reading data from parallel-working work-items, we should reduce the local id of current work-item - // because we calculate readed data size based on this value. + // because we calculate reeded data size based on this value. load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1); } } From a3284b3c31ba4726671c90d3c499aaaf35b14623 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 17:46:15 +0100 Subject: [PATCH 40/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 5d9fbd43dfd..fd1f1be3118 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -575,9 +575,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __wi_in_one_wg, __local_id); } - // Wait until all the data is loaded (if we have more then one item in work-group - if (__wi_in_one_wg > 1) - __dpl_sycl::__group_barrier(__nd_item); + // Wait until all the data is loaded + __dpl_sycl::__group_barrier(__nd_item); // Current diagonal inside of the merge matrix? if (__have_data) From 167490fd3cbc55c3daf72620ced80085b0056e92 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 17:51:46 +0100 Subject: [PATCH 41/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix data types in load_data_into_slm Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index fd1f1be3118..2eac65dc93e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -412,10 +412,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __wi_for_data_reading; } - template + template static void - load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, - _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, + load_data_into_slm(_Range&& __rng1, _DataType1* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, + _Range&& __rng2, _DataType2* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, const std::size_t __wi_in_one_wg, const std::size_t __local_id) { // TODO what size of SLM bank we have now? From c72c92d1c5961beb36fe0d1700b5f0a9a3695239 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 18:14:11 +0100 Subject: [PATCH 42/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix comments in include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 2eac65dc93e..5980a1082a9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -345,14 +345,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, | rng[2] | + | | | | | | slm[1] | | SLM bank: write into one SLM bank from one work-item | rng[3] | + | | | | | | slm[2] | / | rng[4] | | + | | | | | slm[3] | - | rng[5] | | + | | | | | slm[3] | - | rng[6] | | + | | | | | slm[3] | - | rng[7] | | | + | | | | slm[3] | - | rng[8] | | | + | | | | slm[3] | - | rng[9] | | | + | | | | slm[3] | - | rng[10] | | | | + | | | slm[3] | - | rng[11] | | | | + | | | slm[3] | - | rng[12] | | | | + | | | slm[3] | + | rng[5] | | + | | | | | slm[4] | + | rng[6] | | + | | | | | slm[5] | + | rng[7] | | | + | | | | slm[6] | + | rng[8] | | | + | | | | slm[7] | + | rng[9] | | | + | | | | slm[8] | + | rng[10] | | | | + | | | slm[9] | + | rng[11] | | | | + | | | slm[10] | + | rng[12] | | | | + | | | slm[11] | | ..... | | | | | +++ | | ... | | rng[M + 1] | | | | | | + | slm[M] | | rng[M + 2] | | | | | | + | slm[M + 1] | From ba224e05f8a69a9a171d29caf1c3a762cd640bad Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 20:07:01 +0100 Subject: [PATCH 43/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix types in load_data_into_slm Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 5980a1082a9..3a751fbad5b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -412,10 +412,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __wi_for_data_reading; } - template + template static void - load_data_into_slm(_Range&& __rng1, _DataType1* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, - _Range&& __rng2, _DataType2* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, + load_data_into_slm(_Range1&& __rng1, _DataType1* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, + _Range2&& __rng2, _DataType2* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, const std::size_t __wi_in_one_wg, const std::size_t __local_id) { // TODO what size of SLM bank we have now? From 8bd40faa35e01cc602509860aca49fb1e755587f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 20:07:27 +0100 Subject: [PATCH 44/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an error: not all source data loaded into SLM cache Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 3a751fbad5b..a145a89a590 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -330,8 +330,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, template static void load_data_into_slm_impl(_Range&& __rng, _DataType* __slm, - std::size_t __idx_global_begin, std::size_t __idx_global_end, - std::size_t __wi_in_one_wg, std::size_t __local_id) + std::size_t __idx_global_begin, std::size_t __idx_global_end, + std::size_t __wi_in_one_wg, std::size_t __local_id) { // How we load data: /* @@ -547,29 +547,31 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _Range1ValueType* __rng1_cache_slm = nullptr; _Range1ValueType* __rng2_cache_slm = nullptr; - // Current diagonal inside of the merge matrix? - const bool __have_data = __global_linear_id * __chunk < __n; + // Split points on left anr right base diagonals + // - in GLOBAL coordinates + const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; + const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; - // Current diagonal inside of the merge matrix? - if (__have_data) - { - // Split points on left anr right base diagonals - // - in GLOBAL coordinates - const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; - const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; + assert(__sp_base_right_global.first >= __sp_base_left_global.first); + assert(__sp_base_right_global.second >= __sp_base_left_global.second); - assert(__sp_base_right_global.first >= __sp_base_left_global.first); - assert(__sp_base_right_global.second >= __sp_base_left_global.second); + __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; + __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; - __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; - __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0; + const bool __need_merge_data = __global_linear_id * __chunk < __n; + if (__need_load_data || __need_merge_data) + { auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack); auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size); __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1; __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2; + } - // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm + // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm + if (__need_load_data) + { load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __wi_in_one_wg, __local_id); @@ -579,7 +581,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __dpl_sycl::__group_barrier(__nd_item); // Current diagonal inside of the merge matrix? - if (__have_data) + if (__need_merge_data) { // Find split point in LOCAL coordinates // - bottom-right split point describes the size of current area between two base diagonals. From f93fade7409bf6fd5a2659f621206e21013a4ce2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 20:18:13 +0100 Subject: [PATCH 45/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a145a89a590..2a1816b7c27 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -631,11 +631,24 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits::max()); - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName>>; - return __parallel_merge_submitter()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint16_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint32_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } else { From eab6cee3972389eb2291dbe3349cbbb39cb606f1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 09:42:45 +0100 Subject: [PATCH 46/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 2a1816b7c27..8929f14e6b7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -631,24 +631,12 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits::max()); - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint16_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } + using _WiIndex = std::uint32_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); } else { From f3f8468a5fd394a98760e640fcde2606103d82e2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 09:45:51 +0100 Subject: [PATCH 47/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - load __parallel_merge_submitter if we merge different merge types Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8929f14e6b7..cffe1b87528 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -455,6 +455,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, { using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type; + static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); @@ -626,8 +627,13 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy constexpr std::size_t __starting_size_limit_for_large_submitter = 16 * 1'048'576; // 4 Mb + using _Range1ValueType = typename std::iterator_traits::value_type; + using _Range2ValueType = typename std::iterator_traits::value_type; + + constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>; + const std::size_t __n = __rng1.size() + __rng2.size(); - if (false) //if (__n < __starting_size_limit_for_large_submitter) + if (false) //if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types) { static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits::max()); From 4bbeb508412e180d5881f1065a842b829b7105c7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 09:55:09 +0100 Subject: [PATCH 48/80] remove usage of __merge_slm_helper Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 76 +++---------------- 1 file changed, 10 insertions(+), 66 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index cffe1b87528..b77337e01aa 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -280,53 +280,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { - // Create local accessors for data cache in SLM: - // - one accessor for the first and for the second ranges if _Range1 and _Range2 has the SAME value types; - // - two accessors for the first and for the second ranges if _Range1 and _Range2 has DIFFERENT value types. - struct __merge_slm_helper - { - template - static std::size_t - get_data_size(_Range1&& __rng1, _Range2&& __rng2) - { - using _Range1ValueType = typename std::iterator_traits::value_type; - using _Range2ValueType = typename std::iterator_traits::value_type; - - return sizeof(_Range1ValueType) + sizeof(_Range2ValueType); - } - - template - static constexpr auto - create_local_accessors(sycl::handler& __cgh, _Range1&& __rng1, _Range2&& __rng2, - std::size_t __slm_cached_data_size) - { - using _Range1ValueType = typename std::iterator_traits::value_type; - using _Range2ValueType = typename std::iterator_traits::value_type; - - if constexpr (std::is_same_v<_Range1ValueType, _Range2ValueType>) - return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>>( - __dpl_sycl::__local_accessor<_Range1ValueType>(2 * __slm_cached_data_size, __cgh)); - else - return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>, - __dpl_sycl::__local_accessor<_Range2ValueType>>( - __dpl_sycl::__local_accessor<_Range1ValueType>(__slm_cached_data_size, __cgh), - __dpl_sycl::__local_accessor<_Range2ValueType>(__slm_cached_data_size, __cgh)); - } - - template - static auto - get_local_accessor(AccessorsTuple& __loc_acc_pack, std::size_t __offset = 0) - { - static_assert(std::tuple_size_v == 1 || std::tuple_size_v == 2); - - if constexpr (std::tuple_size_v == 1) - return std::pair(std::get<0>(__loc_acc_pack), __offset); - - else - return std::pair(std::get(__loc_acc_pack), 0); - } - }; - template static void load_data_into_slm_impl(_Range&& __rng, _DataType* __slm, @@ -457,6 +410,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _Range2ValueType = typename std::iterator_traits::value_type; static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); + using _RangeValueType = _Range1ValueType; + const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; @@ -472,8 +427,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Pessimistically only use half of the memory to take into account memory used by compiled kernel const std::size_t __max_slm_size_adj = std::max((std::size_t)__chunk, - std::min((std::size_t)__n, - oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, __merge_slm_helper::get_data_size(__rng1, __rng2)))); + std::min((std::size_t)__n, oneapi::dpl::__internal::__slm_adjusted_work_group_size( + __exec, 2 * sizeof(_RangeValueType)))); // The amount of data must be a multiple of the chunk size. const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk; @@ -529,7 +484,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk; - auto __loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size); + __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(2 * __slm_cached_data_size, __cgh); // Run nd_range parallel_for to process all the data // - each work-group caching source data in SLM and processing diagonals between two base diagonals; @@ -542,12 +497,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __local_id = __nd_item.get_local_id(0); // Merge sub-matrix LOCAL diagonal's index const std::size_t __group_linear_id = __nd_item.get_group_linear_id(); // Merge matrix base diagonal's GLOBAL index - _IdType __rng1_wg_data_size = 0; - _IdType __rng2_wg_data_size = 0; - - _Range1ValueType* __rng1_cache_slm = nullptr; - _Range1ValueType* __rng2_cache_slm = nullptr; - // Split points on left anr right base diagonals // - in GLOBAL coordinates const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; @@ -556,20 +505,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, assert(__sp_base_right_global.first >= __sp_base_left_global.first); assert(__sp_base_right_global.second >= __sp_base_left_global.second); - __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; - __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; + _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + + _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); + _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0; const bool __need_merge_data = __global_linear_id * __chunk < __n; - if (__need_load_data || __need_merge_data) - { - auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack); - auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size); - __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1; - __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2; - } - // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm if (__need_load_data) { From 0e5c0d2c1f7877836a9b20aca3f86d7da4413c4d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 09:58:49 +0100 Subject: [PATCH 49/80] load_data_into_slm now working only with the same data types too Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b77337e01aa..a5a1b429569 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -365,10 +365,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __wi_for_data_reading; } - template + template static void - load_data_into_slm(_Range1&& __rng1, _DataType1* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, - _Range2&& __rng2, _DataType2* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, + load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, + _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, const std::size_t __wi_in_one_wg, const std::size_t __local_id) { // TODO what size of SLM bank we have now? @@ -376,10 +376,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type; + static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); + + using _RangeValueType = _Range1ValueType; // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache - const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1); - const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2); + const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _RangeValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1); + const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _RangeValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2); // Now arrange the reading by work-items if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2) From 501e58e4b389747d00517679eadd793d797937c0 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 10:04:15 +0100 Subject: [PATCH 50/80] remove __calc_wi_amount_for_data_reading function and it's usage Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a5a1b429569..f6d2296da9e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -350,21 +350,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, } } - template - static std::size_t - __calc_wi_amount_for_data_reading(const std::size_t __wi_in_one_wg, const std::size_t __reading_data) - { - std::size_t __wi_for_data_reading = 0; - if (__reading_data > 0) - { - const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); - - __wi_for_data_reading = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__reading_data, __required_reading_data_per_wi)); - } - - return __wi_for_data_reading; - } - template static void load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, @@ -381,8 +366,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _RangeValueType = _Range1ValueType; // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache - const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _RangeValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1); - const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _RangeValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2); + const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); + const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__idx_global_end1 - __idx_global_begin1, __required_reading_data_per_wi)); + const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__idx_global_end2 - __idx_global_begin2, __required_reading_data_per_wi)); // Now arrange the reading by work-items if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2) From ebae70744024dff2ce3432b83033fdbdb07f45b6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 10:17:16 +0100 Subject: [PATCH 51/80] modify load_data_into_slm Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index f6d2296da9e..06093a9f52f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -365,26 +365,30 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _RangeValueType = _Range1ValueType; + const auto __to_read_rng1 = __idx_global_end1 - __idx_global_begin1; + const auto __to_read_rng2 = __idx_global_end2 - __idx_global_begin2; + // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); - const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__idx_global_end1 - __idx_global_begin1, __required_reading_data_per_wi)); - const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__idx_global_end2 - __idx_global_begin2, __required_reading_data_per_wi)); + const std::size_t __wi_for_data_reading_all = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1 + __to_read_rng2, __required_reading_data_per_wi)); + const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1, __required_reading_data_per_wi)); + const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng2, __required_reading_data_per_wi)); // Now arrange the reading by work-items - if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2) + if (__wi_in_one_wg >= __wi_for_data_reading_all) { if (__local_id < __wi_for_data_reading1) { load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); } - else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2) + else if (__local_id < __wi_for_data_reading_all) { // When we reading data from parallel-working work-items, we should reduce the local id of current work-item // because we calculate reeded data size based on this value. load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1); } } - else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2) + else if (__local_id < __wi_for_data_reading_all) { load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id); From 6459dac76b68573737289436bff3d22aada2c748 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 12:03:24 +0100 Subject: [PATCH 52/80] @@@ Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 217 ++++++------------ 1 file changed, 72 insertions(+), 145 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 06093a9f52f..0c56f954583 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -39,6 +39,13 @@ using _split_point_t = std::pair<_Index, _Index>; template constexpr _split_point_t<_Index> __zero_split_point{0, 0}; +template +inline _Index __get_index_sum(_Index __idx) +{ + assert(__idx > 0); + return __idx - 1; +} + //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: // 0 1 1 2 3 @@ -121,7 +128,7 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el { //////////////////////////////////////////////////////////////////////////////////// // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; + const auto __index_sum = __get_index_sum(__i_elem); using _IndexSigned = std::make_signed_t<_Index>; @@ -280,125 +287,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { - template - static void - load_data_into_slm_impl(_Range&& __rng, _DataType* __slm, - std::size_t __idx_global_begin, std::size_t __idx_global_end, - std::size_t __wi_in_one_wg, std::size_t __local_id) - { - // How we load data: - /* - +-------------------+--------------------------------------------------------------+------------+ - | Source data index | Work-items in one work-group | SLM index | - +-------------------+--------------------------------------------------------------+------------+ - | | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) | | <--- __local_id: in which work-item we are - +-------------------+-------+-------+-------+-------+-----+------------------------+------------+ - | rng[0] | | | | | | | | - | rng[1] | + | | | | | | slm[0] | <--- __idx_global_begin \ - | rng[2] | + | | | | | | slm[1] | | SLM bank: write into one SLM bank from one work-item - | rng[3] | + | | | | | | slm[2] | / - | rng[4] | | + | | | | | slm[3] | - | rng[5] | | + | | | | | slm[4] | - | rng[6] | | + | | | | | slm[5] | - | rng[7] | | | + | | | | slm[6] | - | rng[8] | | | + | | | | slm[7] | - | rng[9] | | | + | | | | slm[8] | - | rng[10] | | | | + | | | slm[9] | - | rng[11] | | | | + | | | slm[10] | - | rng[12] | | | | + | | | slm[11] | - | ..... | | | | | +++ | | ... | - | rng[M + 1] | | | | | | + | slm[M] | - | rng[M + 2] | | | | | | + | slm[M + 1] | - | rng[M + 3] | | | | | | + | slm[M + 2] | - | rng[M + 4] | | | | | | | slm[M + 3] | - | rng[M + 5] | | | | - | | | | <--- __idx_global_end - | ..... | | | | | --- | | | - | rng[M + M + 1] | | | | | | - | | - +-------------------+--------------------------------------------------------------+------------+ - ^ - | - __local_id - - "+" - load one source data item ito SLM - */ - - const std::size_t __wg_data_size_rng = __idx_global_end - __idx_global_begin; - if (__wg_data_size_rng > 0) - { - // Calculate the size of the current part of merging data per work-item - const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __wi_in_one_wg); - assert(__loading_data_per_wi > 0); - - if (__loading_data_per_wi > 1) - { - const auto __slm_idx_begin = __local_id * __loading_data_per_wi; - const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; - - std::size_t __slm_idx = __slm_idx_begin; - std::size_t __rng_idx = __idx_global_begin + __slm_idx; - - _ONEDPL_PRAGMA_UNROLL - for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx) - __slm[__slm_idx] = __rng[__rng_idx]; - } - else - { - const std::size_t __rng_idx = __idx_global_begin + __local_id; - if (__rng_idx < __idx_global_end) - __slm[__local_id] = __rng[__rng_idx]; - } - } - } - - template - static void - load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, - _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, - const std::size_t __wi_in_one_wg, const std::size_t __local_id) - { - // TODO what size of SLM bank we have now? - constexpr std::size_t __slm_bank_size = 64; // = 1024; - - using _Range1ValueType = typename std::iterator_traits::value_type; - using _Range2ValueType = typename std::iterator_traits::value_type; - static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); - - using _RangeValueType = _Range1ValueType; - - const auto __to_read_rng1 = __idx_global_end1 - __idx_global_begin1; - const auto __to_read_rng2 = __idx_global_end2 - __idx_global_begin2; - - // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache - const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); - const std::size_t __wi_for_data_reading_all = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1 + __to_read_rng2, __required_reading_data_per_wi)); - const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1, __required_reading_data_per_wi)); - const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng2, __required_reading_data_per_wi)); - - // Now arrange the reading by work-items - if (__wi_in_one_wg >= __wi_for_data_reading_all) - { - if (__local_id < __wi_for_data_reading1) - { - load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); - } - else if (__local_id < __wi_for_data_reading_all) - { - // When we reading data from parallel-working work-items, we should reduce the local id of current work-item - // because we calculate reeded data size based on this value. - load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1); - } - } - else if (__local_id < __wi_for_data_reading_all) - { - load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); - load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id); - } - } - template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { + // TODO what size of SLM bank we have now? + constexpr std::size_t __slm_bank_size = 64; // = 1024; + using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type; static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); @@ -413,10 +308,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _PRINT_INFO_IN_DEBUG_MODE(__exec); + const bool __b_check = __n1 == 521 && __n2 == 260; + // Empirical number of values to process per work-item const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 8; assert(__chunk > 0); + // The only bank conflicts you need to worry about is in SLM, so I think if your chunk_size * element_size > bank size, then this should be ok. + assert(__chunk * sizeof(_RangeValueType) >= __slm_bank_size); + // Pessimistically only use half of the memory to take into account memory used by compiled kernel const std::size_t __max_slm_size_adj = std::max((std::size_t)__chunk, @@ -490,42 +390,69 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __local_id = __nd_item.get_local_id(0); // Merge sub-matrix LOCAL diagonal's index const std::size_t __group_linear_id = __nd_item.get_group_linear_id(); // Merge matrix base diagonal's GLOBAL index - // Split points on left anr right base diagonals - // - in GLOBAL coordinates - const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; - const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; - - assert(__sp_base_right_global.first >= __sp_base_left_global.first); - assert(__sp_base_right_global.second >= __sp_base_left_global.second); - - _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; - _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; - - _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); - _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - - const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0; - const bool __need_merge_data = __global_linear_id * __chunk < __n; - - // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm - if (__need_load_data) - { - load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, - __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, - __wi_in_one_wg, __local_id); - } - - // Wait until all the data is loaded - __dpl_sycl::__group_barrier(__nd_item); - // Current diagonal inside of the merge matrix? + const bool __need_merge_data = __global_linear_id * __chunk < __n; if (__need_merge_data) { + // Split points on left anr right base diagonals + // - in GLOBAL coordinates + const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; + const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; + + assert(__sp_base_right_global.first >= __sp_base_left_global.first); + assert(__sp_base_right_global.second >= __sp_base_left_global.second); + + const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; + const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + + _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); + _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; + + const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0; + assert(__need_load_data); + + // Calculate diagonal index + // - in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data + const _IdType __i_elem = __local_id * __chunk; + if (__i_elem > 0) + { + const auto __index_sum = __get_index_sum(__i_elem); + //assert(__index_sum >= __chunk); + + for (_IdType __idx = __index_sum - __chunk; __idx < __index_sum && __sp_base_left_global.first + __idx < __sp_base_right_global.first; ++__idx) + __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; + + for (_IdType __idx = __index_sum - __chunk; __idx < __index_sum && __sp_base_left_global.second + __idx < __sp_base_right_global.second; ++__idx) + __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; + } + else + { + assert(__i_elem == 0); + + if (__wi_in_one_wg > 1) + { + if (__rng1_wg_data_size > 0) + __rng1_cache_slm[0] = __rng1[__sp_base_left_global.first]; + + if (__rng2_wg_data_size > 0) + __rng2_cache_slm[0] = __rng2[__sp_base_left_global.second]; + } + else + { + assert(__wi_in_one_wg == 1); + for (_IdType __idx = 0; __idx < __rng1_wg_data_size; ++__idx) + __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; + + for (_IdType __idx = 0; __idx < __rng2_wg_data_size; ++__idx) + __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; + } + } + // Find split point in LOCAL coordinates // - bottom-right split point describes the size of current area between two base diagonals. const _split_point_t<_IdType> __sp_local = __find_start_point( __rng1_cache_slm, __rng2_cache_slm, // SLM cached copy of merging data - (_IdType)(__local_id * __chunk), // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data + __i_elem, // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data __rng1_wg_data_size, __rng2_wg_data_size, // size of rng1 and rng2 __comp); From e615d65fbac26cdcb6b33d23bc0305c4bfd4e0eb Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 12:06:55 +0100 Subject: [PATCH 53/80] Revert "@@@" This reverts commit 6459dac76b68573737289436bff3d22aada2c748. --- .../dpcpp/parallel_backend_sycl_merge.h | 217 ++++++++++++------ 1 file changed, 145 insertions(+), 72 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 0c56f954583..06093a9f52f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -39,13 +39,6 @@ using _split_point_t = std::pair<_Index, _Index>; template constexpr _split_point_t<_Index> __zero_split_point{0, 0}; -template -inline _Index __get_index_sum(_Index __idx) -{ - assert(__idx > 0); - return __idx - 1; -} - //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: // 0 1 1 2 3 @@ -128,7 +121,7 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el { //////////////////////////////////////////////////////////////////////////////////// // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __get_index_sum(__i_elem); + const auto __index_sum = __i_elem - 1; using _IndexSigned = std::make_signed_t<_Index>; @@ -287,13 +280,125 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { + template + static void + load_data_into_slm_impl(_Range&& __rng, _DataType* __slm, + std::size_t __idx_global_begin, std::size_t __idx_global_end, + std::size_t __wi_in_one_wg, std::size_t __local_id) + { + // How we load data: + /* + +-------------------+--------------------------------------------------------------+------------+ + | Source data index | Work-items in one work-group | SLM index | + +-------------------+--------------------------------------------------------------+------------+ + | | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) | | <--- __local_id: in which work-item we are + +-------------------+-------+-------+-------+-------+-----+------------------------+------------+ + | rng[0] | | | | | | | | + | rng[1] | + | | | | | | slm[0] | <--- __idx_global_begin \ + | rng[2] | + | | | | | | slm[1] | | SLM bank: write into one SLM bank from one work-item + | rng[3] | + | | | | | | slm[2] | / + | rng[4] | | + | | | | | slm[3] | + | rng[5] | | + | | | | | slm[4] | + | rng[6] | | + | | | | | slm[5] | + | rng[7] | | | + | | | | slm[6] | + | rng[8] | | | + | | | | slm[7] | + | rng[9] | | | + | | | | slm[8] | + | rng[10] | | | | + | | | slm[9] | + | rng[11] | | | | + | | | slm[10] | + | rng[12] | | | | + | | | slm[11] | + | ..... | | | | | +++ | | ... | + | rng[M + 1] | | | | | | + | slm[M] | + | rng[M + 2] | | | | | | + | slm[M + 1] | + | rng[M + 3] | | | | | | + | slm[M + 2] | + | rng[M + 4] | | | | | | | slm[M + 3] | + | rng[M + 5] | | | | - | | | | <--- __idx_global_end + | ..... | | | | | --- | | | + | rng[M + M + 1] | | | | | | - | | + +-------------------+--------------------------------------------------------------+------------+ + ^ + | + __local_id + + "+" - load one source data item ito SLM + */ + + const std::size_t __wg_data_size_rng = __idx_global_end - __idx_global_begin; + if (__wg_data_size_rng > 0) + { + // Calculate the size of the current part of merging data per work-item + const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __wi_in_one_wg); + assert(__loading_data_per_wi > 0); + + if (__loading_data_per_wi > 1) + { + const auto __slm_idx_begin = __local_id * __loading_data_per_wi; + const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; + + std::size_t __slm_idx = __slm_idx_begin; + std::size_t __rng_idx = __idx_global_begin + __slm_idx; + + _ONEDPL_PRAGMA_UNROLL + for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx) + __slm[__slm_idx] = __rng[__rng_idx]; + } + else + { + const std::size_t __rng_idx = __idx_global_begin + __local_id; + if (__rng_idx < __idx_global_end) + __slm[__local_id] = __rng[__rng_idx]; + } + } + } + + template + static void + load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, + _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, + const std::size_t __wi_in_one_wg, const std::size_t __local_id) + { + // TODO what size of SLM bank we have now? + constexpr std::size_t __slm_bank_size = 64; // = 1024; + + using _Range1ValueType = typename std::iterator_traits::value_type; + using _Range2ValueType = typename std::iterator_traits::value_type; + static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); + + using _RangeValueType = _Range1ValueType; + + const auto __to_read_rng1 = __idx_global_end1 - __idx_global_begin1; + const auto __to_read_rng2 = __idx_global_end2 - __idx_global_begin2; + + // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache + const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); + const std::size_t __wi_for_data_reading_all = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1 + __to_read_rng2, __required_reading_data_per_wi)); + const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1, __required_reading_data_per_wi)); + const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng2, __required_reading_data_per_wi)); + + // Now arrange the reading by work-items + if (__wi_in_one_wg >= __wi_for_data_reading_all) + { + if (__local_id < __wi_for_data_reading1) + { + load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); + } + else if (__local_id < __wi_for_data_reading_all) + { + // When we reading data from parallel-working work-items, we should reduce the local id of current work-item + // because we calculate reeded data size based on this value. + load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1); + } + } + else if (__local_id < __wi_for_data_reading_all) + { + load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); + load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id); + } + } + template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { - // TODO what size of SLM bank we have now? - constexpr std::size_t __slm_bank_size = 64; // = 1024; - using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type; static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); @@ -308,15 +413,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _PRINT_INFO_IN_DEBUG_MODE(__exec); - const bool __b_check = __n1 == 521 && __n2 == 260; - // Empirical number of values to process per work-item const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 8; assert(__chunk > 0); - // The only bank conflicts you need to worry about is in SLM, so I think if your chunk_size * element_size > bank size, then this should be ok. - assert(__chunk * sizeof(_RangeValueType) >= __slm_bank_size); - // Pessimistically only use half of the memory to take into account memory used by compiled kernel const std::size_t __max_slm_size_adj = std::max((std::size_t)__chunk, @@ -390,69 +490,42 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __local_id = __nd_item.get_local_id(0); // Merge sub-matrix LOCAL diagonal's index const std::size_t __group_linear_id = __nd_item.get_group_linear_id(); // Merge matrix base diagonal's GLOBAL index - // Current diagonal inside of the merge matrix? + // Split points on left anr right base diagonals + // - in GLOBAL coordinates + const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; + const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; + + assert(__sp_base_right_global.first >= __sp_base_left_global.first); + assert(__sp_base_right_global.second >= __sp_base_left_global.second); + + _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; + _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + + _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); + _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; + + const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0; const bool __need_merge_data = __global_linear_id * __chunk < __n; - if (__need_merge_data) + + // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm + if (__need_load_data) { - // Split points on left anr right base diagonals - // - in GLOBAL coordinates - const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; - const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; - - assert(__sp_base_right_global.first >= __sp_base_left_global.first); - assert(__sp_base_right_global.second >= __sp_base_left_global.second); - - const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; - const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; - - _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); - _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - - const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0; - assert(__need_load_data); - - // Calculate diagonal index - // - in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data - const _IdType __i_elem = __local_id * __chunk; - if (__i_elem > 0) - { - const auto __index_sum = __get_index_sum(__i_elem); - //assert(__index_sum >= __chunk); - - for (_IdType __idx = __index_sum - __chunk; __idx < __index_sum && __sp_base_left_global.first + __idx < __sp_base_right_global.first; ++__idx) - __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; - - for (_IdType __idx = __index_sum - __chunk; __idx < __index_sum && __sp_base_left_global.second + __idx < __sp_base_right_global.second; ++__idx) - __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; - } - else - { - assert(__i_elem == 0); - - if (__wi_in_one_wg > 1) - { - if (__rng1_wg_data_size > 0) - __rng1_cache_slm[0] = __rng1[__sp_base_left_global.first]; - - if (__rng2_wg_data_size > 0) - __rng2_cache_slm[0] = __rng2[__sp_base_left_global.second]; - } - else - { - assert(__wi_in_one_wg == 1); - for (_IdType __idx = 0; __idx < __rng1_wg_data_size; ++__idx) - __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; - - for (_IdType __idx = 0; __idx < __rng2_wg_data_size; ++__idx) - __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; - } - } + load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, + __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, + __wi_in_one_wg, __local_id); + } + + // Wait until all the data is loaded + __dpl_sycl::__group_barrier(__nd_item); + // Current diagonal inside of the merge matrix? + if (__need_merge_data) + { // Find split point in LOCAL coordinates // - bottom-right split point describes the size of current area between two base diagonals. const _split_point_t<_IdType> __sp_local = __find_start_point( __rng1_cache_slm, __rng2_cache_slm, // SLM cached copy of merging data - __i_elem, // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data + (_IdType)(__local_id * __chunk), // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data __rng1_wg_data_size, __rng2_wg_data_size, // size of rng1 and rng2 __comp); From 38ad9f299ff0537c622faf6e2c4528cd1cdecd6a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 12:16:03 +0100 Subject: [PATCH 54/80] remove load_data_into_slm_impl and etc. Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 139 ++---------------- 1 file changed, 12 insertions(+), 127 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 06093a9f52f..efccf26e581 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -280,121 +280,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { - template - static void - load_data_into_slm_impl(_Range&& __rng, _DataType* __slm, - std::size_t __idx_global_begin, std::size_t __idx_global_end, - std::size_t __wi_in_one_wg, std::size_t __local_id) - { - // How we load data: - /* - +-------------------+--------------------------------------------------------------+------------+ - | Source data index | Work-items in one work-group | SLM index | - +-------------------+--------------------------------------------------------------+------------+ - | | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) | | <--- __local_id: in which work-item we are - +-------------------+-------+-------+-------+-------+-----+------------------------+------------+ - | rng[0] | | | | | | | | - | rng[1] | + | | | | | | slm[0] | <--- __idx_global_begin \ - | rng[2] | + | | | | | | slm[1] | | SLM bank: write into one SLM bank from one work-item - | rng[3] | + | | | | | | slm[2] | / - | rng[4] | | + | | | | | slm[3] | - | rng[5] | | + | | | | | slm[4] | - | rng[6] | | + | | | | | slm[5] | - | rng[7] | | | + | | | | slm[6] | - | rng[8] | | | + | | | | slm[7] | - | rng[9] | | | + | | | | slm[8] | - | rng[10] | | | | + | | | slm[9] | - | rng[11] | | | | + | | | slm[10] | - | rng[12] | | | | + | | | slm[11] | - | ..... | | | | | +++ | | ... | - | rng[M + 1] | | | | | | + | slm[M] | - | rng[M + 2] | | | | | | + | slm[M + 1] | - | rng[M + 3] | | | | | | + | slm[M + 2] | - | rng[M + 4] | | | | | | | slm[M + 3] | - | rng[M + 5] | | | | - | | | | <--- __idx_global_end - | ..... | | | | | --- | | | - | rng[M + M + 1] | | | | | | - | | - +-------------------+--------------------------------------------------------------+------------+ - ^ - | - __local_id - - "+" - load one source data item ito SLM - */ - - const std::size_t __wg_data_size_rng = __idx_global_end - __idx_global_begin; - if (__wg_data_size_rng > 0) - { - // Calculate the size of the current part of merging data per work-item - const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __wi_in_one_wg); - assert(__loading_data_per_wi > 0); - - if (__loading_data_per_wi > 1) - { - const auto __slm_idx_begin = __local_id * __loading_data_per_wi; - const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi; - - std::size_t __slm_idx = __slm_idx_begin; - std::size_t __rng_idx = __idx_global_begin + __slm_idx; - - _ONEDPL_PRAGMA_UNROLL - for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx) - __slm[__slm_idx] = __rng[__rng_idx]; - } - else - { - const std::size_t __rng_idx = __idx_global_begin + __local_id; - if (__rng_idx < __idx_global_end) - __slm[__local_id] = __rng[__rng_idx]; - } - } - } - - template - static void - load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1, - _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2, - const std::size_t __wi_in_one_wg, const std::size_t __local_id) - { - // TODO what size of SLM bank we have now? - constexpr std::size_t __slm_bank_size = 64; // = 1024; - - using _Range1ValueType = typename std::iterator_traits::value_type; - using _Range2ValueType = typename std::iterator_traits::value_type; - static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); - - using _RangeValueType = _Range1ValueType; - - const auto __to_read_rng1 = __idx_global_end1 - __idx_global_begin1; - const auto __to_read_rng2 = __idx_global_end2 - __idx_global_begin2; - - // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache - const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); - const std::size_t __wi_for_data_reading_all = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1 + __to_read_rng2, __required_reading_data_per_wi)); - const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1, __required_reading_data_per_wi)); - const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng2, __required_reading_data_per_wi)); - - // Now arrange the reading by work-items - if (__wi_in_one_wg >= __wi_for_data_reading_all) - { - if (__local_id < __wi_for_data_reading1) - { - load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); - } - else if (__local_id < __wi_for_data_reading_all) - { - // When we reading data from parallel-working work-items, we should reduce the local id of current work-item - // because we calculate reeded data size based on this value. - load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1); - } - } - else if (__local_id < __wi_for_data_reading_all) - { - load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id); - load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id); - } - } - template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -498,34 +383,34 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, assert(__sp_base_right_global.first >= __sp_base_left_global.first); assert(__sp_base_right_global.second >= __sp_base_left_global.second); - _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; - _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; + const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0; - const bool __need_merge_data = __global_linear_id * __chunk < __n; + // Calculate __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data + const _IdType __i_elem = __local_id * __chunk; // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm - if (__need_load_data) - { - load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first, __sp_base_right_global.first, - __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, - __wi_in_one_wg, __local_id); - } + _ONEDPL_PRAGMA_UNROLL + for (_IdType __idx = __i_elem; __idx < __i_elem + __chunk && __sp_base_left_global.first + __idx < __sp_base_right_global.first; ++__idx) + __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; + _ONEDPL_PRAGMA_UNROLL + for (_IdType __idx = __i_elem; __idx < __i_elem + __chunk && __sp_base_left_global.second + __idx < __sp_base_right_global.second; ++__idx) + __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); // Current diagonal inside of the merge matrix? - if (__need_merge_data) + if (__global_linear_id * __chunk < __n) { // Find split point in LOCAL coordinates // - bottom-right split point describes the size of current area between two base diagonals. const _split_point_t<_IdType> __sp_local = __find_start_point( __rng1_cache_slm, __rng2_cache_slm, // SLM cached copy of merging data - (_IdType)(__local_id * __chunk), // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data + __i_elem, // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data __rng1_wg_data_size, __rng2_wg_data_size, // size of rng1 and rng2 __comp); From f9df4d471c93e2689e616e96682b4d9874dae7d1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 12:28:59 +0100 Subject: [PATCH 55/80] restore __parallel_merge_submitter call Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index efccf26e581..31959de347a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -455,7 +455,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>; const std::size_t __n = __rng1.size() + __rng2.size(); - if (false) //if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types) + if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types) { static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits::max()); From 04feeb975eff0ded56dab64a4deb604f6ad42c59 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 12:30:14 +0100 Subject: [PATCH 56/80] call __parallel_merge_submitter_large for 1Mb of merging data and more Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 31959de347a..1a524f8321e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -447,7 +447,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - constexpr std::size_t __starting_size_limit_for_large_submitter = 16 * 1'048'576; // 4 Mb + constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type; From eb2de959601f7f81d4f4cf9b7c5129a81869f8d3 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 13:05:34 +0100 Subject: [PATCH 57/80] improvement of for-loop in loading data into SLM cache Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 1a524f8321e..2230851f5ce 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -391,13 +391,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculate __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data const _IdType __i_elem = __local_id * __chunk; + const _IdType __i_elem_next = (__local_id + 1) * __chunk; // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm _ONEDPL_PRAGMA_UNROLL - for (_IdType __idx = __i_elem; __idx < __i_elem + __chunk && __sp_base_left_global.first + __idx < __sp_base_right_global.first; ++__idx) + for (_IdType __idx = __i_elem; __idx < __i_elem_next && __idx < __rng1_wg_data_size; ++__idx) __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; _ONEDPL_PRAGMA_UNROLL - for (_IdType __idx = __i_elem; __idx < __i_elem + __chunk && __sp_base_left_global.second + __idx < __sp_base_right_global.second; ++__idx) + for (_IdType __idx = __i_elem; __idx < __i_elem_next && __idx < __rng2_wg_data_size; ++__idx) __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; // Wait until all the data is loaded From 3ec199d96aa7e0c6e3852ba1e7a51369ff6b9209 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 13:14:22 +0100 Subject: [PATCH 58/80] improvement of for-loop in loading data into SLM cache Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 2230851f5ce..310eb058e88 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -394,11 +394,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __i_elem_next = (__local_id + 1) * __chunk; // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm + _IdType __idx_end = std::min(__i_elem_next, __rng1_wg_data_size); _ONEDPL_PRAGMA_UNROLL - for (_IdType __idx = __i_elem; __idx < __i_elem_next && __idx < __rng1_wg_data_size; ++__idx) + for (_IdType __idx = __i_elem; __idx < __idx_end; ++__idx) __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; + + __idx_end = std::min(__i_elem_next, __rng2_wg_data_size); _ONEDPL_PRAGMA_UNROLL - for (_IdType __idx = __i_elem; __idx < __i_elem_next && __idx < __rng2_wg_data_size; ++__idx) + for (_IdType __idx = __i_elem; __idx < __idx_end; ++__idx) __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; // Wait until all the data is loaded From fde1797e45662c2c8c04e5523b8a157d3426ea30 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 13:22:52 +0100 Subject: [PATCH 59/80] Revert "include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - increate chunk size on GPU to 8" This reverts commit 1b5f0a7b9bc4f401155d4a51ed5db4bbdc4f982f. --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 310eb058e88..a55e3ef7636 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -299,7 +299,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _PRINT_INFO_IN_DEBUG_MODE(__exec); // Empirical number of values to process per work-item - const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 8; + const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; assert(__chunk > 0); // Pessimistically only use half of the memory to take into account memory used by compiled kernel From 5ba4cd59a093dfa6bd6018d84154937bbb3747f0 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 15:29:00 +0100 Subject: [PATCH 60/80] rewrite cooperative data load into SLM Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a55e3ef7636..c80ad29ef38 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -385,24 +385,38 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + const _IdType __rng_wg_data_size = __rng1_wg_data_size + __rng2_wg_data_size; _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - // Calculate __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data - const _IdType __i_elem = __local_id * __chunk; - const _IdType __i_elem_next = (__local_id + 1) * __chunk; - - // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm - _IdType __idx_end = std::min(__i_elem_next, __rng1_wg_data_size); - _ONEDPL_PRAGMA_UNROLL - for (_IdType __idx = __i_elem; __idx < __idx_end; ++__idx) - __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; - - __idx_end = std::min(__i_elem_next, __rng2_wg_data_size); - _ONEDPL_PRAGMA_UNROLL - for (_IdType __idx = __i_elem; __idx < __idx_end; ++__idx) - __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; + const std::size_t __chunk_of_data_reading = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_wg_data_size, __wi_in_one_wg); + const std::size_t __idx_begin = __local_id * __chunk_of_data_reading; + if (__idx_begin < __rng_wg_data_size) + { + const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng_wg_data_size); + + // Cooperative data load from __rng1 to __rng1_cache_slm + if (__idx_begin < __rng1_wg_data_size) + { + const _IdType __idx_begin_rng1 = __idx_begin; + const _IdType __idx_end_rng1 = std::min(__idx_end, __rng1_wg_data_size); + _ONEDPL_PRAGMA_UNROLL + for (_IdType __idx = __idx_begin_rng1; __idx < __idx_end_rng1; ++__idx) + __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; + } + + // Cooperative data load from __rng2 to __rng1_cache_slm + if (__idx_end > __rng1_wg_data_size) + { + const _IdType __idx_begin_rng2 = 0; + const _IdType __idx_end_rng2 = __idx_end - __rng1_wg_data_size; + + _ONEDPL_PRAGMA_UNROLL + for (_IdType __idx = __idx_begin_rng2; __idx < __idx_end_rng2; ++__idx) + __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; + } + } // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); @@ -414,7 +428,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - bottom-right split point describes the size of current area between two base diagonals. const _split_point_t<_IdType> __sp_local = __find_start_point( __rng1_cache_slm, __rng2_cache_slm, // SLM cached copy of merging data - __i_elem, // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data + (_IdType)(__local_id * __chunk), // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data __rng1_wg_data_size, __rng2_wg_data_size, // size of rng1 and rng2 __comp); @@ -459,7 +473,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>; const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types) + if (false)//if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types) { static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits::max()); From 28e2e374f0f879160070c08f5fccee0053aec47b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 15:55:31 +0100 Subject: [PATCH 61/80] evalueate __chunk_of_data_reading through SLM bank size Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index c80ad29ef38..9969adacd61 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -390,7 +390,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - const std::size_t __chunk_of_data_reading = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_wg_data_size, __wi_in_one_wg); + constexpr std::size_t __slm_bank_size = 32; + + const std::size_t __chunk_of_data_reading = std::max( + oneapi::dpl::__internal::__dpl_ceiling_div(__rng_wg_data_size, __wi_in_one_wg), + oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, 2 * sizeof(_RangeValueType))); const std::size_t __idx_begin = __local_id * __chunk_of_data_reading; if (__idx_begin < __rng_wg_data_size) { From 00dcb1db9d357c1ef400ea46bcaab60ce59f7c08 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 17:27:15 +0100 Subject: [PATCH 62/80] Using 2/3 of available SLM Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9969adacd61..42552c29dbe 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -302,11 +302,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; assert(__chunk > 0); - // Pessimistically only use half of the memory to take into account memory used by compiled kernel + // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel const std::size_t __max_slm_size_adj = std::max((std::size_t)__chunk, std::min((std::size_t)__n, oneapi::dpl::__internal::__slm_adjusted_work_group_size( - __exec, 2 * sizeof(_RangeValueType)))); + __exec, sizeof(_RangeValueType)))) * 2 / 3; // The amount of data must be a multiple of the chunk size. const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk; From 144de4a76fb20afb3b5fda148fc9eb2ba72e536e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 19:02:03 +0100 Subject: [PATCH 63/80] balance data load into SLM cache Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 51 ++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 42552c29dbe..e0a9f78a9d4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -303,10 +303,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, assert(__chunk > 0); // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel - const std::size_t __max_slm_size_adj = - std::max((std::size_t)__chunk, - std::min((std::size_t)__n, oneapi::dpl::__internal::__slm_adjusted_work_group_size( - __exec, sizeof(_RangeValueType)))) * 2 / 3; + const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType)); + const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 2 / 3; + const std::size_t __max_slm_size_adj = __slm_adjusted_work_group_size_x_part; // The amount of data must be a multiple of the chunk size. const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk; @@ -319,7 +318,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // The amount of the base diagonals is the amount of the work-groups // - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group - const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_source_data_items_fit_into_slm); + const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __wi_in_one_wg); // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) // - in GLOBAL coordinates @@ -385,39 +384,43 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; - const _IdType __rng_wg_data_size = __rng1_wg_data_size + __rng2_wg_data_size; _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - constexpr std::size_t __slm_bank_size = 32; + const std::size_t __chunk_of_data_reading = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg); - const std::size_t __chunk_of_data_reading = std::max( - oneapi::dpl::__internal::__dpl_ceiling_div(__rng_wg_data_size, __wi_in_one_wg), - oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, 2 * sizeof(_RangeValueType))); - const std::size_t __idx_begin = __local_id * __chunk_of_data_reading; - if (__idx_begin < __rng_wg_data_size) + const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); + const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); + + // Calculate the amount of WI for read data from rng1 + if (__local_id < __how_many_wi_reads_rng1) { - const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng_wg_data_size); + const std::size_t __idx_begin = __local_id * __chunk_of_data_reading; // Cooperative data load from __rng1 to __rng1_cache_slm if (__idx_begin < __rng1_wg_data_size) { - const _IdType __idx_begin_rng1 = __idx_begin; - const _IdType __idx_end_rng1 = std::min(__idx_end, __rng1_wg_data_size); + const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng1_wg_data_size); + _ONEDPL_PRAGMA_UNROLL - for (_IdType __idx = __idx_begin_rng1; __idx < __idx_end_rng1; ++__idx) + for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; } + } - // Cooperative data load from __rng2 to __rng1_cache_slm - if (__idx_end > __rng1_wg_data_size) - { - const _IdType __idx_begin_rng2 = 0; - const _IdType __idx_end_rng2 = __idx_end - __rng1_wg_data_size; + const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1; + if (__local_id >= __first_wi_local_id_for_read_rng2) + { + const std::size_t __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading; + // Cooperative data load from __rng2 to __rng2_cache_slm + if (__idx_begin < __rng2_wg_data_size) + { + const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng2_wg_data_size); + _ONEDPL_PRAGMA_UNROLL - for (_IdType __idx = __idx_begin_rng2; __idx < __idx_end_rng2; ++__idx) + for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; } } @@ -477,10 +480,10 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>; const std::size_t __n = __rng1.size() + __rng2.size(); - if (false)//if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types) + if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types) { static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits::max()); - + using _WiIndex = std::uint32_t; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; From 1812dbb2de56582968584bd7490046c0bdbc591d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 19:33:26 +0100 Subject: [PATCH 64/80] balance data load into SLM cache Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index e0a9f78a9d4..7bc99aa6cbd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -302,6 +302,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; assert(__chunk > 0); + // Define SLM bank size + constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? + + // Calculate how many data items we can read into one SLM bank + constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType)); + // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType)); const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 2 / 3; @@ -388,7 +394,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - const std::size_t __chunk_of_data_reading = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg); + const std::size_t __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); From 3fdd67379b8f0e959e2296e3054da37fe4813438 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 19:42:28 +0100 Subject: [PATCH 65/80] Using 4/5 of available SLM Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 7bc99aa6cbd..9cb0a04a392 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -310,7 +310,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType)); - const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 2 / 3; + const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5; const std::size_t __max_slm_size_adj = __slm_adjusted_work_group_size_x_part; // The amount of data must be a multiple of the chunk size. From 0ca542a2db9beb47b55cb80c6087e9c27147aa3b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 10:37:49 +0100 Subject: [PATCH 66/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an error in types of SP on base diagonals Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9cb0a04a392..0cf5bf24ffd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -382,8 +382,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Split points on left anr right base diagonals // - in GLOBAL coordinates - const _split_point_t& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; - const _split_point_t& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; + const auto& __sp_base_left_global = __base_diagonals_sp_global_ptr[__group_linear_id]; + const auto& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; assert(__sp_base_right_global.first >= __sp_base_left_global.first); assert(__sp_base_right_global.second >= __sp_base_left_global.second); From b32c911f5d4d7afc2499c862853fe9e43de79d08 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 10:42:21 +0100 Subject: [PATCH 67/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix types in __parallel_merge_submitter_large::operator() Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 0cf5bf24ffd..47e12d20bed 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -394,20 +394,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - const std::size_t __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); + const _IdType __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); - const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); - const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); + const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); + const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); // Calculate the amount of WI for read data from rng1 if (__local_id < __how_many_wi_reads_rng1) { - const std::size_t __idx_begin = __local_id * __chunk_of_data_reading; + const _IdType __idx_begin = __local_id * __chunk_of_data_reading; // Cooperative data load from __rng1 to __rng1_cache_slm if (__idx_begin < __rng1_wg_data_size) { - const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng1_wg_data_size); + const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size); _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) @@ -418,12 +418,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1; if (__local_id >= __first_wi_local_id_for_read_rng2) { - const std::size_t __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading; + const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading; // Cooperative data load from __rng2 to __rng2_cache_slm if (__idx_begin < __rng2_wg_data_size) { - const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng2_wg_data_size); + const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size); _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) From 7cda3e4291658a083dee9723b603a6ba8e647b62 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 10:56:59 +0100 Subject: [PATCH 68/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix types in __serial_merge Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 47e12d20bed..74606208870 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -194,19 +194,19 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ { //copying a residual of the second seq const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) + for (_Index __i = 0; __i < __n; ++__i) __rng3[__start3 + __i] = __rng2[__start2 + __i]; } else if (__start2 >= __n2) { //copying a residual of the first seq const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) + for (_Index __i = 0; __i < __n; ++__i) __rng3[__start3 + __i] = __rng1[__start1 + __i]; } else { - for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) + for (_Index __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) { const auto& __val1 = __rng1[__start1]; const auto& __val2 = __rng2[__start2]; From 253ca8d0cde9029b4b490e30ad6a3e42894bd082 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 11:10:28 +0100 Subject: [PATCH 69/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove extra local variable Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 74606208870..e73c5ee5690 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -311,10 +311,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType)); const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5; - const std::size_t __max_slm_size_adj = __slm_adjusted_work_group_size_x_part; // The amount of data must be a multiple of the chunk size. - const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk; + const std::size_t __max_source_data_items_fit_into_slm = __slm_adjusted_work_group_size_x_part - __slm_adjusted_work_group_size_x_part % __chunk; assert(__max_source_data_items_fit_into_slm > 0); assert(__max_source_data_items_fit_into_slm % __chunk == 0); From 952871e1247c9f39132e1f3d435885ed51e1519c Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 16:28:51 +0100 Subject: [PATCH 70/80] @@@ include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - debug code under DUMP_DATA_LOADING Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index e73c5ee5690..a8c5f9045d3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -26,6 +26,8 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" +//#define DUMP_DATA_LOADING 1 + namespace oneapi { namespace dpl @@ -280,6 +282,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { +#if DUMP_DATA_LOADING + template + static void + __load_item_into_slm(_Range&& __rng, _Index __idx_from, _Data* __slm, _Index __idx_to, std::size_t __range_index, + bool __b_check, std::size_t __group_linear_id, std::size_t __local_id) + { + // BP + // condition: __b_check + // action: __range_index = {__range_index}, __rng[{__idx_from}] -> __slm[{__idx_to}], __group_linear_id = {__group_linear_id}, __local_id = {__local_id} + // action: {__range_index}, {__idx_from}, {__idx_to}, {__group_linear_id}, {__local_id} + __slm[__idx_to] = __rng[__idx_from]; + } +#endif + template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -294,6 +310,18 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; +#if DUMP_DATA_LOADING + //const bool __b_check = __n1 == 16144 && __n2 == 8072; + //const bool __b_check = __n1 == 50716 && __n2 == 25358; // __wi_in_one_wg = 51 __wg_count = 12 + const bool __b_check = false; + + if (__b_check) + { + int i = 0; + i = i; + } +#endif + assert(__n1 > 0 || __n2 > 0); _PRINT_INFO_IN_DEBUG_MODE(__exec); @@ -410,7 +438,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) +#if !DUMP_DATA_LOADING __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; +#else + __load_item_into_slm(__rng1, __sp_base_left_global.first + __idx, __rng1_cache_slm, __idx, 1, __b_check, __group_linear_id, __local_id); +#endif } } @@ -426,13 +458,44 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) +#if !DUMP_DATA_LOADING __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; +#else + __load_item_into_slm(__rng2, __sp_base_left_global.second + __idx, __rng2_cache_slm, __idx, 2, __b_check, __group_linear_id, __local_id); +#endif } } // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); +#if DUMP_DATA_LOADING + if (__local_id == 0) + { + for (auto i = __sp_base_left_global.first; i < __sp_base_right_global.first; ++i) + { + auto _idx_slm = i - __sp_base_left_global.first; + if (__rng1_cache_slm[_idx_slm] != __rng1[i]) + { + auto __group_linear_id_tmp = __group_linear_id; + __group_linear_id_tmp = __group_linear_id_tmp; + assert(false); + } + } + + for (auto i = __sp_base_left_global.second; i < __sp_base_right_global.second; ++i) + { + auto _idx_slm = i - __sp_base_left_global.second; + if (__rng2_cache_slm[_idx_slm] != __rng2[i]) + { + auto __group_linear_id_tmp = __group_linear_id; + __group_linear_id_tmp = __group_linear_id_tmp; + assert(false); + } + } + } +#endif + // Current diagonal inside of the merge matrix? if (__global_linear_id * __chunk < __n) { From b6e1d1c58ca7a67407760a4c6e9a624efacca762 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 16:30:32 +0100 Subject: [PATCH 71/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an error in data loading Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a8c5f9045d3..10bc5609cb9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -446,7 +446,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, } } - const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1; + const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2; if (__local_id >= __first_wi_local_id_for_read_rng2) { const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading; From ef568f08fbd1413cbfcde25f27c2d1780f8aacb1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 16:32:24 +0100 Subject: [PATCH 72/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix chunk size on GPU Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 10bc5609cb9..3a2c893bf15 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -326,16 +326,16 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _PRINT_INFO_IN_DEBUG_MODE(__exec); - // Empirical number of values to process per work-item - const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - assert(__chunk > 0); - // Define SLM bank size constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? // Calculate how many data items we can read into one SLM bank constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType)); + // Empirical number of values to process per work-item + _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; + assert(__chunk > 0); + // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType)); const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5; @@ -353,6 +353,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __wi_in_one_wg); + assert(__wg_count * __wi_in_one_wg * __chunk >= __n); + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) // - in GLOBAL coordinates using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; From 3dced5122178241fbc3c46e16394e5f270ad5c65 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 16:33:57 +0100 Subject: [PATCH 73/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix calculation of available SLM memory amount Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 3a2c893bf15..cc34a8144d8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -336,17 +336,24 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; assert(__chunk > 0); - // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel - const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType)); - const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5; + // Get the size of local memory arena in bytes. + const std::size_t __slm_mem_size = __exec.queue().get_device().template get_info(); - // The amount of data must be a multiple of the chunk size. - const std::size_t __max_source_data_items_fit_into_slm = __slm_adjusted_work_group_size_x_part - __slm_adjusted_work_group_size_x_part % __chunk; - assert(__max_source_data_items_fit_into_slm > 0); - assert(__max_source_data_items_fit_into_slm % __chunk == 0); + // Pessimistically only use 4/5 of the memory to take into account memory used by compiled kernel + const std::size_t __slm_mem_size_x_part = __slm_mem_size * 4 / 5; + + // Calculate how many items count we may place into SLM memory + const auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType); // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group) - const std::size_t __wi_in_one_wg = __max_source_data_items_fit_into_slm / __chunk; + std::size_t __wi_in_one_wg = __slm_cached_items_count / __chunk; + const std::size_t __max_wi_in_one_wg = __exec.queue().get_device().template get_info>()[0]; + if (__wi_in_one_wg > __max_wi_in_one_wg) + { + __chunk = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_cached_items_count, __max_wi_in_one_wg); + __wi_in_one_wg = __slm_cached_items_count / __chunk; + assert(__wi_in_one_wg <= __max_wi_in_one_wg); + } assert(__wi_in_one_wg > 0); // The amount of the base diagonals is the amount of the work-groups @@ -396,7 +403,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk; - __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(2 * __slm_cached_data_size, __cgh); + __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(__slm_cached_data_size, __cgh); // Run nd_range parallel_for to process all the data // - each work-group caching source data in SLM and processing diagonals between two base diagonals; From 39b68e4a89294734e0b89c3cc48092224866fcbf Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 16:35:44 +0100 Subject: [PATCH 74/80] Revert "@@@ include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - debug code under DUMP_DATA_LOADING" This reverts commit 952871e1247c9f39132e1f3d435885ed51e1519c. --- .../dpcpp/parallel_backend_sycl_merge.h | 63 ------------------- 1 file changed, 63 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index cc34a8144d8..cae18d8425a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -26,8 +26,6 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" -//#define DUMP_DATA_LOADING 1 - namespace oneapi { namespace dpl @@ -282,20 +280,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { -#if DUMP_DATA_LOADING - template - static void - __load_item_into_slm(_Range&& __rng, _Index __idx_from, _Data* __slm, _Index __idx_to, std::size_t __range_index, - bool __b_check, std::size_t __group_linear_id, std::size_t __local_id) - { - // BP - // condition: __b_check - // action: __range_index = {__range_index}, __rng[{__idx_from}] -> __slm[{__idx_to}], __group_linear_id = {__group_linear_id}, __local_id = {__local_id} - // action: {__range_index}, {__idx_from}, {__idx_to}, {__group_linear_id}, {__local_id} - __slm[__idx_to] = __rng[__idx_from]; - } -#endif - template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -310,18 +294,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; -#if DUMP_DATA_LOADING - //const bool __b_check = __n1 == 16144 && __n2 == 8072; - //const bool __b_check = __n1 == 50716 && __n2 == 25358; // __wi_in_one_wg = 51 __wg_count = 12 - const bool __b_check = false; - - if (__b_check) - { - int i = 0; - i = i; - } -#endif - assert(__n1 > 0 || __n2 > 0); _PRINT_INFO_IN_DEBUG_MODE(__exec); @@ -447,11 +419,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) -#if !DUMP_DATA_LOADING __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; -#else - __load_item_into_slm(__rng1, __sp_base_left_global.first + __idx, __rng1_cache_slm, __idx, 1, __b_check, __group_linear_id, __local_id); -#endif } } @@ -467,44 +435,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) -#if !DUMP_DATA_LOADING __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; -#else - __load_item_into_slm(__rng2, __sp_base_left_global.second + __idx, __rng2_cache_slm, __idx, 2, __b_check, __group_linear_id, __local_id); -#endif } } // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); -#if DUMP_DATA_LOADING - if (__local_id == 0) - { - for (auto i = __sp_base_left_global.first; i < __sp_base_right_global.first; ++i) - { - auto _idx_slm = i - __sp_base_left_global.first; - if (__rng1_cache_slm[_idx_slm] != __rng1[i]) - { - auto __group_linear_id_tmp = __group_linear_id; - __group_linear_id_tmp = __group_linear_id_tmp; - assert(false); - } - } - - for (auto i = __sp_base_left_global.second; i < __sp_base_right_global.second; ++i) - { - auto _idx_slm = i - __sp_base_left_global.second; - if (__rng2_cache_slm[_idx_slm] != __rng2[i]) - { - auto __group_linear_id_tmp = __group_linear_id; - __group_linear_id_tmp = __group_linear_id_tmp; - assert(false); - } - } - } -#endif - // Current diagonal inside of the merge matrix? if (__global_linear_id * __chunk < __n) { From 2da44ff5e64efaaa4fef9a0a1eba2f2e5bc30198 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 20 Nov 2024 18:21:26 +0100 Subject: [PATCH 75/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - another approach to calculate the amount of work-groups and work-items Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index cae18d8425a..63729d21bfb 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -305,7 +305,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType)); // Empirical number of values to process per work-item - _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; + const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; assert(__chunk > 0); // Get the size of local memory arena in bytes. @@ -315,17 +315,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __slm_mem_size_x_part = __slm_mem_size * 4 / 5; // Calculate how many items count we may place into SLM memory - const auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType); + auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType); // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group) - std::size_t __wi_in_one_wg = __slm_cached_items_count / __chunk; - const std::size_t __max_wi_in_one_wg = __exec.queue().get_device().template get_info>()[0]; - if (__wi_in_one_wg > __max_wi_in_one_wg) - { - __chunk = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_cached_items_count, __max_wi_in_one_wg); - __wi_in_one_wg = __slm_cached_items_count / __chunk; - assert(__wi_in_one_wg <= __max_wi_in_one_wg); - } + const std::size_t __max_wg_size = __exec.queue().get_device().template get_info(); + const std::size_t __wi_in_one_wg = std::min(__max_wg_size, __slm_cached_items_count / __chunk); assert(__wi_in_one_wg > 0); // The amount of the base diagonals is the amount of the work-groups From b04b25e156b0e5a8c647345bea353bfad4524f34 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 21 Nov 2024 09:57:02 +0100 Subject: [PATCH 76/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - do not use SLM bank size Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 63729d21bfb..ec32f822e93 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -299,13 +299,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _PRINT_INFO_IN_DEBUG_MODE(__exec); // Define SLM bank size - constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? + //constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? // Calculate how many data items we can read into one SLM bank - constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType)); + //constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType)); // Empirical number of values to process per work-item - const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; + const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;// __data_items_in_slm_bank; assert(__chunk > 0); // Get the size of local memory arena in bytes. @@ -396,7 +396,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - const _IdType __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); + const _IdType __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, (_IdType)oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); From cc5f8f05363849a7c982ce4918ded4d5a8f1ec4a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sat, 23 Nov 2024 13:58:37 +0100 Subject: [PATCH 77/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - use std::size_t instead of _IdType Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ec32f822e93..1a5cb8a41d8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -290,9 +290,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _RangeValueType = _Range1ValueType; - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; + const std::size_t __n1 = __rng1.size(); + const std::size_t __n2 = __rng2.size(); + const std::size_t __n = __n1 + __n2; assert(__n1 > 0 || __n2 > 0); @@ -305,7 +305,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, //constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType)); // Empirical number of values to process per work-item - const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;// __data_items_in_slm_bank; + const std::size_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; // __data_items_in_slm_bank; assert(__chunk > 0); // Get the size of local memory arena in bytes. @@ -390,26 +390,26 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, assert(__sp_base_right_global.first >= __sp_base_left_global.first); assert(__sp_base_right_global.second >= __sp_base_left_global.second); - const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; - const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; + const std::size_t __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first; + const std::size_t __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second; _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - const _IdType __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, (_IdType)oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); + const std::size_t __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, (_IdType)oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); - const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); - const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); + const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); + const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); // Calculate the amount of WI for read data from rng1 if (__local_id < __how_many_wi_reads_rng1) { - const _IdType __idx_begin = __local_id * __chunk_of_data_reading; + const std::size_t __idx_begin = __local_id * __chunk_of_data_reading; // Cooperative data load from __rng1 to __rng1_cache_slm if (__idx_begin < __rng1_wg_data_size) { - const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size); + const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size); _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) @@ -420,12 +420,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2; if (__local_id >= __first_wi_local_id_for_read_rng2) { - const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading; + const std::size_t __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading; // Cooperative data load from __rng2 to __rng2_cache_slm if (__idx_begin < __rng2_wg_data_size) { - const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size); + const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size); _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) From 4ec32e60647a2fd544ee849bc489500f0be0ac0a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sat, 23 Nov 2024 14:44:21 +0100 Subject: [PATCH 78/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix compile errors Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 0765f8ef7bc..c481a38beda 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -63,7 +63,7 @@ struct __group_merge_path_sorter template bool sort(const sycl::nd_item<1>& __item, const _StorageAcc& __storage_acc, _Compare __comp, std::uint32_t __start, - std::uint32_t __end, std::uint32_t __sorted, std::uint16_t __data_per_workitem, + std::uint32_t __end, std::uint32_t __sorted, std::uint32_t __data_per_workitem, std::uint32_t __workgroup_size) const { const std::uint32_t __sorted_final = __data_per_workitem * __workgroup_size; @@ -259,12 +259,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name __cgh.parallel_for<_GlobalSortName...>( sycl::range(__steps), [=](sycl::item __item_id) { - const _IndexT __i_elem = __item_id.get_linear_id() * __chunk; - const _IndexT __i_elem_local = __i_elem % (__n_sorted * 2); + const std::uint32_t __i_elem = __item_id.get_linear_id() * __chunk; + const std::uint32_t __i_elem_local = __i_elem % (__n_sorted * 2); - const _IndexT __offset = std::min<_IndexT>(__i_elem - __i_elem_local, __n); - const _IndexT __n1 = std::min<_IndexT>(__offset + __n_sorted, __n) - __offset; - const _IndexT __n2 = std::min<_IndexT>(__offset + __n1 + __n_sorted, __n) - (__offset + __n1); + const std::uint32_t __offset = std::min<_IndexT>(__i_elem - __i_elem_local, __n); + const std::uint32_t __n1 = std::min<_IndexT>(__offset + __n_sorted, __n) - __offset; + const std::uint32_t __n2 = std::min<_IndexT>(__offset + __n1 + __n_sorted, __n) - (__offset + __n1); if (__data_in_temp) { From 1810317eca283196501a8792a3ee5578e9c79474 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sat, 23 Nov 2024 15:10:48 +0100 Subject: [PATCH 79/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix compile errors Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 1a5cb8a41d8..f5562b67ecd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -351,7 +351,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - in GLOBAL coordinates _split_point_t<_IdType> __sp(__linear_id == 0 ? __zero_split_point : _split_point_t{__n1, __n2}); if (0 < __linear_id && __linear_id < __wg_count) - __sp = __find_start_point(__rng1, __rng2, (_IdType)(__linear_id * __wi_in_one_wg * __chunk), __n1, __n2, __comp); + __sp = __find_start_point(__rng1, __rng2, __linear_id * __wi_in_one_wg * __chunk, __n1, __n2, __comp); __base_diagonals_sp_global_ptr[__linear_id] = __sp; }); @@ -396,7 +396,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - const std::size_t __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, (_IdType)oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); + const std::size_t __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); @@ -443,7 +443,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - bottom-right split point describes the size of current area between two base diagonals. const _split_point_t<_IdType> __sp_local = __find_start_point( __rng1_cache_slm, __rng2_cache_slm, // SLM cached copy of merging data - (_IdType)(__local_id * __chunk), // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data + __local_id * __chunk, // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data __rng1_wg_data_size, __rng2_wg_data_size, // size of rng1 and rng2 __comp); @@ -451,9 +451,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // - we should have here __sp_global in GLOBAL coordinates __serial_merge(__rng1_cache_slm, __rng2_cache_slm, // SLM cached copy of merging data __rng3, // Destination range - __sp_local.first, // __start1 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data - __sp_local.second, // __start2 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data - (_IdType)(__global_linear_id * __chunk), // __start3 in GLOBAL coordinates because __rng3 is not cached at all + (std::size_t)__sp_local.first, // __start1 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data + (std::size_t)__sp_local.second, // __start2 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data + __global_linear_id * __chunk, // __start3 in GLOBAL coordinates because __rng3 is not cached at all __chunk, __rng1_wg_data_size, __rng2_wg_data_size, // size of rng1 and rng2 __comp); From c8c0a26a82ac7a899f1a57dad5559f44e2057c25 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 26 Nov 2024 19:31:46 +0100 Subject: [PATCH 80/80] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using oneapi::dpl::__internal::__value_t to detect range's value types Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index f5562b67ecd..1453a078074 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -284,8 +284,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { - using _Range1ValueType = typename std::iterator_traits::value_type; - using _Range2ValueType = typename std::iterator_traits::value_type; + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type"); using _RangeValueType = _Range1ValueType; @@ -482,8 +482,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb - using _Range1ValueType = typename std::iterator_traits::value_type; - using _Range2ValueType = typename std::iterator_traits::value_type; + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>;