From cd477a378fa89bb6da81c63e8d3187dc0aa2e5ee Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Thu, 14 Nov 2024 12:57:51 +0100
Subject: [PATCH 01/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h -
 re-implement __find_start_point function

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 155 +++++++++++++++---
 1 file changed, 129 insertions(+), 26 deletions(-)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 753e32816a0..bd87d000354 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -32,6 +32,12 @@ namespace dpl
 namespace __par_backend_hetero
 {
 
+template <typename _Index>
+using _split_point_t = std::pair<_Index, _Index>;
+
+template <typename _Index>
+constexpr _split_point_t<_Index> __zero_split_point{0, 0};
+
 //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges
 //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below:
 //     0   1  1  2   3
@@ -45,37 +51,134 @@ namespace __par_backend_hetero
 //   |             ---->
 // 3 | 0   0  0  0   0 |
 template <typename _Rng1, typename _Rng2, typename _Index, typename _Compare>
-auto
+_split_point_t<_Index>
 __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1,
                    const _Index __n2, _Compare __comp)
 {
-    //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1]
-    oneapi::dpl::counting_iterator<_Index> __diag_it(0);
+    const _Index __rng1_from = 0;
+    const _Index __rng1_to = __n1;
+    const _Index __rng2_from = 0;
+    const _Index __rng2_to = __n2;
 
-    if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed
-    {
-        const _Index __q = __i_elem;                         //diagonal index
-        const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size
-        auto __res =
-            std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/,
-                             [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable {
-                                 const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]);
-                                 return __zero_or_one < __value;
-                             });
-        return std::make_pair(*__res, __q - *__res);
-    }
-    else
+    assert(__rng1_from <= __rng1_to);
+    assert(__rng2_from <= __rng2_to);
+
+    assert(__rng1_to > 0 || __rng2_to > 0);
+
+    if constexpr (!std::is_pointer_v<_Rng1>)
+        assert(__rng1_to <= __rng1.size());
+    if constexpr (!std::is_pointer_v<_Rng2>)
+        assert(__rng2_to <= __rng2.size());
+
+    assert(__i_elem >= 0);
+
+    // ----------------------- EXAMPLE ------------------------
+    // Let's consider the following input data:
+    //    rng1.size() = 10
+    //    rng2.size() = 6
+    //    i_diag = 9
+    // Let's define the following ranges for processing:
+    //    rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9
+    //    rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4
+    //
+    // The goal: required to process only X' items of the merge matrix
+    //           as intersection of rng1[3, ..., 9) and rng2[1, ..., 4)
+    //
+    // --------------------------------------------------------
+    //
+    //         __diag_it_begin(rng1)            __diag_it_end(rng1)
+    //      (init state) (dest state)          (init state, dest state)
+    //            |          |                       |
+    //            V          V                       V
+    //                       +   +   +   +   +   +
+    //    \ rng1  0  1   2   3   4   5   6   7   8   9
+    //   rng2   +--------------------------------------+
+    //    0     |                    ^   ^   ^   X     |     <--- __diag_it_end(rng2) (init state)
+    // +  1     | <----------------- +   +   X'2 ^     |     <--- __diag_it_end(rng2) (dest state)
+    // +  2     | <----------------- +   X'1     |     |
+    // +  3     | <----------------- X'0         |     |     <--- __diag_it_begin(rng2) (dest state)
+    //    4     |                X   ^           |     |
+    //    5     |            X       |           |     |     <--- __diag_it_begin(rng2) (init state)
+    //          +-------AX-----------+-----------+-----+
+    //              AX               |           |
+    //           AX                  |           |
+    //              Run lower_bound:[from = 5,   to = 8)
+    //
+    //  AX - absent items in rng2
+    //
+    //  We have three points on diagonal for call comparison:
+    //      X'0 : call __comp(rng1[5], rng2[3])             // 5 + 3 == 9 - 1 == 8
+    //      X'1 : call __comp(rng1[6], rng2[2])             // 6 + 2 == 9 - 1 == 8
+    //      X'3 : call __comp(rng1[7], rng2[1])             // 7 + 1 == 9 - 1 == 8
+    //   - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Process the corner case: for the first diagonal with the index 0 split point
+    // is equal to (0, 0) regardless of the size and content of the data.
+    if (__i_elem > 0)
     {
-        const _Index __q = __i_elem - __n2;                         //diagonal index
-        const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size
-        auto __res =
-            std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/,
-                             [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable {
-                                 const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]);
-                                 return __zero_or_one < __value;
-                             });
-        return std::make_pair(__q + *__res, __n2 - *__res);
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Taking into account the specified constraints of the range of processed data
+        const auto __index_sum = __i_elem - 1;
+
+        using _IndexSigned = std::make_signed_t<_Index>;
+
+        _IndexSigned idx1_from = __rng1_from;
+        _IndexSigned idx1_to = __rng1_to;
+        assert(idx1_from <= idx1_to);
+
+        _IndexSigned idx2_from = __index_sum - (__rng1_to - 1);
+        _IndexSigned idx2_to = __index_sum - __rng1_from + 1;
+        assert(idx2_from <= idx2_to);
+
+        const _IndexSigned idx2_from_diff =
+            idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0;
+        const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0;
+
+        idx1_to -= idx2_from_diff;
+        idx1_from += idx2_to_diff;
+
+        idx2_from = __index_sum - (idx1_to - 1);
+        idx2_to = __index_sum - idx1_from + 1;
+
+        assert(idx1_from <= idx1_to);
+        assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to);
+
+        assert(idx2_from <= idx2_to);
+        assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Run search of split point on diagonal
+
+        using __it_t = oneapi::dpl::counting_iterator<_Index>;
+
+        __it_t __diag_it_begin(idx1_from);
+        __it_t __diag_it_end(idx1_to);
+
+        constexpr int kValue = 1;
+        const __it_t __res =
+            std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) {
+                const auto __rng1_idx = __idx;
+                const auto __rng2_idx = __index_sum - __idx;
+
+                assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to);
+                assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to);
+                assert(__rng1_idx + __rng2_idx == __index_sum);
+
+                const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]);
+                return __zero_or_one < kValue;
+            });
+
+        const _split_point_t<_Index> __result = std::make_pair(*__res, __index_sum - *__res + 1);
+        assert(__result.first + __result.second == __i_elem);
+
+        assert(__rng1_from <= __result.first && __result.first <= __rng1_to);
+        assert(__rng2_from <= __result.second && __result.second <= __rng2_to);
+
+        return __result;
     }
+
+    return std::make_pair(0, 0);
 }
 
 // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing
@@ -157,7 +260,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N
             oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3);
             __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
                 const _IdType __i_elem = __item_id.get_linear_id() * __chunk;
-                const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp);
+                    const _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp);
                 __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2,
                                __comp);
             });

From c76ad72c59673a0508570255b3e61cfc17ab8b9e Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Thu, 14 Nov 2024 12:59:13 +0100
Subject: [PATCH 02/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename
 template params in __parallel_merge_submitter

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_merge.h  | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index bd87d000354..0c0befe175e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -233,11 +233,11 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _
 }
 
 // Please see the comment for __parallel_for_submitter for optional kernel name explanation
-template <typename _IdType, typename _Name>
+template <typename _IdType, typename _MergeKernelName>
 struct __parallel_merge_submitter;
 
-template <typename _IdType, typename... _Name>
-struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>>
+template <typename _IdType, typename... _MergeKernelName>
+struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>>
 {
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
@@ -258,7 +258,8 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N
 
         auto __event = __exec.queue().submit([&](sycl::handler& __cgh) {
             oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3);
-            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
+            __cgh.parallel_for<_MergeKernelName...>(
+                sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
                 const _IdType __i_elem = __item_id.get_linear_id() * __chunk;
                     const _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp);
                 __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2,
@@ -283,18 +284,18 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     if (__n <= std::numeric_limits<std::uint32_t>::max())
     {
         using _WiIndex = std::uint32_t;
-        using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
             __merge_kernel_name<_CustomName, _WiIndex>>;
-        return __parallel_merge_submitter<_WiIndex, _MergeKernel>()(
+        return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
             std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
             std::forward<_Range3>(__rng3), __comp);
     }
     else
     {
         using _WiIndex = std::uint64_t;
-        using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
             __merge_kernel_name<_CustomName, _WiIndex>>;
-        return __parallel_merge_submitter<_WiIndex, _MergeKernel>()(
+        return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
             std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
             std::forward<_Range3>(__rng3), __comp);
     }

From cdf7d2b94d031db749a22fb79e2ea00faf21b548 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Thu, 14 Nov 2024 13:08:44 +0100
Subject: [PATCH 03/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h -
 implementation of __parallel_merge_submitter_large

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 0c0befe175e..472a45bd0c3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -21,6 +21,7 @@
 #include <cstdint>   // std::uint8_t, ...
 #include <utility>   // std::make_pair, std::forward
 #include <algorithm> // std::min, std::lower_bound
+#include <tuple>     // std::tuple
 
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
@@ -270,6 +271,240 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M
     }
 };
 
+template <typename _IdType, typename _CustomName, typename _DiagonalsKernelName, typename _MergeKernelName>
+struct __parallel_merge_submitter_large;
+
+template <typename _IdType, typename _CustomName, typename... _DiagonalsKernelName, typename... _MergeKernelName>
+struct __parallel_merge_submitter_large<_IdType, _CustomName,
+                                        __internal::__optional_kernel_name<_DiagonalsKernelName...>,
+                                        __internal::__optional_kernel_name<_MergeKernelName...>>
+{
+    // Create local accessors for data cache in SLM:
+    //  - one accessor for the first and for the second ranges if _Range1 and _Range2 has the SAME value types;
+    //  - two accessors for the first and for the second ranges if _Range1 and _Range2 has DIFFERENT value types.
+    struct __merge_slm_helper
+    {
+        template <typename _Range1, typename _Range2>
+        static std::size_t
+        get_data_size(_Range1&& __rng1, _Range2&& __rng2)
+        {
+            using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
+            using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+
+            return sizeof(_Range1ValueType) + sizeof(_Range2ValueType);
+        }
+
+        template <typename _Range1, typename _Range2>
+        static constexpr auto
+        create_local_accessors(sycl::handler& __cgh, _Range1&& __rng1, _Range2&& __rng2,
+                               std::size_t __slm_cached_data_size)
+        {
+            using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
+            using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+
+            if constexpr (std::is_same_v<_Range1ValueType, _Range2ValueType>)
+                return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>>(
+                    __dpl_sycl::__local_accessor<_Range1ValueType>(2 * __slm_cached_data_size, __cgh));
+            else
+                return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>,
+                                  __dpl_sycl::__local_accessor<_Range2ValueType>>(
+                    __dpl_sycl::__local_accessor<_Range1ValueType>(__slm_cached_data_size, __cgh),
+                    __dpl_sycl::__local_accessor<_Range2ValueType>(__slm_cached_data_size, __cgh));
+        }
+
+        template <std::size_t AccessorIdx, typename AccessorsTuple>
+        static auto
+        get_local_accessor(AccessorsTuple& __acc_tuple, std::size_t __offset = 0)
+        {
+            static_assert(std::tuple_size_v<AccessorsTuple> == 1 || std::tuple_size_v<AccessorsTuple> == 2);
+
+            if constexpr (std::tuple_size_v<AccessorsTuple> == 1)
+                return std::pair(std::get<0>(__acc_tuple), __offset);
+
+            else
+                return std::pair(std::get<AccessorIdx>(__acc_tuple), 0);
+        }
+    };
+
+    template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
+    {
+        const _IdType __n1 = __rng1.size();
+        const _IdType __n2 = __rng2.size();
+        const _IdType __n = __n1 + __n2;
+
+        assert(__n1 > 0 || __n2 > 0);
+
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+
+        // Empirical number of values to process per work-item
+        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
+        assert(__chunk > 0);
+
+        // Pessimistically only use half of the memory to take into account memory used by compiled kernel
+        const std::size_t __max_slm_size_adj =
+            oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, __merge_slm_helper::get_data_size(__rng1, __rng2));
+
+        // The amount of data must be a multiple of the chunk size.
+        const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk;
+        assert(__max_source_data_items_fit_into_slm > 0);
+        assert(__max_source_data_items_fit_into_slm % __chunk == 0);
+
+        // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group)
+        const _IdType __items_in_wg_count = __max_source_data_items_fit_into_slm / __chunk;
+        assert(__items_in_wg_count > 0);
+
+        // The amount of the base diagonals is the amount of the work-groups
+        //  - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group
+        const _IdType __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_source_data_items_fit_into_slm);
+
+        // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group)
+        //  - in GLOBAL coordinates
+        using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>;
+        __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __wg_count + 1};
+
+        // 1. Calculate split points on each base diagonal
+        //    - one work-item processing one base diagonal
+        sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) {
+
+            oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2);
+            auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc<sycl::access_mode::write>(__cgh, __dpl_sycl::__no_init{});
+
+            __cgh.parallel_for<_DiagonalsKernelName...>(
+                sycl::range</*dim=*/1>(__wg_count + 1), [=](sycl::item</*dim=*/1> __item_id) {
+
+                    const std::size_t __global_idx = __item_id.get_linear_id();
+
+                    _split_point_t<_IdType>* __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
+
+                    // Save top-left split point for first/last base diagonals of merge matrix
+                    //  - in GLOBAL coordinates
+                    _split_point_t<_IdType> __sp(__global_idx == 0 ? __zero_split_point<_IdType> : _split_point_t<_IdType>{__n1, __n2});
+
+                    if (0 < __global_idx && __global_idx < __wg_count)
+                    {
+                        const _IdType __i_elem = __global_idx * __items_in_wg_count * __chunk;
+
+                        // Save bottom-right split point for current base diagonal of merge matrix
+                        //  - in GLOBAL coordinates
+                        __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp);
+                    }
+
+                    __base_diagonals_sp_global_ptr[__global_idx] = __sp;
+                });
+        });
+
+        // 2. Merge data using split points on each base diagonal
+        //    - one work-item processing one diagonal
+        //    - work-items grouped to process diagonals between two base diagonals (include left base diagonal and exclude right base diagonal)
+        __event = __exec.queue().submit([&](sycl::handler& __cgh) {
+
+            __cgh.depends_on(__event);
+
+            oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3);
+            auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc<sycl::access_mode::read>(__cgh);
+
+            const std::size_t __slm_cached_data_size = __items_in_wg_count * __chunk;
+            auto local_accessors = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
+
+            // Run nd_range parallel_for to process all the data
+            __cgh.parallel_for<_MergeKernelName...>(
+                sycl::nd_range</*dim=*/1>(__wg_count * __items_in_wg_count, __items_in_wg_count),
+                [=](sycl::nd_item</*dim=*/1> __nd_item)
+                {
+                    // Merge matrix diagonal's GLOBAL index
+                    const std::size_t __global_idx = __nd_item.get_global_linear_id();
+
+                    // Merge sub-matrix LOCAL diagonal's index
+                    const std::size_t __local_idx = __nd_item.get_local_id(0);
+
+                    // Merge matrix base diagonal's GLOBAL index
+                    const std::size_t __wg_id = __nd_item.get_group_linear_id();
+
+                    auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
+
+                    // Split points on left anr right base diagonals
+                    //  - in GLOBAL coordinates
+                    assert(__wg_id + 1 < __wg_count + 1);
+                    const _split_point_t<_IdType>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__wg_id];
+                    const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; 
+
+                    auto [__local_accessor_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(local_accessors);
+                    auto [__local_accessor_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(local_accessors, (std::size_t)(__sp_base_right_global.first -__sp_base_left_global.first));
+                    auto __rngs_data_in_slm1 = std::addressof(__local_accessor_rng1[0]) + offset_to_slm1;
+                    auto __rngs_data_in_slm2 = std::addressof(__local_accessor_rng2[0]) + offset_to_slm2;
+
+                    // Full amount of work-items may be great then the amount of diagonals in the merge matrix
+                    // so we should skip the redundant work-items
+                    const bool __out_of_data = __global_idx * __chunk >= __n;
+                    if (!__out_of_data)
+                    {
+                        // Load the current part of merging data placed between two base diagonals into SLM
+                        // TODO implement cooperative data load by multiple work-items
+                        assert(__items_in_wg_count > 1);
+                        if (__local_idx == 0)
+                        {
+                            _IdType __slm_idx = 0;
+                            for (_IdType __idx = __sp_base_left_global.first; __idx < __sp_base_right_global.first; ++__idx, ++__slm_idx)
+                            {
+                                assert(__slm_idx < __slm_cached_data_size);
+                                assert(__idx < __n1);
+                                __rngs_data_in_slm1[__slm_idx] = __rng1[__idx];
+                            }
+                        }
+
+                        if (__local_idx == 1 && __items_in_wg_count > 1 || __local_idx == 0)
+                        {
+                            _IdType __slm_idx = 0;
+                            for (_IdType __idx = __sp_base_left_global.second; __idx < __sp_base_right_global.second; ++__idx, ++__slm_idx)
+                            {
+                                assert(__slm_idx < __slm_cached_data_size);
+                                assert(__idx < __n2);
+                                __rngs_data_in_slm2[__slm_idx] = __rng2[__idx];
+                            }
+                        }
+                    }
+
+                    // Wait until all the data is loaded
+                    //  - we shouldn't setup this barrier under any conditions!!!
+                    __dpl_sycl::__group_barrier(__nd_item);
+
+                    if (!__out_of_data)
+                    {
+                        // We are between two base diagonals and need to find the start points in the merge matrix area,
+                        // limited by split points of the left and right base diagonals.
+
+                        // Find split point in LOCAL coordinates
+                        //  - top-left split point is (0, 0);
+                        //  - bottom-right split point describes the size of current area between two base diagonals.
+                        assert(__sp_base_right_global.first >= __sp_base_left_global.first);
+                        assert(__sp_base_right_global.second >= __sp_base_left_global.second);
+                        const _split_point_t<_IdType> __sp_local = __find_start_point(
+                            __rngs_data_in_slm1, __rngs_data_in_slm2,                                   // SLM cached copy of merging data
+                            (_IdType)(__local_idx * __chunk),                                           // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data
+                            (_IdType)(__sp_base_right_global.first - __sp_base_left_global.first),      // size of rng1
+                            (_IdType)(__sp_base_right_global.second - __sp_base_left_global.second),    // size of rng2
+                            __comp);
+
+                        // Merge data for the current diagonal
+                        //  - we should have here __sp_global in GLOBAL coordinates
+                        __serial_merge(__rngs_data_in_slm1, __rngs_data_in_slm2,                        // SLM cached copy of merging data
+                                       __rng3,                                                          // Destination range
+                                       __sp_local.first,                                                // __start1 in LOCAL coordinates because __rngs_data_in_slm1 is work-group SLM cached copy of source data
+                                       __sp_local.second,                                               // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data
+                                       (_IdType)(__global_idx * __chunk),                               // __start3 in GLOBAL coordinates because __rng3 is not cached at all
+                                       __chunk,
+                                        __sp_base_right_global.first - __sp_base_left_global.first,     // size of __rngs_data_in_slm1
+                                        __sp_base_right_global.second - __sp_base_left_global.second,   // size of __rngs_data_in_slm2
+                                        __comp);
+                    }
+                });
+        });
+        return __future(__event);
+    }
+};
+
 template <typename... _Name>
 class __merge_kernel_name;
 

From e5ced865f051c6214ed7f1cd4d40a13851b8d580 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Thu, 14 Nov 2024 13:04:45 +0100
Subject: [PATCH 04/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using
 __parallel_merge_submitter_large in the __parallel_merge

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 60 +++++++++++++++----
 1 file changed, 47 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 472a45bd0c3..9053310a7ef 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -508,6 +508,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 template <typename... _Name>
 class __merge_kernel_name;
 
+template <typename... _Name>
+class __diagonals_kernel_name;
+
+template <typename... _Name>
+class __merge_kernel_name_large;
+
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
 auto
 __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1,
@@ -516,23 +522,51 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
 
     const auto __n = __rng1.size() + __rng2.size();
-    if (__n <= std::numeric_limits<std::uint32_t>::max())
+    if (__n < 4 * 1'048'576)
     {
-        using _WiIndex = std::uint32_t;
-        using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-            __merge_kernel_name<_CustomName, _WiIndex>>;
-        return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
-            std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
-            std::forward<_Range3>(__rng3), __comp);
+        if (__n <= std::numeric_limits<std::uint32_t>::max())
+        {
+            using _WiIndex = std::uint32_t;
+            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                __merge_kernel_name<_CustomName, _WiIndex>>;
+            return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
+                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+                std::forward<_Range3>(__rng3), __comp);
+        }
+        else
+        {
+            using _WiIndex = std::uint64_t;
+            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                __merge_kernel_name<_CustomName, _WiIndex>>;
+            return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
+                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+                std::forward<_Range3>(__rng3), __comp);
+        }
     }
     else
     {
-        using _WiIndex = std::uint64_t;
-        using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-            __merge_kernel_name<_CustomName, _WiIndex>>;
-        return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
-            std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
-            std::forward<_Range3>(__rng3), __comp);
+        if (__n <= std::numeric_limits<std::uint32_t>::max())
+        {
+            using _WiIndex = std::uint32_t;
+            using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                __diagonals_kernel_name<_CustomName, _WiIndex>>;
+            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                __merge_kernel_name_large<_CustomName, _WiIndex>>;
+            return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()(
+                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+                std::forward<_Range3>(__rng3), __comp);
+        }
+        else
+        {
+            using _WiIndex = std::uint64_t;
+            using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                __diagonals_kernel_name<_CustomName, _WiIndex>>;
+            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                __merge_kernel_name_large<_CustomName, _WiIndex>>;
+            return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()(
+                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+                std::forward<_Range3>(__rng3), __comp);
+        }
     }
 }
 

From 99dfb4aba50f67d4f57285ba7d456ff2df57c585 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 12:30:21 +0100
Subject: [PATCH 05/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - removed
 redundand comment

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h    | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 9053310a7ef..56044f9646e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -476,7 +476,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                         // limited by split points of the left and right base diagonals.
 
                         // Find split point in LOCAL coordinates
-                        //  - top-left split point is (0, 0);
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         assert(__sp_base_right_global.first >= __sp_base_left_global.first);
                         assert(__sp_base_right_global.second >= __sp_base_left_global.second);
@@ -495,9 +494,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                        __sp_local.second,                                               // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data
                                        (_IdType)(__global_idx * __chunk),                               // __start3 in GLOBAL coordinates because __rng3 is not cached at all
                                        __chunk,
-                                        __sp_base_right_global.first - __sp_base_left_global.first,     // size of __rngs_data_in_slm1
-                                        __sp_base_right_global.second - __sp_base_left_global.second,   // size of __rngs_data_in_slm2
-                                        __comp);
+                                       __sp_base_right_global.first - __sp_base_left_global.first,      // size of __rngs_data_in_slm1
+                                       __sp_base_right_global.second - __sp_base_left_global.second,    // size of __rngs_data_in_slm2
+                                       __comp);
                     }
                 });
         });

From e79d00c97e217682b77e0b48f8e40035a058275a Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 12:32:03 +0100
Subject: [PATCH 06/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - small
 data types should be acceptable too

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 56044f9646e..cec67084da5 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -343,8 +343,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         assert(__chunk > 0);
 
         // Pessimistically only use half of the memory to take into account memory used by compiled kernel
-        const std::size_t __max_slm_size_adj =
-            oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, __merge_slm_helper::get_data_size(__rng1, __rng2));
+        const std::size_t __max_slm_size_adj = 
+            std::max((std::size_t)__chunk,
+                     std::min((std::size_t)__n,
+                               oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, __merge_slm_helper::get_data_size(__rng1, __rng2))));
 
         // The amount of data must be a multiple of the chunk size.
         const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk;

From 30b70b3d700039b2882a6c05f9edd7ebbb742fff Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 12:32:51 +0100
Subject: [PATCH 07/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - define
 __base_diagonals_sp_global_ptr outside of parallel_for

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index cec67084da5..ca7813dc3f7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -406,6 +406,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
             oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3);
             auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc<sycl::access_mode::read>(__cgh);
+            auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
             const std::size_t __slm_cached_data_size = __items_in_wg_count * __chunk;
             auto local_accessors = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
@@ -424,8 +425,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     // Merge matrix base diagonal's GLOBAL index
                     const std::size_t __wg_id = __nd_item.get_group_linear_id();
 
-                    auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
-
                     // Split points on left anr right base diagonals
                     //  - in GLOBAL coordinates
                     assert(__wg_id + 1 < __wg_count + 1);

From e9222aa0584534c80e3af583821f5cbc2e5943b9 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 12:36:37 +0100
Subject: [PATCH 08/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h -
 calculate and use cached data-size for work-group

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index ca7813dc3f7..621d84f05f3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -429,7 +429,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     //  - in GLOBAL coordinates
                     assert(__wg_id + 1 < __wg_count + 1);
                     const _split_point_t<_IdType>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__wg_id];
-                    const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; 
+                    const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1];
+
+                    assert(__sp_base_right_global.first >= __sp_base_left_global.first);
+                    assert(__sp_base_right_global.second >= __sp_base_left_global.second);
+
+                    const _IdType __wg_data_size_rng1 = __sp_base_right_global.first - __sp_base_left_global.first;
+                    const _IdType __wg_data_size_rng2 = __sp_base_right_global.second - __sp_base_left_global.second;
+                   
 
                     auto [__local_accessor_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(local_accessors);
                     auto [__local_accessor_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(local_accessors, (std::size_t)(__sp_base_right_global.first -__sp_base_left_global.first));
@@ -478,13 +485,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                         // Find split point in LOCAL coordinates
                         //  - bottom-right split point describes the size of current area between two base diagonals.
-                        assert(__sp_base_right_global.first >= __sp_base_left_global.first);
-                        assert(__sp_base_right_global.second >= __sp_base_left_global.second);
                         const _split_point_t<_IdType> __sp_local = __find_start_point(
                             __rngs_data_in_slm1, __rngs_data_in_slm2,                                   // SLM cached copy of merging data
                             (_IdType)(__local_idx * __chunk),                                           // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data
-                            (_IdType)(__sp_base_right_global.first - __sp_base_left_global.first),      // size of rng1
-                            (_IdType)(__sp_base_right_global.second - __sp_base_left_global.second),    // size of rng2
+                            __wg_data_size_rng1,                                                        // size of rng1
+                            __wg_data_size_rng2,                                                        // size of rng2
                             __comp);
 
                         // Merge data for the current diagonal
@@ -495,8 +500,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                        __sp_local.second,                                               // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data
                                        (_IdType)(__global_idx * __chunk),                               // __start3 in GLOBAL coordinates because __rng3 is not cached at all
                                        __chunk,
-                                       __sp_base_right_global.first - __sp_base_left_global.first,      // size of __rngs_data_in_slm1
-                                       __sp_base_right_global.second - __sp_base_left_global.second,    // size of __rngs_data_in_slm2
+                                       __wg_data_size_rng1,                                             // size of __rngs_data_in_slm1
+                                       __wg_data_size_rng2,                                             // size of __rngs_data_in_slm2
                                        __comp);
                     }
                 });

From ed1a1b20bc99b7d3171296913382279f1b71c2ee Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 12:38:05 +0100
Subject: [PATCH 09/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename
 some local variables

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h   | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 621d84f05f3..90837008415 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -409,7 +409,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
             const std::size_t __slm_cached_data_size = __items_in_wg_count * __chunk;
-            auto local_accessors = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
+            auto loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
 
             // Run nd_range parallel_for to process all the data
             __cgh.parallel_for<_MergeKernelName...>(
@@ -437,11 +437,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const _IdType __wg_data_size_rng1 = __sp_base_right_global.first - __sp_base_left_global.first;
                     const _IdType __wg_data_size_rng2 = __sp_base_right_global.second - __sp_base_left_global.second;
                    
-
-                    auto [__local_accessor_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(local_accessors);
-                    auto [__local_accessor_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(local_accessors, (std::size_t)(__sp_base_right_global.first -__sp_base_left_global.first));
-                    auto __rngs_data_in_slm1 = std::addressof(__local_accessor_rng1[0]) + offset_to_slm1;
-                    auto __rngs_data_in_slm2 = std::addressof(__local_accessor_rng2[0]) + offset_to_slm2;
+                    auto [__loc_acc_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack);
+                    auto [__loc_acc_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __wg_data_size_rng1);
+                    auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1;
+                    auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2;
 
                     // Full amount of work-items may be great then the amount of diagonals in the merge matrix
                     // so we should skip the redundant work-items

From 744bcdb64c64ba15041c63f044424d6353cd2b84 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 12:39:45 +0100
Subject: [PATCH 10/80] @@@
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - debug
 code

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 139 +++++++++++++++---
 1 file changed, 117 insertions(+), 22 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 90837008415..8c6163cfe9b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -26,6 +26,8 @@
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
 
+#define USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE 0
+
 namespace oneapi
 {
 namespace dpl
@@ -274,6 +276,34 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M
 template <typename _IdType, typename _CustomName, typename _DiagonalsKernelName, typename _MergeKernelName>
 struct __parallel_merge_submitter_large;
 
+#if USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
+// TODO remove debug code
+template <typename _RngTo, typename _RngFrom, typename _IdType>
+void
+load_data(std::size_t __n1, std::size_t __n2, std::size_t __wg_id, std::size_t __rng_no, std::size_t __local_idx, _RngTo& __rng_to, std::size_t __idx_to, const _RngFrom& __rng_from, std::size_t __idx_from,
+          _IdType                       __wg_data_size_rng, 
+          _IdType                       __items_in_wg_count,
+          std::size_t                   __max_wi_amount_for_data_loading,
+          const std::size_t             __loading_data_per_wi,
+          const _split_point_t<_IdType> __sp_base_left_global,  
+          const _split_point_t<_IdType> __sp_base_right_global)
+{
+    __rng_to[__idx_to] = __rng_from[__idx_from];
+}
+
+// TODO remove debug code
+template <typename _IdType>
+void
+dump_split_point(_IdType __idx, const _split_point_t<_IdType> __sp)
+{
+    auto first = __sp.first;
+    auto second = __sp.second;
+
+    first = first;
+    second = second;
+}
+#endif
+
 template <typename _IdType, typename _CustomName, typename... _DiagonalsKernelName, typename... _MergeKernelName>
 struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
@@ -412,6 +442,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             auto loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
 
             // Run nd_range parallel_for to process all the data
+            // - each work-group caching source data in SLM and processing diagonals between two base diagonals;
+            // - each work-item processing one diagonal.
             __cgh.parallel_for<_MergeKernelName...>(
                 sycl::nd_range</*dim=*/1>(__wg_count * __items_in_wg_count, __items_in_wg_count),
                 [=](sycl::nd_item</*dim=*/1> __nd_item)
@@ -425,50 +457,112 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     // Merge matrix base diagonal's GLOBAL index
                     const std::size_t __wg_id = __nd_item.get_group_linear_id();
 
+#if USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
+                    // TODO remove debug code: dump split points
+                    {
+                        if (__wg_id == 0 && __local_idx == 0)
+                            for (_IdType i = 0; i < __wg_count + 1; ++i)
+                                dump_split_point(i, __base_diagonals_sp_global_ptr[i]);
+                        __dpl_sycl::__group_barrier(__nd_item);
+                    }
+#endif
+
                     // Split points on left anr right base diagonals
                     //  - in GLOBAL coordinates
                     assert(__wg_id + 1 < __wg_count + 1);
                     const _split_point_t<_IdType>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__wg_id];
-                    const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1];
+                    const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; 
 
                     assert(__sp_base_right_global.first >= __sp_base_left_global.first);
                     assert(__sp_base_right_global.second >= __sp_base_left_global.second);
 
                     const _IdType __wg_data_size_rng1 = __sp_base_right_global.first - __sp_base_left_global.first;
                     const _IdType __wg_data_size_rng2 = __sp_base_right_global.second - __sp_base_left_global.second;
-                   
+
                     auto [__loc_acc_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack);
                     auto [__loc_acc_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __wg_data_size_rng1);
                     auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1;
                     auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2;
 
-                    // Full amount of work-items may be great then the amount of diagonals in the merge matrix
-                    // so we should skip the redundant work-items
-                    const bool __out_of_data = __global_idx * __chunk >= __n;
-                    if (!__out_of_data)
+                    constexpr std::size_t __max_wi_amount_for_data_loading = 16;
+
+                    if (__local_idx < __max_wi_amount_for_data_loading)
                     {
+                        ////////////////////////////////////////////////////////////////////////////////////////
                         // Load the current part of merging data placed between two base diagonals into SLM
-                        // TODO implement cooperative data load by multiple work-items
-                        assert(__items_in_wg_count > 1);
-                        if (__local_idx == 0)
+
+                        // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2023-0/shared-local-memory.html
+                        // SLM: 64 bytes x 16 banks (granularity: 4 bytes / 32 bits)
+                        // the goal - each WI should write into separate bank
+                        //      -> load from max 16 work-items (defined at __max_wi_amount_for_data_loading)
+                        //      -> it is necessary to ensure sequential writing to adjacent addresses of SLM memory
+
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // Cooperative data load from __rng1 to __rngs_data_in_slm1
+                        if (__wg_data_size_rng1 > 0)
                         {
-                            _IdType __slm_idx = 0;
-                            for (_IdType __idx = __sp_base_left_global.first; __idx < __sp_base_right_global.first; ++__idx, ++__slm_idx)
+                            // Calculate the size of the current part of merging data per work-item
+                            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng1, std::min((std::size_t)__items_in_wg_count, __max_wi_amount_for_data_loading));
+
+                            // Calculate the range of SLM indexes of loading data
+                            const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi;
+                            const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
+
+                            for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
                             {
-                                assert(__slm_idx < __slm_cached_data_size);
-                                assert(__idx < __n1);
-                                __rngs_data_in_slm1[__slm_idx] = __rng1[__idx];
+                                const _IdType __rng_idx = __sp_base_left_global.first + __slm_idx;
+                                if (__rng_idx < __sp_base_right_global.first)
+                                {
+                                    assert(__slm_idx < __wg_data_size_rng1);
+                                    assert(__rng_idx < __n1);
+#if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
+                                    __rngs_data_in_slm1[__slm_idx] = __rng1[__rng_idx];
+#else
+                                    load_data(__n1, __n2, __wg_id, 1, __local_idx, __rngs_data_in_slm1, __slm_idx, __rng1, __rng_idx,
+                                              __wg_data_size_rng2, 
+                                              __items_in_wg_count,
+                                              __max_wi_amount_for_data_loading,
+                                              __loading_data_per_wi,
+                                              __sp_base_left_global,
+                                              __sp_base_right_global);
+#endif
+                                }
                             }
                         }
 
-                        if (__local_idx == 1 && __items_in_wg_count > 1 || __local_idx == 0)
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // Cooperative data load from __rng2 to __rngs_data_in_slm2
+                        if (__wg_data_size_rng2 > 0)
                         {
-                            _IdType __slm_idx = 0;
-                            for (_IdType __idx = __sp_base_left_global.second; __idx < __sp_base_right_global.second; ++__idx, ++__slm_idx)
+                            // __loading_data_per_wi = 3, __sp_base_left_global = (521, 247), __sp_base_right_global = (521, 260)
+                            //  -> __wg_data_size_rng2 = 260 - 247 = 13
+                            //  -> __loading_data_per_wi = __dpl_ceiling_div(13, 6) = 3
+                            // Calculate the size of the current part of merging data per work-item
+                            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng2, std::min((std::size_t)__items_in_wg_count, __max_wi_amount_for_data_loading));
+
+                            // Calculate the range of SLM indexes of loading data
+                            const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi;
+                            const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
+
+                            for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
                             {
-                                assert(__slm_idx < __slm_cached_data_size);
-                                assert(__idx < __n2);
-                                __rngs_data_in_slm2[__slm_idx] = __rng2[__idx];
+                                const _IdType __rng_idx = __sp_base_left_global.second + __slm_idx;
+                                if (__rng_idx < __sp_base_right_global.second)
+                                {
+                                    assert(__slm_idx < __wg_data_size_rng2);
+                                    assert(__rng_idx < __n2);
+#if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
+                                    __rngs_data_in_slm2[__slm_idx] = __rng2[__rng_idx];
+#else
+                                    load_data(__n1, __n2, __wg_id, 2, __local_idx, __rngs_data_in_slm2, __slm_idx, __rng2, __rng_idx,
+                                              __wg_data_size_rng2, 
+                                              __items_in_wg_count,
+                                              __max_wi_amount_for_data_loading,
+                                              __loading_data_per_wi,
+                                              __sp_base_left_global,
+                                              __sp_base_right_global);
+#endif
+                                }
                             }
                         }
                     }
@@ -477,7 +571,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     //  - we shouldn't setup this barrier under any conditions!!!
                     __dpl_sycl::__group_barrier(__nd_item);
 
-                    if (!__out_of_data)
+                    // Current diagonal inside of the merge matrix?
+                    if (__global_idx * __chunk < __n)
                     {
                         // We are between two base diagonals and need to find the start points in the merge matrix area,
                         // limited by split points of the left and right base diagonals.
@@ -526,7 +621,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
 
     const auto __n = __rng1.size() + __rng2.size();
-    if (__n < 4 * 1'048'576)
+    if (false)  //if (__n < 4 * 1'048'576)
     {
         if (__n <= std::numeric_limits<std::uint32_t>::max())
         {

From 2239811b71eb2dd8b72625a4ffa5943d1792f601 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 22:04:10 +0100
Subject: [PATCH 11/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 review comment: let's use __parallel_merge_submitter with std::uint32_t data
 type only

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 41 ++++++++-----------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 8c6163cfe9b..fb4e83f48e0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -384,12 +384,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         assert(__max_source_data_items_fit_into_slm % __chunk == 0);
 
         // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group)
-        const _IdType __items_in_wg_count = __max_source_data_items_fit_into_slm / __chunk;
+        const std::size_t __items_in_wg_count = __max_source_data_items_fit_into_slm / __chunk;
         assert(__items_in_wg_count > 0);
 
         // The amount of the base diagonals is the amount of the work-groups
         //  - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group
-        const _IdType __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_source_data_items_fit_into_slm);
+        const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_source_data_items_fit_into_slm);
 
         // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group)
         //  - in GLOBAL coordinates
@@ -461,7 +461,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     // TODO remove debug code: dump split points
                     {
                         if (__wg_id == 0 && __local_idx == 0)
-                            for (_IdType i = 0; i < __wg_count + 1; ++i)
+                            for (std::size_t i = 0; i < __wg_count + 1; ++i)
                                 dump_split_point(i, __base_diagonals_sp_global_ptr[i]);
                         __dpl_sycl::__group_barrier(__nd_item);
                     }
@@ -510,7 +510,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                             for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
                             {
-                                const _IdType __rng_idx = __sp_base_left_global.first + __slm_idx;
+                                const std::size_t __rng_idx = __sp_base_left_global.first + __slm_idx;
                                 if (__rng_idx < __sp_base_right_global.first)
                                 {
                                     assert(__slm_idx < __wg_data_size_rng1);
@@ -546,7 +546,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                             for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
                             {
-                                const _IdType __rng_idx = __sp_base_left_global.second + __slm_idx;
+                                const std::size_t __rng_idx = __sp_base_left_global.second + __slm_idx;
                                 if (__rng_idx < __sp_base_right_global.second)
                                 {
                                     assert(__slm_idx < __wg_data_size_rng2);
@@ -620,27 +620,18 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
 
-    const auto __n = __rng1.size() + __rng2.size();
-    if (false)  //if (__n < 4 * 1'048'576)
+    constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 Mb
+
+    const std::size_t __n = __rng1.size() + __rng2.size();
+    if (__n < __starting_size_limit_for_large_submitter)
     {
-        if (__n <= std::numeric_limits<std::uint32_t>::max())
-        {
-            using _WiIndex = std::uint32_t;
-            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-                __merge_kernel_name<_CustomName, _WiIndex>>;
-            return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
-                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
-                std::forward<_Range3>(__rng3), __comp);
-        }
-        else
-        {
-            using _WiIndex = std::uint64_t;
-            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-                __merge_kernel_name<_CustomName, _WiIndex>>;
-            return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
-                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
-                std::forward<_Range3>(__rng3), __comp);
-        }
+        static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits<std::uint32_t>::max());
+
+        using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+            __merge_kernel_name<_CustomName>>;
+        return __parallel_merge_submitter<std::uint32_t, _MergeKernelName>()(
+            std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+            std::forward<_Range3>(__rng3), __comp);
     }
     else
     {

From 4039c85168fd463c5ec71e06c7239c6a603df26a Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 13:17:05 +0100
Subject: [PATCH 12/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - load
 source data into SLM by all available work-items in the group

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 107 +++++++-----------
 1 file changed, 43 insertions(+), 64 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index fb4e83f48e0..a99f99d4fa2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -283,7 +283,6 @@ void
 load_data(std::size_t __n1, std::size_t __n2, std::size_t __wg_id, std::size_t __rng_no, std::size_t __local_idx, _RngTo& __rng_to, std::size_t __idx_to, const _RngFrom& __rng_from, std::size_t __idx_from,
           _IdType                       __wg_data_size_rng, 
           _IdType                       __items_in_wg_count,
-          std::size_t                   __max_wi_amount_for_data_loading,
           const std::size_t             __loading_data_per_wi,
           const _split_point_t<_IdType> __sp_base_left_global,  
           const _split_point_t<_IdType> __sp_base_right_global)
@@ -484,85 +483,65 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1;
                     auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2;
 
-                    constexpr std::size_t __max_wi_amount_for_data_loading = 16;
-
-                    if (__local_idx < __max_wi_amount_for_data_loading)
+                    ////////////////////////////////////////////////////////////////////////////////////////
+                    // Cooperative data load from __rng1 to __rngs_data_in_slm1
+                    if (__wg_data_size_rng1 > 0)
                     {
-                        ////////////////////////////////////////////////////////////////////////////////////////
-                        // Load the current part of merging data placed between two base diagonals into SLM
-
-                        // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2023-0/shared-local-memory.html
-                        // SLM: 64 bytes x 16 banks (granularity: 4 bytes / 32 bits)
-                        // the goal - each WI should write into separate bank
-                        //      -> load from max 16 work-items (defined at __max_wi_amount_for_data_loading)
-                        //      -> it is necessary to ensure sequential writing to adjacent addresses of SLM memory
-
-                        ////////////////////////////////////////////////////////////////////////////////////////
-                        // Cooperative data load from __rng1 to __rngs_data_in_slm1
-                        if (__wg_data_size_rng1 > 0)
-                        {
-                            // Calculate the size of the current part of merging data per work-item
-                            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng1, std::min((std::size_t)__items_in_wg_count, __max_wi_amount_for_data_loading));
+                        // Calculate the size of the current part of merging data per work-item
+                        const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng1, __items_in_wg_count);
 
-                            // Calculate the range of SLM indexes of loading data
-                            const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi;
-                            const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
+                        // Calculate the range of SLM indexes of loading data
+                        const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi;
+                        const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
 
-                            for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
+                        for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
+                        {
+                            const std::size_t __rng_idx = __sp_base_left_global.first + __slm_idx;
+                            if (__rng_idx < __sp_base_right_global.first)
                             {
-                                const std::size_t __rng_idx = __sp_base_left_global.first + __slm_idx;
-                                if (__rng_idx < __sp_base_right_global.first)
-                                {
-                                    assert(__slm_idx < __wg_data_size_rng1);
-                                    assert(__rng_idx < __n1);
+                                assert(__slm_idx < __wg_data_size_rng1);
+                                assert(__rng_idx < __n1);
 #if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
-                                    __rngs_data_in_slm1[__slm_idx] = __rng1[__rng_idx];
+                                __rngs_data_in_slm1[__slm_idx] = __rng1[__rng_idx];
 #else
-                                    load_data(__n1, __n2, __wg_id, 1, __local_idx, __rngs_data_in_slm1, __slm_idx, __rng1, __rng_idx,
-                                              __wg_data_size_rng2, 
-                                              __items_in_wg_count,
-                                              __max_wi_amount_for_data_loading,
-                                              __loading_data_per_wi,
-                                              __sp_base_left_global,
-                                              __sp_base_right_global);
+                                load_data(__n1, __n2, __wg_id, 1, __local_idx, __rngs_data_in_slm1, __slm_idx, __rng1, __rng_idx,
+                                            __wg_data_size_rng2, 
+                                            __items_in_wg_count,
+                                            __loading_data_per_wi,
+                                            __sp_base_left_global,
+                                            __sp_base_right_global);
 #endif
-                                }
                             }
                         }
+                    }
 
-                        ////////////////////////////////////////////////////////////////////////////////////////
-                        // Cooperative data load from __rng2 to __rngs_data_in_slm2
-                        if (__wg_data_size_rng2 > 0)
-                        {
-                            // __loading_data_per_wi = 3, __sp_base_left_global = (521, 247), __sp_base_right_global = (521, 260)
-                            //  -> __wg_data_size_rng2 = 260 - 247 = 13
-                            //  -> __loading_data_per_wi = __dpl_ceiling_div(13, 6) = 3
-                            // Calculate the size of the current part of merging data per work-item
-                            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng2, std::min((std::size_t)__items_in_wg_count, __max_wi_amount_for_data_loading));
+                    ////////////////////////////////////////////////////////////////////////////////////////
+                    // Cooperative data load from __rng2 to __rngs_data_in_slm2
+                    if (__wg_data_size_rng2 > 0)
+                    {
+                        const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng2, __items_in_wg_count);
 
-                            // Calculate the range of SLM indexes of loading data
-                            const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi;
-                            const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
+                        // Calculate the range of SLM indexes of loading data
+                        const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi;
+                        const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
 
-                            for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
+                        for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
+                        {
+                            const std::size_t __rng_idx = __sp_base_left_global.second + __slm_idx;
+                            if (__rng_idx < __sp_base_right_global.second)
                             {
-                                const std::size_t __rng_idx = __sp_base_left_global.second + __slm_idx;
-                                if (__rng_idx < __sp_base_right_global.second)
-                                {
-                                    assert(__slm_idx < __wg_data_size_rng2);
-                                    assert(__rng_idx < __n2);
+                                assert(__slm_idx < __wg_data_size_rng2);
+                                assert(__rng_idx < __n2);
 #if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
-                                    __rngs_data_in_slm2[__slm_idx] = __rng2[__rng_idx];
+                                __rngs_data_in_slm2[__slm_idx] = __rng2[__rng_idx];
 #else
-                                    load_data(__n1, __n2, __wg_id, 2, __local_idx, __rngs_data_in_slm2, __slm_idx, __rng2, __rng_idx,
-                                              __wg_data_size_rng2, 
-                                              __items_in_wg_count,
-                                              __max_wi_amount_for_data_loading,
-                                              __loading_data_per_wi,
-                                              __sp_base_left_global,
-                                              __sp_base_right_global);
+                                load_data(__n1, __n2, __wg_id, 2, __local_idx, __rngs_data_in_slm2, __slm_idx, __rng2, __rng_idx,
+                                            __wg_data_size_rng2, 
+                                            __items_in_wg_count,
+                                            __loading_data_per_wi,
+                                            __sp_base_left_global,
+                                            __sp_base_right_global);
 #endif
-                                }
                             }
                         }
                     }

From 50f04459f2afc99c7f66acc03a7df4eca95c13ee Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 14:57:27 +0100
Subject: [PATCH 13/80] extract function load_data_into_slm to load source data
 into SLM

---
 .../dpcpp/parallel_backend_sycl_merge.h       | 112 ++++++++----------
 1 file changed, 48 insertions(+), 64 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index a99f99d4fa2..dff041bbd74 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -181,7 +181,7 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el
         return __result;
     }
 
-    return std::make_pair(0, 0);
+    return __zero_split_point<_Index>;
 }
 
 // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing
@@ -355,6 +355,50 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         }
     };
 
+    template <typename _Range, typename _DataType>
+    static void
+    load_data_into_slm(_Range&& __rng, _DataType* __slm,
+                       std::size_t __sp_base_left_global_from, std::size_t __sp_base_left_global_to,
+                       std::size_t __items_in_wg_count, std::size_t __local_idx)
+    {
+        const std::size_t __wg_data_size_rng = __sp_base_left_global_to - __sp_base_left_global_from;
+        if (__wg_data_size_rng > 0)
+        {
+            // Calculate the size of the current part of merging data per work-item
+            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __items_in_wg_count);
+            assert(__loading_data_per_wi > 0);
+
+            if (__loading_data_per_wi > 1)
+            {
+                const auto __slm_idx_begin = __local_idx * __loading_data_per_wi;
+                const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
+
+                for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
+                {
+                    const std::size_t __rng_idx = __sp_base_left_global_from + __slm_idx;
+                    if (__rng_idx < __sp_base_left_global_to)
+                    {
+                        assert(__slm_idx < __wg_data_size_rng);
+                        assert(__rng_idx < __rng.size());
+                        __slm[__slm_idx] = __rng[__rng_idx];
+                    }
+                }
+            }
+            else
+            {
+                assert(__loading_data_per_wi == 1);
+
+                const std::size_t __rng_idx = __sp_base_left_global_from + __local_idx;
+                if (__rng_idx < __sp_base_left_global_to)
+                {
+                    assert(__local_idx < __wg_data_size_rng);
+                    assert(__rng_idx < __rng.size());
+                    __slm[__local_idx] = __rng[__rng_idx];
+                }
+            }
+        }
+    }
+
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
@@ -483,71 +527,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1;
                     auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2;
 
-                    ////////////////////////////////////////////////////////////////////////////////////////
-                    // Cooperative data load from __rng1 to __rngs_data_in_slm1
-                    if (__wg_data_size_rng1 > 0)
-                    {
-                        // Calculate the size of the current part of merging data per work-item
-                        const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng1, __items_in_wg_count);
-
-                        // Calculate the range of SLM indexes of loading data
-                        const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi;
-                        const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
-
-                        for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
-                        {
-                            const std::size_t __rng_idx = __sp_base_left_global.first + __slm_idx;
-                            if (__rng_idx < __sp_base_right_global.first)
-                            {
-                                assert(__slm_idx < __wg_data_size_rng1);
-                                assert(__rng_idx < __n1);
-#if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
-                                __rngs_data_in_slm1[__slm_idx] = __rng1[__rng_idx];
-#else
-                                load_data(__n1, __n2, __wg_id, 1, __local_idx, __rngs_data_in_slm1, __slm_idx, __rng1, __rng_idx,
-                                            __wg_data_size_rng2, 
-                                            __items_in_wg_count,
-                                            __loading_data_per_wi,
-                                            __sp_base_left_global,
-                                            __sp_base_right_global);
-#endif
-                            }
-                        }
-                    }
-
-                    ////////////////////////////////////////////////////////////////////////////////////////
-                    // Cooperative data load from __rng2 to __rngs_data_in_slm2
-                    if (__wg_data_size_rng2 > 0)
-                    {
-                        const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__wg_data_size_rng2, __items_in_wg_count);
-
-                        // Calculate the range of SLM indexes of loading data
-                        const std::size_t __slm_idx_begin = __local_idx * __loading_data_per_wi;
-                        const std::size_t __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
-
-                        for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
-                        {
-                            const std::size_t __rng_idx = __sp_base_left_global.second + __slm_idx;
-                            if (__rng_idx < __sp_base_right_global.second)
-                            {
-                                assert(__slm_idx < __wg_data_size_rng2);
-                                assert(__rng_idx < __n2);
-#if !USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
-                                __rngs_data_in_slm2[__slm_idx] = __rng2[__rng_idx];
-#else
-                                load_data(__n1, __n2, __wg_id, 2, __local_idx, __rngs_data_in_slm2, __slm_idx, __rng2, __rng_idx,
-                                            __wg_data_size_rng2, 
-                                            __items_in_wg_count,
-                                            __loading_data_per_wi,
-                                            __sp_base_left_global,
-                                            __sp_base_right_global);
-#endif
-                            }
-                        }
-                    }
+                    // Cooperative data load from __rng1 to __rngs_data_in_slm1, from __rng2 to __rngs_data_in_slm2
+                    load_data_into_slm(__rng1, __rngs_data_in_slm1, __sp_base_left_global.first,  __sp_base_right_global.first,  __items_in_wg_count, __local_idx);
+                    load_data_into_slm(__rng2, __rngs_data_in_slm2, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_idx);
 
                     // Wait until all the data is loaded
-                    //  - we shouldn't setup this barrier under any conditions!!!
                     __dpl_sycl::__group_barrier(__nd_item);
 
                     // Current diagonal inside of the merge matrix?

From 3a0a8625926e48a52ba02e54b07316da10db854b Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 15:35:29 +0100
Subject: [PATCH 14/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove
 debug code

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 39 -------------------
 1 file changed, 39 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index dff041bbd74..fdeac2846e4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -26,8 +26,6 @@
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
 
-#define USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE 0
-
 namespace oneapi
 {
 namespace dpl
@@ -276,33 +274,6 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M
 template <typename _IdType, typename _CustomName, typename _DiagonalsKernelName, typename _MergeKernelName>
 struct __parallel_merge_submitter_large;
 
-#if USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
-// TODO remove debug code
-template <typename _RngTo, typename _RngFrom, typename _IdType>
-void
-load_data(std::size_t __n1, std::size_t __n2, std::size_t __wg_id, std::size_t __rng_no, std::size_t __local_idx, _RngTo& __rng_to, std::size_t __idx_to, const _RngFrom& __rng_from, std::size_t __idx_from,
-          _IdType                       __wg_data_size_rng, 
-          _IdType                       __items_in_wg_count,
-          const std::size_t             __loading_data_per_wi,
-          const _split_point_t<_IdType> __sp_base_left_global,  
-          const _split_point_t<_IdType> __sp_base_right_global)
-{
-    __rng_to[__idx_to] = __rng_from[__idx_from];
-}
-
-// TODO remove debug code
-template <typename _IdType>
-void
-dump_split_point(_IdType __idx, const _split_point_t<_IdType> __sp)
-{
-    auto first = __sp.first;
-    auto second = __sp.second;
-
-    first = first;
-    second = second;
-}
-#endif
-
 template <typename _IdType, typename _CustomName, typename... _DiagonalsKernelName, typename... _MergeKernelName>
 struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
@@ -500,16 +471,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     // Merge matrix base diagonal's GLOBAL index
                     const std::size_t __wg_id = __nd_item.get_group_linear_id();
 
-#if USE_DEBUG_CODE_IN_MERGE_SUBMITTER_LARGE
-                    // TODO remove debug code: dump split points
-                    {
-                        if (__wg_id == 0 && __local_idx == 0)
-                            for (std::size_t i = 0; i < __wg_count + 1; ++i)
-                                dump_split_point(i, __base_diagonals_sp_global_ptr[i]);
-                        __dpl_sycl::__group_barrier(__nd_item);
-                    }
-#endif
-
                     // Split points on left anr right base diagonals
                     //  - in GLOBAL coordinates
                     assert(__wg_id + 1 < __wg_count + 1);

From ab38d96d687bf7b3ed6d357bbc9034bf9660427a Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 15:38:34 +0100
Subject: [PATCH 15/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename
 some variables

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 47 +++++++++----------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index fdeac2846e4..baf701bffaa 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -330,7 +330,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     static void
     load_data_into_slm(_Range&& __rng, _DataType* __slm,
                        std::size_t __sp_base_left_global_from, std::size_t __sp_base_left_global_to,
-                       std::size_t __items_in_wg_count, std::size_t __local_idx)
+                       std::size_t __items_in_wg_count, std::size_t __local_id)
     {
         const std::size_t __wg_data_size_rng = __sp_base_left_global_to - __sp_base_left_global_from;
         if (__wg_data_size_rng > 0)
@@ -341,7 +341,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
             if (__loading_data_per_wi > 1)
             {
-                const auto __slm_idx_begin = __local_idx * __loading_data_per_wi;
+                const auto __slm_idx_begin = __local_id * __loading_data_per_wi;
                 const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
 
                 for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
@@ -359,12 +359,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             {
                 assert(__loading_data_per_wi == 1);
 
-                const std::size_t __rng_idx = __sp_base_left_global_from + __local_idx;
+                const std::size_t __rng_idx = __sp_base_left_global_from + __local_id;
                 if (__rng_idx < __sp_base_left_global_to)
                 {
-                    assert(__local_idx < __wg_data_size_rng);
+                    assert(__local_id < __wg_data_size_rng);
                     assert(__rng_idx < __rng.size());
-                    __slm[__local_idx] = __rng[__rng_idx];
+                    __slm[__local_id] = __rng[__rng_idx];
                 }
             }
         }
@@ -420,24 +420,24 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             __cgh.parallel_for<_DiagonalsKernelName...>(
                 sycl::range</*dim=*/1>(__wg_count + 1), [=](sycl::item</*dim=*/1> __item_id) {
 
-                    const std::size_t __global_idx = __item_id.get_linear_id();
+                    const std::size_t __linear_id = __item_id.get_linear_id();
 
                     _split_point_t<_IdType>* __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
                     // Save top-left split point for first/last base diagonals of merge matrix
                     //  - in GLOBAL coordinates
-                    _split_point_t<_IdType> __sp(__global_idx == 0 ? __zero_split_point<_IdType> : _split_point_t<_IdType>{__n1, __n2});
+                    _split_point_t<_IdType> __sp(__linear_id == 0 ? __zero_split_point<std::size_t> : _split_point_t<std::size_t>{__n1, __n2});
 
-                    if (0 < __global_idx && __global_idx < __wg_count)
+                    if (0 < __linear_id && __linear_id < __wg_count)
                     {
-                        const _IdType __i_elem = __global_idx * __items_in_wg_count * __chunk;
+                        const _IdType __i_elem = __linear_id * __items_in_wg_count * __chunk;
 
                         // Save bottom-right split point for current base diagonal of merge matrix
                         //  - in GLOBAL coordinates
                         __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp);
                     }
 
-                    __base_diagonals_sp_global_ptr[__global_idx] = __sp;
+                    __base_diagonals_sp_global_ptr[__linear_id] = __sp;
                 });
         });
 
@@ -462,20 +462,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                 sycl::nd_range</*dim=*/1>(__wg_count * __items_in_wg_count, __items_in_wg_count),
                 [=](sycl::nd_item</*dim=*/1> __nd_item)
                 {
-                    // Merge matrix diagonal's GLOBAL index
-                    const std::size_t __global_idx = __nd_item.get_global_linear_id();
-
-                    // Merge sub-matrix LOCAL diagonal's index
-                    const std::size_t __local_idx = __nd_item.get_local_id(0);
-
-                    // Merge matrix base diagonal's GLOBAL index
-                    const std::size_t __wg_id = __nd_item.get_group_linear_id();
+                    const std::size_t __global_linear_id = __nd_item.get_global_linear_id();    // Merge matrix diagonal's GLOBAL index
+                    const std::size_t __local_id = __nd_item.get_local_id(0);                   // Merge sub-matrix LOCAL diagonal's index
+                    const std::size_t __group_linear_id = __nd_item.get_group_linear_id();      // Merge matrix base diagonal's GLOBAL index
 
                     // Split points on left anr right base diagonals
                     //  - in GLOBAL coordinates
-                    assert(__wg_id + 1 < __wg_count + 1);
-                    const _split_point_t<_IdType>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__wg_id];
-                    const _split_point_t<_IdType>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__wg_id + 1]; 
+                    assert(__group_linear_id + 1 < __wg_count + 1);
+                    const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
+                    const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
 
                     assert(__sp_base_right_global.first >= __sp_base_left_global.first);
                     assert(__sp_base_right_global.second >= __sp_base_left_global.second);
@@ -489,14 +484,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2;
 
                     // Cooperative data load from __rng1 to __rngs_data_in_slm1, from __rng2 to __rngs_data_in_slm2
-                    load_data_into_slm(__rng1, __rngs_data_in_slm1, __sp_base_left_global.first,  __sp_base_right_global.first,  __items_in_wg_count, __local_idx);
-                    load_data_into_slm(__rng2, __rngs_data_in_slm2, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_idx);
+                    load_data_into_slm(__rng1, __rngs_data_in_slm1, __sp_base_left_global.first,  __sp_base_right_global.first,  __items_in_wg_count, __local_id);
+                    load_data_into_slm(__rng2, __rngs_data_in_slm2, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_id);
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);
 
                     // Current diagonal inside of the merge matrix?
-                    if (__global_idx * __chunk < __n)
+                    if (__global_linear_id * __chunk < __n)
                     {
                         // We are between two base diagonals and need to find the start points in the merge matrix area,
                         // limited by split points of the left and right base diagonals.
@@ -505,7 +500,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         const _split_point_t<_IdType> __sp_local = __find_start_point(
                             __rngs_data_in_slm1, __rngs_data_in_slm2,                                   // SLM cached copy of merging data
-                            (_IdType)(__local_idx * __chunk),                                           // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data
+                            (_IdType)(__local_id * __chunk),                                            // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data
                             __wg_data_size_rng1,                                                        // size of rng1
                             __wg_data_size_rng2,                                                        // size of rng2
                             __comp);
@@ -516,7 +511,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                        __rng3,                                                          // Destination range
                                        __sp_local.first,                                                // __start1 in LOCAL coordinates because __rngs_data_in_slm1 is work-group SLM cached copy of source data
                                        __sp_local.second,                                               // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data
-                                       (_IdType)(__global_idx * __chunk),                               // __start3 in GLOBAL coordinates because __rng3 is not cached at all
+                                       (_IdType)(__global_linear_id * __chunk),                         // __start3 in GLOBAL coordinates because __rng3 is not cached at all
                                        __chunk,
                                        __wg_data_size_rng1,                                             // size of __rngs_data_in_slm1
                                        __wg_data_size_rng2,                                             // size of __rngs_data_in_slm2

From cbbfb06e4c79421b3b52a60e67be61b120ddb330 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 15:39:44 +0100
Subject: [PATCH 16/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - removed
 redundand comment

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index baf701bffaa..cfd3f54d4da 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -493,9 +493,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     // Current diagonal inside of the merge matrix?
                     if (__global_linear_id * __chunk < __n)
                     {
-                        // We are between two base diagonals and need to find the start points in the merge matrix area,
-                        // limited by split points of the left and right base diagonals.
-
                         // Find split point in LOCAL coordinates
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         const _split_point_t<_IdType> __sp_local = __find_start_point(

From f73cf27c84b754252d2b50428c3b4717f8e43ad0 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 15:40:33 +0100
Subject: [PATCH 17/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - removed
 redundand assert

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index cfd3f54d4da..e7ac839bb76 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -468,7 +468,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                     // Split points on left anr right base diagonals
                     //  - in GLOBAL coordinates
-                    assert(__group_linear_id + 1 < __wg_count + 1);
                     const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
                     const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
 

From 75f6e4adbc79a7690d1b4bae4a58890eba6d2ac2 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 15:43:15 +0100
Subject: [PATCH 18/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 unused variable

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index e7ac839bb76..8e7387ec442 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -427,15 +427,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     // Save top-left split point for first/last base diagonals of merge matrix
                     //  - in GLOBAL coordinates
                     _split_point_t<_IdType> __sp(__linear_id == 0 ? __zero_split_point<std::size_t> : _split_point_t<std::size_t>{__n1, __n2});
-
                     if (0 < __linear_id && __linear_id < __wg_count)
-                    {
-                        const _IdType __i_elem = __linear_id * __items_in_wg_count * __chunk;
-
-                        // Save bottom-right split point for current base diagonal of merge matrix
-                        //  - in GLOBAL coordinates
-                        __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp);
-                    }
+                        __sp = __find_start_point(__rng1, __rng2, (_IdType)(__linear_id * __items_in_wg_count * __chunk), __n1, __n2, __comp);
 
                     __base_diagonals_sp_global_ptr[__linear_id] = __sp;
                 });

From 15f52916ffad1043e5eda8ea8cde1bf80c67a110 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 15:52:42 +0100
Subject: [PATCH 19/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename
 some variables

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 8e7387ec442..970f28f8584 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -467,17 +467,17 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     assert(__sp_base_right_global.first >= __sp_base_left_global.first);
                     assert(__sp_base_right_global.second >= __sp_base_left_global.second);
 
-                    const _IdType __wg_data_size_rng1 = __sp_base_right_global.first - __sp_base_left_global.first;
-                    const _IdType __wg_data_size_rng2 = __sp_base_right_global.second - __sp_base_left_global.second;
+                    const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
+                    const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
 
-                    auto [__loc_acc_rng1, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack);
-                    auto [__loc_acc_rng2, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __wg_data_size_rng1);
-                    auto __rngs_data_in_slm1 = std::addressof(__loc_acc_rng1[0]) + offset_to_slm1;
-                    auto __rngs_data_in_slm2 = std::addressof(__loc_acc_rng2[0]) + offset_to_slm2;
+                    auto [__rng1_loc_acc, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack);
+                    auto [__rng2_loc_acc, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __rng1_wg_data_size);
+                    auto __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + offset_to_slm1;
+                    auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + offset_to_slm2;
 
-                    // Cooperative data load from __rng1 to __rngs_data_in_slm1, from __rng2 to __rngs_data_in_slm2
-                    load_data_into_slm(__rng1, __rngs_data_in_slm1, __sp_base_left_global.first,  __sp_base_right_global.first,  __items_in_wg_count, __local_id);
-                    load_data_into_slm(__rng2, __rngs_data_in_slm2, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_id);
+                    // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
+                    load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,  __items_in_wg_count, __local_id);
+                    load_data_into_slm(__rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_id);
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);
@@ -488,22 +488,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                         // Find split point in LOCAL coordinates
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         const _split_point_t<_IdType> __sp_local = __find_start_point(
-                            __rngs_data_in_slm1, __rngs_data_in_slm2,                                   // SLM cached copy of merging data
-                            (_IdType)(__local_id * __chunk),                                            // __i_elem in LOCAL coordinates because __rngs_data_in_slm1 and __rngs_data_in_slm2 is work-group SLM cached copy of source data
-                            __wg_data_size_rng1,                                                        // size of rng1
-                            __wg_data_size_rng2,                                                        // size of rng2
+                            __rng1_cache_slm, __rng2_cache_slm,                         // SLM cached copy of merging data
+                            (_IdType)(__local_id * __chunk),                            // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
+                            __rng1_wg_data_size, __rng2_wg_data_size,                   // size of rng1 and rng2
                             __comp);
 
                         // Merge data for the current diagonal
                         //  - we should have here __sp_global in GLOBAL coordinates
-                        __serial_merge(__rngs_data_in_slm1, __rngs_data_in_slm2,                        // SLM cached copy of merging data
-                                       __rng3,                                                          // Destination range
-                                       __sp_local.first,                                                // __start1 in LOCAL coordinates because __rngs_data_in_slm1 is work-group SLM cached copy of source data
-                                       __sp_local.second,                                               // __start2 in LOCAL coordinates because __rngs_data_in_slm2 is work-group SLM cached copy of source data
-                                       (_IdType)(__global_linear_id * __chunk),                         // __start3 in GLOBAL coordinates because __rng3 is not cached at all
+                        __serial_merge(__rng1_cache_slm, __rng2_cache_slm,              // SLM cached copy of merging data
+                                       __rng3,                                          // Destination range
+                                       __sp_local.first,                                // __start1 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data
+                                       __sp_local.second,                               // __start2 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data
+                                       (_IdType)(__global_linear_id * __chunk),         // __start3 in GLOBAL coordinates because __rng3 is not cached at all
                                        __chunk,
-                                       __wg_data_size_rng1,                                             // size of __rngs_data_in_slm1
-                                       __wg_data_size_rng2,                                             // size of __rngs_data_in_slm2
+                                       __rng1_wg_data_size, __rng2_wg_data_size,        // size of rng1 and rng2
                                        __comp);
                     }
                 });

From 68f3d251ee4e223bf9510c39e599f5ff6402587a Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 18:29:51 +0100
Subject: [PATCH 20/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - declare
 load_data_into_slm as inline

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 970f28f8584..8910aec321d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -327,7 +327,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     };
 
     template <typename _Range, typename _DataType>
-    static void
+    inline static void
     load_data_into_slm(_Range&& __rng, _DataType* __slm,
                        std::size_t __sp_base_left_global_from, std::size_t __sp_base_left_global_to,
                        std::size_t __items_in_wg_count, std::size_t __local_id)

From 07d7143b579edf0a5558cb22ff402e7dce7b6274 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 18:39:32 +0100
Subject: [PATCH 21/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - removed
 redundand assert

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 8910aec321d..49e0df27fcb 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -357,8 +357,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             }
             else
             {
-                assert(__loading_data_per_wi == 1);
-
                 const std::size_t __rng_idx = __sp_base_left_global_from + __local_id;
                 if (__rng_idx < __sp_base_left_global_to)
                 {

From 6c852bfbdfa021f960a7d5b9f810fc7eced25697 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 22:43:38 +0100
Subject: [PATCH 22/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h -
 additional comments for load_data_into_slm

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 58 ++++++++++++++-----
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 49e0df27fcb..0108ad232f2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -329,14 +329,42 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     template <typename _Range, typename _DataType>
     inline static void
     load_data_into_slm(_Range&& __rng, _DataType* __slm,
-                       std::size_t __sp_base_left_global_from, std::size_t __sp_base_left_global_to,
-                       std::size_t __items_in_wg_count, std::size_t __local_id)
+                       std::size_t __idx_global_begin, std::size_t __idx_global_end,
+                       std::size_t __wi_in_one_wg, std::size_t __local_id)
     {
-        const std::size_t __wg_data_size_rng = __sp_base_left_global_to - __sp_base_left_global_from;
+        // How we load data:
+        /*
+           +-------------------+--------------------------------------------------------------+------------+
+           | Source data index |                   Work-items in one work-group               | SLM index  |
+           +-------------------+--------------------------------------------------------------+------------+
+           |                   | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) |            |  <--- __local_id: in which work-item we are
+           +-------------------+-------+-------+-------+-------+-----+------------------------+------------+
+           |  rng[0]           |       |       |       |       |     |                        |            |  
+           |  rng[1]           |   +   |       |       |       |     |                        | slm[0]     |  <--- __idx_global_begin
+           |  rng[2]           |       |   +   |       |       |     |                        | slm[1]     |  
+           |  rng[3]           |       |       |   +   |       |     |                        | slm[2]     |  
+           |  rng[4]           |       |       |       |   +   |     |                        | slm[3]     |  
+           |  .....            |       |       |       |       | +++ |                        | ...        |  
+           |  rng[M + 1]       |       |       |       |       |     |           +            | slm[M]     |  
+           |  rng[M + 2]       |   +   |       |       |       |     |                        | slm[M + 1] |  
+           |  rng[M + 3]       |       |   +   |       |       |     |                        | slm[M + 2] |  
+           |  rng[M + 4]       |       |       |   +   |       |     |                        | slm[M + 3] |
+           |  rng[M + 5]       |       |       |       |   -   |     |                        |            |  <--- __idx_global_end
+           |  .....            |       |       |       |       | --- |                        |            |  
+           |  rng[M + M + 1]   |       |       |       |       |     |           -            |            |  
+           +-------------------+--------------------------------------------------------------+------------+
+                                                   ^
+                                                   |
+                                              __local_id
+           
+            "+" - load one source data item ito SLM
+        */
+
+        const std::size_t __wg_data_size_rng = __idx_global_end - __idx_global_begin;
         if (__wg_data_size_rng > 0)
         {
             // Calculate the size of the current part of merging data per work-item
-            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __items_in_wg_count);
+            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __wi_in_one_wg);
             assert(__loading_data_per_wi > 0);
 
             if (__loading_data_per_wi > 1)
@@ -346,8 +374,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                 for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
                 {
-                    const std::size_t __rng_idx = __sp_base_left_global_from + __slm_idx;
-                    if (__rng_idx < __sp_base_left_global_to)
+                    const std::size_t __rng_idx = __idx_global_begin + __slm_idx;
+                    if (__rng_idx < __idx_global_end)
                     {
                         assert(__slm_idx < __wg_data_size_rng);
                         assert(__rng_idx < __rng.size());
@@ -357,8 +385,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             }
             else
             {
-                const std::size_t __rng_idx = __sp_base_left_global_from + __local_id;
-                if (__rng_idx < __sp_base_left_global_to)
+                const std::size_t __rng_idx = __idx_global_begin + __local_id;
+                if (__rng_idx < __idx_global_end)
                 {
                     assert(__local_id < __wg_data_size_rng);
                     assert(__rng_idx < __rng.size());
@@ -396,8 +424,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         assert(__max_source_data_items_fit_into_slm % __chunk == 0);
 
         // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group)
-        const std::size_t __items_in_wg_count = __max_source_data_items_fit_into_slm / __chunk;
-        assert(__items_in_wg_count > 0);
+        const std::size_t __wi_in_one_wg = __max_source_data_items_fit_into_slm / __chunk;
+        assert(__wi_in_one_wg > 0);
 
         // The amount of the base diagonals is the amount of the work-groups
         //  - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group
@@ -426,7 +454,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     //  - in GLOBAL coordinates
                     _split_point_t<_IdType> __sp(__linear_id == 0 ? __zero_split_point<std::size_t> : _split_point_t<std::size_t>{__n1, __n2});
                     if (0 < __linear_id && __linear_id < __wg_count)
-                        __sp = __find_start_point(__rng1, __rng2, (_IdType)(__linear_id * __items_in_wg_count * __chunk), __n1, __n2, __comp);
+                        __sp = __find_start_point(__rng1, __rng2, (_IdType)(__linear_id * __wi_in_one_wg * __chunk), __n1, __n2, __comp);
 
                     __base_diagonals_sp_global_ptr[__linear_id] = __sp;
                 });
@@ -443,14 +471,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc<sycl::access_mode::read>(__cgh);
             auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
-            const std::size_t __slm_cached_data_size = __items_in_wg_count * __chunk;
+            const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk;
             auto loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
 
             // Run nd_range parallel_for to process all the data
             // - each work-group caching source data in SLM and processing diagonals between two base diagonals;
             // - each work-item processing one diagonal.
             __cgh.parallel_for<_MergeKernelName...>(
-                sycl::nd_range</*dim=*/1>(__wg_count * __items_in_wg_count, __items_in_wg_count),
+                sycl::nd_range</*dim=*/1>(__wg_count * __wi_in_one_wg, __wi_in_one_wg),
                 [=](sycl::nd_item</*dim=*/1> __nd_item)
                 {
                     const std::size_t __global_linear_id = __nd_item.get_global_linear_id();    // Merge matrix diagonal's GLOBAL index
@@ -474,8 +502,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + offset_to_slm2;
 
                     // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
-                    load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,  __items_in_wg_count, __local_id);
-                    load_data_into_slm(__rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __items_in_wg_count, __local_id);
+                    load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,  __wi_in_one_wg, __local_id);
+                    load_data_into_slm(__rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __wi_in_one_wg, __local_id);
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);

From 64d856db7faeab523c69a2bda1ef9c6fb2b2520f Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Fri, 15 Nov 2024 23:09:34 +0100
Subject: [PATCH 23/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename
 some local variables and params

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 38 +++++++++++--------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 0108ad232f2..80fe834ee2d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -314,15 +314,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         template <std::size_t AccessorIdx, typename AccessorsTuple>
         static auto
-        get_local_accessor(AccessorsTuple& __acc_tuple, std::size_t __offset = 0)
+        get_local_accessor(AccessorsTuple& __loc_acc_pack, std::size_t __offset = 0)
         {
             static_assert(std::tuple_size_v<AccessorsTuple> == 1 || std::tuple_size_v<AccessorsTuple> == 2);
 
             if constexpr (std::tuple_size_v<AccessorsTuple> == 1)
-                return std::pair(std::get<0>(__acc_tuple), __offset);
+                return std::pair(std::get<0>(__loc_acc_pack), __offset);
 
             else
-                return std::pair(std::get<AccessorIdx>(__acc_tuple), 0);
+                return std::pair(std::get<AccessorIdx>(__loc_acc_pack), 0);
         }
     };
 
@@ -340,15 +340,23 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
            |                   | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) |            |  <--- __local_id: in which work-item we are
            +-------------------+-------+-------+-------+-------+-----+------------------------+------------+
            |  rng[0]           |       |       |       |       |     |                        |            |  
-           |  rng[1]           |   +   |       |       |       |     |                        | slm[0]     |  <--- __idx_global_begin
-           |  rng[2]           |       |   +   |       |       |     |                        | slm[1]     |  
-           |  rng[3]           |       |       |   +   |       |     |                        | slm[2]     |  
-           |  rng[4]           |       |       |       |   +   |     |                        | slm[3]     |  
+           |  rng[1]           |   +   |       |       |       |     |                        | slm[0]     |  <--- __idx_global_begin              \
+           |  rng[2]           |   +   |       |       |       |     |                        | slm[1]     |                                        | SLM bank: write into one SLM bank from one work-item
+           |  rng[3]           |   +   |       |       |       |     |                        | slm[2]     |                                       /
+           |  rng[4]           |       |   +   |       |       |     |                        | slm[3]     |
+           |  rng[5]           |       |   +   |       |       |     |                        | slm[3]     |
+           |  rng[6]           |       |   +   |       |       |     |                        | slm[3]     |
+           |  rng[7]           |       |       |   +   |       |     |                        | slm[3]     |
+           |  rng[8]           |       |       |   +   |       |     |                        | slm[3]     |
+           |  rng[9]           |       |       |   +   |       |     |                        | slm[3]     |
+           |  rng[10]          |       |       |       |   +   |     |                        | slm[3]     |
+           |  rng[11]          |       |       |       |   +   |     |                        | slm[3]     |
+           |  rng[12]          |       |       |       |   +   |     |                        | slm[3]     |
            |  .....            |       |       |       |       | +++ |                        | ...        |  
            |  rng[M + 1]       |       |       |       |       |     |           +            | slm[M]     |  
-           |  rng[M + 2]       |   +   |       |       |       |     |                        | slm[M + 1] |  
-           |  rng[M + 3]       |       |   +   |       |       |     |                        | slm[M + 2] |  
-           |  rng[M + 4]       |       |       |   +   |       |     |                        | slm[M + 3] |
+           |  rng[M + 2]       |       |       |       |       |     |           +            | slm[M + 1] |  
+           |  rng[M + 3]       |       |       |       |       |     |           +            | slm[M + 2] |  
+           |  rng[M + 4]       |       |       |       |       |     |                        | slm[M + 3] |
            |  rng[M + 5]       |       |       |       |   -   |     |                        |            |  <--- __idx_global_end
            |  .....            |       |       |       |       | --- |                        |            |  
            |  rng[M + M + 1]   |       |       |       |       |     |           -            |            |  
@@ -472,7 +480,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
             const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk;
-            auto loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
+            auto __loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
 
             // Run nd_range parallel_for to process all the data
             // - each work-group caching source data in SLM and processing diagonals between two base diagonals;
@@ -496,10 +504,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
                     const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
 
-                    auto [__rng1_loc_acc, offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(loc_acc_pack);
-                    auto [__rng2_loc_acc, offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(loc_acc_pack, __rng1_wg_data_size);
-                    auto __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + offset_to_slm1;
-                    auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + offset_to_slm2;
+                    auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack);
+                    auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size);
+                    auto __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1;
+                    auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2;
 
                     // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
                     load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,  __wi_in_one_wg, __local_id);

From 3d233dd334e7a5356727cf8500897741f86f5ba1 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 12:03:05 +0100
Subject: [PATCH 24/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rewrite
 the data loading into SLM cache #1

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 77 ++++++++++++++++++-
 1 file changed, 74 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 80fe834ee2d..1b48594332f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -328,7 +328,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
     template <typename _Range, typename _DataType>
     inline static void
-    load_data_into_slm(_Range&& __rng, _DataType* __slm,
+    load_data_into_slm_impl(_Range&& __rng, _DataType* __slm,
                        std::size_t __idx_global_begin, std::size_t __idx_global_end,
                        std::size_t __wi_in_one_wg, std::size_t __local_id)
     {
@@ -404,6 +404,76 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         }
     }
 
+    template <const std::size_t __slm_bank_size, typename _RangeValueType>
+    static std::size_t
+    __calc_wi_amount_for_data_reading(const std::size_t __wi_in_one_wg, const std::size_t __reading_data)
+    {
+        //const std::size_t __required_reading_data_per_wi = __slm_bank_size / sizeof(_RangeValueType);
+
+        std::size_t __wi_for_data_reading = 0;
+        if (__reading_data > 0)
+        {
+            const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType));
+
+            __wi_for_data_reading = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__reading_data, __required_reading_data_per_wi));
+        }
+
+        return __wi_for_data_reading;
+    }
+
+    template <typename _Range, typename _DataType>
+    static void
+    load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
+                       _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
+                       const std::size_t __wi_in_one_wg, const std::size_t __local_id)
+    {
+        // TODO what size of SLM bank we have now?
+        constexpr std::size_t __slm_bank_size = 1024;
+
+#if 0
+        auto __n1 = __rng1.size();
+        auto __n2 = __rng2.size();
+
+        if (__n1 == 521 && __n2 == 260)
+        {
+            __n1 = __n1;
+            __n2 = __n2;
+        }
+#endif
+
+        using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
+        using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+
+        // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache
+        const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1);
+        const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2);
+
+#if 0
+        const std::size_t __wi_for_data_reading1_128 = __calc_wi_amount_for_data_reading<128, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1);
+        const std::size_t __wi_for_data_reading2_128 = __calc_wi_amount_for_data_reading<128, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2);
+#endif
+
+        // Now arrange the reading by work-items
+        if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2)
+        {
+            if (__local_id < __wi_for_data_reading1)
+            {
+                load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
+            }
+            else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2)
+            {
+                // When we reading data from parallel-working work-items, we should reduce the local id of current work-item
+                // because we calculate readed data size based on this value.
+                load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1);
+            }
+        }
+        else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2)
+        {
+            load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
+            load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id);
+        }
+    }
+
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
@@ -510,8 +580,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2;
 
                     // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
-                    load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,  __wi_in_one_wg, __local_id);
-                    load_data_into_slm(__rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second, __wi_in_one_wg, __local_id);
+                    load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,
+                                       __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second,
+                                       __wi_in_one_wg, __local_id);
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);

From 6a66b1b04d691a165e483c9b9eee0ceee32fc664 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 12:29:45 +0100
Subject: [PATCH 25/80] @@@
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - always
 use two separate SLM cache

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 1b48594332f..3ab7fb14991 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -302,7 +302,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
             using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
 
-            if constexpr (std::is_same_v<_Range1ValueType, _Range2ValueType>)
+            if constexpr (false && std::is_same_v<_Range1ValueType, _Range2ValueType>)
                 return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>>(
                     __dpl_sycl::__local_accessor<_Range1ValueType>(2 * __slm_cached_data_size, __cgh));
             else

From 62bf5ce39aa93301b2c3f5bba54fc50fed7c9ebd Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 12:36:36 +0100
Subject: [PATCH 26/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - use
 large submitter after 16M items

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 3ab7fb14991..e8bbdb45b84 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -631,7 +631,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
 
-    constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 Mb
+    constexpr std::size_t __starting_size_limit_for_large_submitter = 16 * 1'048'576; // 4 Mb
 
     const std::size_t __n = __rng1.size() + __rng2.size();
     if (__n < __starting_size_limit_for_large_submitter)

From bf5b8ce08694c9f7132ad654216f8457690dd1b4 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 12:47:37 +0100
Subject: [PATCH 27/80] @@@
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using
 __parallel_merge_submitter_large for all data sizes

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index e8bbdb45b84..c9bde2e42e6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -634,7 +634,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     constexpr std::size_t __starting_size_limit_for_large_submitter = 16 * 1'048'576; // 4 Mb
 
     const std::size_t __n = __rng1.size() + __rng2.size();
-    if (__n < __starting_size_limit_for_large_submitter)
+    if (false)  //if (__n < __starting_size_limit_for_large_submitter)
     {
         static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits<std::uint32_t>::max());
 

From 435259c31c14da2befe324a4564de0cc5bab17da Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 13:17:07 +0100
Subject: [PATCH 28/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - avoid
 barrier if we have more then one work-item in each work-group

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index c9bde2e42e6..7b01be0f06b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -584,8 +584,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                        __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second,
                                        __wi_in_one_wg, __local_id);
 
-                    // Wait until all the data is loaded
-                    __dpl_sycl::__group_barrier(__nd_item);
+                    // Wait until all the data is loaded (if we have more then one item in work-group
+                    if (__wi_in_one_wg > 1)
+                        __dpl_sycl::__group_barrier(__nd_item);
 
                     // Current diagonal inside of the merge matrix?
                     if (__global_linear_id * __chunk < __n)

From 809c0735b6c30281578a6e2a7e651331b0c95adf Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 13:26:17 +0100
Subject: [PATCH 29/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - avoid
 any action in the __parallel_merge_submitter_large::operator() if we  haven't
 any data to process

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 52 ++++++++++++-------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 7b01be0f06b..4768cd553fc 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -478,6 +478,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
     {
+        using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
+        using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+
         const _IdType __n1 = __rng1.size();
         const _IdType __n2 = __rng2.size();
         const _IdType __n = __n1 + __n2;
@@ -563,33 +566,46 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const std::size_t __local_id = __nd_item.get_local_id(0);                   // Merge sub-matrix LOCAL diagonal's index
                     const std::size_t __group_linear_id = __nd_item.get_group_linear_id();      // Merge matrix base diagonal's GLOBAL index
 
-                    // Split points on left anr right base diagonals
-                    //  - in GLOBAL coordinates
-                    const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
-                    const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
-
-                    assert(__sp_base_right_global.first >= __sp_base_left_global.first);
-                    assert(__sp_base_right_global.second >= __sp_base_left_global.second);
+                    _IdType __rng1_wg_data_size = 0;
+                    _IdType __rng2_wg_data_size = 0;
 
-                    const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
-                    const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+                    _Range1ValueType* __rng1_cache_slm = nullptr;
+                    _Range1ValueType* __rng2_cache_slm = nullptr;
 
-                    auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack);
-                    auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size);
-                    auto __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1;
-                    auto __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2;
+                    // Current diagonal inside of the merge matrix?
+                    const bool __have_data = __global_linear_id * __chunk < __n;
 
-                    // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
-                    load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,
-                                       __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second,
-                                       __wi_in_one_wg, __local_id);
+                    // Current diagonal inside of the merge matrix?
+                    if (__have_data)
+                    {
+                        // Split points on left anr right base diagonals
+                        //  - in GLOBAL coordinates
+                        const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
+                        const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
+
+                        assert(__sp_base_right_global.first >= __sp_base_left_global.first);
+                        assert(__sp_base_right_global.second >= __sp_base_left_global.second);
+
+                        __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
+                        __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+
+                        auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack);
+                        auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size);
+                        __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1;
+                        __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2;
+
+                        // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
+                        load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,
+                                           __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second,
+                                           __wi_in_one_wg, __local_id);
+                    }
 
                     // Wait until all the data is loaded (if we have more then one item in work-group
                     if (__wi_in_one_wg > 1)
                         __dpl_sycl::__group_barrier(__nd_item);
 
                     // Current diagonal inside of the merge matrix?
-                    if (__global_linear_id * __chunk < __n)
+                    if (__have_data)
                     {
                         // Find split point in LOCAL coordinates
                         //  - bottom-right split point describes the size of current area between two base diagonals.

From 021dbb858f053fa919e8245dca05d5d4babc340b Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 13:49:06 +0100
Subject: [PATCH 30/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove
 inline on load_data_into_slm_impl

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 4768cd553fc..78fc978e5d9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -327,7 +327,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     };
 
     template <typename _Range, typename _DataType>
-    inline static void
+    static void
     load_data_into_slm_impl(_Range&& __rng, _DataType* __slm,
                        std::size_t __idx_global_begin, std::size_t __idx_global_end,
                        std::size_t __wi_in_one_wg, std::size_t __local_id)

From 2fa02678e8bae2ff3fb195c9afab41147c4471a8 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 14:35:32 +0100
Subject: [PATCH 31/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove
 extra condition checks and asserts from load_data_into_slm_impl

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_merge.h  | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 78fc978e5d9..46aa6907085 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -380,26 +380,17 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                 const auto __slm_idx_begin = __local_id * __loading_data_per_wi;
                 const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
 
-                for (std::size_t __slm_idx = __slm_idx_begin; __slm_idx < __slm_idx_end; ++__slm_idx)
-                {
-                    const std::size_t __rng_idx = __idx_global_begin + __slm_idx;
-                    if (__rng_idx < __idx_global_end)
-                    {
-                        assert(__slm_idx < __wg_data_size_rng);
-                        assert(__rng_idx < __rng.size());
+                std::size_t __slm_idx = __slm_idx_begin;
+                std::size_t __rng_idx = __idx_global_begin + __slm_idx;
+
+                for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx)
                         __slm[__slm_idx] = __rng[__rng_idx];
-                    }
-                }
             }
             else
             {
                 const std::size_t __rng_idx = __idx_global_begin + __local_id;
                 if (__rng_idx < __idx_global_end)
-                {
-                    assert(__local_id < __wg_data_size_rng);
-                    assert(__rng_idx < __rng.size());
                     __slm[__local_id] = __rng[__rng_idx];
-                }
             }
         }
     }

From 3f95ff12c6badbdce632c74302b03b23a5d05ec1 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 14:36:37 +0100
Subject: [PATCH 32/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - add
 _ONEDPL_PRAGMA_UNROLL into load_data_into_slm_impl

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 46aa6907085..e0839d38756 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -383,6 +383,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                 std::size_t __slm_idx = __slm_idx_begin;
                 std::size_t __rng_idx = __idx_global_begin + __slm_idx;
 
+                _ONEDPL_PRAGMA_UNROLL
                 for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx)
                         __slm[__slm_idx] = __rng[__rng_idx];
             }

From 0caf24c83f6afed95f0443efc707783d918ab86a Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 14:37:03 +0100
Subject: [PATCH 33/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rewrite
 the data loading into SLM cache #1

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h       | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index e0839d38756..dda8755e774 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -385,7 +385,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                 _ONEDPL_PRAGMA_UNROLL
                 for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx)
-                        __slm[__slm_idx] = __rng[__rng_idx];
+                    __slm[__slm_idx] = __rng[__rng_idx];
             }
             else
             {
@@ -400,8 +400,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     static std::size_t
     __calc_wi_amount_for_data_reading(const std::size_t __wi_in_one_wg, const std::size_t __reading_data)
     {
-        //const std::size_t __required_reading_data_per_wi = __slm_bank_size / sizeof(_RangeValueType);
-
         std::size_t __wi_for_data_reading = 0;
         if (__reading_data > 0)
         {

From b12eada173a923b36c1bcdfe58630fbe99aa84c7 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 15:16:13 +0100
Subject: [PATCH 34/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - define
 __slm_bank_size as 64 in load_data_into_slm

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index dda8755e774..66d497837e9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -418,7 +418,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                        const std::size_t __wi_in_one_wg, const std::size_t __local_id)
     {
         // TODO what size of SLM bank we have now?
-        constexpr std::size_t __slm_bank_size = 1024;
+        constexpr std::size_t __slm_bank_size = 64;     // = 1024;
 
 #if 0
         auto __n1 = __rng1.size();

From c2c66acc4ad891e18d9ffde96ce046d9e0bf815e Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 15:17:42 +0100
Subject: [PATCH 35/80] Revert "@@@
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - always
 use two separate SLM cache"

This reverts commit 6a66b1b04d691a165e483c9b9eee0ceee32fc664.
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 66d497837e9..ee5084998ba 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -302,7 +302,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
             using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
 
-            if constexpr (false && std::is_same_v<_Range1ValueType, _Range2ValueType>)
+            if constexpr (std::is_same_v<_Range1ValueType, _Range2ValueType>)
                 return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>>(
                     __dpl_sycl::__local_accessor<_Range1ValueType>(2 * __slm_cached_data_size, __cgh));
             else

From f55c36fe8910450c623a6751bc853423cf912b82 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 15:59:28 +0100
Subject: [PATCH 36/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove
 debug code

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_merge.h   | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index ee5084998ba..55cac57bfe9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -420,17 +420,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         // TODO what size of SLM bank we have now?
         constexpr std::size_t __slm_bank_size = 64;     // = 1024;
 
-#if 0
-        auto __n1 = __rng1.size();
-        auto __n2 = __rng2.size();
-
-        if (__n1 == 521 && __n2 == 260)
-        {
-            __n1 = __n1;
-            __n2 = __n2;
-        }
-#endif
-
         using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
         using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
 
@@ -438,11 +427,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1);
         const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2);
 
-#if 0
-        const std::size_t __wi_for_data_reading1_128 = __calc_wi_amount_for_data_reading<128, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1);
-        const std::size_t __wi_for_data_reading2_128 = __calc_wi_amount_for_data_reading<128, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2);
-#endif
-
         // Now arrange the reading by work-items
         if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2)
         {

From 1b5f0a7b9bc4f401155d4a51ed5db4bbdc4f982f Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 16:03:29 +0100
Subject: [PATCH 37/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - increate
 chunk size on GPU to 8

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 55cac57bfe9..454ebfddee3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -464,7 +464,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
 
         // Empirical number of values to process per work-item
-        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
+        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 8;
         assert(__chunk > 0);
 
         // Pessimistically only use half of the memory to take into account memory used by compiled kernel

From 55169cc3fd4ad703ab15fd90fc8e7d7701c1f65e Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 16:56:40 +0100
Subject: [PATCH 38/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - declare
 __rng1_from and __rng2_from as constexpr in __find_start_point

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 454ebfddee3..a0b1138229f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -56,9 +56,10 @@ _split_point_t<_Index>
 __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1,
                    const _Index __n2, _Compare __comp)
 {
-    const _Index __rng1_from = 0;
+    constexpr _Index __rng1_from = 0;
+    constexpr _Index __rng2_from = 0;
+
     const _Index __rng1_to = __n1;
-    const _Index __rng2_from = 0;
     const _Index __rng2_to = __n2;
 
     assert(__rng1_from <= __rng1_to);

From 9fac5b97f0065a0781b621906e87ac2b44ba359b Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 17:32:44 +0100
Subject: [PATCH 39/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 spell-check error

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index a0b1138229f..5d9fbd43dfd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -438,7 +438,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2)
             {
                 // When we reading data from parallel-working work-items, we should reduce the local id of current work-item
-                // because we calculate readed data size based on this value.
+                // because we calculate reeded data size based on this value.
                 load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1);
             }
         }

From a3284b3c31ba4726671c90d3c499aaaf35b14623 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 17:46:15 +0100
Subject: [PATCH 40/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 review comment

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h      | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 5d9fbd43dfd..fd1f1be3118 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -575,9 +575,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                            __wi_in_one_wg, __local_id);
                     }
 
-                    // Wait until all the data is loaded (if we have more then one item in work-group
-                    if (__wi_in_one_wg > 1)
-                        __dpl_sycl::__group_barrier(__nd_item);
+                    // Wait until all the data is loaded
+                    __dpl_sycl::__group_barrier(__nd_item);
 
                     // Current diagonal inside of the merge matrix?
                     if (__have_data)

From 167490fd3cbc55c3daf72620ced80085b0056e92 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 17:51:46 +0100
Subject: [PATCH 41/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix data
 types in load_data_into_slm

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index fd1f1be3118..2eac65dc93e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -412,10 +412,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         return __wi_for_data_reading;
     }
 
-    template <typename _Range, typename _DataType>
+    template <typename _Range, typename _DataType1, typename _DataType2>
     static void
-    load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
-                       _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
+    load_data_into_slm(_Range&& __rng1, _DataType1* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
+                       _Range&& __rng2, _DataType2* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
                        const std::size_t __wi_in_one_wg, const std::size_t __local_id)
     {
         // TODO what size of SLM bank we have now?

From c72c92d1c5961beb36fe0d1700b5f0a9a3695239 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 18:14:11 +0100
Subject: [PATCH 42/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 comments in
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_merge.h   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 2eac65dc93e..5980a1082a9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -345,14 +345,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
            |  rng[2]           |   +   |       |       |       |     |                        | slm[1]     |                                        | SLM bank: write into one SLM bank from one work-item
            |  rng[3]           |   +   |       |       |       |     |                        | slm[2]     |                                       /
            |  rng[4]           |       |   +   |       |       |     |                        | slm[3]     |
-           |  rng[5]           |       |   +   |       |       |     |                        | slm[3]     |
-           |  rng[6]           |       |   +   |       |       |     |                        | slm[3]     |
-           |  rng[7]           |       |       |   +   |       |     |                        | slm[3]     |
-           |  rng[8]           |       |       |   +   |       |     |                        | slm[3]     |
-           |  rng[9]           |       |       |   +   |       |     |                        | slm[3]     |
-           |  rng[10]          |       |       |       |   +   |     |                        | slm[3]     |
-           |  rng[11]          |       |       |       |   +   |     |                        | slm[3]     |
-           |  rng[12]          |       |       |       |   +   |     |                        | slm[3]     |
+           |  rng[5]           |       |   +   |       |       |     |                        | slm[4]     |
+           |  rng[6]           |       |   +   |       |       |     |                        | slm[5]     |
+           |  rng[7]           |       |       |   +   |       |     |                        | slm[6]     |
+           |  rng[8]           |       |       |   +   |       |     |                        | slm[7]     |
+           |  rng[9]           |       |       |   +   |       |     |                        | slm[8]     |
+           |  rng[10]          |       |       |       |   +   |     |                        | slm[9]     |
+           |  rng[11]          |       |       |       |   +   |     |                        | slm[10]    |
+           |  rng[12]          |       |       |       |   +   |     |                        | slm[11]    |
            |  .....            |       |       |       |       | +++ |                        | ...        |  
            |  rng[M + 1]       |       |       |       |       |     |           +            | slm[M]     |  
            |  rng[M + 2]       |       |       |       |       |     |           +            | slm[M + 1] |  

From ba224e05f8a69a9a171d29caf1c3a762cd640bad Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 20:07:01 +0100
Subject: [PATCH 43/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 types in load_data_into_slm

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 5980a1082a9..3a751fbad5b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -412,10 +412,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         return __wi_for_data_reading;
     }
 
-    template <typename _Range, typename _DataType1, typename _DataType2>
+    template <typename _Range1, typename _Range2, typename _DataType1, typename _DataType2>
     static void
-    load_data_into_slm(_Range&& __rng1, _DataType1* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
-                       _Range&& __rng2, _DataType2* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
+    load_data_into_slm(_Range1&& __rng1, _DataType1* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
+                       _Range2&& __rng2, _DataType2* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
                        const std::size_t __wi_in_one_wg, const std::size_t __local_id)
     {
         // TODO what size of SLM bank we have now?

From 8bd40faa35e01cc602509860aca49fb1e755587f Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 20:07:27 +0100
Subject: [PATCH 44/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an
 error: not all source data loaded into SLM cache

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 3a751fbad5b..a145a89a590 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -330,8 +330,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     template <typename _Range, typename _DataType>
     static void
     load_data_into_slm_impl(_Range&& __rng, _DataType* __slm,
-                       std::size_t __idx_global_begin, std::size_t __idx_global_end,
-                       std::size_t __wi_in_one_wg, std::size_t __local_id)
+                            std::size_t __idx_global_begin, std::size_t __idx_global_end,
+                            std::size_t __wi_in_one_wg, std::size_t __local_id)
     {
         // How we load data:
         /*
@@ -547,29 +547,31 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     _Range1ValueType* __rng1_cache_slm = nullptr;
                     _Range1ValueType* __rng2_cache_slm = nullptr;
 
-                    // Current diagonal inside of the merge matrix?
-                    const bool __have_data = __global_linear_id * __chunk < __n;
+                    // Split points on left anr right base diagonals
+                    //  - in GLOBAL coordinates
+                    const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
+                    const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
 
-                    // Current diagonal inside of the merge matrix?
-                    if (__have_data)
-                    {
-                        // Split points on left anr right base diagonals
-                        //  - in GLOBAL coordinates
-                        const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
-                        const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
+                    assert(__sp_base_right_global.first >= __sp_base_left_global.first);
+                    assert(__sp_base_right_global.second >= __sp_base_left_global.second);
 
-                        assert(__sp_base_right_global.first >= __sp_base_left_global.first);
-                        assert(__sp_base_right_global.second >= __sp_base_left_global.second);
+                    __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
+                    __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
 
-                        __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
-                        __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+                    const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0;
+                    const bool __need_merge_data = __global_linear_id * __chunk < __n;
 
+                    if (__need_load_data || __need_merge_data)
+                    {
                         auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack);
                         auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size);
                         __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1;
                         __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2;
+                    }
 
-                        // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
+                    // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
+                    if (__need_load_data)
+                    {
                         load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,
                                            __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second,
                                            __wi_in_one_wg, __local_id);
@@ -579,7 +581,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     __dpl_sycl::__group_barrier(__nd_item);
 
                     // Current diagonal inside of the merge matrix?
-                    if (__have_data)
+                    if (__need_merge_data)
                     {
                         // Find split point in LOCAL coordinates
                         //  - bottom-right split point describes the size of current area between two base diagonals.

From f93fade7409bf6fd5a2659f621206e21013a4ce2 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Mon, 18 Nov 2024 20:18:13 +0100
Subject: [PATCH 45/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 review comment

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index a145a89a590..2a1816b7c27 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -631,11 +631,24 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     {
         static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits<std::uint32_t>::max());
 
-        using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-            __merge_kernel_name<_CustomName>>;
-        return __parallel_merge_submitter<std::uint32_t, _MergeKernelName>()(
-            std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
-            std::forward<_Range3>(__rng3), __comp);
+        if (__n <= std::numeric_limits<std::uint16_t>::max())
+        {
+            using _WiIndex = std::uint16_t;
+            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                __merge_kernel_name<_CustomName, _WiIndex>>;
+            return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
+                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+                std::forward<_Range3>(__rng3), __comp);
+        }
+        else
+        {
+            using _WiIndex = std::uint32_t;
+            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                __merge_kernel_name<_CustomName, _WiIndex>>;
+            return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
+                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+                std::forward<_Range3>(__rng3), __comp);
+        }
     }
     else
     {

From eab6cee3972389eb2291dbe3349cbbb39cb606f1 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 09:42:45 +0100
Subject: [PATCH 46/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 review comment

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 24 +++++--------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 2a1816b7c27..8929f14e6b7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -631,24 +631,12 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     {
         static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits<std::uint32_t>::max());
 
-        if (__n <= std::numeric_limits<std::uint16_t>::max())
-        {
-            using _WiIndex = std::uint16_t;
-            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-                __merge_kernel_name<_CustomName, _WiIndex>>;
-            return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
-                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
-                std::forward<_Range3>(__rng3), __comp);
-        }
-        else
-        {
-            using _WiIndex = std::uint32_t;
-            using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-                __merge_kernel_name<_CustomName, _WiIndex>>;
-            return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
-                std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
-                std::forward<_Range3>(__rng3), __comp);
-        }
+        using _WiIndex = std::uint32_t;
+        using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+            __merge_kernel_name<_CustomName, _WiIndex>>;
+        return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()(
+            std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+            std::forward<_Range3>(__rng3), __comp);
     }
     else
     {

From f3f8468a5fd394a98760e640fcde2606103d82e2 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 09:45:51 +0100
Subject: [PATCH 47/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - load
 __parallel_merge_submitter if we merge different merge types

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h   | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 8929f14e6b7..cffe1b87528 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -455,6 +455,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     {
         using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
         using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+        static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
 
         const _IdType __n1 = __rng1.size();
         const _IdType __n2 = __rng2.size();
@@ -626,8 +627,13 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
 
     constexpr std::size_t __starting_size_limit_for_large_submitter = 16 * 1'048'576; // 4 Mb
 
+    using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
+    using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+
+    constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>;
+
     const std::size_t __n = __rng1.size() + __rng2.size();
-    if (false)  //if (__n < __starting_size_limit_for_large_submitter)
+    if (false)  //if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types)
     {
         static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits<std::uint32_t>::max());
 

From 4bbeb508412e180d5881f1065a842b829b7105c7 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 09:55:09 +0100
Subject: [PATCH 48/80] remove usage of __merge_slm_helper

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 76 +++----------------
 1 file changed, 10 insertions(+), 66 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index cffe1b87528..b77337e01aa 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -280,53 +280,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
                                         __internal::__optional_kernel_name<_MergeKernelName...>>
 {
-    // Create local accessors for data cache in SLM:
-    //  - one accessor for the first and for the second ranges if _Range1 and _Range2 has the SAME value types;
-    //  - two accessors for the first and for the second ranges if _Range1 and _Range2 has DIFFERENT value types.
-    struct __merge_slm_helper
-    {
-        template <typename _Range1, typename _Range2>
-        static std::size_t
-        get_data_size(_Range1&& __rng1, _Range2&& __rng2)
-        {
-            using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
-            using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
-
-            return sizeof(_Range1ValueType) + sizeof(_Range2ValueType);
-        }
-
-        template <typename _Range1, typename _Range2>
-        static constexpr auto
-        create_local_accessors(sycl::handler& __cgh, _Range1&& __rng1, _Range2&& __rng2,
-                               std::size_t __slm_cached_data_size)
-        {
-            using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
-            using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
-
-            if constexpr (std::is_same_v<_Range1ValueType, _Range2ValueType>)
-                return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>>(
-                    __dpl_sycl::__local_accessor<_Range1ValueType>(2 * __slm_cached_data_size, __cgh));
-            else
-                return std::tuple<__dpl_sycl::__local_accessor<_Range1ValueType>,
-                                  __dpl_sycl::__local_accessor<_Range2ValueType>>(
-                    __dpl_sycl::__local_accessor<_Range1ValueType>(__slm_cached_data_size, __cgh),
-                    __dpl_sycl::__local_accessor<_Range2ValueType>(__slm_cached_data_size, __cgh));
-        }
-
-        template <std::size_t AccessorIdx, typename AccessorsTuple>
-        static auto
-        get_local_accessor(AccessorsTuple& __loc_acc_pack, std::size_t __offset = 0)
-        {
-            static_assert(std::tuple_size_v<AccessorsTuple> == 1 || std::tuple_size_v<AccessorsTuple> == 2);
-
-            if constexpr (std::tuple_size_v<AccessorsTuple> == 1)
-                return std::pair(std::get<0>(__loc_acc_pack), __offset);
-
-            else
-                return std::pair(std::get<AccessorIdx>(__loc_acc_pack), 0);
-        }
-    };
-
     template <typename _Range, typename _DataType>
     static void
     load_data_into_slm_impl(_Range&& __rng, _DataType* __slm,
@@ -457,6 +410,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
         static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
 
+        using _RangeValueType = _Range1ValueType;
+
         const _IdType __n1 = __rng1.size();
         const _IdType __n2 = __rng2.size();
         const _IdType __n = __n1 + __n2;
@@ -472,8 +427,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         // Pessimistically only use half of the memory to take into account memory used by compiled kernel
         const std::size_t __max_slm_size_adj = 
             std::max((std::size_t)__chunk,
-                     std::min((std::size_t)__n,
-                               oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, __merge_slm_helper::get_data_size(__rng1, __rng2))));
+                     std::min((std::size_t)__n, oneapi::dpl::__internal::__slm_adjusted_work_group_size(
+                                                                 __exec, 2 * sizeof(_RangeValueType))));
 
         // The amount of data must be a multiple of the chunk size.
         const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk;
@@ -529,7 +484,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
             const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk;
-            auto __loc_acc_pack = __merge_slm_helper::create_local_accessors(__cgh, __rng1, __rng2, __slm_cached_data_size);
+            __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(2 * __slm_cached_data_size, __cgh);
 
             // Run nd_range parallel_for to process all the data
             // - each work-group caching source data in SLM and processing diagonals between two base diagonals;
@@ -542,12 +497,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const std::size_t __local_id = __nd_item.get_local_id(0);                   // Merge sub-matrix LOCAL diagonal's index
                     const std::size_t __group_linear_id = __nd_item.get_group_linear_id();      // Merge matrix base diagonal's GLOBAL index
 
-                    _IdType __rng1_wg_data_size = 0;
-                    _IdType __rng2_wg_data_size = 0;
-
-                    _Range1ValueType* __rng1_cache_slm = nullptr;
-                    _Range1ValueType* __rng2_cache_slm = nullptr;
-
                     // Split points on left anr right base diagonals
                     //  - in GLOBAL coordinates
                     const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
@@ -556,20 +505,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     assert(__sp_base_right_global.first >= __sp_base_left_global.first);
                     assert(__sp_base_right_global.second >= __sp_base_left_global.second);
 
-                    __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
-                    __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+                    _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
+                    _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+
+                    _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
+                    _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
                     const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0;
                     const bool __need_merge_data = __global_linear_id * __chunk < __n;
 
-                    if (__need_load_data || __need_merge_data)
-                    {
-                        auto [__rng1_loc_acc, __offset_to_slm1] = __merge_slm_helper::template get_local_accessor<0>(__loc_acc_pack);
-                        auto [__rng2_loc_acc, __offset_to_slm2] = __merge_slm_helper::template get_local_accessor<1>(__loc_acc_pack, __rng1_wg_data_size);
-                        __rng1_cache_slm = std::addressof(__rng1_loc_acc[0]) + __offset_to_slm1;
-                        __rng2_cache_slm = std::addressof(__rng2_loc_acc[0]) + __offset_to_slm2;
-                    }
-
                     // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
                     if (__need_load_data)
                     {

From 0e5c0d2c1f7877836a9b20aca3f86d7da4413c4d Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 09:58:49 +0100
Subject: [PATCH 49/80] load_data_into_slm now working only with the same data
 types too

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index b77337e01aa..a5a1b429569 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -365,10 +365,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         return __wi_for_data_reading;
     }
 
-    template <typename _Range1, typename _Range2, typename _DataType1, typename _DataType2>
+    template <typename _Range, typename _DataType>
     static void
-    load_data_into_slm(_Range1&& __rng1, _DataType1* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
-                       _Range2&& __rng2, _DataType2* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
+    load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
+                       _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
                        const std::size_t __wi_in_one_wg, const std::size_t __local_id)
     {
         // TODO what size of SLM bank we have now?
@@ -376,10 +376,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
         using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+        static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
+
+        using _RangeValueType = _Range1ValueType;
 
         // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache
-        const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range1ValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1);
-        const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _Range2ValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2);
+        const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _RangeValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1);
+        const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _RangeValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2);
 
         // Now arrange the reading by work-items
         if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2)

From 501e58e4b389747d00517679eadd793d797937c0 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 10:04:15 +0100
Subject: [PATCH 50/80] remove __calc_wi_amount_for_data_reading function and
 it's usage

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 20 +++----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index a5a1b429569..f6d2296da9e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -350,21 +350,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         }
     }
 
-    template <const std::size_t __slm_bank_size, typename _RangeValueType>
-    static std::size_t
-    __calc_wi_amount_for_data_reading(const std::size_t __wi_in_one_wg, const std::size_t __reading_data)
-    {
-        std::size_t __wi_for_data_reading = 0;
-        if (__reading_data > 0)
-        {
-            const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType));
-
-            __wi_for_data_reading = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__reading_data, __required_reading_data_per_wi));
-        }
-
-        return __wi_for_data_reading;
-    }
-
     template <typename _Range, typename _DataType>
     static void
     load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
@@ -381,8 +366,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         using _RangeValueType = _Range1ValueType;
 
         // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache
-        const std::size_t __wi_for_data_reading1 = __calc_wi_amount_for_data_reading<__slm_bank_size, _RangeValueType>(__wi_in_one_wg, __idx_global_end1 - __idx_global_begin1);
-        const std::size_t __wi_for_data_reading2 = __calc_wi_amount_for_data_reading<__slm_bank_size, _RangeValueType>(__wi_in_one_wg, __idx_global_end2 - __idx_global_begin2);
+        const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType));
+        const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__idx_global_end1 - __idx_global_begin1, __required_reading_data_per_wi));
+        const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__idx_global_end2 - __idx_global_begin2, __required_reading_data_per_wi));
 
         // Now arrange the reading by work-items
         if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2)

From ebae70744024dff2ce3432b83033fdbdb07f45b6 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 10:17:16 +0100
Subject: [PATCH 51/80] modify load_data_into_slm

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_merge.h     | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index f6d2296da9e..06093a9f52f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -365,26 +365,30 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         using _RangeValueType = _Range1ValueType;
 
+        const auto __to_read_rng1 = __idx_global_end1 - __idx_global_begin1;
+        const auto __to_read_rng2 = __idx_global_end2 - __idx_global_begin2;
+
         // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache
         const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType));
-        const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__idx_global_end1 - __idx_global_begin1, __required_reading_data_per_wi));
-        const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__idx_global_end2 - __idx_global_begin2, __required_reading_data_per_wi));
+        const std::size_t __wi_for_data_reading_all = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1 + __to_read_rng2, __required_reading_data_per_wi));
+        const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1, __required_reading_data_per_wi));
+        const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng2, __required_reading_data_per_wi));
 
         // Now arrange the reading by work-items
-        if (__wi_in_one_wg >= __wi_for_data_reading1 + __wi_for_data_reading2)
+        if (__wi_in_one_wg >= __wi_for_data_reading_all)
         {
             if (__local_id < __wi_for_data_reading1)
             {
                 load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
             }
-            else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2)
+            else if (__local_id < __wi_for_data_reading_all)
             {
                 // When we reading data from parallel-working work-items, we should reduce the local id of current work-item
                 // because we calculate reeded data size based on this value.
                 load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1);
             }
         }
-        else if (__local_id < __wi_for_data_reading1 + __wi_for_data_reading2)
+        else if (__local_id < __wi_for_data_reading_all)
         {
             load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
             load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id);

From 6459dac76b68573737289436bff3d22aada2c748 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 12:03:24 +0100
Subject: [PATCH 52/80] @@@

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 217 ++++++------------
 1 file changed, 72 insertions(+), 145 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 06093a9f52f..0c56f954583 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -39,6 +39,13 @@ using _split_point_t = std::pair<_Index, _Index>;
 template <typename _Index>
 constexpr _split_point_t<_Index> __zero_split_point{0, 0};
 
+template <typename _Index>
+inline _Index __get_index_sum(_Index __idx)
+{
+    assert(__idx > 0);
+    return __idx - 1;
+}
+
 //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges
 //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below:
 //     0   1  1  2   3
@@ -121,7 +128,7 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el
     {
         ////////////////////////////////////////////////////////////////////////////////////
         // Taking into account the specified constraints of the range of processed data
-        const auto __index_sum = __i_elem - 1;
+        const auto __index_sum = __get_index_sum(__i_elem);
 
         using _IndexSigned = std::make_signed_t<_Index>;
 
@@ -280,125 +287,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
                                         __internal::__optional_kernel_name<_MergeKernelName...>>
 {
-    template <typename _Range, typename _DataType>
-    static void
-    load_data_into_slm_impl(_Range&& __rng, _DataType* __slm,
-                            std::size_t __idx_global_begin, std::size_t __idx_global_end,
-                            std::size_t __wi_in_one_wg, std::size_t __local_id)
-    {
-        // How we load data:
-        /*
-           +-------------------+--------------------------------------------------------------+------------+
-           | Source data index |                   Work-items in one work-group               | SLM index  |
-           +-------------------+--------------------------------------------------------------+------------+
-           |                   | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) |            |  <--- __local_id: in which work-item we are
-           +-------------------+-------+-------+-------+-------+-----+------------------------+------------+
-           |  rng[0]           |       |       |       |       |     |                        |            |  
-           |  rng[1]           |   +   |       |       |       |     |                        | slm[0]     |  <--- __idx_global_begin              \
-           |  rng[2]           |   +   |       |       |       |     |                        | slm[1]     |                                        | SLM bank: write into one SLM bank from one work-item
-           |  rng[3]           |   +   |       |       |       |     |                        | slm[2]     |                                       /
-           |  rng[4]           |       |   +   |       |       |     |                        | slm[3]     |
-           |  rng[5]           |       |   +   |       |       |     |                        | slm[4]     |
-           |  rng[6]           |       |   +   |       |       |     |                        | slm[5]     |
-           |  rng[7]           |       |       |   +   |       |     |                        | slm[6]     |
-           |  rng[8]           |       |       |   +   |       |     |                        | slm[7]     |
-           |  rng[9]           |       |       |   +   |       |     |                        | slm[8]     |
-           |  rng[10]          |       |       |       |   +   |     |                        | slm[9]     |
-           |  rng[11]          |       |       |       |   +   |     |                        | slm[10]    |
-           |  rng[12]          |       |       |       |   +   |     |                        | slm[11]    |
-           |  .....            |       |       |       |       | +++ |                        | ...        |  
-           |  rng[M + 1]       |       |       |       |       |     |           +            | slm[M]     |  
-           |  rng[M + 2]       |       |       |       |       |     |           +            | slm[M + 1] |  
-           |  rng[M + 3]       |       |       |       |       |     |           +            | slm[M + 2] |  
-           |  rng[M + 4]       |       |       |       |       |     |                        | slm[M + 3] |
-           |  rng[M + 5]       |       |       |       |   -   |     |                        |            |  <--- __idx_global_end
-           |  .....            |       |       |       |       | --- |                        |            |  
-           |  rng[M + M + 1]   |       |       |       |       |     |           -            |            |  
-           +-------------------+--------------------------------------------------------------+------------+
-                                                   ^
-                                                   |
-                                              __local_id
-           
-            "+" - load one source data item ito SLM
-        */
-
-        const std::size_t __wg_data_size_rng = __idx_global_end - __idx_global_begin;
-        if (__wg_data_size_rng > 0)
-        {
-            // Calculate the size of the current part of merging data per work-item
-            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __wi_in_one_wg);
-            assert(__loading_data_per_wi > 0);
-
-            if (__loading_data_per_wi > 1)
-            {
-                const auto __slm_idx_begin = __local_id * __loading_data_per_wi;
-                const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
-
-                std::size_t __slm_idx = __slm_idx_begin;
-                std::size_t __rng_idx = __idx_global_begin + __slm_idx;
-
-                _ONEDPL_PRAGMA_UNROLL
-                for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx)
-                    __slm[__slm_idx] = __rng[__rng_idx];
-            }
-            else
-            {
-                const std::size_t __rng_idx = __idx_global_begin + __local_id;
-                if (__rng_idx < __idx_global_end)
-                    __slm[__local_id] = __rng[__rng_idx];
-            }
-        }
-    }
-
-    template <typename _Range, typename _DataType>
-    static void
-    load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
-                       _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
-                       const std::size_t __wi_in_one_wg, const std::size_t __local_id)
-    {
-        // TODO what size of SLM bank we have now?
-        constexpr std::size_t __slm_bank_size = 64;     // = 1024;
-
-        using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
-        using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
-        static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
-
-        using _RangeValueType = _Range1ValueType;
-
-        const auto __to_read_rng1 = __idx_global_end1 - __idx_global_begin1;
-        const auto __to_read_rng2 = __idx_global_end2 - __idx_global_begin2;
-
-        // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache
-        const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType));
-        const std::size_t __wi_for_data_reading_all = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1 + __to_read_rng2, __required_reading_data_per_wi));
-        const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1, __required_reading_data_per_wi));
-        const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng2, __required_reading_data_per_wi));
-
-        // Now arrange the reading by work-items
-        if (__wi_in_one_wg >= __wi_for_data_reading_all)
-        {
-            if (__local_id < __wi_for_data_reading1)
-            {
-                load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
-            }
-            else if (__local_id < __wi_for_data_reading_all)
-            {
-                // When we reading data from parallel-working work-items, we should reduce the local id of current work-item
-                // because we calculate reeded data size based on this value.
-                load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1);
-            }
-        }
-        else if (__local_id < __wi_for_data_reading_all)
-        {
-            load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
-            load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id);
-        }
-    }
-
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
     {
+        // TODO what size of SLM bank we have now?
+        constexpr std::size_t __slm_bank_size = 64; // = 1024;
+
         using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
         using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
         static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
@@ -413,10 +308,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
 
+        const bool __b_check = __n1 == 521 && __n2 == 260;
+
         // Empirical number of values to process per work-item
         const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 8;
         assert(__chunk > 0);
 
+        // The only bank conflicts you need to worry about is in SLM, so I think if your chunk_size * element_size > bank size, then this should be ok.
+        assert(__chunk * sizeof(_RangeValueType) >= __slm_bank_size);
+
         // Pessimistically only use half of the memory to take into account memory used by compiled kernel
         const std::size_t __max_slm_size_adj = 
             std::max((std::size_t)__chunk,
@@ -490,42 +390,69 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const std::size_t __local_id = __nd_item.get_local_id(0);                   // Merge sub-matrix LOCAL diagonal's index
                     const std::size_t __group_linear_id = __nd_item.get_group_linear_id();      // Merge matrix base diagonal's GLOBAL index
 
-                    // Split points on left anr right base diagonals
-                    //  - in GLOBAL coordinates
-                    const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
-                    const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
-
-                    assert(__sp_base_right_global.first >= __sp_base_left_global.first);
-                    assert(__sp_base_right_global.second >= __sp_base_left_global.second);
-
-                    _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
-                    _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
-
-                    _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
-                    _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
-
-                    const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0;
-                    const bool __need_merge_data = __global_linear_id * __chunk < __n;
-
-                    // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
-                    if (__need_load_data)
-                    {
-                        load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,
-                                           __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second,
-                                           __wi_in_one_wg, __local_id);
-                    }
-
-                    // Wait until all the data is loaded
-                    __dpl_sycl::__group_barrier(__nd_item);
-
                     // Current diagonal inside of the merge matrix?
+                    const bool __need_merge_data = __global_linear_id * __chunk < __n;
                     if (__need_merge_data)
                     {
+                        // Split points on left anr right base diagonals
+                        //  - in GLOBAL coordinates
+                        const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
+                        const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
+
+                        assert(__sp_base_right_global.first >= __sp_base_left_global.first);
+                        assert(__sp_base_right_global.second >= __sp_base_left_global.second);
+
+                        const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
+                        const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+
+                        _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
+                        _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
+
+                        const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0;
+                        assert(__need_load_data);
+
+                        // Calculate diagonal index
+                        //  - in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
+                        const _IdType __i_elem = __local_id * __chunk;
+                        if (__i_elem > 0)
+                        {
+                            const auto __index_sum = __get_index_sum(__i_elem);
+                            //assert(__index_sum >= __chunk);
+
+                            for (_IdType __idx = __index_sum - __chunk; __idx < __index_sum && __sp_base_left_global.first + __idx < __sp_base_right_global.first; ++__idx)
+                                __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
+
+                            for (_IdType __idx = __index_sum - __chunk; __idx < __index_sum && __sp_base_left_global.second + __idx < __sp_base_right_global.second; ++__idx)
+                                __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
+                        }
+                        else
+                        {
+                            assert(__i_elem == 0);
+
+                            if (__wi_in_one_wg > 1)
+                            {
+                                if (__rng1_wg_data_size > 0)
+                                    __rng1_cache_slm[0] = __rng1[__sp_base_left_global.first];
+
+                                if (__rng2_wg_data_size > 0)
+                                    __rng2_cache_slm[0] = __rng2[__sp_base_left_global.second];
+                            }
+                            else
+                            {
+                                assert(__wi_in_one_wg == 1);
+                                for (_IdType __idx = 0; __idx < __rng1_wg_data_size; ++__idx)
+                                    __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
+
+                                for (_IdType __idx = 0; __idx < __rng2_wg_data_size; ++__idx)
+                                    __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
+                            }
+                        }
+
                         // Find split point in LOCAL coordinates
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         const _split_point_t<_IdType> __sp_local = __find_start_point(
                             __rng1_cache_slm, __rng2_cache_slm,                         // SLM cached copy of merging data
-                            (_IdType)(__local_id * __chunk),                            // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
+                            __i_elem,                                                   // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
                             __rng1_wg_data_size, __rng2_wg_data_size,                   // size of rng1 and rng2
                             __comp);
 

From e615d65fbac26cdcb6b33d23bc0305c4bfd4e0eb Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 12:06:55 +0100
Subject: [PATCH 53/80] Revert "@@@"

This reverts commit 6459dac76b68573737289436bff3d22aada2c748.
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 217 ++++++++++++------
 1 file changed, 145 insertions(+), 72 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 0c56f954583..06093a9f52f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -39,13 +39,6 @@ using _split_point_t = std::pair<_Index, _Index>;
 template <typename _Index>
 constexpr _split_point_t<_Index> __zero_split_point{0, 0};
 
-template <typename _Index>
-inline _Index __get_index_sum(_Index __idx)
-{
-    assert(__idx > 0);
-    return __idx - 1;
-}
-
 //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges
 //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below:
 //     0   1  1  2   3
@@ -128,7 +121,7 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el
     {
         ////////////////////////////////////////////////////////////////////////////////////
         // Taking into account the specified constraints of the range of processed data
-        const auto __index_sum = __get_index_sum(__i_elem);
+        const auto __index_sum = __i_elem - 1;
 
         using _IndexSigned = std::make_signed_t<_Index>;
 
@@ -287,13 +280,125 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
                                         __internal::__optional_kernel_name<_MergeKernelName...>>
 {
+    template <typename _Range, typename _DataType>
+    static void
+    load_data_into_slm_impl(_Range&& __rng, _DataType* __slm,
+                            std::size_t __idx_global_begin, std::size_t __idx_global_end,
+                            std::size_t __wi_in_one_wg, std::size_t __local_id)
+    {
+        // How we load data:
+        /*
+           +-------------------+--------------------------------------------------------------+------------+
+           | Source data index |                   Work-items in one work-group               | SLM index  |
+           +-------------------+--------------------------------------------------------------+------------+
+           |                   | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) |            |  <--- __local_id: in which work-item we are
+           +-------------------+-------+-------+-------+-------+-----+------------------------+------------+
+           |  rng[0]           |       |       |       |       |     |                        |            |  
+           |  rng[1]           |   +   |       |       |       |     |                        | slm[0]     |  <--- __idx_global_begin              \
+           |  rng[2]           |   +   |       |       |       |     |                        | slm[1]     |                                        | SLM bank: write into one SLM bank from one work-item
+           |  rng[3]           |   +   |       |       |       |     |                        | slm[2]     |                                       /
+           |  rng[4]           |       |   +   |       |       |     |                        | slm[3]     |
+           |  rng[5]           |       |   +   |       |       |     |                        | slm[4]     |
+           |  rng[6]           |       |   +   |       |       |     |                        | slm[5]     |
+           |  rng[7]           |       |       |   +   |       |     |                        | slm[6]     |
+           |  rng[8]           |       |       |   +   |       |     |                        | slm[7]     |
+           |  rng[9]           |       |       |   +   |       |     |                        | slm[8]     |
+           |  rng[10]          |       |       |       |   +   |     |                        | slm[9]     |
+           |  rng[11]          |       |       |       |   +   |     |                        | slm[10]    |
+           |  rng[12]          |       |       |       |   +   |     |                        | slm[11]    |
+           |  .....            |       |       |       |       | +++ |                        | ...        |  
+           |  rng[M + 1]       |       |       |       |       |     |           +            | slm[M]     |  
+           |  rng[M + 2]       |       |       |       |       |     |           +            | slm[M + 1] |  
+           |  rng[M + 3]       |       |       |       |       |     |           +            | slm[M + 2] |  
+           |  rng[M + 4]       |       |       |       |       |     |                        | slm[M + 3] |
+           |  rng[M + 5]       |       |       |       |   -   |     |                        |            |  <--- __idx_global_end
+           |  .....            |       |       |       |       | --- |                        |            |  
+           |  rng[M + M + 1]   |       |       |       |       |     |           -            |            |  
+           +-------------------+--------------------------------------------------------------+------------+
+                                                   ^
+                                                   |
+                                              __local_id
+           
+            "+" - load one source data item ito SLM
+        */
+
+        const std::size_t __wg_data_size_rng = __idx_global_end - __idx_global_begin;
+        if (__wg_data_size_rng > 0)
+        {
+            // Calculate the size of the current part of merging data per work-item
+            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __wi_in_one_wg);
+            assert(__loading_data_per_wi > 0);
+
+            if (__loading_data_per_wi > 1)
+            {
+                const auto __slm_idx_begin = __local_id * __loading_data_per_wi;
+                const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
+
+                std::size_t __slm_idx = __slm_idx_begin;
+                std::size_t __rng_idx = __idx_global_begin + __slm_idx;
+
+                _ONEDPL_PRAGMA_UNROLL
+                for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx)
+                    __slm[__slm_idx] = __rng[__rng_idx];
+            }
+            else
+            {
+                const std::size_t __rng_idx = __idx_global_begin + __local_id;
+                if (__rng_idx < __idx_global_end)
+                    __slm[__local_id] = __rng[__rng_idx];
+            }
+        }
+    }
+
+    template <typename _Range, typename _DataType>
+    static void
+    load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
+                       _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
+                       const std::size_t __wi_in_one_wg, const std::size_t __local_id)
+    {
+        // TODO what size of SLM bank we have now?
+        constexpr std::size_t __slm_bank_size = 64;     // = 1024;
+
+        using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
+        using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+        static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
+
+        using _RangeValueType = _Range1ValueType;
+
+        const auto __to_read_rng1 = __idx_global_end1 - __idx_global_begin1;
+        const auto __to_read_rng2 = __idx_global_end2 - __idx_global_begin2;
+
+        // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache
+        const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType));
+        const std::size_t __wi_for_data_reading_all = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1 + __to_read_rng2, __required_reading_data_per_wi));
+        const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1, __required_reading_data_per_wi));
+        const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng2, __required_reading_data_per_wi));
+
+        // Now arrange the reading by work-items
+        if (__wi_in_one_wg >= __wi_for_data_reading_all)
+        {
+            if (__local_id < __wi_for_data_reading1)
+            {
+                load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
+            }
+            else if (__local_id < __wi_for_data_reading_all)
+            {
+                // When we reading data from parallel-working work-items, we should reduce the local id of current work-item
+                // because we calculate reeded data size based on this value.
+                load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1);
+            }
+        }
+        else if (__local_id < __wi_for_data_reading_all)
+        {
+            load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
+            load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id);
+        }
+    }
+
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
     {
-        // TODO what size of SLM bank we have now?
-        constexpr std::size_t __slm_bank_size = 64; // = 1024;
-
         using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
         using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
         static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
@@ -308,15 +413,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
 
-        const bool __b_check = __n1 == 521 && __n2 == 260;
-
         // Empirical number of values to process per work-item
         const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 8;
         assert(__chunk > 0);
 
-        // The only bank conflicts you need to worry about is in SLM, so I think if your chunk_size * element_size > bank size, then this should be ok.
-        assert(__chunk * sizeof(_RangeValueType) >= __slm_bank_size);
-
         // Pessimistically only use half of the memory to take into account memory used by compiled kernel
         const std::size_t __max_slm_size_adj = 
             std::max((std::size_t)__chunk,
@@ -390,69 +490,42 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const std::size_t __local_id = __nd_item.get_local_id(0);                   // Merge sub-matrix LOCAL diagonal's index
                     const std::size_t __group_linear_id = __nd_item.get_group_linear_id();      // Merge matrix base diagonal's GLOBAL index
 
-                    // Current diagonal inside of the merge matrix?
+                    // Split points on left anr right base diagonals
+                    //  - in GLOBAL coordinates
+                    const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
+                    const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
+
+                    assert(__sp_base_right_global.first >= __sp_base_left_global.first);
+                    assert(__sp_base_right_global.second >= __sp_base_left_global.second);
+
+                    _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
+                    _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+
+                    _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
+                    _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
+
+                    const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0;
                     const bool __need_merge_data = __global_linear_id * __chunk < __n;
-                    if (__need_merge_data)
+
+                    // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
+                    if (__need_load_data)
                     {
-                        // Split points on left anr right base diagonals
-                        //  - in GLOBAL coordinates
-                        const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
-                        const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
-
-                        assert(__sp_base_right_global.first >= __sp_base_left_global.first);
-                        assert(__sp_base_right_global.second >= __sp_base_left_global.second);
-
-                        const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
-                        const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
-
-                        _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
-                        _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
-
-                        const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0;
-                        assert(__need_load_data);
-
-                        // Calculate diagonal index
-                        //  - in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
-                        const _IdType __i_elem = __local_id * __chunk;
-                        if (__i_elem > 0)
-                        {
-                            const auto __index_sum = __get_index_sum(__i_elem);
-                            //assert(__index_sum >= __chunk);
-
-                            for (_IdType __idx = __index_sum - __chunk; __idx < __index_sum && __sp_base_left_global.first + __idx < __sp_base_right_global.first; ++__idx)
-                                __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
-
-                            for (_IdType __idx = __index_sum - __chunk; __idx < __index_sum && __sp_base_left_global.second + __idx < __sp_base_right_global.second; ++__idx)
-                                __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
-                        }
-                        else
-                        {
-                            assert(__i_elem == 0);
-
-                            if (__wi_in_one_wg > 1)
-                            {
-                                if (__rng1_wg_data_size > 0)
-                                    __rng1_cache_slm[0] = __rng1[__sp_base_left_global.first];
-
-                                if (__rng2_wg_data_size > 0)
-                                    __rng2_cache_slm[0] = __rng2[__sp_base_left_global.second];
-                            }
-                            else
-                            {
-                                assert(__wi_in_one_wg == 1);
-                                for (_IdType __idx = 0; __idx < __rng1_wg_data_size; ++__idx)
-                                    __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
-
-                                for (_IdType __idx = 0; __idx < __rng2_wg_data_size; ++__idx)
-                                    __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
-                            }
-                        }
+                        load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,
+                                           __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second,
+                                           __wi_in_one_wg, __local_id);
+                    }
+
+                    // Wait until all the data is loaded
+                    __dpl_sycl::__group_barrier(__nd_item);
 
+                    // Current diagonal inside of the merge matrix?
+                    if (__need_merge_data)
+                    {
                         // Find split point in LOCAL coordinates
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         const _split_point_t<_IdType> __sp_local = __find_start_point(
                             __rng1_cache_slm, __rng2_cache_slm,                         // SLM cached copy of merging data
-                            __i_elem,                                                   // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
+                            (_IdType)(__local_id * __chunk),                            // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
                             __rng1_wg_data_size, __rng2_wg_data_size,                   // size of rng1 and rng2
                             __comp);
 

From 38ad9f299ff0537c622faf6e2c4528cd1cdecd6a Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 12:16:03 +0100
Subject: [PATCH 54/80] remove load_data_into_slm_impl and etc.

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 139 ++----------------
 1 file changed, 12 insertions(+), 127 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 06093a9f52f..efccf26e581 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -280,121 +280,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
                                         __internal::__optional_kernel_name<_MergeKernelName...>>
 {
-    template <typename _Range, typename _DataType>
-    static void
-    load_data_into_slm_impl(_Range&& __rng, _DataType* __slm,
-                            std::size_t __idx_global_begin, std::size_t __idx_global_end,
-                            std::size_t __wi_in_one_wg, std::size_t __local_id)
-    {
-        // How we load data:
-        /*
-           +-------------------+--------------------------------------------------------------+------------+
-           | Source data index |                   Work-items in one work-group               | SLM index  |
-           +-------------------+--------------------------------------------------------------+------------+
-           |                   | wi(0) | wi(1) | wi(2) | wi(3) | ... | wi(__wi_in_one_wg - 1) |            |  <--- __local_id: in which work-item we are
-           +-------------------+-------+-------+-------+-------+-----+------------------------+------------+
-           |  rng[0]           |       |       |       |       |     |                        |            |  
-           |  rng[1]           |   +   |       |       |       |     |                        | slm[0]     |  <--- __idx_global_begin              \
-           |  rng[2]           |   +   |       |       |       |     |                        | slm[1]     |                                        | SLM bank: write into one SLM bank from one work-item
-           |  rng[3]           |   +   |       |       |       |     |                        | slm[2]     |                                       /
-           |  rng[4]           |       |   +   |       |       |     |                        | slm[3]     |
-           |  rng[5]           |       |   +   |       |       |     |                        | slm[4]     |
-           |  rng[6]           |       |   +   |       |       |     |                        | slm[5]     |
-           |  rng[7]           |       |       |   +   |       |     |                        | slm[6]     |
-           |  rng[8]           |       |       |   +   |       |     |                        | slm[7]     |
-           |  rng[9]           |       |       |   +   |       |     |                        | slm[8]     |
-           |  rng[10]          |       |       |       |   +   |     |                        | slm[9]     |
-           |  rng[11]          |       |       |       |   +   |     |                        | slm[10]    |
-           |  rng[12]          |       |       |       |   +   |     |                        | slm[11]    |
-           |  .....            |       |       |       |       | +++ |                        | ...        |  
-           |  rng[M + 1]       |       |       |       |       |     |           +            | slm[M]     |  
-           |  rng[M + 2]       |       |       |       |       |     |           +            | slm[M + 1] |  
-           |  rng[M + 3]       |       |       |       |       |     |           +            | slm[M + 2] |  
-           |  rng[M + 4]       |       |       |       |       |     |                        | slm[M + 3] |
-           |  rng[M + 5]       |       |       |       |   -   |     |                        |            |  <--- __idx_global_end
-           |  .....            |       |       |       |       | --- |                        |            |  
-           |  rng[M + M + 1]   |       |       |       |       |     |           -            |            |  
-           +-------------------+--------------------------------------------------------------+------------+
-                                                   ^
-                                                   |
-                                              __local_id
-           
-            "+" - load one source data item ito SLM
-        */
-
-        const std::size_t __wg_data_size_rng = __idx_global_end - __idx_global_begin;
-        if (__wg_data_size_rng > 0)
-        {
-            // Calculate the size of the current part of merging data per work-item
-            const std::size_t __loading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__rng.size(), __wi_in_one_wg);
-            assert(__loading_data_per_wi > 0);
-
-            if (__loading_data_per_wi > 1)
-            {
-                const auto __slm_idx_begin = __local_id * __loading_data_per_wi;
-                const auto __slm_idx_end = __slm_idx_begin + __loading_data_per_wi;
-
-                std::size_t __slm_idx = __slm_idx_begin;
-                std::size_t __rng_idx = __idx_global_begin + __slm_idx;
-
-                _ONEDPL_PRAGMA_UNROLL
-                for (; __slm_idx < __slm_idx_end && __rng_idx < __idx_global_end; ++__slm_idx, ++__rng_idx)
-                    __slm[__slm_idx] = __rng[__rng_idx];
-            }
-            else
-            {
-                const std::size_t __rng_idx = __idx_global_begin + __local_id;
-                if (__rng_idx < __idx_global_end)
-                    __slm[__local_id] = __rng[__rng_idx];
-            }
-        }
-    }
-
-    template <typename _Range, typename _DataType>
-    static void
-    load_data_into_slm(_Range&& __rng1, _DataType* __slm1, const std::size_t __idx_global_begin1, const std::size_t __idx_global_end1,
-                       _Range&& __rng2, _DataType* __slm2, const std::size_t __idx_global_begin2, const std::size_t __idx_global_end2,
-                       const std::size_t __wi_in_one_wg, const std::size_t __local_id)
-    {
-        // TODO what size of SLM bank we have now?
-        constexpr std::size_t __slm_bank_size = 64;     // = 1024;
-
-        using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
-        using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
-        static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
-
-        using _RangeValueType = _Range1ValueType;
-
-        const auto __to_read_rng1 = __idx_global_end1 - __idx_global_begin1;
-        const auto __to_read_rng2 = __idx_global_end2 - __idx_global_begin2;
-
-        // Calculate how many work-items should read the part of __rng1 and __rng2 into SLM cache
-        const std::size_t __required_reading_data_per_wi = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType));
-        const std::size_t __wi_for_data_reading_all = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1 + __to_read_rng2, __required_reading_data_per_wi));
-        const std::size_t __wi_for_data_reading1 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng1, __required_reading_data_per_wi));
-        const std::size_t __wi_for_data_reading2 = std::min(__wi_in_one_wg, oneapi::dpl::__internal::__dpl_ceiling_div(__to_read_rng2, __required_reading_data_per_wi));
-
-        // Now arrange the reading by work-items
-        if (__wi_in_one_wg >= __wi_for_data_reading_all)
-        {
-            if (__local_id < __wi_for_data_reading1)
-            {
-                load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
-            }
-            else if (__local_id < __wi_for_data_reading_all)
-            {
-                // When we reading data from parallel-working work-items, we should reduce the local id of current work-item
-                // because we calculate reeded data size based on this value.
-                load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id - __wi_for_data_reading1);
-            }
-        }
-        else if (__local_id < __wi_for_data_reading_all)
-        {
-            load_data_into_slm_impl(__rng1, __slm1, __idx_global_begin1, __idx_global_end1, __wi_for_data_reading1, __local_id);
-            load_data_into_slm_impl(__rng2, __slm2, __idx_global_begin2, __idx_global_end2, __wi_for_data_reading2, __local_id);
-        }
-    }
-
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
@@ -498,34 +383,34 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     assert(__sp_base_right_global.first >= __sp_base_left_global.first);
                     assert(__sp_base_right_global.second >= __sp_base_left_global.second);
 
-                    _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
-                    _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+                    const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
+                    const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
 
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    const bool __need_load_data = __rng1_wg_data_size > 0 || __rng2_wg_data_size > 0;
-                    const bool __need_merge_data = __global_linear_id * __chunk < __n;
+                    // Calculate __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
+                    const _IdType __i_elem = __local_id * __chunk;
 
                     // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
-                    if (__need_load_data)
-                    {
-                        load_data_into_slm(__rng1, __rng1_cache_slm, __sp_base_left_global.first,  __sp_base_right_global.first,
-                                           __rng2, __rng2_cache_slm, __sp_base_left_global.second, __sp_base_right_global.second,
-                                           __wi_in_one_wg, __local_id);
-                    }
+                    _ONEDPL_PRAGMA_UNROLL
+                    for (_IdType __idx = __i_elem; __idx < __i_elem + __chunk && __sp_base_left_global.first + __idx < __sp_base_right_global.first; ++__idx)
+                        __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
+                    _ONEDPL_PRAGMA_UNROLL
+                    for (_IdType __idx = __i_elem; __idx < __i_elem + __chunk && __sp_base_left_global.second + __idx < __sp_base_right_global.second; ++__idx)
+                        __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);
 
                     // Current diagonal inside of the merge matrix?
-                    if (__need_merge_data)
+                    if (__global_linear_id * __chunk < __n)
                     {
                         // Find split point in LOCAL coordinates
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         const _split_point_t<_IdType> __sp_local = __find_start_point(
                             __rng1_cache_slm, __rng2_cache_slm,                         // SLM cached copy of merging data
-                            (_IdType)(__local_id * __chunk),                            // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
+                            __i_elem,                                                   // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
                             __rng1_wg_data_size, __rng2_wg_data_size,                   // size of rng1 and rng2
                             __comp);
 

From f9df4d471c93e2689e616e96682b4d9874dae7d1 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 12:28:59 +0100
Subject: [PATCH 55/80] restore __parallel_merge_submitter call

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index efccf26e581..31959de347a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -455,7 +455,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>;
 
     const std::size_t __n = __rng1.size() + __rng2.size();
-    if (false)  //if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types)
+    if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types)
     {
         static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits<std::uint32_t>::max());
 

From 04feeb975eff0ded56dab64a4deb604f6ad42c59 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 12:30:14 +0100
Subject: [PATCH 56/80] call __parallel_merge_submitter_large for 1Mb of
 merging data and more

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 31959de347a..1a524f8321e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -447,7 +447,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
 
-    constexpr std::size_t __starting_size_limit_for_large_submitter = 16 * 1'048'576; // 4 Mb
+    constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb
 
     using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
     using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;

From eb2de959601f7f81d4f4cf9b7c5129a81869f8d3 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 13:05:34 +0100
Subject: [PATCH 57/80] improvement of for-loop in loading data into SLM cache

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h      | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 1a524f8321e..2230851f5ce 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -391,13 +391,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                     // Calculate __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
                     const _IdType __i_elem = __local_id * __chunk;
+                    const _IdType __i_elem_next = (__local_id + 1) * __chunk;
 
                     // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
                     _ONEDPL_PRAGMA_UNROLL
-                    for (_IdType __idx = __i_elem; __idx < __i_elem + __chunk && __sp_base_left_global.first + __idx < __sp_base_right_global.first; ++__idx)
+                    for (_IdType __idx = __i_elem; __idx < __i_elem_next && __idx < __rng1_wg_data_size; ++__idx)
                         __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
                     _ONEDPL_PRAGMA_UNROLL
-                    for (_IdType __idx = __i_elem; __idx < __i_elem + __chunk && __sp_base_left_global.second + __idx < __sp_base_right_global.second; ++__idx)
+                    for (_IdType __idx = __i_elem; __idx < __i_elem_next && __idx < __rng2_wg_data_size; ++__idx)
                         __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
 
                     // Wait until all the data is loaded

From 3ec199d96aa7e0c6e3852ba1e7a51369ff6b9209 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 13:14:22 +0100
Subject: [PATCH 58/80] improvement of for-loop in loading data into SLM cache

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 2230851f5ce..310eb058e88 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -394,11 +394,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const _IdType __i_elem_next = (__local_id + 1) * __chunk;
 
                     // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
+                    _IdType __idx_end = std::min(__i_elem_next, __rng1_wg_data_size);
                     _ONEDPL_PRAGMA_UNROLL
-                    for (_IdType __idx = __i_elem; __idx < __i_elem_next && __idx < __rng1_wg_data_size; ++__idx)
+                    for (_IdType __idx = __i_elem; __idx < __idx_end; ++__idx)
                         __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
+
+                    __idx_end = std::min(__i_elem_next, __rng2_wg_data_size);
                     _ONEDPL_PRAGMA_UNROLL
-                    for (_IdType __idx = __i_elem; __idx < __i_elem_next && __idx < __rng2_wg_data_size; ++__idx)
+                    for (_IdType __idx = __i_elem; __idx < __idx_end; ++__idx)
                         __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
 
                     // Wait until all the data is loaded

From fde1797e45662c2c8c04e5523b8a157d3426ea30 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 13:22:52 +0100
Subject: [PATCH 59/80] Revert
 "include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h -
 increate chunk size on GPU to 8"

This reverts commit 1b5f0a7b9bc4f401155d4a51ed5db4bbdc4f982f.
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 310eb058e88..a55e3ef7636 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -299,7 +299,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
 
         // Empirical number of values to process per work-item
-        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 8;
+        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
         assert(__chunk > 0);
 
         // Pessimistically only use half of the memory to take into account memory used by compiled kernel

From 5ba4cd59a093dfa6bd6018d84154937bbb3747f0 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 15:29:00 +0100
Subject: [PATCH 60/80] rewrite cooperative data load into SLM

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 46 ++++++++++++-------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index a55e3ef7636..c80ad29ef38 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -385,24 +385,38 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                     const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
                     const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+                    const _IdType __rng_wg_data_size = __rng1_wg_data_size + __rng2_wg_data_size;
 
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    // Calculate __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
-                    const _IdType __i_elem = __local_id * __chunk;
-                    const _IdType __i_elem_next = (__local_id + 1) * __chunk;
-
-                    // Cooperative data load from __rng1 to __rng1_cache_slm, from __rng2 to __rng1_cache_slm
-                    _IdType __idx_end = std::min(__i_elem_next, __rng1_wg_data_size);
-                    _ONEDPL_PRAGMA_UNROLL
-                    for (_IdType __idx = __i_elem; __idx < __idx_end; ++__idx)
-                        __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
-
-                    __idx_end = std::min(__i_elem_next, __rng2_wg_data_size);
-                    _ONEDPL_PRAGMA_UNROLL
-                    for (_IdType __idx = __i_elem; __idx < __idx_end; ++__idx)
-                        __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
+                    const std::size_t __chunk_of_data_reading = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_wg_data_size, __wi_in_one_wg);
+                    const std::size_t __idx_begin = __local_id * __chunk_of_data_reading;
+                    if (__idx_begin < __rng_wg_data_size)
+                    {
+                        const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng_wg_data_size);
+
+                        // Cooperative data load from __rng1 to __rng1_cache_slm
+                        if (__idx_begin < __rng1_wg_data_size)
+                        {
+                            const _IdType __idx_begin_rng1 = __idx_begin;
+                            const _IdType __idx_end_rng1 = std::min(__idx_end, __rng1_wg_data_size);
+                            _ONEDPL_PRAGMA_UNROLL
+                            for (_IdType __idx = __idx_begin_rng1; __idx < __idx_end_rng1; ++__idx)
+                                __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
+                        }
+
+                        // Cooperative data load from __rng2 to __rng1_cache_slm
+                        if (__idx_end > __rng1_wg_data_size)
+                        {
+                            const _IdType __idx_begin_rng2 = 0;
+                            const _IdType __idx_end_rng2 = __idx_end - __rng1_wg_data_size;
+
+                            _ONEDPL_PRAGMA_UNROLL
+                            for (_IdType __idx = __idx_begin_rng2; __idx < __idx_end_rng2; ++__idx)
+                                __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
+                        }
+                    }
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);
@@ -414,7 +428,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         const _split_point_t<_IdType> __sp_local = __find_start_point(
                             __rng1_cache_slm, __rng2_cache_slm,                         // SLM cached copy of merging data
-                            __i_elem,                                                   // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
+                            (_IdType)(__local_id * __chunk),                            // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
                             __rng1_wg_data_size, __rng2_wg_data_size,                   // size of rng1 and rng2
                             __comp);
 
@@ -459,7 +473,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>;
 
     const std::size_t __n = __rng1.size() + __rng2.size();
-    if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types)
+    if (false)//if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types)
     {
         static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits<std::uint32_t>::max());
 

From 28e2e374f0f879160070c08f5fccee0053aec47b Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 15:55:31 +0100
Subject: [PATCH 61/80] evalueate __chunk_of_data_reading through SLM bank size

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index c80ad29ef38..9969adacd61 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -390,7 +390,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    const std::size_t __chunk_of_data_reading = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_wg_data_size, __wi_in_one_wg);
+                    constexpr std::size_t __slm_bank_size = 32;
+
+                    const std::size_t __chunk_of_data_reading = std::max(
+                        oneapi::dpl::__internal::__dpl_ceiling_div(__rng_wg_data_size, __wi_in_one_wg),
+                        oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, 2 * sizeof(_RangeValueType)));
                     const std::size_t __idx_begin = __local_id * __chunk_of_data_reading;
                     if (__idx_begin < __rng_wg_data_size)
                     {

From 00dcb1db9d357c1ef400ea46bcaab60ce59f7c08 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 17:27:15 +0100
Subject: [PATCH 62/80] Using 2/3 of available SLM

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 9969adacd61..42552c29dbe 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -302,11 +302,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
         assert(__chunk > 0);
 
-        // Pessimistically only use half of the memory to take into account memory used by compiled kernel
+        // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
         const std::size_t __max_slm_size_adj = 
             std::max((std::size_t)__chunk,
                      std::min((std::size_t)__n, oneapi::dpl::__internal::__slm_adjusted_work_group_size(
-                                                                 __exec, 2 * sizeof(_RangeValueType))));
+                                                                 __exec, sizeof(_RangeValueType)))) * 2 / 3;
 
         // The amount of data must be a multiple of the chunk size.
         const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk;

From 144de4a76fb20afb3b5fda148fc9eb2ba72e536e Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 19:02:03 +0100
Subject: [PATCH 63/80] balance data load into SLM cache

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 51 ++++++++++---------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 42552c29dbe..e0a9f78a9d4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -303,10 +303,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         assert(__chunk > 0);
 
         // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
-        const std::size_t __max_slm_size_adj = 
-            std::max((std::size_t)__chunk,
-                     std::min((std::size_t)__n, oneapi::dpl::__internal::__slm_adjusted_work_group_size(
-                                                                 __exec, sizeof(_RangeValueType)))) * 2 / 3;
+        const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType));
+        const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 2 / 3;
+        const std::size_t __max_slm_size_adj = __slm_adjusted_work_group_size_x_part;
 
         // The amount of data must be a multiple of the chunk size.
         const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk;
@@ -319,7 +318,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         // The amount of the base diagonals is the amount of the work-groups
         //  - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group
-        const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __max_source_data_items_fit_into_slm);
+        const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __wi_in_one_wg);
 
         // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group)
         //  - in GLOBAL coordinates
@@ -385,39 +384,43 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                     const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
                     const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
-                    const _IdType __rng_wg_data_size = __rng1_wg_data_size + __rng2_wg_data_size;
 
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    constexpr std::size_t __slm_bank_size = 32;
+                    const std::size_t __chunk_of_data_reading = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg);
 
-                    const std::size_t __chunk_of_data_reading = std::max(
-                        oneapi::dpl::__internal::__dpl_ceiling_div(__rng_wg_data_size, __wi_in_one_wg),
-                        oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, 2 * sizeof(_RangeValueType)));
-                    const std::size_t __idx_begin = __local_id * __chunk_of_data_reading;
-                    if (__idx_begin < __rng_wg_data_size)
+                    const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
+                    const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);
+                    
+                    // Calculate the amount of WI for read data from rng1
+                    if (__local_id < __how_many_wi_reads_rng1)
                     {
-                        const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng_wg_data_size);
+                        const std::size_t __idx_begin = __local_id * __chunk_of_data_reading;
 
                         // Cooperative data load from __rng1 to __rng1_cache_slm
                         if (__idx_begin < __rng1_wg_data_size)
                         {
-                            const _IdType __idx_begin_rng1 = __idx_begin;
-                            const _IdType __idx_end_rng1 = std::min(__idx_end, __rng1_wg_data_size);
+                            const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng1_wg_data_size);
+                    
                             _ONEDPL_PRAGMA_UNROLL
-                            for (_IdType __idx = __idx_begin_rng1; __idx < __idx_end_rng1; ++__idx)
+                            for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
                                 __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
                         }
+                    }
 
-                        // Cooperative data load from __rng2 to __rng1_cache_slm
-                        if (__idx_end > __rng1_wg_data_size)
-                        {
-                            const _IdType __idx_begin_rng2 = 0;
-                            const _IdType __idx_end_rng2 = __idx_end - __rng1_wg_data_size;
+                    const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1;
+                    if (__local_id >= __first_wi_local_id_for_read_rng2)
+                    {
+                        const std::size_t __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading;
 
+                        // Cooperative data load from __rng2 to __rng2_cache_slm
+                        if (__idx_begin < __rng2_wg_data_size)
+                        {
+                            const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng2_wg_data_size);
+                    
                             _ONEDPL_PRAGMA_UNROLL
-                            for (_IdType __idx = __idx_begin_rng2; __idx < __idx_end_rng2; ++__idx)
+                            for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
                                 __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
                         }
                     }
@@ -477,10 +480,10 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
     constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>;
 
     const std::size_t __n = __rng1.size() + __rng2.size();
-    if (false)//if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types)
+    if (__n < __starting_size_limit_for_large_submitter || !__same_merge_types)
     {
         static_assert(__starting_size_limit_for_large_submitter < std::numeric_limits<std::uint32_t>::max());
-
+    
         using _WiIndex = std::uint32_t;
         using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
             __merge_kernel_name<_CustomName, _WiIndex>>;

From 1812dbb2de56582968584bd7490046c0bdbc591d Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 19:33:26 +0100
Subject: [PATCH 64/80] balance data load into SLM cache

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h   | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index e0a9f78a9d4..7bc99aa6cbd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -302,6 +302,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
         assert(__chunk > 0);
 
+        // Define SLM bank size
+        constexpr std::size_t __slm_bank_size = 32;     // TODO is it correct value? How to get it from hardware?
+
+        // Calculate how many data items we can read into one SLM bank
+        constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType));
+
         // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
         const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType));
         const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 2 / 3;
@@ -388,7 +394,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    const std::size_t __chunk_of_data_reading = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg);
+                    const std::size_t __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
 
                     const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
                     const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);

From 3fdd67379b8f0e959e2296e3054da37fe4813438 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 19 Nov 2024 19:42:28 +0100
Subject: [PATCH 65/80] Using 4/5 of available SLM

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 7bc99aa6cbd..9cb0a04a392 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -310,7 +310,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
         const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType));
-        const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 2 / 3;
+        const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5;
         const std::size_t __max_slm_size_adj = __slm_adjusted_work_group_size_x_part;
 
         // The amount of data must be a multiple of the chunk size.

From 0ca542a2db9beb47b55cb80c6087e9c27147aa3b Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 10:37:49 +0100
Subject: [PATCH 66/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an
 error in types of SP on base diagonals

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 9cb0a04a392..0cf5bf24ffd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -382,8 +382,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
                     // Split points on left anr right base diagonals
                     //  - in GLOBAL coordinates
-                    const _split_point_t<std::size_t>& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
-                    const _split_point_t<std::size_t>& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
+                    const auto& __sp_base_left_global  = __base_diagonals_sp_global_ptr[__group_linear_id];
+                    const auto& __sp_base_right_global = __base_diagonals_sp_global_ptr[__group_linear_id + 1]; 
 
                     assert(__sp_base_right_global.first >= __sp_base_left_global.first);
                     assert(__sp_base_right_global.second >= __sp_base_left_global.second);

From b32c911f5d4d7afc2499c862853fe9e43de79d08 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 10:42:21 +0100
Subject: [PATCH 67/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 types in __parallel_merge_submitter_large::operator()

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_merge.h     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 0cf5bf24ffd..47e12d20bed 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -394,20 +394,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    const std::size_t __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
+                    const _IdType __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
 
-                    const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
-                    const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);
+                    const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
+                    const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);
                     
                     // Calculate the amount of WI for read data from rng1
                     if (__local_id < __how_many_wi_reads_rng1)
                     {
-                        const std::size_t __idx_begin = __local_id * __chunk_of_data_reading;
+                        const _IdType __idx_begin = __local_id * __chunk_of_data_reading;
 
                         // Cooperative data load from __rng1 to __rng1_cache_slm
                         if (__idx_begin < __rng1_wg_data_size)
                         {
-                            const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng1_wg_data_size);
+                            const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size);
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
@@ -418,12 +418,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1;
                     if (__local_id >= __first_wi_local_id_for_read_rng2)
                     {
-                        const std::size_t __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading;
+                        const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading;
 
                         // Cooperative data load from __rng2 to __rng2_cache_slm
                         if (__idx_begin < __rng2_wg_data_size)
                         {
-                            const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, (std::size_t)__rng2_wg_data_size);
+                            const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size);
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)

From 7cda3e4291658a083dee9723b603a6ba8e647b62 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 10:56:59 +0100
Subject: [PATCH 68/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 types in __serial_merge

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 47e12d20bed..74606208870 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -194,19 +194,19 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _
     {
         //copying a residual of the second seq
         const _Index __n = std::min<_Index>(__n2 - __start2, __chunk);
-        for (std::uint8_t __i = 0; __i < __n; ++__i)
+        for (_Index __i = 0; __i < __n; ++__i)
             __rng3[__start3 + __i] = __rng2[__start2 + __i];
     }
     else if (__start2 >= __n2)
     {
         //copying a residual of the first seq
         const _Index __n = std::min<_Index>(__n1 - __start1, __chunk);
-        for (std::uint8_t __i = 0; __i < __n; ++__i)
+        for (_Index __i = 0; __i < __n; ++__i)
             __rng3[__start3 + __i] = __rng1[__start1 + __i];
     }
     else
     {
-        for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i)
+        for (_Index __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i)
         {
             const auto& __val1 = __rng1[__start1];
             const auto& __val2 = __rng2[__start2];

From 253ca8d0cde9029b4b490e30ad6a3e42894bd082 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 11:10:28 +0100
Subject: [PATCH 69/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove
 extra local variable

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 74606208870..e73c5ee5690 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -311,10 +311,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
         const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType));
         const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5;
-        const std::size_t __max_slm_size_adj = __slm_adjusted_work_group_size_x_part;
 
         // The amount of data must be a multiple of the chunk size.
-        const std::size_t __max_source_data_items_fit_into_slm = __max_slm_size_adj - __max_slm_size_adj % __chunk;
+        const std::size_t __max_source_data_items_fit_into_slm = __slm_adjusted_work_group_size_x_part - __slm_adjusted_work_group_size_x_part % __chunk;
         assert(__max_source_data_items_fit_into_slm > 0);
         assert(__max_source_data_items_fit_into_slm % __chunk == 0);
 

From 952871e1247c9f39132e1f3d435885ed51e1519c Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 16:28:51 +0100
Subject: [PATCH 70/80] @@@
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - debug
 code under DUMP_DATA_LOADING

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index e73c5ee5690..a8c5f9045d3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -26,6 +26,8 @@
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
 
+//#define DUMP_DATA_LOADING 1
+
 namespace oneapi
 {
 namespace dpl
@@ -280,6 +282,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
                                         __internal::__optional_kernel_name<_MergeKernelName...>>
 {
+#if DUMP_DATA_LOADING
+    template <typename _Range, typename _Index, typename _Data>
+    static void
+    __load_item_into_slm(_Range&& __rng, _Index __idx_from, _Data* __slm, _Index __idx_to, std::size_t __range_index,
+                         bool __b_check, std::size_t __group_linear_id, std::size_t __local_id)
+    {
+        // BP
+        //  condition: __b_check
+        //  action: __range_index = {__range_index}, __rng[{__idx_from}] -> __slm[{__idx_to}], __group_linear_id = {__group_linear_id}, __local_id = {__local_id}
+        //  action: {__range_index}, {__idx_from}, {__idx_to}, {__group_linear_id}, {__local_id}
+        __slm[__idx_to] = __rng[__idx_from];
+    }
+#endif
+
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
@@ -294,6 +310,18 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         const _IdType __n2 = __rng2.size();
         const _IdType __n = __n1 + __n2;
 
+#if DUMP_DATA_LOADING
+        //const bool __b_check = __n1 == 16144 && __n2 == 8072;
+        //const bool __b_check = __n1 == 50716 && __n2 == 25358;      // __wi_in_one_wg = 51 __wg_count = 12
+        const bool __b_check = false;
+
+        if (__b_check)
+        {
+            int i = 0;
+            i = i;
+        }
+#endif
+
         assert(__n1 > 0 || __n2 > 0);
 
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
@@ -410,7 +438,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
+#if !DUMP_DATA_LOADING
                                 __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
+#else
+                                __load_item_into_slm(__rng1, __sp_base_left_global.first + __idx, __rng1_cache_slm, __idx, 1, __b_check, __group_linear_id, __local_id);
+#endif
                         }
                     }
 
@@ -426,13 +458,44 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
+#if !DUMP_DATA_LOADING
                                 __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
+#else
+                                __load_item_into_slm(__rng2, __sp_base_left_global.second + __idx, __rng2_cache_slm, __idx, 2, __b_check, __group_linear_id, __local_id);
+#endif
                         }
                     }
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);
 
+#if DUMP_DATA_LOADING
+                    if (__local_id == 0)
+                    {
+                        for (auto i = __sp_base_left_global.first; i < __sp_base_right_global.first; ++i)
+                        {
+                            auto _idx_slm = i - __sp_base_left_global.first;
+                            if (__rng1_cache_slm[_idx_slm] != __rng1[i])
+                            {
+                                auto __group_linear_id_tmp = __group_linear_id;
+                                __group_linear_id_tmp = __group_linear_id_tmp;
+                                assert(false);
+                            }
+                        }
+
+                        for (auto i = __sp_base_left_global.second; i < __sp_base_right_global.second; ++i)
+                        {
+                            auto _idx_slm = i - __sp_base_left_global.second;
+                            if (__rng2_cache_slm[_idx_slm] != __rng2[i])
+                            {
+                                auto __group_linear_id_tmp = __group_linear_id;
+                                __group_linear_id_tmp = __group_linear_id_tmp;
+                                assert(false);
+                            }
+                        }
+                    }
+#endif
+
                     // Current diagonal inside of the merge matrix?
                     if (__global_linear_id * __chunk < __n)
                     {

From b6e1d1c58ca7a67407760a4c6e9a624efacca762 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 16:30:32 +0100
Subject: [PATCH 71/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an
 error in data loading

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index a8c5f9045d3..10bc5609cb9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -446,7 +446,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                         }
                     }
 
-                    const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1;
+                    const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2;
                     if (__local_id >= __first_wi_local_id_for_read_rng2)
                     {
                         const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading;

From ef568f08fbd1413cbfcde25f27c2d1780f8aacb1 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 16:32:24 +0100
Subject: [PATCH 72/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 chunk size on GPU

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h    | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 10bc5609cb9..3a2c893bf15 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -326,16 +326,16 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
 
-        // Empirical number of values to process per work-item
-        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
-        assert(__chunk > 0);
-
         // Define SLM bank size
         constexpr std::size_t __slm_bank_size = 32;     // TODO is it correct value? How to get it from hardware?
 
         // Calculate how many data items we can read into one SLM bank
         constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType));
 
+        // Empirical number of values to process per work-item
+        _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank;
+        assert(__chunk > 0);
+
         // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
         const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType));
         const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5;
@@ -353,6 +353,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         //  - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group
         const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __wi_in_one_wg);
 
+        assert(__wg_count * __wi_in_one_wg * __chunk >= __n);
+
         // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group)
         //  - in GLOBAL coordinates
         using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>;

From 3dced5122178241fbc3c46e16394e5f270ad5c65 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 16:33:57 +0100
Subject: [PATCH 73/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 calculation of available SLM memory amount

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 3a2c893bf15..cc34a8144d8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -336,17 +336,24 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank;
         assert(__chunk > 0);
 
-        // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
-        const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType));
-        const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5;
+        // Get the size of local memory arena in bytes.
+        const std::size_t __slm_mem_size = __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>();
 
-        // The amount of data must be a multiple of the chunk size.
-        const std::size_t __max_source_data_items_fit_into_slm = __slm_adjusted_work_group_size_x_part - __slm_adjusted_work_group_size_x_part % __chunk;
-        assert(__max_source_data_items_fit_into_slm > 0);
-        assert(__max_source_data_items_fit_into_slm % __chunk == 0);
+        // Pessimistically only use 4/5 of the memory to take into account memory used by compiled kernel
+        const std::size_t __slm_mem_size_x_part = __slm_mem_size * 4 / 5;
+
+        // Calculate how many items count we may place into SLM memory
+        const auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType);
 
         // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group)
-        const std::size_t __wi_in_one_wg = __max_source_data_items_fit_into_slm / __chunk;
+        std::size_t __wi_in_one_wg = __slm_cached_items_count / __chunk;
+        const std::size_t __max_wi_in_one_wg = __exec.queue().get_device().template get_info<sycl::info::device::max_work_item_sizes<1>>()[0];
+        if (__wi_in_one_wg > __max_wi_in_one_wg)
+        {
+            __chunk = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_cached_items_count, __max_wi_in_one_wg);
+            __wi_in_one_wg = __slm_cached_items_count / __chunk;
+            assert(__wi_in_one_wg <= __max_wi_in_one_wg);
+        }
         assert(__wi_in_one_wg > 0);
 
         // The amount of the base diagonals is the amount of the work-groups
@@ -396,7 +403,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
             const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk;
-            __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(2 * __slm_cached_data_size, __cgh);
+            __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(__slm_cached_data_size, __cgh);
 
             // Run nd_range parallel_for to process all the data
             // - each work-group caching source data in SLM and processing diagonals between two base diagonals;

From 39b68e4a89294734e0b89c3cc48092224866fcbf Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 16:35:44 +0100
Subject: [PATCH 74/80] Revert "@@@
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - debug
 code under DUMP_DATA_LOADING"

This reverts commit 952871e1247c9f39132e1f3d435885ed51e1519c.
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 63 -------------------
 1 file changed, 63 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index cc34a8144d8..cae18d8425a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -26,8 +26,6 @@
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
 
-//#define DUMP_DATA_LOADING 1
-
 namespace oneapi
 {
 namespace dpl
@@ -282,20 +280,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
                                         __internal::__optional_kernel_name<_MergeKernelName...>>
 {
-#if DUMP_DATA_LOADING
-    template <typename _Range, typename _Index, typename _Data>
-    static void
-    __load_item_into_slm(_Range&& __rng, _Index __idx_from, _Data* __slm, _Index __idx_to, std::size_t __range_index,
-                         bool __b_check, std::size_t __group_linear_id, std::size_t __local_id)
-    {
-        // BP
-        //  condition: __b_check
-        //  action: __range_index = {__range_index}, __rng[{__idx_from}] -> __slm[{__idx_to}], __group_linear_id = {__group_linear_id}, __local_id = {__local_id}
-        //  action: {__range_index}, {__idx_from}, {__idx_to}, {__group_linear_id}, {__local_id}
-        __slm[__idx_to] = __rng[__idx_from];
-    }
-#endif
-
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
@@ -310,18 +294,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         const _IdType __n2 = __rng2.size();
         const _IdType __n = __n1 + __n2;
 
-#if DUMP_DATA_LOADING
-        //const bool __b_check = __n1 == 16144 && __n2 == 8072;
-        //const bool __b_check = __n1 == 50716 && __n2 == 25358;      // __wi_in_one_wg = 51 __wg_count = 12
-        const bool __b_check = false;
-
-        if (__b_check)
-        {
-            int i = 0;
-            i = i;
-        }
-#endif
-
         assert(__n1 > 0 || __n2 > 0);
 
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
@@ -447,11 +419,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
-#if !DUMP_DATA_LOADING
                                 __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
-#else
-                                __load_item_into_slm(__rng1, __sp_base_left_global.first + __idx, __rng1_cache_slm, __idx, 1, __b_check, __group_linear_id, __local_id);
-#endif
                         }
                     }
 
@@ -467,44 +435,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
-#if !DUMP_DATA_LOADING
                                 __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
-#else
-                                __load_item_into_slm(__rng2, __sp_base_left_global.second + __idx, __rng2_cache_slm, __idx, 2, __b_check, __group_linear_id, __local_id);
-#endif
                         }
                     }
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);
 
-#if DUMP_DATA_LOADING
-                    if (__local_id == 0)
-                    {
-                        for (auto i = __sp_base_left_global.first; i < __sp_base_right_global.first; ++i)
-                        {
-                            auto _idx_slm = i - __sp_base_left_global.first;
-                            if (__rng1_cache_slm[_idx_slm] != __rng1[i])
-                            {
-                                auto __group_linear_id_tmp = __group_linear_id;
-                                __group_linear_id_tmp = __group_linear_id_tmp;
-                                assert(false);
-                            }
-                        }
-
-                        for (auto i = __sp_base_left_global.second; i < __sp_base_right_global.second; ++i)
-                        {
-                            auto _idx_slm = i - __sp_base_left_global.second;
-                            if (__rng2_cache_slm[_idx_slm] != __rng2[i])
-                            {
-                                auto __group_linear_id_tmp = __group_linear_id;
-                                __group_linear_id_tmp = __group_linear_id_tmp;
-                                assert(false);
-                            }
-                        }
-                    }
-#endif
-
                     // Current diagonal inside of the merge matrix?
                     if (__global_linear_id * __chunk < __n)
                     {

From 2da44ff5e64efaaa4fef9a0a1eba2f2e5bc30198 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Wed, 20 Nov 2024 18:21:26 +0100
Subject: [PATCH 75/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - another
 approach to calculate the amount of work-groups and work-items

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_merge.h     | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index cae18d8425a..63729d21bfb 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -305,7 +305,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType));
 
         // Empirical number of values to process per work-item
-        _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank;
+        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank;
         assert(__chunk > 0);
 
         // Get the size of local memory arena in bytes.
@@ -315,17 +315,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         const std::size_t __slm_mem_size_x_part = __slm_mem_size * 4 / 5;
 
         // Calculate how many items count we may place into SLM memory
-        const auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType);
+        auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType);
 
         // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group)
-        std::size_t __wi_in_one_wg = __slm_cached_items_count / __chunk;
-        const std::size_t __max_wi_in_one_wg = __exec.queue().get_device().template get_info<sycl::info::device::max_work_item_sizes<1>>()[0];
-        if (__wi_in_one_wg > __max_wi_in_one_wg)
-        {
-            __chunk = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_cached_items_count, __max_wi_in_one_wg);
-            __wi_in_one_wg = __slm_cached_items_count / __chunk;
-            assert(__wi_in_one_wg <= __max_wi_in_one_wg);
-        }
+        const std::size_t __max_wg_size = __exec.queue().get_device().template get_info<sycl::info::device::max_work_group_size>();
+        const std::size_t __wi_in_one_wg = std::min(__max_wg_size, __slm_cached_items_count / __chunk);
         assert(__wi_in_one_wg > 0);
 
         // The amount of the base diagonals is the amount of the work-groups

From b04b25e156b0e5a8c647345bea353bfad4524f34 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Thu, 21 Nov 2024 09:57:02 +0100
Subject: [PATCH 76/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - do not
 use SLM bank size

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 63729d21bfb..ec32f822e93 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -299,13 +299,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
 
         // Define SLM bank size
-        constexpr std::size_t __slm_bank_size = 32;     // TODO is it correct value? How to get it from hardware?
+        //constexpr std::size_t __slm_bank_size = 32;     // TODO is it correct value? How to get it from hardware?
 
         // Calculate how many data items we can read into one SLM bank
-        constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType));
+        //constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType));
 
         // Empirical number of values to process per work-item
-        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank;
+        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;// __data_items_in_slm_bank;
         assert(__chunk > 0);
 
         // Get the size of local memory arena in bytes.
@@ -396,7 +396,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    const _IdType __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
+                    const _IdType __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, (_IdType)oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
 
                     const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
                     const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);

From cc5f8f05363849a7c982ce4918ded4d5a8f1ec4a Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Sat, 23 Nov 2024 13:58:37 +0100
Subject: [PATCH 77/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - use
 std::size_t instead of _IdType

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpcpp/parallel_backend_sycl_merge.h       | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index ec32f822e93..1a5cb8a41d8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -290,9 +290,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
 
         using _RangeValueType = _Range1ValueType;
 
-        const _IdType __n1 = __rng1.size();
-        const _IdType __n2 = __rng2.size();
-        const _IdType __n = __n1 + __n2;
+        const std::size_t __n1 = __rng1.size();
+        const std::size_t __n2 = __rng2.size();
+        const std::size_t __n = __n1 + __n2;
 
         assert(__n1 > 0 || __n2 > 0);
 
@@ -305,7 +305,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         //constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType));
 
         // Empirical number of values to process per work-item
-        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;// __data_items_in_slm_bank;
+        const std::size_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; // __data_items_in_slm_bank;
         assert(__chunk > 0);
 
         // Get the size of local memory arena in bytes.
@@ -390,26 +390,26 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     assert(__sp_base_right_global.first >= __sp_base_left_global.first);
                     assert(__sp_base_right_global.second >= __sp_base_left_global.second);
 
-                    const _IdType __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
-                    const _IdType __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
+                    const std::size_t __rng1_wg_data_size = __sp_base_right_global.first - __sp_base_left_global.first;
+                    const std::size_t __rng2_wg_data_size = __sp_base_right_global.second - __sp_base_left_global.second;
 
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    const _IdType __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, (_IdType)oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
+                    const std::size_t __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, (_IdType)oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
 
-                    const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
-                    const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);
+                    const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
+                    const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);
                     
                     // Calculate the amount of WI for read data from rng1
                     if (__local_id < __how_many_wi_reads_rng1)
                     {
-                        const _IdType __idx_begin = __local_id * __chunk_of_data_reading;
+                        const std::size_t __idx_begin = __local_id * __chunk_of_data_reading;
 
                         // Cooperative data load from __rng1 to __rng1_cache_slm
                         if (__idx_begin < __rng1_wg_data_size)
                         {
-                            const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size);
+                            const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size);
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
@@ -420,12 +420,12 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2;
                     if (__local_id >= __first_wi_local_id_for_read_rng2)
                     {
-                        const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading;
+                        const std::size_t __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading;
 
                         // Cooperative data load from __rng2 to __rng2_cache_slm
                         if (__idx_begin < __rng2_wg_data_size)
                         {
-                            const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size);
+                            const std::size_t __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size);
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)

From 4ec32e60647a2fd544ee849bc489500f0be0ac0a Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Sat, 23 Nov 2024 14:44:21 +0100
Subject: [PATCH 78/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h - fix
 compile errors

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_merge_sort.h  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h
index 0765f8ef7bc..c481a38beda 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h
@@ -63,7 +63,7 @@ struct __group_merge_path_sorter
     template <typename _StorageAcc, typename _Compare>
     bool
     sort(const sycl::nd_item<1>& __item, const _StorageAcc& __storage_acc, _Compare __comp, std::uint32_t __start,
-         std::uint32_t __end, std::uint32_t __sorted, std::uint16_t __data_per_workitem,
+         std::uint32_t __end, std::uint32_t __sorted, std::uint32_t __data_per_workitem,
          std::uint32_t __workgroup_size) const
     {
         const std::uint32_t __sorted_final = __data_per_workitem * __workgroup_size;
@@ -259,12 +259,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
 
                 __cgh.parallel_for<_GlobalSortName...>(
                     sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
-                        const _IndexT __i_elem = __item_id.get_linear_id() * __chunk;
-                        const _IndexT __i_elem_local = __i_elem % (__n_sorted * 2);
+                        const std::uint32_t __i_elem = __item_id.get_linear_id() * __chunk;
+                        const std::uint32_t __i_elem_local = __i_elem % (__n_sorted * 2);
 
-                        const _IndexT __offset = std::min<_IndexT>(__i_elem - __i_elem_local, __n);
-                        const _IndexT __n1 = std::min<_IndexT>(__offset + __n_sorted, __n) - __offset;
-                        const _IndexT __n2 = std::min<_IndexT>(__offset + __n1 + __n_sorted, __n) - (__offset + __n1);
+                        const std::uint32_t __offset = std::min<_IndexT>(__i_elem - __i_elem_local, __n);
+                        const std::uint32_t __n1 = std::min<_IndexT>(__offset + __n_sorted, __n) - __offset;
+                        const std::uint32_t __n2 = std::min<_IndexT>(__offset + __n1 + __n_sorted, __n) - (__offset + __n1);
 
                         if (__data_in_temp)
                         {

From 1810317eca283196501a8792a3ee5578e9c79474 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Sat, 23 Nov 2024 15:10:48 +0100
Subject: [PATCH 79/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix
 compile errors

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index 1a5cb8a41d8..f5562b67ecd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -351,7 +351,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     //  - in GLOBAL coordinates
                     _split_point_t<_IdType> __sp(__linear_id == 0 ? __zero_split_point<std::size_t> : _split_point_t<std::size_t>{__n1, __n2});
                     if (0 < __linear_id && __linear_id < __wg_count)
-                        __sp = __find_start_point(__rng1, __rng2, (_IdType)(__linear_id * __wi_in_one_wg * __chunk), __n1, __n2, __comp);
+                        __sp = __find_start_point(__rng1, __rng2, __linear_id * __wi_in_one_wg * __chunk, __n1, __n2, __comp);
 
                     __base_diagonals_sp_global_ptr[__linear_id] = __sp;
                 });
@@ -396,7 +396,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    const std::size_t __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, (_IdType)oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
+                    const std::size_t __chunk_of_data_reading = std::max(__chunk/*__data_items_in_slm_bank*/, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
 
                     const std::size_t __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
                     const std::size_t __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);
@@ -443,7 +443,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                         //  - bottom-right split point describes the size of current area between two base diagonals.
                         const _split_point_t<_IdType> __sp_local = __find_start_point(
                             __rng1_cache_slm, __rng2_cache_slm,                         // SLM cached copy of merging data
-                            (_IdType)(__local_id * __chunk),                            // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
+                            __local_id * __chunk,                                       // __i_elem in LOCAL coordinates because __rng1_cache_slm and __rng1_cache_slm is work-group SLM cached copy of source data
                             __rng1_wg_data_size, __rng2_wg_data_size,                   // size of rng1 and rng2
                             __comp);
 
@@ -451,9 +451,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                         //  - we should have here __sp_global in GLOBAL coordinates
                         __serial_merge(__rng1_cache_slm, __rng2_cache_slm,              // SLM cached copy of merging data
                                        __rng3,                                          // Destination range
-                                       __sp_local.first,                                // __start1 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data
-                                       __sp_local.second,                               // __start2 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data
-                                       (_IdType)(__global_linear_id * __chunk),         // __start3 in GLOBAL coordinates because __rng3 is not cached at all
+                                       (std::size_t)__sp_local.first,                   // __start1 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data
+                                       (std::size_t)__sp_local.second,                  // __start2 in LOCAL coordinates because __rng1_cache_slm is work-group SLM cached copy of source data
+                                       __global_linear_id * __chunk,                    // __start3 in GLOBAL coordinates because __rng3 is not cached at all
                                        __chunk,
                                        __rng1_wg_data_size, __rng2_wg_data_size,        // size of rng1 and rng2
                                        __comp);

From c8c0a26a82ac7a899f1a57dad5559f44e2057c25 Mon Sep 17 00:00:00 2001
From: Sergey Kopienko <sergey.kopienko@intel.com>
Date: Tue, 26 Nov 2024 19:31:46 +0100
Subject: [PATCH 80/80] 
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using
 oneapi::dpl::__internal::__value_t to detect range's value types

Signed-off-by: Sergey Kopienko <sergey.kopienko@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index f5562b67ecd..1453a078074 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -284,8 +284,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
     {
-        using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
-        using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+        using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
         static_assert(std::is_same_v<_Range1ValueType, _Range2ValueType>, "In this implementation we can merge only data of the same type");
 
         using _RangeValueType = _Range1ValueType;
@@ -482,8 +482,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
 
     constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb
 
-    using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
-    using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
+    using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
+    using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>;
 
     constexpr bool __same_merge_types = std::is_same_v<_Range1ValueType, _Range2ValueType>;