From e5c121b2ba797256dbd27dbd6e662462c7d9ee73 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 29 Nov 2023 12:28:43 +0000
Subject: [PATCH 001/134] Enable pragma unroll for open-source DPC++

---
 include/oneapi/dpl/pstl/onedpl_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h
index b5641fde37c..8101dc671a6 100644
--- a/include/oneapi/dpl/pstl/onedpl_config.h
+++ b/include/oneapi/dpl/pstl/onedpl_config.h
@@ -117,7 +117,7 @@
 
 // Enable loop unrolling pragmas where supported
 #if (__INTEL_LLVM_COMPILER || __INTEL_COMPILER ||                                                                      \
-     (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && _ONEDPL_GCC_VERSION >= 80000))
+     (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000))))
 #    define _ONEDPL_PRAGMA_UNROLL _ONEDPL_PRAGMA(unroll)
 #else //no pragma unroll
 #    define _ONEDPL_PRAGMA_UNROLL

From 7c1cb0faec3f2462c99acd1ec7fab6f2d7615e78 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 29 Nov 2023 14:28:07 +0000
Subject: [PATCH 002/134] clang-format

---
 include/oneapi/dpl/pstl/onedpl_config.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h
index 8101dc671a6..d860e2661f7 100644
--- a/include/oneapi/dpl/pstl/onedpl_config.h
+++ b/include/oneapi/dpl/pstl/onedpl_config.h
@@ -117,7 +117,8 @@
 
 // Enable loop unrolling pragmas where supported
 #if (__INTEL_LLVM_COMPILER || __INTEL_COMPILER ||                                                                      \
-     (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) && ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000))))
+     (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) &&                                                 \
+      ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000))))
 #    define _ONEDPL_PRAGMA_UNROLL _ONEDPL_PRAGMA(unroll)
 #else //no pragma unroll
 #    define _ONEDPL_PRAGMA_UNROLL

From 154161f12f07e140b3feace770fd08d3cbd2009a Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Fri, 18 Aug 2023 14:52:41 -0500
Subject: [PATCH 003/134] Start of single-pass scan kernel template

---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |   2 +
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 150 ++++++++++++++++++
 .../numeric/numeric.ops/scan_kt.pass.cpp      |  30 ++++
 3 files changed, 182 insertions(+)
 create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
 create mode 100644 test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 2335bad252e..2299a0e26d8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -40,6 +40,8 @@
 #    include "parallel_backend_sycl_radix_sort.h"
 #endif
 
+#include "parallel_backend_sycl_scan.h"
+
 namespace oneapi
 {
 namespace dpl
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
new file mode 100644
index 00000000000..4fc2dbe4d44
--- /dev/null
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -0,0 +1,150 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_parallel_backend_sycl_scan_H
+#define _ONEDPL_parallel_backend_sycl_scan_H
+
+namespace oneapi::dpl::experimental::igpu
+{
+
+template<typename _T>
+struct __scan_status_flag
+{
+    using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>;
+    static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2);
+    static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1);
+    static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask);
+
+    __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id)
+      : atomic_flag(*(flags_begin + tile_id))
+    {
+
+    }
+
+    void set_partial(std::uint32_t val)
+    {
+        atomic_flag.store(val | partial_mask);
+    }
+
+    void set_full(std::uint32_t val)
+    {
+        atomic_flag.store(val | full_mask);
+    }
+
+    _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin)
+    {
+        _T sum = 0;
+        int i = 0;
+        for (std::int32_t tile = static_cast<std::int32_t>(tile_id) - 1; tile >= 0; --tile)
+        {
+            _AtomicRefT tile_atomic(*(flags_begin + tile));
+            std::uint32_t tile_val = 0;
+            do {
+                tile_val = tile_atomic.load();
+            } while (tile_val == 0);
+
+            sum += tile_val & value_mask;
+
+            // If this was a full value, we can stop looking at previous tiles. Otherwise,
+            // keep going through tiles until we either find a full tile or we've completely
+            // recomputed the prefix using partial values
+            if (tile_val & full_mask)
+                break;
+        }
+        return sum;
+    }
+
+    _AtomicRefT atomic_flag;
+};
+
+template <bool _Inclusive, typename _Policy, typename _InRange, typename _OutRange, typename _BinaryOp>
+void
+single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+
+    const ::std::size_t n = __in_rng.size();
+    auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+    std::size_t num_wgs = __max_cu;
+
+    std::size_t wgsize = n/__max_cu;
+
+    std::uint32_t status_flags_buf_size = num_wgs+1;
+    sycl::buffer<uint32_t, 1> status_flags_buf(status_flags_buf_size);
+
+    // TODO: this probably isn't the best way to do this
+    sycl::host_accessor<std::uint32_t, 1> status_flags(status_flags_buf);
+    for (std::size_t i = 0; i < status_flags_buf_size; ++i)
+        status_flags[i] = 0;
+
+
+    auto event = __exec.queue().submit([&](sycl::handler& hdl) {
+        auto status_flags = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write>(status_flags_buf, hdl);
+        auto tile_id_lacc = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write, sycl::target::local>(sycl::range<1>{1}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
+        hdl.parallel_for(sycl::nd_range<1>(n, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
+            auto item_id = item.get_local_linear_id();
+            auto group = item.get_group();
+
+            //std::uint32_t elems_in_tile = elems_per_item*wgsize;
+            std::uint32_t elems_in_tile = wgsize;
+
+            // Obtain unique ID for this work-group that will be used in decoupled lookback
+            if (group.leader())
+            {
+                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_buf_size-1]);
+                tile_id_lacc[0] = idx_atomic.fetch_add(1);
+            }
+            sycl::group_barrier(group);
+            std::uint32_t tile_id = tile_id_lacc[0];
+
+            auto in_begin = __in_rng.begin() + (tile_id*elems_in_tile);
+            auto in_end = __in_rng.begin() + ((tile_id+1)*elems_in_tile);
+            auto out_begin = __out_rng.begin() + (tile_id*elems_in_tile);
+
+            auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
+
+			__scan_status_flag<_Type> flag(status_flags.get_pointer(), tile_id);
+			flag.set_partial(local_sum);
+
+            auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer());
+            flag.set_full(prev_sum + local_sum);
+
+            sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
+        });
+    });
+
+    event.wait();
+}
+
+template <typename _Policy, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+void
+single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
+{
+    auto __n = __in_end - __in_begin;
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    single_pass_scan_impl<true>(__exec, __buf1.all_view(), __buf2.all_view(), __binary_op);
+}
+
+} // namespace oneapi::dpl::experimental::igpu
+
+#endif /* _ONEDPL_parallel_backend_sycl_scan_H */
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
new file mode 100644
index 00000000000..71a725563d4
--- /dev/null
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -0,0 +1,30 @@
+// -*- C++ -*-
+//===-- scan.pass.cpp -----------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/test_config.h"
+
+#include _PSTL_TEST_HEADER(execution)
+#include _PSTL_TEST_HEADER(numeric)
+
+int
+main()
+{
+    int n = 1 << 16;
+    sycl::queue q;
+    int* in_ptr = sycl::malloc_device<int>(n, q);
+    int* out_ptr = sycl::malloc_device<int>(n, q);
+    oneapi::dpl::experimental::igpu::single_pass_inclusive_scan(oneapi::dpl::execution::dpcpp_default, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
+    return 0;
+}

From 16ec5adce45e1e35109e0e15cad0c9174678bcdc Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Thu, 24 Aug 2023 08:48:42 -0500
Subject: [PATCH 004/134] Fix hang in inclusive scan

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 44 ++++++++++++++-----
 .../numeric/numeric.ops/scan_kt.pass.cpp      | 30 ++++++++++++-
 2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 4fc2dbe4d44..e71398a44b7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -16,9 +16,11 @@
 #ifndef _ONEDPL_parallel_backend_sycl_scan_H
 #define _ONEDPL_parallel_backend_sycl_scan_H
 
-namespace oneapi::dpl::experimental::igpu
+namespace oneapi::dpl::experimental::kt
 {
 
+inline namespace igpu {
+
 template<typename _T>
 struct __scan_status_flag
 {
@@ -69,28 +71,36 @@ struct __scan_status_flag
     _AtomicRefT atomic_flag;
 };
 
-template <bool _Inclusive, typename _Policy, typename _InRange, typename _OutRange, typename _BinaryOp>
+template <typename _KernelParam, bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
 void
-single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
+single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
 
+    static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
+
     const ::std::size_t n = __in_rng.size();
-    auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
-    std::size_t num_wgs = __max_cu;
+    auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
+    //std::size_t num_wgs = __max_cu;
+    std::size_t num_wgs = 64;
 
-    std::size_t wgsize = n/__max_cu;
+    // TODO: use wgsize and iters per item from _KernelParam
+    std::size_t wgsize = n/num_wgs;
 
     std::uint32_t status_flags_buf_size = num_wgs+1;
     sycl::buffer<uint32_t, 1> status_flags_buf(status_flags_buf_size);
 
     // TODO: this probably isn't the best way to do this
+    {
     sycl::host_accessor<std::uint32_t, 1> status_flags(status_flags_buf);
     for (std::size_t i = 0; i < status_flags_buf_size; ++i)
         status_flags[i] = 0;
+    }
+
+//    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%lu\n", n, num_wgs, wgsize, __max_cu);
 
 
-    auto event = __exec.queue().submit([&](sycl::handler& hdl) {
+    auto event = __queue.submit([&](sycl::handler& hdl) {
         auto status_flags = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write>(status_flags_buf, hdl);
         auto tile_id_lacc = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write, sycl::target::local>(sycl::range<1>{1}, hdl);
 
@@ -121,6 +131,7 @@ single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_r
 			flag.set_partial(local_sum);
 
             auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer());
+            //auto prev_sum = 0;
             flag.set_full(prev_sum + local_sum);
 
             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
@@ -130,9 +141,18 @@ single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_r
     event.wait();
 }
 
-template <typename _Policy, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+// The generic structure for configuring a kernel
+template <std::uint16_t DataPerWorkItem, std::uint16_t WorkGroupSize, typename KernelName>
+struct kernel_param
+{
+    static constexpr std::uint16_t data_per_workitem = DataPerWorkItem;
+    static constexpr std::uint16_t workgroup_size = WorkGroupSize;
+    using kernel_name = KernelName;
+};
+
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _BinaryOp>
 void
-single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
+single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
 {
     auto __n = __in_end - __in_begin;
     auto __keep1 =
@@ -142,9 +162,11 @@ single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
-    single_pass_scan_impl<true>(__exec, __buf1.all_view(), __buf2.all_view(), __binary_op);
+    single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
 }
 
-} // namespace oneapi::dpl::experimental::igpu
+} // inline namespace igpu
+
+} // namespace oneapi::dpl::experimental::kt
 
 #endif /* _ONEDPL_parallel_backend_sycl_scan_H */
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
index 71a725563d4..4ae83a92041 100644
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -22,9 +22,35 @@ int
 main()
 {
     int n = 1 << 16;
+    std::vector<int> v(n, 1);
     sycl::queue q;
     int* in_ptr = sycl::malloc_device<int>(n, q);
     int* out_ptr = sycl::malloc_device<int>(n, q);
-    oneapi::dpl::experimental::igpu::single_pass_inclusive_scan(oneapi::dpl::execution::dpcpp_default, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
-    return 0;
+
+
+    q.copy(v.data(), in_ptr, n);
+    using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
+    oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
+
+    std::vector<int> tmp(n, 0);
+    q.copy(out_ptr, tmp.data(), n);
+
+    std::inclusive_scan(v.begin(), v.end(), v.begin());
+
+    bool passed = true;
+    for (size_t i  = 0; i < n; ++i)
+    {
+        if (tmp[i] != v[i])
+        {
+            passed = false;
+            std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
+        }
+    }
+
+    if (passed)
+        std::cout << "passed" << std::endl;
+    else
+        std::cout << "failed" << std::endl;
+
+    return !passed;
 }

From bd8960153adb9d62090530479a4c5e7a51d6f142 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Thu, 31 Aug 2023 06:18:55 -0700
Subject: [PATCH 005/134] Debug statements for scan kernel template

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 81 +++++++++++++------
 1 file changed, 58 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index e71398a44b7..c70bbabb82b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -82,56 +82,67 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     const ::std::size_t n = __in_rng.size();
     auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
     //std::size_t num_wgs = __max_cu;
-    std::size_t num_wgs = 64;
+    std::size_t num_wgs = 256;
 
     // TODO: use wgsize and iters per item from _KernelParam
-    std::size_t wgsize = n/num_wgs;
+    //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
+    constexpr ::std::size_t __elems_per_item = 2;
+    std::size_t wgsize = n/num_wgs/__elems_per_item;
+    std::size_t num_items = n/__elems_per_item;
 
-    std::uint32_t status_flags_buf_size = num_wgs+1;
-    sycl::buffer<uint32_t, 1> status_flags_buf(status_flags_buf_size);
 
-    // TODO: this probably isn't the best way to do this
-    {
-    sycl::host_accessor<std::uint32_t, 1> status_flags(status_flags_buf);
-    for (std::size_t i = 0; i < status_flags_buf_size; ++i)
-        status_flags[i] = 0;
-    }
+    std::uint32_t status_flags_size = num_wgs+1;
+
+    uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
 
-//    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%lu\n", n, num_wgs, wgsize, __max_cu);
+    //printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
+    /*printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
 
+    uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    uint32_t* debug4 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    uint32_t* debug5 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);*/
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto status_flags = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write>(status_flags_buf, hdl);
-        auto tile_id_lacc = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write, sycl::target::local>(sycl::range<1>{1}, hdl);
+        auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for(sycl::nd_range<1>(n, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
-            auto item_id = item.get_local_linear_id();
+        hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
             auto group = item.get_group();
 
-            //std::uint32_t elems_in_tile = elems_per_item*wgsize;
-            std::uint32_t elems_in_tile = wgsize;
+            std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
             // Obtain unique ID for this work-group that will be used in decoupled lookback
             if (group.leader())
             {
-                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_buf_size-1]);
+                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_size-1]);
                 tile_id_lacc[0] = idx_atomic.fetch_add(1);
             }
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
+            //debug5[group.get_local_id()] = tile_id;
 
-            auto in_begin = __in_rng.begin() + (tile_id*elems_in_tile);
-            auto in_end = __in_rng.begin() + ((tile_id+1)*elems_in_tile);
-            auto out_begin = __out_rng.begin() + (tile_id*elems_in_tile);
+            auto current_offset = (tile_id*elems_in_tile);
+            auto next_offset = ((tile_id+1)*elems_in_tile);
+            auto in_begin = __in_rng.begin() + current_offset;
+            auto in_end = __in_rng.begin() + next_offset;
+            auto out_begin = __out_rng.begin() + current_offset;
+
+            //debug3[tile_id] = current_offset;
+            //debug4[tile_id] = next_offset;
 
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
+            //auto local_sum = 0;
+            ///debug1[tile_id] = local_sum;
 
-			__scan_status_flag<_Type> flag(status_flags.get_pointer(), tile_id);
+			__scan_status_flag<_Type> flag(status_flags, tile_id);
 			flag.set_partial(local_sum);
 
-            auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer());
+            auto prev_sum = flag.lookback(tile_id, status_flags);
             //auto prev_sum = 0;
+            //debug2[tile_id] = prev_sum;
             flag.set_full(prev_sum + local_sum);
 
             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
@@ -139,6 +150,30 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     });
 
     event.wait();
+
+#if 0
+    std::vector<uint32_t> debug1v(status_flags_size);
+    std::vector<uint32_t> debug2v(status_flags_size);
+    std::vector<uint32_t> debug3v(status_flags_size);
+    std::vector<uint32_t> debug4v(status_flags_size);
+    std::vector<uint32_t> debug5v(status_flags_size);
+    __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t));
+
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "local_sum " << i << " " << debug1v[i] << std::endl;
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "lookback " << i << " " << debug2v[i] << std::endl;
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "offset " << i << " " << debug3v[i] << std::endl;
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "end " << i << " " << debug4v[i] << std::endl;
+#endif
+
+    sycl::free(status_flags, __queue);
 }
 
 // The generic structure for configuring a kernel

From 60a69fcdf20ccfd0341dd72084211096a8965f2f Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Wed, 6 Sep 2023 08:46:10 -0500
Subject: [PATCH 006/134] Update scan kernel template test

---
 .../numeric/numeric.ops/scan_kt.pass.cpp      | 54 +++++++++++--------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
index 4ae83a92041..de5ecafc25b 100644
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -21,36 +21,44 @@
 int
 main()
 {
-    int n = 1 << 16;
-    std::vector<int> v(n, 1);
-    sycl::queue q;
-    int* in_ptr = sycl::malloc_device<int>(n, q);
-    int* out_ptr = sycl::malloc_device<int>(n, q);
+    bool all_passed = true;
 
+    for (int logn : {4, 8, 11, 16, 19, 21})
+    {
+        std::cout << "Testing 2^" << logn << '\n';
+        int n = 1 << logn;
+        std::vector<int> v(n, 1);
+        sycl::queue q;
+        int* in_ptr = sycl::malloc_device<int>(n, q);
+        int* out_ptr = sycl::malloc_device<int>(n, q);
 
-    q.copy(v.data(), in_ptr, n);
-    using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
-    oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
 
-    std::vector<int> tmp(n, 0);
-    q.copy(out_ptr, tmp.data(), n);
+        q.copy(v.data(), in_ptr, n);
+        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
+        oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
 
-    std::inclusive_scan(v.begin(), v.end(), v.begin());
+        std::vector<int> tmp(n, 0);
+        q.copy(out_ptr, tmp.data(), n);
 
-    bool passed = true;
-    for (size_t i  = 0; i < n; ++i)
-    {
-        if (tmp[i] != v[i])
+        std::inclusive_scan(v.begin(), v.end(), v.begin());
+
+        bool passed = true;
+        for (size_t i  = 0; i < n; ++i)
         {
-            passed = false;
-            std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
+            if (tmp[i] != v[i])
+            {
+                passed = false;
+                std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
+            }
         }
-    }
 
-    if (passed)
-        std::cout << "passed" << std::endl;
-    else
-        std::cout << "failed" << std::endl;
+        if (passed)
+            std::cout << "passed" << std::endl;
+        else
+            std::cout << "failed" << std::endl;
+
+        all_passed &= passed;
+    }
 
-    return !passed;
+    return !all_passed;
 }

From d526f0431baf85cdf96143d332f2f6a24c44fec9 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Thu, 14 Sep 2023 09:08:55 -0700
Subject: [PATCH 007/134] Only have a single work-item per group query for
 previous tile status

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index c70bbabb82b..b01f56ac539 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -86,7 +86,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     // TODO: use wgsize and iters per item from _KernelParam
     //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
-    constexpr ::std::size_t __elems_per_item = 2;
+    constexpr ::std::size_t __elems_per_item = 16;
     std::size_t wgsize = n/num_wgs/__elems_per_item;
     std::size_t num_items = n/__elems_per_item;
 
@@ -96,14 +96,16 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
 
-    //printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
-    /*printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
+#if SCAN_KT_DEBUG
+    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
+    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
 
     uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug4 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug5 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);*/
+    uint32_t* debug5 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+#endif
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
@@ -138,12 +140,21 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             ///debug1[tile_id] = local_sum;
 
 			__scan_status_flag<_Type> flag(status_flags, tile_id);
-			flag.set_partial(local_sum);
 
-            auto prev_sum = flag.lookback(tile_id, status_flags);
-            //auto prev_sum = 0;
+            if (group.leader())
+                flag.set_partial(local_sum);
+
+            // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+            //sycl::reduce_over_group(item.get_subgroup())
+
+            auto prev_sum = 0;
+
+            if (group.leader())
+                prev_sum = flag.lookback(tile_id, status_flags);
             //debug2[tile_id] = prev_sum;
-            flag.set_full(prev_sum + local_sum);
+
+            if (group.leader())
+                flag.set_full(prev_sum + local_sum);
 
             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
         });

From 09e9bbf4329623afa46fbe3ed6e6029835094157 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Mon, 18 Sep 2023 08:06:43 -0700
Subject: [PATCH 008/134] First attempt at parallel lookback

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 106 +++++++++++++++---
 1 file changed, 89 insertions(+), 17 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index b01f56ac539..27fdc1d09b4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -24,13 +24,21 @@ inline namespace igpu {
 template<typename _T>
 struct __scan_status_flag
 {
+    // 00xxxx - not computed
+    // 01xxxx - partial
+    // 10xxxx - full
+    // 110000 - out of bounds
+
     using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>;
     static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2);
     static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1);
     static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask);
+    static constexpr std::uint32_t oob_value = partial_mask | full_mask;
+
+    static constexpr int padding = 32;
 
     __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id)
-      : atomic_flag(*(flags_begin + tile_id))
+      : atomic_flag(*(flags_begin + tile_id + padding))
     {
 
     }
@@ -42,16 +50,57 @@ struct __scan_status_flag
 
     void set_full(std::uint32_t val)
     {
-        atomic_flag.store(val | full_mask);
+        atomic_flag.store((val ^ partial_mask) | full_mask);
+    }
+
+    template<typename _Subgroup, typename BinOp>
+    _T cooperative_lookback(const std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin)
+    {
+        _T sum = 0;
+        int offset = -1;
+        int i = 0;
+        int local_id = subgroup.get_local_id();
+
+        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; offset -= 32)
+        {
+            _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id));
+            std::uint32_t tile_val = 0;
+            do {
+                tile_val = tile_atomic.load();
+
+            //} while (!sycl::all_of_group(subgroup, tile_val != 0));
+            } while (0);
+
+            bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0);
+            auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
+            ::std::uint32_t is_full_ballot_bits{};
+            is_full_ballot.extract_bits(is_full_ballot_bits);
+
+            auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits);
+            _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{};
+
+            // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
+            sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
+
+            // If we found a full value, we can stop looking at previous tiles. Otherwise,
+            // keep going through tiles until we either find a full tile or we've completely
+            // recomputed the prefix using partial values
+            if (is_full_ballot_bits)
+                break;
+
+            //if (i++ > 10) break;
+        }
+        return sum;
     }
 
+#if 0
     _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin)
     {
         _T sum = 0;
         int i = 0;
         for (std::int32_t tile = static_cast<std::int32_t>(tile_id) - 1; tile >= 0; --tile)
         {
-            _AtomicRefT tile_atomic(*(flags_begin + tile));
+            _AtomicRefT tile_atomic(*(flags_begin + tile + padding));
             std::uint32_t tile_val = 0;
             do {
                 tile_val = tile_atomic.load();
@@ -67,6 +116,7 @@ struct __scan_status_flag
         }
         return sum;
     }
+#endif
 
     _AtomicRefT atomic_flag;
 };
@@ -86,15 +136,28 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     // TODO: use wgsize and iters per item from _KernelParam
     //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
-    constexpr ::std::size_t __elems_per_item = 16;
+#ifdef _ONEDPL_SCAN_ITER_SIZE
+    constexpr ::std::size_t __elems_per_item = _ONEDPL_SCAN_ITER_SIZE;
+#else
+    constexpr ::std::size_t __elems_per_item = 8;
+#endif
     std::size_t wgsize = n/num_wgs/__elems_per_item;
     std::size_t num_items = n/__elems_per_item;
 
 
-    std::uint32_t status_flags_size = num_wgs+1;
+    constexpr int status_flag_padding = 32;
+    std::uint32_t status_flags_size = num_wgs+1+status_flag_padding;
 
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
+    //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
+
+    auto fill_event = __queue.submit([&](sycl::handler& hdl) {
+
+        hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
+                int id = item.get_linear_id();
+                status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0;
+        });
+    });
 
 #if SCAN_KT_DEBUG
     printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
@@ -109,10 +172,12 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
+        hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
         hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
             auto group = item.get_group();
+            auto subgroup = item.get_sub_group();
 
             std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
@@ -139,23 +204,30 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             //auto local_sum = 0;
             ///debug1[tile_id] = local_sum;
 
-			__scan_status_flag<_Type> flag(status_flags, tile_id);
+            auto prev_sum = 0;
 
-            if (group.leader())
-                flag.set_partial(local_sum);
+            // The first sub-group will query the previous tiles to find a prefix
+            if (subgroup.get_group_id() == 0)
+            {
+                __scan_status_flag<_Type> flag(status_flags, tile_id);
 
-            // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-            //sycl::reduce_over_group(item.get_subgroup())
+                if (group.leader())
+                    flag.set_partial(local_sum);
 
-            auto prev_sum = 0;
+                // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                //sycl::reduce_over_group(item.get_subgroup())
 
-            if (group.leader())
-                prev_sum = flag.lookback(tile_id, status_flags);
-            //debug2[tile_id] = prev_sum;
 
-            if (group.leader())
-                flag.set_full(prev_sum + local_sum);
+                prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags);
+                //if (group.leader())
+                //    prev_sum = flag.lookback(tile_id, status_flags);
+                //debug2[tile_id] = prev_sum;
+
+                if (group.leader())
+                    flag.set_full(prev_sum + local_sum);
+            }
 
+            prev_sum = sycl::group_broadcast(group, prev_sum, 0);
             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
         });
     });

From 30e0da7811689d75de697aafefeec7bac2ec8526 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Fri, 22 Sep 2023 11:42:33 -0700
Subject: [PATCH 009/134] Working cooperative lookback

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 61 +++++++++++++------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 27fdc1d09b4..963de2952e6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -50,26 +50,26 @@ struct __scan_status_flag
 
     void set_full(std::uint32_t val)
     {
-        atomic_flag.store((val ^ partial_mask) | full_mask);
+        atomic_flag.store(val | full_mask);
     }
 
     template<typename _Subgroup, typename BinOp>
-    _T cooperative_lookback(const std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin)
+    _T cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin)
     {
         _T sum = 0;
         int offset = -1;
         int i = 0;
         int local_id = subgroup.get_local_id();
 
-        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; offset -= 32)
+        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= 32)
         {
             _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id));
             std::uint32_t tile_val = 0;
             do {
                 tile_val = tile_atomic.load();
 
-            //} while (!sycl::all_of_group(subgroup, tile_val != 0));
-            } while (0);
+            } while (!sycl::all_of_group(subgroup, tile_val != 0));
+            //} while (0);
 
             bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
@@ -77,7 +77,7 @@ struct __scan_status_flag
             is_full_ballot.extract_bits(is_full_ballot_bits);
 
             auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits);
-            _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{};
+            _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{0};
 
             // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
             sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
@@ -132,6 +132,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     const ::std::size_t n = __in_rng.size();
     auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
     //std::size_t num_wgs = __max_cu;
+    //std::size_t num_wgs = 448;
     std::size_t num_wgs = 256;
 
     // TODO: use wgsize and iters per item from _KernelParam
@@ -143,26 +144,31 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 #endif
     std::size_t wgsize = n/num_wgs/__elems_per_item;
     std::size_t num_items = n/__elems_per_item;
+    //
+    //std::size_t wgsize = 256;
+    //std::size_t num_items = 114688;
 
 
     constexpr int status_flag_padding = 32;
     std::uint32_t status_flags_size = num_wgs+1+status_flag_padding;
 
+    printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu);
+
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
 
     auto fill_event = __queue.submit([&](sycl::handler& hdl) {
 
-        hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
+        hdl.parallel_for<class scan_kt_init>(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
                 int id = item.get_linear_id();
                 status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0;
         });
     });
 
-#if SCAN_KT_DEBUG
-    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
-    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
+    std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
+#define SCAN_KT_DEBUG 1
+#if SCAN_KT_DEBUG
     uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
@@ -175,11 +181,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
+        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
             auto group = item.get_group();
             auto subgroup = item.get_sub_group();
 
-            std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
             // Obtain unique ID for this work-group that will be used in decoupled lookback
             if (group.leader())
@@ -189,7 +194,9 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             }
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
-            //debug5[group.get_local_id()] = tile_id;
+#if SCAN_KT_DEBUG
+            debug5[group.get_group_linear_id()] = tile_id;
+#endif
 
             auto current_offset = (tile_id*elems_in_tile);
             auto next_offset = ((tile_id+1)*elems_in_tile);
@@ -197,12 +204,15 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             auto in_end = __in_rng.begin() + next_offset;
             auto out_begin = __out_rng.begin() + current_offset;
 
-            //debug3[tile_id] = current_offset;
-            //debug4[tile_id] = next_offset;
+#if SCAN_KT_DEBUG
+            debug3[tile_id] = current_offset;
+            debug4[tile_id] = next_offset;
+#endif
 
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
-            //auto local_sum = 0;
-            ///debug1[tile_id] = local_sum;
+#if SCAN_KT_DEBUG
+            debug1[tile_id] = local_sum;
+#endif
 
             auto prev_sum = 0;
 
@@ -221,7 +231,9 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                 prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags);
                 //if (group.leader())
                 //    prev_sum = flag.lookback(tile_id, status_flags);
-                //debug2[tile_id] = prev_sum;
+#if SCAN_KT_DEBUG
+                debug2[tile_id] = prev_sum;
+#endif
 
                 if (group.leader())
                     flag.set_full(prev_sum + local_sum);
@@ -234,20 +246,31 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     event.wait();
 
-#if 0
+#if SCAN_KT_DEBUG
     std::vector<uint32_t> debug1v(status_flags_size);
     std::vector<uint32_t> debug2v(status_flags_size);
     std::vector<uint32_t> debug3v(status_flags_size);
     std::vector<uint32_t> debug4v(status_flags_size);
     std::vector<uint32_t> debug5v(status_flags_size);
+    std::vector<uint32_t> debug6v(status_flags_size);
     __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t));
     __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t));
     __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t));
     __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t));
     __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug6v.data(), status_flags, status_flags_size * sizeof(uint32_t));
 
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "tile " << i << " " << debug5v[i] << std::endl;
     for (int i = 0; i < status_flags_size-1; ++i)
         std::cout << "local_sum " << i << " " << debug1v[i] << std::endl;
+    for (int i = 0; i < status_flags_size-1; ++i)
+    {
+        auto val = (debug6v[i] & __scan_status_flag<_Type>::value_mask);
+        int a = val / elems_in_tile;
+        int b = val % elems_in_tile;
+        std::cout << "flags " << i << " " << std::bitset<32>(debug6v[i]) << " (" << val<< " = " << a << "/" << elems_in_tile << "+" << b <<")" << std::endl;
+    }
     for (int i = 0; i < status_flags_size-1; ++i)
         std::cout << "lookback " << i << " " << debug2v[i] << std::endl;
     for (int i = 0; i < status_flags_size-1; ++i)

From 2311929486e18d8b4eee18208e932f20409e4489 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Wed, 25 Oct 2023 11:13:53 -0700
Subject: [PATCH 010/134] Fix correctness issue with non-power-of-2 sizes

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 52 ++++++++++++++++---
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 963de2952e6..7aaf3f2a255 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -133,7 +133,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
     //std::size_t num_wgs = __max_cu;
     //std::size_t num_wgs = 448;
-    std::size_t num_wgs = 256;
+    //std::size_t num_wgs = 256;
 
     // TODO: use wgsize and iters per item from _KernelParam
     //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
@@ -142,8 +142,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 #else
     constexpr ::std::size_t __elems_per_item = 8;
 #endif
-    std::size_t wgsize = n/num_wgs/__elems_per_item;
-    std::size_t num_items = n/__elems_per_item;
+    // Next power of 2 greater than or equal to __n
+    auto __n_uniform = n;
+    if ((__n_uniform & (__n_uniform - 1)) != 0)
+        __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1;
+    //std::size_t wgsize = n/num_wgs/__elems_per_item;
+    std::size_t wgsize = 256;
+    std::size_t num_items = __n_uniform/__elems_per_item;
+    std::size_t num_wgs = num_items/wgsize;
     //
     //std::size_t wgsize = 256;
     //std::size_t num_items = 114688;
@@ -152,7 +158,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     constexpr int status_flag_padding = 32;
     std::uint32_t status_flags_size = num_wgs+1+status_flag_padding;
 
-    printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu);
+    //printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu);
 
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
@@ -165,10 +171,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         });
     });
 
+
     std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
-#define SCAN_KT_DEBUG 1
+#define SCAN_KT_DEBUG 0
 #if SCAN_KT_DEBUG
+    std::vector<uint32_t> debug11v(status_flags_size);
+    __queue.memcpy(debug11v.data(), status_flags, status_flags_size * sizeof(uint32_t));
+
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "flag_before " << i << " " << debug11v[i] << std::endl;
+
     uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
@@ -200,21 +213,27 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
             auto current_offset = (tile_id*elems_in_tile);
             auto next_offset = ((tile_id+1)*elems_in_tile);
+            if (next_offset > n)
+                next_offset = n;
             auto in_begin = __in_rng.begin() + current_offset;
             auto in_end = __in_rng.begin() + next_offset;
             auto out_begin = __out_rng.begin() + current_offset;
 
+
 #if SCAN_KT_DEBUG
             debug3[tile_id] = current_offset;
             debug4[tile_id] = next_offset;
 #endif
 
+            if (current_offset >= n)
+                return;
+
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
 #if SCAN_KT_DEBUG
             debug1[tile_id] = local_sum;
 #endif
 
-            auto prev_sum = 0;
+            _Type prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
             if (subgroup.get_group_id() == 0)
@@ -296,6 +315,17 @@ void
 single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
 {
     auto __n = __in_end - __in_begin;
+
+#if SCAN_KT_DEBUG
+    using _Type = std::remove_pointer_t<_InIterator>;
+    std::vector<_Type> in_debug(__n);
+    __queue.memcpy(in_debug.data(), __in_begin, __n * sizeof(_Type));
+
+    for (int i = 0; i < __n; ++i)
+        std::cout << "input_before " << i << " " << in_debug[i] << std::endl;
+#endif
+
+    //printf("KERNEL_TEMPLATE %lu\n", __n);
     auto __keep1 =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
     auto __buf1 = __keep1(__in_begin, __in_end);
@@ -304,6 +334,16 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
     single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+
+#if SCAN_KT_DEBUG
+    std::vector<_Type> in_debug2(__n);
+    __queue.memcpy(in_debug2.data(), __in_begin, __n * sizeof(_Type));
+
+    for (int i = 0; i < __n; ++i)
+        std::cout << "input_after " << i << " " << in_debug2[i] << std::endl;
+#endif
+
+    //printf("KERNEL_TEMPLATE DONE %lu\n", __n);
 }
 
 } // inline namespace igpu

From 0f58c07c24ea397f1d63eabfe7ae7dac82cdf14f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Tue, 7 Nov 2023 10:51:21 +0000
Subject: [PATCH 011/134] Scan_kt Flags and Values separated (#15)

Atomic flags and the values used in Scan_kt separated to avoid truncating the range to 30bit values, and prepare for a more general scan implementation.
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 242 +++++-------------
 .../numeric/numeric.ops/scan_kt.pass.cpp      |  14 +-
 2 files changed, 75 insertions(+), 181 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 7aaf3f2a255..f52e4ef532f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -21,63 +21,75 @@ namespace oneapi::dpl::experimental::kt
 
 inline namespace igpu {
 
+constexpr size_t SUBGROUP_SIZE = 32;
+
 template<typename _T>
 struct __scan_status_flag
 {
-    // 00xxxx - not computed
-    // 01xxxx - partial
-    // 10xxxx - full
-    // 110000 - out of bounds
-
-    using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>;
-    static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2);
-    static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1);
-    static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask);
-    static constexpr std::uint32_t oob_value = partial_mask | full_mask;
-
-    static constexpr int padding = 32;
-
-    __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id)
-      : atomic_flag(*(flags_begin + tile_id + padding))
+    using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::acq_rel, sycl::memory_scope::device,
+                                         sycl::access::address_space::global_space>;
+    static constexpr std::uint32_t NOT_READY = 0;
+    static constexpr std::uint32_t PARTIAL_MASK = 1;
+    static constexpr std::uint32_t FULL_MASK = 2;
+    static constexpr std::uint32_t OUT_OF_BOUNDS = 4;
+
+    static constexpr int padding = SUBGROUP_SIZE;
+
+    __scan_status_flag(const std::uint32_t tile_id, std::uint32_t* flags_begin, _T* tile_sums,
+                       size_t num_elements)
+        : atomic_flag(*(flags_begin + tile_id + padding)), scanned_partial_value(tile_sums + tile_id + padding),
+          scanned_full_value(tile_sums + tile_id + padding + num_elements), num_elements{num_elements}
     {
-
     }
 
-    void set_partial(std::uint32_t val)
+    void
+    set_partial(_T val)
     {
-        atomic_flag.store(val | partial_mask);
+        (*scanned_partial_value) = val;
+        atomic_flag.store(PARTIAL_MASK);
     }
 
-    void set_full(std::uint32_t val)
+    void
+    set_full(_T val)
     {
-        atomic_flag.store(val | full_mask);
+        (*scanned_full_value) = val;
+        atomic_flag.store(FULL_MASK);
     }
 
-    template<typename _Subgroup, typename BinOp>
-    _T cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin)
+    template <typename _Subgroup, typename BinOp>
+    _T
+    cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin,
+                         _T* tile_sums)
     {
         _T sum = 0;
         int offset = -1;
         int i = 0;
         int local_id = subgroup.get_local_id();
 
-        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= 32)
+        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
         {
             _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id));
-            std::uint32_t tile_val = 0;
-            do {
-                tile_val = tile_atomic.load();
+            std::uint32_t flag;
+            do
+            {
+                flag = tile_atomic.load();
+            } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready
 
-            } while (!sycl::all_of_group(subgroup, tile_val != 0));
-            //} while (0);
+            bool is_full = flag == FULL_MASK;
 
-            bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
             ::std::uint32_t is_full_ballot_bits{};
             is_full_ballot.extract_bits(is_full_ballot_bits);
 
             auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits);
-            _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{0};
+
+            // The partial scan results and the full scan sum values are in contiguous memory.
+            // Each section of the memory is of size num_elements.
+            // The partial sum for a tile is at [i] and the full sum is at [i + num_elements]
+            // is_full * num_elements allows to select between the two values without branching the code.
+            size_t contrib_offset = tile + padding - local_id + is_full * num_elements;
+            _T val = *(tile_sums + contrib_offset);
+            _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? val : _T{0};
 
             // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
             sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
@@ -88,37 +100,16 @@ struct __scan_status_flag
             if (is_full_ballot_bits)
                 break;
 
-            //if (i++ > 10) break;
         }
-        return sum;
-    }
-
-#if 0
-    _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin)
-    {
-        _T sum = 0;
-        int i = 0;
-        for (std::int32_t tile = static_cast<std::int32_t>(tile_id) - 1; tile >= 0; --tile)
-        {
-            _AtomicRefT tile_atomic(*(flags_begin + tile + padding));
-            std::uint32_t tile_val = 0;
-            do {
-                tile_val = tile_atomic.load();
-            } while (tile_val == 0);
-
-            sum += tile_val & value_mask;
 
-            // If this was a full value, we can stop looking at previous tiles. Otherwise,
-            // keep going through tiles until we either find a full tile or we've completely
-            // recomputed the prefix using partial values
-            if (tile_val & full_mask)
-                break;
-        }
         return sum;
     }
-#endif
 
     _AtomicRefT atomic_flag;
+    _T* scanned_partial_value;
+    _T* scanned_full_value;
+
+    size_t num_elements;
 };
 
 template <typename _KernelParam, bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
@@ -130,86 +121,57 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
 
     const ::std::size_t n = __in_rng.size();
-    auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
-    //std::size_t num_wgs = __max_cu;
-    //std::size_t num_wgs = 448;
-    //std::size_t num_wgs = 256;
-
-    // TODO: use wgsize and iters per item from _KernelParam
-    //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
 #ifdef _ONEDPL_SCAN_ITER_SIZE
-    constexpr ::std::size_t __elems_per_item = _ONEDPL_SCAN_ITER_SIZE;
+    constexpr ::std::size_t __elems_per_workitem = _ONEDPL_SCAN_ITER_SIZE;
 #else
-    constexpr ::std::size_t __elems_per_item = 8;
+    constexpr ::std::size_t __elems_per_workitem = 8;
 #endif
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = n;
     if ((__n_uniform & (__n_uniform - 1)) != 0)
         __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1;
-    //std::size_t wgsize = n/num_wgs/__elems_per_item;
-    std::size_t wgsize = 256;
-    std::size_t num_items = __n_uniform/__elems_per_item;
-    std::size_t num_wgs = num_items/wgsize;
-    //
-    //std::size_t wgsize = 256;
-    //std::size_t num_items = 114688;
-
+    std::size_t num_workitems = __n_uniform / __elems_per_workitem;
+    std::size_t wgsize = num_workitems > 256 ? 256 : num_workitems;
+    std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_workitems, wgsize);
 
-    constexpr int status_flag_padding = 32;
-    std::uint32_t status_flags_size = num_wgs+1+status_flag_padding;
-
-    //printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu);
+    constexpr int status_flag_padding = SUBGROUP_SIZE;
+    std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1;
+    std::uint32_t tile_sums_size = num_wgs + status_flag_padding;
 
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
+    // First status_flags_size elements: partial scanned values (using _BinaryOp) of each workgroup
+    // Second status_flags_size elements: full scanned values, i.e. sum of the previous and current workgroup's partial sums
+    _Type* tile_sums = sycl::malloc_device<_Type>(tile_sums_size * 2, __queue);
 
     auto fill_event = __queue.submit([&](sycl::handler& hdl) {
-
         hdl.parallel_for<class scan_kt_init>(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
                 int id = item.get_linear_id();
-                status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0;
+                status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
+                                                            : __scan_status_flag<_Type>::NOT_READY;
         });
     });
 
-
-    std::uint32_t elems_in_tile = wgsize*__elems_per_item;
-
-#define SCAN_KT_DEBUG 0
-#if SCAN_KT_DEBUG
-    std::vector<uint32_t> debug11v(status_flags_size);
-    __queue.memcpy(debug11v.data(), status_flags, status_flags_size * sizeof(uint32_t));
-
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "flag_before " << i << " " << debug11v[i] << std::endl;
-
-    uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug4 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug5 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-#endif
+    std::uint32_t elems_in_tile = wgsize*__elems_per_workitem;
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
+        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
             auto group = item.get_group();
             auto subgroup = item.get_sub_group();
 
-
             // Obtain unique ID for this work-group that will be used in decoupled lookback
             if (group.leader())
             {
-                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_size-1]);
+                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                 sycl::access::address_space::global_space>
+                    idx_atomic(status_flags[status_flags_size - 1]);
                 tile_id_lacc[0] = idx_atomic.fetch_add(1);
             }
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
-#if SCAN_KT_DEBUG
-            debug5[group.get_group_linear_id()] = tile_id;
-#endif
 
             auto current_offset = (tile_id*elems_in_tile);
             auto next_offset = ((tile_id+1)*elems_in_tile);
@@ -219,40 +181,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             auto in_end = __in_rng.begin() + next_offset;
             auto out_begin = __out_rng.begin() + current_offset;
 
-
-#if SCAN_KT_DEBUG
-            debug3[tile_id] = current_offset;
-            debug4[tile_id] = next_offset;
-#endif
-
             if (current_offset >= n)
                 return;
 
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
-#if SCAN_KT_DEBUG
-            debug1[tile_id] = local_sum;
-#endif
-
             _Type prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
             if (subgroup.get_group_id() == 0)
             {
-                __scan_status_flag<_Type> flag(status_flags, tile_id);
+                __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, status_flags_size);
 
                 if (group.leader())
                     flag.set_partial(local_sum);
 
                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                //sycl::reduce_over_group(item.get_subgroup())
-
-
-                prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags);
-                //if (group.leader())
-                //    prev_sum = flag.lookback(tile_id, status_flags);
-#if SCAN_KT_DEBUG
-                debug2[tile_id] = prev_sum;
-#endif
+                prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags, tile_sums);
 
                 if (group.leader())
                     flag.set_full(prev_sum + local_sum);
@@ -265,40 +209,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     event.wait();
 
-#if SCAN_KT_DEBUG
-    std::vector<uint32_t> debug1v(status_flags_size);
-    std::vector<uint32_t> debug2v(status_flags_size);
-    std::vector<uint32_t> debug3v(status_flags_size);
-    std::vector<uint32_t> debug4v(status_flags_size);
-    std::vector<uint32_t> debug5v(status_flags_size);
-    std::vector<uint32_t> debug6v(status_flags_size);
-    __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug6v.data(), status_flags, status_flags_size * sizeof(uint32_t));
-
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "tile " << i << " " << debug5v[i] << std::endl;
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "local_sum " << i << " " << debug1v[i] << std::endl;
-    for (int i = 0; i < status_flags_size-1; ++i)
-    {
-        auto val = (debug6v[i] & __scan_status_flag<_Type>::value_mask);
-        int a = val / elems_in_tile;
-        int b = val % elems_in_tile;
-        std::cout << "flags " << i << " " << std::bitset<32>(debug6v[i]) << " (" << val<< " = " << a << "/" << elems_in_tile << "+" << b <<")" << std::endl;
-    }
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "lookback " << i << " " << debug2v[i] << std::endl;
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "offset " << i << " " << debug3v[i] << std::endl;
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "end " << i << " " << debug4v[i] << std::endl;
-#endif
-
     sycl::free(status_flags, __queue);
+    sycl::free(tile_sums, __queue);
 }
 
 // The generic structure for configuring a kernel
@@ -316,16 +228,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
 {
     auto __n = __in_end - __in_begin;
 
-#if SCAN_KT_DEBUG
-    using _Type = std::remove_pointer_t<_InIterator>;
-    std::vector<_Type> in_debug(__n);
-    __queue.memcpy(in_debug.data(), __in_begin, __n * sizeof(_Type));
-
-    for (int i = 0; i < __n; ++i)
-        std::cout << "input_before " << i << " " << in_debug[i] << std::endl;
-#endif
-
-    //printf("KERNEL_TEMPLATE %lu\n", __n);
     auto __keep1 =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
     auto __buf1 = __keep1(__in_begin, __in_end);
@@ -334,16 +236,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
     single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
-
-#if SCAN_KT_DEBUG
-    std::vector<_Type> in_debug2(__n);
-    __queue.memcpy(in_debug2.data(), __in_begin, __n * sizeof(_Type));
-
-    for (int i = 0; i < __n; ++i)
-        std::cout << "input_after " << i << " " << in_debug2[i] << std::endl;
-#endif
-
-    //printf("KERNEL_TEMPLATE DONE %lu\n", __n);
 }
 
 } // inline namespace igpu
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
index de5ecafc25b..38a82b026d7 100644
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -22,23 +22,23 @@ int
 main()
 {
     bool all_passed = true;
+    sycl::queue q;
 
     for (int logn : {4, 8, 11, 16, 19, 21})
     {
-        std::cout << "Testing 2^" << logn << '\n';
+        std::cout << "Testing 2^" << logn << std::endl;
         int n = 1 << logn;
         std::vector<int> v(n, 1);
-        sycl::queue q;
         int* in_ptr = sycl::malloc_device<int>(n, q);
         int* out_ptr = sycl::malloc_device<int>(n, q);
 
-
-        q.copy(v.data(), in_ptr, n);
+        q.copy(v.data(), in_ptr, n).wait();
         using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
         oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
 
         std::vector<int> tmp(n, 0);
         q.copy(out_ptr, tmp.data(), n);
+        q.wait();
 
         std::inclusive_scan(v.begin(), v.end(), v.begin());
 
@@ -53,11 +53,13 @@ main()
         }
 
         if (passed)
-            std::cout << "passed" << std::endl;
+            std::cout << " passed" << std::endl;
         else
-            std::cout << "failed" << std::endl;
+            std::cout << " failed" << std::endl;
 
         all_passed &= passed;
+        sycl::free(in_ptr, q);
+        sycl::free(out_ptr, q);
     }
 
     return !all_passed;

From 8af98d60dc7c7ed9072a235235efc1b934e63a43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Tue, 7 Nov 2023 13:07:04 +0000
Subject: [PATCH 012/134] Refactored Scan_kt code (#16)

* Improved Scan_kt: templated parameters, ballot, wgsize calculation.

- Changed number of workgroups calculation from next power of two to
next multiple of wgsize
- Improved group_ballot by using the class member functions
- Using kernel_param struct to determine wgsize and elems per work item.
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 34 +++++++------------
 .../numeric/numeric.ops/scan_kt.pass.cpp      |  2 +-
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index f52e4ef532f..e7a0ca345e6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -76,12 +76,8 @@ struct __scan_status_flag
             } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready
 
             bool is_full = flag == FULL_MASK;
-
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
-            ::std::uint32_t is_full_ballot_bits{};
-            is_full_ballot.extract_bits(is_full_ballot_bits);
-
-            auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits);
+            auto lowest_item_with_full = is_full_ballot.find_low();
 
             // The partial scan results and the full scan sum values are in contiguous memory.
             // Each section of the memory is of size num_elements.
@@ -97,7 +93,7 @@ struct __scan_status_flag
             // If we found a full value, we can stop looking at previous tiles. Otherwise,
             // keep going through tiles until we either find a full tile or we've completely
             // recomputed the prefix using partial values
-            if (is_full_ballot_bits)
+            if (is_full_ballot.any())
                 break;
 
         }
@@ -121,18 +117,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
 
     const ::std::size_t n = __in_rng.size();
-#ifdef _ONEDPL_SCAN_ITER_SIZE
-    constexpr ::std::size_t __elems_per_workitem = _ONEDPL_SCAN_ITER_SIZE;
-#else
-    constexpr ::std::size_t __elems_per_workitem = 8;
-#endif
-    // Next power of 2 greater than or equal to __n
-    auto __n_uniform = n;
-    if ((__n_uniform & (__n_uniform - 1)) != 0)
-        __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1;
-    std::size_t num_workitems = __n_uniform / __elems_per_workitem;
-    std::size_t wgsize = num_workitems > 256 ? 256 : num_workitems;
-    std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_workitems, wgsize);
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
+    ::std::size_t num_workitems = num_wgs * wgsize;
 
     constexpr int status_flag_padding = SUBGROUP_SIZE;
     std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1;
@@ -151,8 +143,6 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         });
     });
 
-    std::uint32_t elems_in_tile = wgsize*__elems_per_workitem;
-
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         hdl.depends_on(fill_event);
@@ -214,10 +204,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 }
 
 // The generic structure for configuring a kernel
-template <std::uint16_t DataPerWorkItem, std::uint16_t WorkGroupSize, typename KernelName>
+template <std::uint16_t ElemsPerWorkItem, std::uint16_t WorkGroupSize, typename KernelName>
 struct kernel_param
 {
-    static constexpr std::uint16_t data_per_workitem = DataPerWorkItem;
+    static constexpr std::uint16_t elems_per_workitem = ElemsPerWorkItem;
     static constexpr std::uint16_t workgroup_size = WorkGroupSize;
     using kernel_name = KernelName;
 };
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
index 38a82b026d7..b3407581f37 100644
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -33,7 +33,7 @@ main()
         int* out_ptr = sycl::malloc_device<int>(n, q);
 
         q.copy(v.data(), in_ptr, n).wait();
-        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
+        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<8, 128, class ScanKernel>;
         oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
 
         std::vector<int> tmp(n, 0);

From 3de596ea2ba5a889b841a0ab96c8aa6055ff6fba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Wed, 8 Nov 2023 16:47:52 +0000
Subject: [PATCH 013/134] Scan_kt: Single memory allocation for device_memory
 (#17) and async free of the device memory (#18)

* Single memory allocation for device_memory

* async free of device memory

---------

Co-authored-by: Joe Todd <joeatodd@users.noreply.github.com>
Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index e7a0ca345e6..5773b80e1be 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -127,13 +127,24 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     ::std::size_t num_workitems = num_wgs * wgsize;
 
     constexpr int status_flag_padding = SUBGROUP_SIZE;
-    std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1;
-    std::uint32_t tile_sums_size = num_wgs + status_flag_padding;
+    std::size_t status_flags_elems = num_wgs + status_flag_padding + 1;
+    std::size_t status_flags_size = status_flags_elems * sizeof(std::uint32_t);
 
-    uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    // First status_flags_size elements: partial scanned values (using _BinaryOp) of each workgroup
-    // Second status_flags_size elements: full scanned values, i.e. sum of the previous and current workgroup's partial sums
-    _Type* tile_sums = sycl::malloc_device<_Type>(tile_sums_size * 2, __queue);
+    std::size_t tile_sums_elems = num_wgs + status_flag_padding;
+    std::size_t tile_sums_size = status_flags_elems * sizeof(_Type);
+
+    std::size_t extra_mem_for_aligment = alignof(_Type) - (status_flags_size % alignof(_Type));
+    // status_flags_size for the status_flags
+    // extra_mem_for_aligment of the datatype _Type
+    // First tile_sums_size partial scanned values
+    // Second tile_sums_size full scanned values (current partial plus all previous workgroups partial)
+    char* mem_pool =
+        sycl::malloc_device<char>(status_flags_size + extra_mem_for_aligment + 2 * tile_sums_size, __queue);
+
+    std::size_t tile_sums_offset = status_flags_size + extra_mem_for_aligment;
+
+    std::uint32_t* status_flags = reinterpret_cast<std::uint32_t*>(mem_pool);
+    _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset);
 
     auto fill_event = __queue.submit([&](sycl::handler& hdl) {
         hdl.parallel_for<class scan_kt_init>(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
@@ -180,7 +191,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             // The first sub-group will query the previous tiles to find a prefix
             if (subgroup.get_group_id() == 0)
             {
-                __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, status_flags_size);
+                __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, tile_sums_elems);
 
                 if (group.leader())
                     flag.set_partial(local_sum);
@@ -197,10 +208,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         });
     });
 
-    event.wait();
+    auto free_event = __queue.submit(
+        [=](sycl::handler& hdl)
+        {
+            hdl.depends_on(event);
+            hdl.host_task([=](){ sycl::free(mem_pool, __queue); });
+        });
 
-    sycl::free(status_flags, __queue);
-    sycl::free(tile_sums, __queue);
+    event.wait();
 }
 
 // The generic structure for configuring a kernel

From 2d6ff78f3d2facb0b384f4f71ac7465198b1e6b4 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 8 Nov 2023 16:07:30 +0000
Subject: [PATCH 014/134] Replace sycl::range with sycl::nd_range for fill

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 5773b80e1be..53d925a14c8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -146,13 +146,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     std::uint32_t* status_flags = reinterpret_cast<std::uint32_t*>(mem_pool);
     _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset);
 
-    auto fill_event = __queue.submit([&](sycl::handler& hdl) {
-        hdl.parallel_for<class scan_kt_init>(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
-                int id = item.get_linear_id();
-                status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
-                                                            : __scan_status_flag<_Type>::NOT_READY;
+    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_size, wgsize);
+
+    auto fill_event = __queue.submit(
+        [&](sycl::handler& hdl)
+        {
+            hdl.parallel_for<class scan_kt_init>(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
+                                                 [=](const sycl::nd_item<1>& item)
+                                                 {
+                                                     int id = item.get_global_linear_id();
+                                                     if (id < status_flags_size)
+                                                         status_flags[id] =
+                                                             id < status_flag_padding
+                                                                 ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
+                                                                 : __scan_status_flag<_Type>::NOT_READY;
+                                                 });
         });
-    });
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);

From 124a912c6852f89a5e3e74041cd0a60e6351e4a2 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 8 Nov 2023 19:14:32 +0000
Subject: [PATCH 015/134] Bug fix

---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 53d925a14c8..038018a13ac 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -146,7 +146,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     std::uint32_t* status_flags = reinterpret_cast<std::uint32_t*>(mem_pool);
     _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset);
 
-    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_size, wgsize);
+    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_elems, wgsize);
 
     auto fill_event = __queue.submit(
         [&](sycl::handler& hdl)
@@ -155,7 +155,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                                                  [=](const sycl::nd_item<1>& item)
                                                  {
                                                      int id = item.get_global_linear_id();
-                                                     if (id < status_flags_size)
+                                                     if (id < status_flags_elems)
                                                          status_flags[id] =
                                                              id < status_flag_padding
                                                                  ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
@@ -177,7 +177,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             {
                 sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device,
                                  sycl::access::address_space::global_space>
-                    idx_atomic(status_flags[status_flags_size - 1]);
+                    idx_atomic(status_flags[status_flags_elems - 1]);
                 tile_id_lacc[0] = idx_atomic.fetch_add(1);
             }
             sycl::group_barrier(group);

From d716bbd21a451b0b1424de131f00eb48b1a7a0e8 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Wed, 8 Nov 2023 13:21:32 +0000
Subject: [PATCH 016/134] Global to local then perform op

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 038018a13ac..846208007da 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -165,11 +165,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
+        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
         hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
             auto group = item.get_group();
+            auto local_id = item.get_local_id(0);
+            auto stride = item.get_local_range(0);
             auto subgroup = item.get_sub_group();
 
             // Obtain unique ID for this work-group that will be used in decoupled lookback
@@ -183,16 +186,33 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
 
-            auto current_offset = (tile_id*elems_in_tile);
-            auto next_offset = ((tile_id+1)*elems_in_tile);
-            if (next_offset > n)
-                next_offset = n;
-            auto in_begin = __in_rng.begin() + current_offset;
-            auto in_end = __in_rng.begin() + next_offset;
-            auto out_begin = __out_rng.begin() + current_offset;
-
-            if (current_offset >= n)
+            // Global load into local
+            auto wg_current_offset = (tile_id*elems_in_tile);
+            auto wg_next_offset = ((tile_id+1)*elems_in_tile);
+            size_t wg_local_memory_size = elems_in_tile;
+            if (wg_current_offset >= n)
                 return;
+            if (wg_next_offset >= n) {
+                wg_local_memory_size = n - wg_current_offset;
+                wg_next_offset = n; // Not needed
+            }
+
+            // TODO: vectorize loads, where possible
+            if (wg_next_offset <= n) {
+                _ONEDPL_PRAGMA_UNROLL
+                for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                    tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
+            } else {
+                for (std::uint32_t i = 0; i < elems_per_workitem; ++i) {
+                    if (wg_current_offset + stride * i < n)
+                        tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + stride * i];
+                }
+            }
+            sycl::group_barrier(group);
+
+            auto in_begin = tile_vals.get_pointer();
+            auto in_end = in_begin + wg_local_memory_size;
+            auto out_begin = __out_rng.begin() + wg_current_offset;
 
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
             _Type prev_sum = 0;

From 6a474c7dc2aeee9082d8183db8fefbc8355a6bd0 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 10 Nov 2023 13:51:35 +0000
Subject: [PATCH 017/134] Update based on feedback

---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h   | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 846208007da..1bd10595413 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -192,20 +192,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             size_t wg_local_memory_size = elems_in_tile;
             if (wg_current_offset >= n)
                 return;
-            if (wg_next_offset >= n) {
+            if (wg_next_offset > n)
                 wg_local_memory_size = n - wg_current_offset;
-                wg_next_offset = n; // Not needed
-            }
 
-            // TODO: vectorize loads, where possible
             if (wg_next_offset <= n) {
                 _ONEDPL_PRAGMA_UNROLL
                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
                     tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
             } else {
                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i) {
-                    if (wg_current_offset + stride * i < n)
-                        tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + stride * i];
+                    if (wg_current_offset + local_id + stride * i < n)
+                        tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
                 }
             }
             sycl::group_barrier(group);

From ba7be34eb82634cb9c81757050ad51b30210bec4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Tue, 21 Nov 2023 11:48:48 +0000
Subject: [PATCH 018/134] Refactored cooperative_loopback and memory
 implementation (#24)

* Refactored cooperative_loopback and memory implementation detail

* renamed load_counter to fetch_add_counter

* Removed dynamic tile counter from the scan memory struct

* scratch memory Reordering

* Fixed wrong values returned in LoopbackScanMemory.get_value

* Improved Class and variable naming
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 334 +++++++++++++-----
 1 file changed, 253 insertions(+), 81 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 1bd10595413..314ace11410 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -16,51 +16,244 @@
 #ifndef _ONEDPL_parallel_backend_sycl_scan_H
 #define _ONEDPL_parallel_backend_sycl_scan_H
 
+#include <cstdint>
+#include <sycl/sycl.hpp>
+
 namespace oneapi::dpl::experimental::kt
 {
 
 inline namespace igpu {
 
-constexpr size_t SUBGROUP_SIZE = 32;
+constexpr ::std::size_t SUBGROUP_SIZE = 32;
+
+template <typename Type, template <typename> typename LoopbackScanMemory, typename TileId>
+struct ScanMemoryManager
+{
+    using _TileIdT = typename TileId::_TileIdT;
+    using _FlagT = typename LoopbackScanMemory<Type>::_FlagT;
+
+    ScanMemoryManager(sycl::queue q) : q{q} {};
+
+    ::std::uint8_t*
+    scan_memory_ptr() noexcept
+    {
+        return scan_memory_begin;
+    };
+
+    _TileIdT*
+    tile_id_ptr() noexcept
+    {
+        return tile_id_begin;
+    };
+
+    void
+    allocate(::std::size_t num_wgs)
+    {
+        ::std::size_t scan_memory_size = LoopbackScanMemory<Type>::get_memory_size(num_wgs);
+        constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size();
+        constexpr ::std::size_t tileid_size = TileId::get_memory_size();
+
+        auto mem_size_bytes = scan_memory_size + padded_tileid_size;
+
+        scratch = sycl::malloc_device<::std::uint8_t>(mem_size_bytes, q);
+
+        scan_memory_begin = scratch;
+
+        void* base_tileid_ptr = reinterpret_cast<void*>(scan_memory_begin + scan_memory_size);
+        size_t remainder = mem_size_bytes - scan_memory_size;
+
+        tile_id_begin = reinterpret_cast<_TileIdT*>(
+            ::std::align(::std::alignment_of_v<_TileIdT>, tileid_size, base_tileid_ptr, remainder));
+    }
+
+    sycl::event
+    async_free(sycl::event dependency)
+    {
+        return q.submit(
+            [e = dependency, ptr = scratch, q_ = q](sycl::handler& hdl)
+            {
+                hdl.depends_on(e);
+                hdl.host_task([=]() { sycl::free(ptr, q_); });
+            });
+    }
+
+  private:
+    ::std::uint8_t* scratch = nullptr;
+    ::std::uint8_t* scan_memory_begin = nullptr;
+    _TileIdT* tile_id_begin = nullptr;
+
+    sycl::queue q;
+};
 
-template<typename _T>
-struct __scan_status_flag
+template <typename _T>
+struct LoopbackScanMemory
 {
-    using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::acq_rel, sycl::memory_scope::device,
-                                         sycl::access::address_space::global_space>;
-    static constexpr std::uint32_t NOT_READY = 0;
-    static constexpr std::uint32_t PARTIAL_MASK = 1;
-    static constexpr std::uint32_t FULL_MASK = 2;
-    static constexpr std::uint32_t OUT_OF_BOUNDS = 4;
-
-    static constexpr int padding = SUBGROUP_SIZE;
-
-    __scan_status_flag(const std::uint32_t tile_id, std::uint32_t* flags_begin, _T* tile_sums,
-                       size_t num_elements)
-        : atomic_flag(*(flags_begin + tile_id + padding)), scanned_partial_value(tile_sums + tile_id + padding),
-          scanned_full_value(tile_sums + tile_id + padding + num_elements), num_elements{num_elements}
+    using _FlagT = ::std::uint32_t;
+    using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>;
+
+    static constexpr _FlagT NOT_READY = 0;
+    static constexpr _FlagT PARTIAL_MASK = 1;
+    static constexpr _FlagT FULL_MASK = 2;
+    static constexpr _FlagT OUT_OF_BOUNDS = 4;
+
+    static constexpr ::std::size_t padding = SUBGROUP_SIZE;
+
+    LoopbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
+        : num_elements(get_num_elements(num_wgs))
     {
+        // LoopbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...]
+        // Each section has num_wgs + padding elements
+        tile_values_begin = reinterpret_cast<_T*>(scan_memory_begin);
+        flags_begin = get_flags_begin(scan_memory_begin, num_wgs);
     }
 
     void
-    set_partial(_T val)
+    set_partial(::std::size_t tile_id, _T val)
     {
-        (*scanned_partial_value) = val;
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        tile_values_begin[tile_id + padding] = val;
         atomic_flag.store(PARTIAL_MASK);
     }
 
     void
-    set_full(_T val)
+    set_full(::std::size_t tile_id, _T val)
     {
-        (*scanned_full_value) = val;
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        tile_values_begin[tile_id + padding + num_elements] = val;
         atomic_flag.store(FULL_MASK);
     }
 
-    template <typename _Subgroup, typename BinOp>
+    _FlagT
+    load_flag(::std::size_t tile_id) const
+    {
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        return atomic_flag.load();
+    }
+
+    _T
+    get_value(::std::size_t tile_id, _FlagT flag) const
+    {
+        ::std::size_t offset = tile_id + padding + num_elements * is_full(flag);
+        return tile_values_begin[offset];
+    }
+
+    static ::std::size_t
+    get_tile_values_bytes(::std::size_t num_elements)
+    {
+        return (2 * num_elements) * sizeof(_T);
+    }
+
+    static ::std::size_t
+    get_flag_bytes(::std::size_t num_elements)
+    {
+        return num_elements * sizeof(_FlagT);
+    }
+
+    static ::std::size_t
+    get_padded_flag_bytes(::std::size_t num_elements)
+    {
+        // sizeof(_FlagT) extra bytes for possible intenal alignment
+        return get_flag_bytes(num_elements) + sizeof(_FlagT);
+    }
+
+    static _FlagT*
+    get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
+    {
+        // Aligned flags
+        ::std::size_t num_elements = get_num_elements(num_wgs);
+        ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements);
+        void* base_flags = reinterpret_cast<void*>(scan_memory_begin + tile_values_bytes);
+        auto remainder = get_padded_flag_bytes(num_elements); // scan_memory_bytes - tile_values_bytes
+        return reinterpret_cast<_FlagT*>(
+            ::std::align(::std::alignment_of_v<_FlagT>, get_flag_bytes(num_elements), base_flags, remainder));
+    }
+
+    static ::std::size_t
+    get_memory_size(::std::size_t num_wgs)
+    {
+        ::std::size_t num_elements = get_num_elements(num_wgs);
+        // sizeof(_T) extra bytes are not needed because LoopbackScanMemory is going at the beginning of the scratch
+        ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements);
+        // Padding to provide room for aligment
+        ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements);
+
+        return tile_values_bytes + flag_bytes;
+    }
+
+    static ::std::size_t
+    get_num_elements(::std::size_t num_wgs)
+    {
+        return padding + num_wgs;
+    }
+
+    static bool
+    is_ready(_FlagT flag)
+    {
+        return flag != NOT_READY;
+    }
+
+    static bool
+    is_full(_FlagT flag)
+    {
+        return flag == FULL_MASK;
+    }
+
+    static bool
+    is_out_of_bounds(_FlagT flag)
+    {
+        return flag == OUT_OF_BOUNDS;
+    }
+
+  private:
+    ::std::size_t num_elements;
+    _FlagT* flags_begin;
+    _T* tile_values_begin;
+};
+
+struct TileId
+{
+    using _TileIdT = ::std::uint32_t;
+    using _AtomicTileRefT = sycl::atomic_ref<_TileIdT, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>;
+
+    TileId(_TileIdT* tileid_memory) : tile_counter(*(tileid_memory)) {}
+
+    constexpr static ::std::size_t
+    get_padded_memory_size()
+    {
+        // extra sizeof(_TileIdT) for possible aligment issues
+        return sizeof(_TileIdT) + sizeof(_TileIdT);
+    }
+
+    constexpr static ::std::size_t
+    get_memory_size()
+    {
+        // extra sizeof(_TileIdT) for possible aligment issues
+        return sizeof(_TileIdT);
+    }
+
+    _TileIdT
+    fetch_inc()
+    {
+        return tile_counter.fetch_add(1);
+    }
+
+    _AtomicTileRefT tile_counter;
+};
+
+struct cooperative_lookback
+{
+
+    template <typename _T, typename _Subgroup, typename BinOp, template <typename> typename LoopbackScanMemory>
     _T
-    cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin,
-                         _T* tile_sums)
+    operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, LoopbackScanMemory<_T> memory)
     {
+        using FlagT = typename LoopbackScanMemory<_T>::_FlagT;
+
         _T sum = 0;
         int offset = -1;
         int i = 0;
@@ -68,24 +261,20 @@ struct __scan_status_flag
 
         for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
         {
-            _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id));
-            std::uint32_t flag;
+            FlagT flag;
             do
             {
-                flag = tile_atomic.load();
-            } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready
+                flag = memory.load_flag(tile - local_id);
+            } while (!sycl::all_of_group(subgroup, LoopbackScanMemory<_T>::is_ready(flag))); // Loop till all ready
 
-            bool is_full = flag == FULL_MASK;
+            bool is_full = LoopbackScanMemory<_T>::is_full(flag);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
             auto lowest_item_with_full = is_full_ballot.find_low();
 
-            // The partial scan results and the full scan sum values are in contiguous memory.
-            // Each section of the memory is of size num_elements.
-            // The partial sum for a tile is at [i] and the full sum is at [i + num_elements]
-            // is_full * num_elements allows to select between the two values without branching the code.
-            size_t contrib_offset = tile + padding - local_id + is_full * num_elements;
-            _T val = *(tile_sums + contrib_offset);
-            _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? val : _T{0};
+            // TODO: Use identity_fn for out of bounds values
+            _T contribution = local_id <= lowest_item_with_full && !LoopbackScanMemory<_T>::is_out_of_bounds(flag)
+                                  ? memory.get_value(tile - local_id, flag)
+                                  : _T{0};
 
             // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
             sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
@@ -100,12 +289,6 @@ struct __scan_status_flag
 
         return sum;
     }
-
-    _AtomicRefT atomic_flag;
-    _T* scanned_partial_value;
-    _T* scanned_full_value;
-
-    size_t num_elements;
 };
 
 template <typename _KernelParam, bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
@@ -113,6 +296,8 @@ void
 single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _TileIdT = TileId::_TileIdT;
+    using _FlagT = typename LoopbackScanMemory<_Type>::_FlagT;
 
     static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
 
@@ -122,31 +307,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
 
     // Avoid non_uniform n by padding up to a multiple of wgsize
-    std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
-    constexpr int status_flag_padding = SUBGROUP_SIZE;
-    std::size_t status_flags_elems = num_wgs + status_flag_padding + 1;
-    std::size_t status_flags_size = status_flags_elems * sizeof(std::uint32_t);
-
-    std::size_t tile_sums_elems = num_wgs + status_flag_padding;
-    std::size_t tile_sums_size = status_flags_elems * sizeof(_Type);
+    ScanMemoryManager<_Type, LoopbackScanMemory, TileId> scratch(__queue);
+    scratch.allocate(num_wgs);
 
-    std::size_t extra_mem_for_aligment = alignof(_Type) - (status_flags_size % alignof(_Type));
-    // status_flags_size for the status_flags
-    // extra_mem_for_aligment of the datatype _Type
-    // First tile_sums_size partial scanned values
-    // Second tile_sums_size full scanned values (current partial plus all previous workgroups partial)
-    char* mem_pool =
-        sycl::malloc_device<char>(status_flags_size + extra_mem_for_aligment + 2 * tile_sums_size, __queue);
+    // Memory Structure:
+    // [Loopback Scan Memory, Tile Id Counter]
+    auto scan_memory_begin = scratch.scan_memory_ptr();
+    auto status_flags_begin = LoopbackScanMemory<_Type>::get_flags_begin(scan_memory_begin, num_wgs);
+    auto tile_id_begin = scratch.tile_id_ptr();
 
-    std::size_t tile_sums_offset = status_flags_size + extra_mem_for_aligment;
-
-    std::uint32_t* status_flags = reinterpret_cast<std::uint32_t*>(mem_pool);
-    _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset);
-
-    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_elems, wgsize);
+    ::std::size_t num_elements = LoopbackScanMemory<_Type>::get_num_elements(num_wgs);
+    // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
+    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
 
     auto fill_event = __queue.submit(
         [&](sycl::handler& hdl)
@@ -155,14 +331,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                                                  [=](const sycl::nd_item<1>& item)
                                                  {
                                                      int id = item.get_global_linear_id();
-                                                     if (id < status_flags_elems)
-                                                         status_flags[id] =
-                                                             id < status_flag_padding
-                                                                 ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
-                                                                 : __scan_status_flag<_Type>::NOT_READY;
+                                                     if (id < num_elements)
+                                                         status_flags_begin[id] =
+                                                             id < LoopbackScanMemory<_Type>::padding
+                                                                 ? LoopbackScanMemory<_Type>::OUT_OF_BOUNDS
+                                                                 : LoopbackScanMemory<_Type>::NOT_READY;
+                                                     if (id == num_elements)
+                                                         tile_id_begin[0] = 0;
                                                  });
         });
 
+
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
@@ -176,12 +355,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             auto subgroup = item.get_sub_group();
 
             // Obtain unique ID for this work-group that will be used in decoupled lookback
+            TileId dynamic_tile_id(tile_id_begin);
             if (group.leader())
             {
-                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device,
-                                 sycl::access::address_space::global_space>
-                    idx_atomic(status_flags[status_flags_elems - 1]);
-                tile_id_lacc[0] = idx_atomic.fetch_add(1);
+                tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
             }
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
@@ -207,7 +384,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             }
             sycl::group_barrier(group);
 
-            auto in_begin = tile_vals.get_pointer();
+            auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
             auto in_end = in_begin + wg_local_memory_size;
             auto out_begin = __out_rng.begin() + wg_current_offset;
 
@@ -217,16 +394,16 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             // The first sub-group will query the previous tiles to find a prefix
             if (subgroup.get_group_id() == 0)
             {
-                __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, tile_sums_elems);
+                LoopbackScanMemory<_Type> scan_mem(scan_memory_begin, num_wgs);
 
                 if (group.leader())
-                    flag.set_partial(local_sum);
+                    scan_mem.set_partial(tile_id, local_sum);
 
                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags, tile_sums);
+                prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
 
                 if (group.leader())
-                    flag.set_full(prev_sum + local_sum);
+                    scan_mem.set_full(tile_id, prev_sum + local_sum);
             }
 
             prev_sum = sycl::group_broadcast(group, prev_sum, 0);
@@ -234,12 +411,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         });
     });
 
-    auto free_event = __queue.submit(
-        [=](sycl::handler& hdl)
-        {
-            hdl.depends_on(event);
-            hdl.host_task([=](){ sycl::free(mem_pool, __queue); });
-        });
+    scratch.async_free(event);
 
     event.wait();
 }

From 69cc2fadc20a5ac89d4e2c2e76ab85b55f7521fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Thu, 23 Nov 2023 14:11:27 +0000
Subject: [PATCH 019/134] [Scan_kt] Atomic64 flags + value implementation (#25)

* Implemented atomic64 version of the scan_kt pass

* Removed repeated offset calculation for tile id atomic flag

* Loopback -> Lookback. Removed unused var.
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 351 ++++++++++++------
 1 file changed, 243 insertions(+), 108 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 314ace11410..6dfe1bb6ef1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -26,11 +26,13 @@ inline namespace igpu {
 
 constexpr ::std::size_t SUBGROUP_SIZE = 32;
 
-template <typename Type, template <typename> typename LoopbackScanMemory, typename TileId>
+template <typename Type, typename UseAtomic64, template <typename, typename> typename LookbackScanMemory,
+          typename TileId>
 struct ScanMemoryManager
 {
     using _TileIdT = typename TileId::_TileIdT;
-    using _FlagT = typename LoopbackScanMemory<Type>::_FlagT;
+    using _LookbackScanMemory = LookbackScanMemory<Type, UseAtomic64>;
+    using _FlagT = typename _LookbackScanMemory::_FlagT;
 
     ScanMemoryManager(sycl::queue q) : q{q} {};
 
@@ -49,7 +51,7 @@ struct ScanMemoryManager
     void
     allocate(::std::size_t num_wgs)
     {
-        ::std::size_t scan_memory_size = LoopbackScanMemory<Type>::get_memory_size(num_wgs);
+        ::std::size_t scan_memory_size = _LookbackScanMemory::get_memory_size(num_wgs);
         constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size();
         constexpr ::std::size_t tileid_size = TileId::get_memory_size();
 
@@ -85,8 +87,11 @@ struct ScanMemoryManager
     sycl::queue q;
 };
 
+template <typename _T, typename UseAtomic64>
+struct LookbackScanMemory;
+
 template <typename _T>
-struct LoopbackScanMemory
+struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::false_type>
 {
     using _FlagT = ::std::uint32_t;
     using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device,
@@ -99,13 +104,12 @@ struct LoopbackScanMemory
 
     static constexpr ::std::size_t padding = SUBGROUP_SIZE;
 
-    LoopbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
-        : num_elements(get_num_elements(num_wgs))
+    // LookbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...]
+    // Each section has num_wgs + padding elements
+    LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
+        : num_elements(get_num_elements(num_wgs)), tile_values_begin(reinterpret_cast<_T*>(scan_memory_begin)),
+          flags_begin(get_flags_begin(scan_memory_begin, num_wgs))
     {
-        // LoopbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...]
-        // Each section has num_wgs + padding elements
-        tile_values_begin = reinterpret_cast<_T*>(scan_memory_begin);
-        flags_begin = get_flags_begin(scan_memory_begin, num_wgs);
     }
 
     void
@@ -126,19 +130,17 @@ struct LoopbackScanMemory
         atomic_flag.store(FULL_MASK);
     }
 
-    _FlagT
-    load_flag(::std::size_t tile_id) const
+    _AtomicFlagRefT
+    get_flag(::std::size_t tile_id) const
     {
-        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
-
-        return atomic_flag.load();
+        return _AtomicFlagRefT(*(flags_begin + tile_id + padding));
     }
 
     _T
     get_value(::std::size_t tile_id, _FlagT flag) const
     {
-        ::std::size_t offset = tile_id + padding + num_elements * is_full(flag);
-        return tile_values_begin[offset];
+        // full_value and partial_value are num_elements apart
+        return *(tile_values_begin + tile_id + padding + num_elements * is_full(flag));
     }
 
     static ::std::size_t
@@ -176,7 +178,7 @@ struct LoopbackScanMemory
     get_memory_size(::std::size_t num_wgs)
     {
         ::std::size_t num_elements = get_num_elements(num_wgs);
-        // sizeof(_T) extra bytes are not needed because LoopbackScanMemory is going at the beginning of the scratch
+        // sizeof(_T) extra bytes are not needed because LookbackScanMemory is going at the beginning of the scratch
         ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements);
         // Padding to provide room for aligment
         ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements);
@@ -214,6 +216,110 @@ struct LoopbackScanMemory
     _T* tile_values_begin;
 };
 
+template <typename _T>
+struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type>
+{
+    using _FlagT = ::std::uint64_t;
+    using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>;
+
+    // Each flag is divided in 2 32bit values
+    // 32..63 status bits
+    // 00..31 value bits
+    // Example: status = full scanned value, int value = 15:
+    // 1000 0000 0000 0000 0000 0000 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 1111
+
+    // Status values:
+    // 00xxxx - not computed
+    // 01xxxx - partial
+    // 10xxxx - full
+    // 110000 - out of bounds
+
+    static constexpr _FlagT NOT_READY = 0;
+    static constexpr _FlagT PARTIAL_MASK = 1l << (sizeof(_FlagT) * 8 - 2);
+    static constexpr _FlagT FULL_MASK = 1l << (sizeof(_FlagT) * 8 - 1);
+    static constexpr _FlagT OUT_OF_BOUNDS = PARTIAL_MASK | FULL_MASK;
+
+    static constexpr _FlagT VALUE_MASK = (1l << sizeof(::std::uint32_t) * 8) - 1; // 32 bit mask to store value
+
+    static constexpr ::std::size_t padding = SUBGROUP_SIZE;
+
+    LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
+        : num_elements(get_num_elements(num_wgs)), flags_begin(get_flags_begin(scan_memory_begin, num_wgs))
+    {
+    }
+
+    void
+    set_partial(::std::size_t tile_id, _T val)
+    {
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        atomic_flag.store(PARTIAL_MASK | static_cast<::std::uint32_t>(val));
+    }
+
+    void
+    set_full(::std::size_t tile_id, _T val)
+    {
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        atomic_flag.store(FULL_MASK | static_cast<::std::uint32_t>(val));
+    }
+
+    _AtomicFlagRefT
+    get_flag(::std::size_t tile_id) const
+    {
+        return _AtomicFlagRefT(*(flags_begin + tile_id + padding));
+    }
+
+    _T
+    get_value(::std::size_t, _FlagT flag) const
+    {
+        return static_cast<::std::uint32_t>(flag & VALUE_MASK);
+    }
+
+    static _FlagT*
+    get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t)
+    {
+        return reinterpret_cast<_FlagT*>(scan_memory_begin);
+    }
+
+    static ::std::size_t
+    get_memory_size(::std::size_t num_wgs)
+    {
+        ::std::size_t num_elements = get_num_elements(num_wgs);
+        return num_elements * sizeof(_FlagT);
+    }
+
+    static ::std::size_t
+    get_num_elements(::std::size_t num_wgs)
+    {
+        return padding + num_wgs;
+    }
+
+    static bool
+    is_ready(_FlagT flag)
+    {
+        // flag & OUT_OF_BOUNDS != NOT_READY means it has either partial or full value, or is out of bounds
+        return (flag & OUT_OF_BOUNDS) != NOT_READY;
+    }
+
+    static bool
+    is_full(_FlagT flag)
+    {
+        return (flag & OUT_OF_BOUNDS) == FULL_MASK;
+    }
+
+    static bool
+    is_out_of_bounds(_FlagT flag)
+    {
+        return (flag & OUT_OF_BOUNDS) == OUT_OF_BOUNDS;
+    }
+
+  private:
+    ::std::size_t num_elements;
+    _FlagT* flags_begin;
+};
+
 struct TileId
 {
     using _TileIdT = ::std::uint32_t;
@@ -248,11 +354,14 @@ struct TileId
 struct cooperative_lookback
 {
 
-    template <typename _T, typename _Subgroup, typename BinOp, template <typename> typename LoopbackScanMemory>
+    template <typename _T, typename _Subgroup, typename BinOp,
+              template <typename, typename> typename LookbackScanMemory, typename UseAtomic64>
     _T
-    operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, LoopbackScanMemory<_T> memory)
+    operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op,
+               LookbackScanMemory<_T, UseAtomic64> memory)
     {
-        using FlagT = typename LoopbackScanMemory<_T>::_FlagT;
+        using _LookbackScanMemory = LookbackScanMemory<_T, UseAtomic64>;
+        using FlagT = typename _LookbackScanMemory::_FlagT;
 
         _T sum = 0;
         int offset = -1;
@@ -261,18 +370,19 @@ struct cooperative_lookback
 
         for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
         {
+            auto atomic_flag = memory.get_flag(tile - local_id);
             FlagT flag;
             do
             {
-                flag = memory.load_flag(tile - local_id);
-            } while (!sycl::all_of_group(subgroup, LoopbackScanMemory<_T>::is_ready(flag))); // Loop till all ready
+                flag = atomic_flag.load();
+            } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag))); // Loop till all ready
 
-            bool is_full = LoopbackScanMemory<_T>::is_full(flag);
+            bool is_full = _LookbackScanMemory::is_full(flag);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
             auto lowest_item_with_full = is_full_ballot.find_low();
 
             // TODO: Use identity_fn for out of bounds values
-            _T contribution = local_id <= lowest_item_with_full && !LoopbackScanMemory<_T>::is_out_of_bounds(flag)
+            _T contribution = local_id <= lowest_item_with_full && !_LookbackScanMemory::is_out_of_bounds(flag)
                                   ? memory.get_value(tile - local_id, flag)
                                   : _T{0};
 
@@ -291,124 +401,131 @@ struct cooperative_lookback
     }
 };
 
-template <typename _KernelParam, bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
+template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _InRange, typename _OutRange,
+          typename _BinaryOp>
 void
 single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _TileIdT = TileId::_TileIdT;
-    using _FlagT = typename LoopbackScanMemory<_Type>::_FlagT;
+    using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>;
+    using _FlagT = typename _LookbackScanMemory::_FlagT;
 
-    static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
+    static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan");
 
     const ::std::size_t n = __in_rng.size();
 
     constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
-
     // Avoid non_uniform n by padding up to a multiple of wgsize
     ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
-    ScanMemoryManager<_Type, LoopbackScanMemory, TileId> scratch(__queue);
+    ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
     scratch.allocate(num_wgs);
 
     // Memory Structure:
-    // [Loopback Scan Memory, Tile Id Counter]
+    // [Lookback Scan Memory, Tile Id Counter]
     auto scan_memory_begin = scratch.scan_memory_ptr();
-    auto status_flags_begin = LoopbackScanMemory<_Type>::get_flags_begin(scan_memory_begin, num_wgs);
+    auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs);
     auto tile_id_begin = scratch.tile_id_ptr();
 
-    ::std::size_t num_elements = LoopbackScanMemory<_Type>::get_num_elements(num_wgs);
+    ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs);
     // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
     ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
 
     auto fill_event = __queue.submit(
         [&](sycl::handler& hdl)
         {
-            hdl.parallel_for<class scan_kt_init>(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
-                                                 [=](const sycl::nd_item<1>& item)
-                                                 {
-                                                     int id = item.get_global_linear_id();
-                                                     if (id < num_elements)
-                                                         status_flags_begin[id] =
-                                                             id < LoopbackScanMemory<_Type>::padding
-                                                                 ? LoopbackScanMemory<_Type>::OUT_OF_BOUNDS
-                                                                 : LoopbackScanMemory<_Type>::NOT_READY;
-                                                     if (id == num_elements)
-                                                         tile_id_begin[0] = 0;
-                                                 });
+            hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
+                             [=](const sycl::nd_item<1>& item)
+                             {
+                                 int id = item.get_global_linear_id();
+                                 if (id < num_elements)
+                                     status_flags_begin[id] = id < _LookbackScanMemory::padding
+                                                                  ? _LookbackScanMemory::OUT_OF_BOUNDS
+                                                                  : _LookbackScanMemory::NOT_READY;
+                                 if (id == num_elements)
+                                     tile_id_begin[0] = 0;
+                             });
         });
 
-
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-            auto group = item.get_group();
-            auto local_id = item.get_local_id(0);
-            auto stride = item.get_local_range(0);
-            auto subgroup = item.get_sub_group();
-
-            // Obtain unique ID for this work-group that will be used in decoupled lookback
-            TileId dynamic_tile_id(tile_id_begin);
-            if (group.leader())
-            {
-                tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-            }
-            sycl::group_barrier(group);
-            std::uint32_t tile_id = tile_id_lacc[0];
-
-            // Global load into local
-            auto wg_current_offset = (tile_id*elems_in_tile);
-            auto wg_next_offset = ((tile_id+1)*elems_in_tile);
-            size_t wg_local_memory_size = elems_in_tile;
-            if (wg_current_offset >= n)
-                return;
-            if (wg_next_offset > n)
-                wg_local_memory_size = n - wg_current_offset;
-
-            if (wg_next_offset <= n) {
-                _ONEDPL_PRAGMA_UNROLL
-                for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                    tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
-            } else {
-                for (std::uint32_t i = 0; i < elems_per_workitem; ++i) {
-                    if (wg_current_offset + local_id + stride * i < n)
-                        tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
-                }
-            }
-            sycl::group_barrier(group);
-
-            auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
-            auto in_end = in_begin + wg_local_memory_size;
-            auto out_begin = __out_rng.begin() + wg_current_offset;
-
-            auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
-            _Type prev_sum = 0;
-
-            // The first sub-group will query the previous tiles to find a prefix
-            if (subgroup.get_group_id() == 0)
-            {
-                LoopbackScanMemory<_Type> scan_mem(scan_memory_begin, num_wgs);
-
-                if (group.leader())
-                    scan_mem.set_partial(tile_id, local_sum);
-
-                // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
-
-                if (group.leader())
-                    scan_mem.set_full(tile_id, prev_sum + local_sum);
-            }
-
-            prev_sum = sycl::group_broadcast(group, prev_sum, 0);
-            sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
-        });
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
+                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]]
+                         {
+                             auto group = item.get_group();
+                             auto local_id = item.get_local_id(0);
+                             auto stride = item.get_local_range(0);
+                             auto subgroup = item.get_sub_group();
+
+                             // Obtain unique ID for this work-group that will be used in decoupled lookback
+                             TileId dynamic_tile_id(tile_id_begin);
+                             if (group.leader())
+                             {
+                                 tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                             }
+                             sycl::group_barrier(group);
+                             std::uint32_t tile_id = tile_id_lacc[0];
+
+                             // Global load into local
+                             auto wg_current_offset = (tile_id * elems_in_tile);
+                             auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
+                             size_t wg_local_memory_size = elems_in_tile;
+                             if (wg_current_offset >= n)
+                                 return;
+                             if (wg_next_offset > n)
+                                 wg_local_memory_size = n - wg_current_offset;
+
+                             if (wg_next_offset <= n)
+                             {
+                                 _ONEDPL_PRAGMA_UNROLL
+                                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                                     tile_vals[local_id + stride * i] =
+                                         __in_rng[wg_current_offset + local_id + stride * i];
+                             }
+                             else
+                             {
+                                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                                 {
+                                     if (wg_current_offset + local_id + stride * i < n)
+                                         tile_vals[local_id + stride * i] =
+                                             __in_rng[wg_current_offset + local_id + stride * i];
+                                 }
+                             }
+                             sycl::group_barrier(group);
+
+                             auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
+                             auto in_end = in_begin + wg_local_memory_size;
+                             auto out_begin = __out_rng.begin() + wg_current_offset;
+
+                             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
+                             _Type prev_sum = 0;
+
+                             // The first sub-group will query the previous tiles to find a prefix
+                             if (subgroup.get_group_id() == 0)
+                             {
+                                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
+
+                                 if (group.leader())
+                                     scan_mem.set_partial(tile_id, local_sum);
+
+                                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                                 prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
+
+                                 if (group.leader())
+                                     scan_mem.set_full(tile_id, prev_sum + local_sum);
+                             }
+
+                             prev_sum = sycl::group_broadcast(group, prev_sum, 0);
+                             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
+                         });
     });
 
     scratch.async_free(event);
@@ -438,7 +555,25 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
-    single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+    // Avoid aspect query overhead for sizeof(Types) > 32 bits
+    if constexpr (sizeof(typename std::iterator_traits<_InIterator>::value_type) <= sizeof(std::uint32_t))
+    {
+        if (__queue.get_device().has(sycl::aspect::atomic64))
+        {
+            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type>(
+                __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+        }
+        else
+        {
+            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>(
+                __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+        }
+    }
+    else
+    {
+        single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>(
+            __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+    }
 }
 
 } // inline namespace igpu

From b5851cea3b280d5a1f0739b25a06e995d7ddce27 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:39:49 +0000
Subject: [PATCH 020/134] constexpr, types and remove an unneeded check

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h     | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 6dfe1bb6ef1..266d4b18657 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -364,8 +364,7 @@ struct cooperative_lookback
         using FlagT = typename _LookbackScanMemory::_FlagT;
 
         _T sum = 0;
-        int offset = -1;
-        int i = 0;
+        constexpr int offset = -1;
         int local_id = subgroup.get_local_id();
 
         for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
@@ -418,7 +417,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
     // Avoid non_uniform n by padding up to a multiple of wgsize
-    ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
@@ -461,8 +460,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                          [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]]
                          {
                              auto group = item.get_group();
-                             auto local_id = item.get_local_id(0);
-                             auto stride = item.get_local_range(0);
+                             ::std::uint32_t local_id = item.get_local_id(0);
+                             constexpr ::std::uint32_t stride = wgsize;
                              auto subgroup = item.get_sub_group();
 
                              // Obtain unique ID for this work-group that will be used in decoupled lookback
@@ -477,9 +476,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                              // Global load into local
                              auto wg_current_offset = (tile_id * elems_in_tile);
                              auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
-                             size_t wg_local_memory_size = elems_in_tile;
-                             if (wg_current_offset >= n)
-                                 return;
+                             auto wg_local_memory_size = elems_in_tile;
+
                              if (wg_next_offset > n)
                                  wg_local_memory_size = n - wg_current_offset;
 
@@ -502,7 +500,6 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                              sycl::group_barrier(group);
 
                              auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
-                             auto in_end = in_begin + wg_local_memory_size;
                              auto out_begin = __out_rng.begin() + wg_current_offset;
 
                              auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);

From c9736c197d85d55703754a4e1791ec03545524e5 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:40:04 +0000
Subject: [PATCH 021/134] Correct static_cast ?

---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 266d4b18657..0655b60deb1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -274,7 +274,7 @@ struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type>
     _T
     get_value(::std::size_t, _FlagT flag) const
     {
-        return static_cast<::std::uint32_t>(flag & VALUE_MASK);
+        return static_cast<_T>(flag & VALUE_MASK);
     }
 
     static _FlagT*

From 0e450f79ffdbbb47c6aa71052d394e0fc57a73e8 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:40:19 +0000
Subject: [PATCH 022/134] Defer group comms in lookback

---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 0655b60deb1..ce186b4ffa4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -386,8 +386,7 @@ struct cooperative_lookback
                                   : _T{0};
 
             // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
-            sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
-
+            sum = bin_op(sum, contribution);
             // If we found a full value, we can stop looking at previous tiles. Otherwise,
             // keep going through tiles until we either find a full tile or we've completely
             // recomputed the prefix using partial values
@@ -395,6 +394,7 @@ struct cooperative_lookback
                 break;
 
         }
+        sum = sycl::reduce_over_group(subgroup, sum, bin_op);
 
         return sum;
     }

From 95b55528d6730b245290d686ef53c97554d5e34b Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:40:52 +0000
Subject: [PATCH 023/134] Disable dynamic tile ID by default

TODO: we still allocate & initialize the memory for the counter
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index ce186b4ffa4..007186a2f9a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -400,8 +400,8 @@ struct cooperative_lookback
     }
 };
 
-template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _InRange, typename _OutRange,
-          typename _BinaryOp>
+template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _UseDynamicTileID,
+          typename _InRange, typename _OutRange, typename _BinaryOp>
 void
 single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
 {
@@ -464,14 +464,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                              constexpr ::std::uint32_t stride = wgsize;
                              auto subgroup = item.get_sub_group();
 
-                             // Obtain unique ID for this work-group that will be used in decoupled lookback
-                             TileId dynamic_tile_id(tile_id_begin);
-                             if (group.leader())
-                             {
-                                 tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-                             }
-                             sycl::group_barrier(group);
-                             std::uint32_t tile_id = tile_id_lacc[0];
+                              std::uint32_t tile_id;
+                              if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
+                              {
+                                  // Obtain unique ID for this work-group that will be used in decoupled lookback
+                                  TileId dynamic_tile_id(tile_id_begin);
+                                  if (group.leader())
+                                  {
+                                      tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                                  }
+                                  sycl::group_barrier(group);
+                                  tile_id = tile_id_lacc[0];
+                              }
+                              else
+                              {
+                                  tile_id = group.get_group_linear_id();
+                              }
+
 
                              // Global load into local
                              auto wg_current_offset = (tile_id * elems_in_tile);
@@ -557,18 +566,18 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     {
         if (__queue.get_device().has(sycl::aspect::atomic64))
         {
-            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type>(
+            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::false_type>(
                 __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
         }
         else
         {
-            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>(
+            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>(
                 __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
         }
     }
     else
     {
-        single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>(
+        single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>(
             __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
     }
 }

From 3f30ec8e4eb0ee8fb9e9589a3ecb32cdda85370b Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:41:32 +0000
Subject: [PATCH 024/134] Reduce from register sums instead of local mem

Also use #pragma unroll for now
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 007186a2f9a..e43cfee6aa6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -489,29 +489,36 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
                              if (wg_next_offset > n)
                                  wg_local_memory_size = n - wg_current_offset;
-
+                             _Type my_reducer{};
                              if (wg_next_offset <= n)
                              {
-                                 _ONEDPL_PRAGMA_UNROLL
+                                 #pragma unroll
                                  for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                                     tile_vals[local_id + stride * i] =
-                                         __in_rng[wg_current_offset + local_id + stride * i];
+                                 {
+                                     _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
+                                     my_reducer = __binary_op(my_reducer, in_val);
+                                     tile_vals[local_id + stride * i] = in_val;
+                                 }
                              }
                              else
                              {
+                                 #pragma unroll
                                  for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
                                  {
                                      if (wg_current_offset + local_id + stride * i < n)
-                                         tile_vals[local_id + stride * i] =
-                                             __in_rng[wg_current_offset + local_id + stride * i];
+                                     {
+                                         _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
+                                         my_reducer = __binary_op(my_reducer, in_val);
+                                         tile_vals[local_id + stride * i] = in_val;
+                                     }
                                  }
                              }
-                             sycl::group_barrier(group);
+
+                             auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op);
 
                              auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
                              auto out_begin = __out_rng.begin() + wg_current_offset;
 
-                             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
                              _Type prev_sum = 0;
 
                              // The first sub-group will query the previous tiles to find a prefix

From c147f053768c4d38fb71b89be1416f58919b1d3a Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:42:06 +0000
Subject: [PATCH 025/134] Unrolled version of joint_inclusive_scan

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index e43cfee6aa6..68921c08c3c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -536,8 +536,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                                      scan_mem.set_full(tile_id, prev_sum + local_sum);
                              }
 
-                             prev_sum = sycl::group_broadcast(group, prev_sum, 0);
-                             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
+                             _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+                             #pragma unroll
+                             for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
+                             {
+                                 ::std::uint32_t i = stride * step;
+                                 _Type x;
+                                 if (i + local_id < wg_local_memory_size)
+                                 {
+                                     x = in_begin[i + local_id];
+                                 }
+                                 _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry);
+                                 if (i + local_id < wg_local_memory_size)
+                                 {
+                                     out_begin[i + local_id] = out;
+                                 }
+                                 carry = group_broadcast(group, out, stride - 1);
+                             }
                          });
     });
 

From ab69568d21c5b6839cb10a7a69778519ff0ff430 Mon Sep 17 00:00:00 2001
From: Joe Todd <joeatodd@users.noreply.github.com>
Date: Thu, 23 Nov 2023 15:25:30 +0000
Subject: [PATCH 026/134] Update
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 68921c08c3c..dae5cd7a48e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -537,6 +537,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                              }
 
                              _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+                             // TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL
                              #pragma unroll
                              for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
                              {

From b992b847972a1f2fc83ace6443b280029bee1b20 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 15:27:43 +0000
Subject: [PATCH 027/134] Add TODO

---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index dae5cd7a48e..a85d86aeb31 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -489,6 +489,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
                              if (wg_next_offset > n)
                                  wg_local_memory_size = n - wg_current_offset;
+                             //TODO: assumes default ctor produces identity w.r.t. __binary_op
                              _Type my_reducer{};
                              if (wg_next_offset <= n)
                              {

From 37726be95a53dc76791b0fbbcc02dc39b8acda9c Mon Sep 17 00:00:00 2001
From: Alberto Cabrera <alberto.cabrera@codeplay.com>
Date: Tue, 28 Nov 2023 15:55:38 +0000
Subject: [PATCH 028/134] Changing fill kernel for a memset

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 39 ++++++++++---------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index a85d86aeb31..c1e1d2c0cbd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -369,19 +369,20 @@ struct cooperative_lookback
 
         for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
         {
-            auto atomic_flag = memory.get_flag(tile - local_id);
+            auto atomic_flag = memory.get_flag(tile - local_id); //
             FlagT flag;
             do
             {
                 flag = atomic_flag.load();
-            } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag))); // Loop till all ready
+            } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag) ||
+                                                       (tile - local_id < 0))); // Loop till all ready
 
             bool is_full = _LookbackScanMemory::is_full(flag);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
             auto lowest_item_with_full = is_full_ballot.find_low();
 
             // TODO: Use identity_fn for out of bounds values
-            _T contribution = local_id <= lowest_item_with_full && !_LookbackScanMemory::is_out_of_bounds(flag)
+            _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0)
                                   ? memory.get_value(tile - local_id, flag)
                                   : _T{0};
 
@@ -434,21 +435,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
     ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
 
-    auto fill_event = __queue.submit(
-        [&](sycl::handler& hdl)
-        {
-            hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
-                             [=](const sycl::nd_item<1>& item)
-                             {
-                                 int id = item.get_global_linear_id();
-                                 if (id < num_elements)
-                                     status_flags_begin[id] = id < _LookbackScanMemory::padding
-                                                                  ? _LookbackScanMemory::OUT_OF_BOUNDS
-                                                                  : _LookbackScanMemory::NOT_READY;
-                                 if (id == num_elements)
-                                     tile_id_begin[0] = 0;
-                             });
-        });
+    auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT));
+
+    // auto fill_event = __queue.submit(
+    //     [&](sycl::handler& hdl)
+    //     {
+    //         hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
+    //                          [=](const sycl::nd_item<1>& item)
+    //                          {
+    //                              int id = item.get_global_linear_id();
+    //                              if (id < num_elements)
+    //                                  status_flags_begin[id] = id < _LookbackScanMemory::padding
+    //                                                               ? _LookbackScanMemory::OUT_OF_BOUNDS
+    //                                                               : _LookbackScanMemory::NOT_READY;
+    //                              if (id == num_elements)
+    //                                  tile_id_begin[0] = 0;
+    //                          });
+    //     });
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);

From d7c3c7860ffa0de52b9fd92941fc78be9be7955e Mon Sep 17 00:00:00 2001
From: Alberto Cabrera <alberto.cabrera@codeplay.com>
Date: Wed, 29 Nov 2023 15:19:30 +0000
Subject: [PATCH 029/134] Single wg implementation

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 358 ++++++++++++------
 1 file changed, 234 insertions(+), 124 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index c1e1d2c0cbd..345da745608 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -401,6 +401,89 @@ struct cooperative_lookback
     }
 };
 
+template <typename _KernelParam, typename _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
+void
+single_pass_scan_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+
+    static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan");
+
+    const ::std::size_t n = __in_rng.size();
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    constexpr ::std::size_t num_workitems = wgsize;
+
+    auto event = __queue.submit([&](sycl::handler& hdl) {
+        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
+        hdl.parallel_for(
+            sycl::nd_range<1>(num_workitems, wgsize), [=
+        ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                auto group = item.get_group();
+                ::std::uint32_t local_id = item.get_local_id(0);
+                constexpr ::std::uint32_t stride = wgsize;
+                auto subgroup = item.get_sub_group();
+
+                constexpr std::uint32_t tile_id = 0;
+                constexpr std::uint32_t wg_begin = 0;
+                constexpr std::uint32_t wg_end = elems_in_tile;
+
+                std::uint32_t wg_local_memory_size = elems_in_tile;
+
+                auto out_begin = __out_rng.begin();
+                _Type carry = 0;
+
+                // Global load into local
+                if (wg_end > n)
+                    wg_local_memory_size = n;
+
+                //TODO: assumes default ctor produces identity w.r.t. __binary_op
+                // _Type my_reducer{};
+                if (wg_end <= n)
+                {
+#pragma unroll
+                    for (std::uint32_t step = 0; step < elems_per_workitem; ++step)
+                    {
+                        ::std::uint32_t i = stride * step;
+                        _Type in_val = __in_rng[i + local_id];
+                        // my_reducer = __binary_op(my_reducer, in_val);
+                        _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry);
+                        out_begin[i + local_id] = out;
+                        carry = group_broadcast(group, out, stride - 1);
+                    }
+                }
+                else
+                {
+#pragma unroll
+                    for (std::uint32_t step = 0; step < elems_per_workitem; ++step)
+                    {
+                        ::std::uint32_t i = stride * step;
+                        _Type in_val;
+
+                        if (i + local_id < n)
+                        {
+                            in_val = __in_rng[i + local_id];
+                            // my_reducer = __binary_op(my_reducer, in_val);
+                        }
+                        _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry);
+                        if (i + local_id < n)
+                        {
+                            out_begin[i + local_id] = out;
+                        }
+                        carry = group_broadcast(group, out, stride - 1);
+                    }
+                }
+            });
+    });
+
+    event.wait();
+}
+
 template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _UseDynamicTileID,
           typename _InRange, typename _OutRange, typename _BinaryOp>
 void
@@ -437,128 +520,111 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT));
 
-    // auto fill_event = __queue.submit(
-    //     [&](sycl::handler& hdl)
-    //     {
-    //         hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
-    //                          [=](const sycl::nd_item<1>& item)
-    //                          {
-    //                              int id = item.get_global_linear_id();
-    //                              if (id < num_elements)
-    //                                  status_flags_begin[id] = id < _LookbackScanMemory::padding
-    //                                                               ? _LookbackScanMemory::OUT_OF_BOUNDS
-    //                                                               : _LookbackScanMemory::NOT_READY;
-    //                              if (id == num_elements)
-    //                                  tile_id_begin[0] = 0;
-    //                          });
-    //     });
-
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
-                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]]
-                         {
-                             auto group = item.get_group();
-                             ::std::uint32_t local_id = item.get_local_id(0);
-                             constexpr ::std::uint32_t stride = wgsize;
-                             auto subgroup = item.get_sub_group();
-
-                              std::uint32_t tile_id;
-                              if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
-                              {
-                                  // Obtain unique ID for this work-group that will be used in decoupled lookback
-                                  TileId dynamic_tile_id(tile_id_begin);
-                                  if (group.leader())
-                                  {
-                                      tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-                                  }
-                                  sycl::group_barrier(group);
-                                  tile_id = tile_id_lacc[0];
-                              }
-                              else
-                              {
-                                  tile_id = group.get_group_linear_id();
-                              }
-
-
-                             // Global load into local
-                             auto wg_current_offset = (tile_id * elems_in_tile);
-                             auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
-                             auto wg_local_memory_size = elems_in_tile;
-
-                             if (wg_next_offset > n)
-                                 wg_local_memory_size = n - wg_current_offset;
-                             //TODO: assumes default ctor produces identity w.r.t. __binary_op
-                             _Type my_reducer{};
-                             if (wg_next_offset <= n)
-                             {
-                                 #pragma unroll
-                                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                                 {
-                                     _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
-                                     my_reducer = __binary_op(my_reducer, in_val);
-                                     tile_vals[local_id + stride * i] = in_val;
-                                 }
-                             }
-                             else
-                             {
-                                 #pragma unroll
-                                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                                 {
-                                     if (wg_current_offset + local_id + stride * i < n)
-                                     {
-                                         _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
-                                         my_reducer = __binary_op(my_reducer, in_val);
-                                         tile_vals[local_id + stride * i] = in_val;
-                                     }
-                                 }
-                             }
-
-                             auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op);
-
-                             auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
-                             auto out_begin = __out_rng.begin() + wg_current_offset;
-
-                             _Type prev_sum = 0;
-
-                             // The first sub-group will query the previous tiles to find a prefix
-                             if (subgroup.get_group_id() == 0)
-                             {
-                                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
-
-                                 if (group.leader())
-                                     scan_mem.set_partial(tile_id, local_sum);
-
-                                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                                 prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
-
-                                 if (group.leader())
-                                     scan_mem.set_full(tile_id, prev_sum + local_sum);
-                             }
-
-                             _Type carry = sycl::group_broadcast(group, prev_sum, 0);
-                             // TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL
-                             #pragma unroll
-                             for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
-                             {
-                                 ::std::uint32_t i = stride * step;
-                                 _Type x;
-                                 if (i + local_id < wg_local_memory_size)
-                                 {
-                                     x = in_begin[i + local_id];
-                                 }
-                                 _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry);
-                                 if (i + local_id < wg_local_memory_size)
-                                 {
-                                     out_begin[i + local_id] = out;
-                                 }
-                                 carry = group_broadcast(group, out, stride - 1);
-                             }
-                         });
+        hdl.parallel_for(
+            sycl::nd_range<1>(num_workitems, wgsize), [=
+        ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                auto group = item.get_group();
+                ::std::uint32_t local_id = item.get_local_id(0);
+                constexpr ::std::uint32_t stride = wgsize;
+                auto subgroup = item.get_sub_group();
+
+                std::uint32_t tile_id;
+                if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
+                {
+                    // Obtain unique ID for this work-group that will be used in decoupled lookback
+                    TileId dynamic_tile_id(tile_id_begin);
+                    if (group.leader())
+                    {
+                        tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                    }
+                    sycl::group_barrier(group);
+                    tile_id = tile_id_lacc[0];
+                }
+                else
+                {
+                    tile_id = group.get_group_linear_id();
+                }
+
+                // Global load into local
+                auto wg_current_offset = (tile_id * elems_in_tile);
+                auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
+                auto wg_local_memory_size = elems_in_tile;
+
+                if (wg_next_offset > n)
+                    wg_local_memory_size = n - wg_current_offset;
+                //TODO: assumes default ctor produces identity w.r.t. __binary_op
+                _Type my_reducer{};
+                if (wg_next_offset <= n)
+                {
+#pragma unroll
+                    for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                    {
+                        _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
+                        my_reducer = __binary_op(my_reducer, in_val);
+                        tile_vals[local_id + stride * i] = in_val;
+                    }
+                }
+                else
+                {
+#pragma unroll
+                    for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                    {
+                        if (wg_current_offset + local_id + stride * i < n)
+                        {
+                            _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
+                            my_reducer = __binary_op(my_reducer, in_val);
+                            tile_vals[local_id + stride * i] = in_val;
+                        }
+                    }
+                }
+
+                auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op);
+
+                auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
+                auto out_begin = __out_rng.begin() + wg_current_offset;
+
+                _Type prev_sum = 0;
+
+                // The first sub-group will query the previous tiles to find a prefix
+                if (subgroup.get_group_id() == 0)
+                {
+                    _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
+
+                    if (group.leader())
+                        scan_mem.set_partial(tile_id, local_sum);
+
+                    // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                    prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
+
+                    if (group.leader())
+                        scan_mem.set_full(tile_id, prev_sum + local_sum);
+                }
+
+                _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+// TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL
+#pragma unroll
+                for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
+                {
+                    ::std::uint32_t i = stride * step;
+                    _Type x;
+                    if (i + local_id < wg_local_memory_size)
+                    {
+                        x = in_begin[i + local_id];
+                    }
+                    _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry);
+                    if (i + local_id < wg_local_memory_size)
+                    {
+                        out_begin[i + local_id] = out;
+                    }
+                    carry = group_broadcast(group, out, stride - 1);
+                }
+            });
     });
 
     scratch.async_free(event);
@@ -575,9 +641,10 @@ struct kernel_param
     using kernel_name = KernelName;
 };
 
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+template <typename _KernelParam, typename _Inclusive, typename _InIterator, typename _OutIterator, typename _BinaryOp>
 void
-single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
+single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
+                           _BinaryOp __binary_op)
 {
     auto __n = __in_end - __in_begin;
 
@@ -593,19 +660,62 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     {
         if (__queue.get_device().has(sycl::aspect::atomic64))
         {
-            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::false_type>(
-                __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+            single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::true_type,
+                                  /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                                          __binary_op);
         }
         else
         {
-            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>(
-                __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+            single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type,
+                                  /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                                          __binary_op);
         }
     }
     else
     {
-        single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>(
-            __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+        single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type,
+                              /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                                      __binary_op);
+    }
+}
+
+template <typename _KernelParam, typename _Inclusive, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+void
+single_pass_single_wg_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
+                                     _OutIterator __out_begin, _BinaryOp __binary_op)
+{
+    auto __n = __in_end - __in_begin;
+
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    // Avoid aspect query overhead for sizeof(Types) > 32 bits
+    single_pass_scan_impl_single_wg<_KernelParam, /* Inclusive */ std::true_type>(__queue, __buf1.all_view(),
+                                                                                  __buf2.all_view(), __binary_op);
+}
+
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+void
+single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
+                           _BinaryOp __binary_op)
+{
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    auto __n = __in_end - __in_begin;
+
+    if (__n <= elems_in_tile)
+    {
+        single_pass_single_wg_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(
+            __queue, __in_begin, __in_end, __out_begin, __binary_op);
+    }
+    else
+    {
+        single_pass_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(__queue, __in_begin, __in_end,
+                                                                                 __out_begin, __binary_op);
     }
 }
 

From e42e68dfa0f3c722df13fce98f802222893df94b Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Tue, 21 Nov 2023 10:38:29 +0000
Subject: [PATCH 030/134] Add phase 1

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 123 ++++++++++++++++++
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   |  77 +++++++++++
 2 files changed, 200 insertions(+)
 create mode 100644 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 345da745608..c6da15a17b0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -719,6 +719,129 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
+// Load function to try and get some PVC perf w/ coalesced
+template <typename Tp, typename _InRange>
+inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id) {
+    // if constexpr (std::is_arithmetic_v<Tp>) {
+    //   return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id);
+    // } 
+    return src[i + wg_stride * wg_group_id];
+}
+
+// Load with checking for the subgroup case
+template <typename Tp, typename _InRange>
+inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id, size_t input_size) {
+    // if constexpr (std::is_arithmetic_v<Tp>) {
+      // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * wg_group_id <= input_size) 
+        // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id);
+      // return src[i + wg_stride * wg_group_id];
+    // } 
+    return src[i + wg_stride * wg_group_id];
+}
+
+template <typename _KernelParam, typename _InRange, typename _OutRange, typename _UnaryPredicate>
+void
+single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+
+    const ::std::size_t n = __in_rng.size();
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
+    ::std::size_t num_workitems = num_wgs * wgsize;
+
+    auto event = __queue.submit([&](sycl::handler& hdl) {
+        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl);
+        auto l_wg_count = sycl::local_accessor<size_t, 1>(sycl::range<1>{1}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
+        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+            auto group = item.get_group();
+            auto wg_group_id = item.get_group(0);
+            auto wg_local_id = item.get_local_id(0);
+            auto sg = item.get_sub_group();
+
+            // Must be a better way to init atomics
+            l_wg_count[0] = 0;
+            sycl::group_barrier(group);
+            sycl::atomic_ref<size_t, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
+
+            constexpr size_t elems_per_workgroup = elems_per_workitem*wgsize;
+
+            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+            if ((wg_group_id + 1) * elems_per_workgroup  <= n) {
+              #pragma unroll
+              for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) {
+                _Type val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id);
+
+                size_t satisfies_pred = pred(val);
+                //size_t satisfies_pred = 0;
+                size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<size_t>());
+
+                if (satisfies_pred)
+                  wg_copy_if_values[count + wg_count.load()] = val;
+
+                if (wg_local_id == (wgsize - 1))
+                  wg_count += (count + satisfies_pred);
+                sycl::group_barrier(group);
+              }
+            } 
+            else {
+              // Edge of input, have to handle memory bounds
+              // Might have unneccessary group_barrier calls
+              #pragma unroll
+              for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) {
+                size_t satisfies_pred = 0;
+                _Type val; // TODO: alloca
+                if (i + elems_per_workgroup * wg_group_id < n) {
+                  val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id, n);
+
+                  satisfies_pred = pred(val);
+                }
+                size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<size_t>());
+
+                if (satisfies_pred)
+                  wg_copy_if_values[count + wg_count.load()] = val;
+
+                if (wg_local_id == (wgsize - 1))
+                  wg_count += (count + satisfies_pred);
+                sycl::group_barrier(group);
+              }
+            }
+            // Check behaviour
+            if (group.leader()) {
+              __out_rng[wg_group_id] = wg_count.load();
+            }
+
+            // Phase 2: Global scan across wg_count
+
+            // Phase 3: copy values to global memory
+        });
+    });
+    event.wait();
+}
+
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _UnaryPredicate>
+void
+single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _UnaryPredicate pred)
+{
+    auto __n = __in_end - __in_begin;
+
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    single_pass_copy_if_impl<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), pred);
+}
+
 } // inline namespace igpu
 
 } // namespace oneapi::dpl::experimental::kt
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
new file mode 100644
index 00000000000..459449d933d
--- /dev/null
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -0,0 +1,77 @@
+// -*- C++ -*-
+//===-- scan.pass.cpp -----------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/test_config.h"
+
+#include _PSTL_TEST_HEADER(execution)
+#include _PSTL_TEST_HEADER(numeric)
+
+int
+main()
+{
+    bool all_passed = true;
+    sycl::queue q;
+
+    for (int logn : {4, 8, 10, 12, 14})
+    {
+        std::cout << "Testing 2^" << logn << std::endl;
+        int n = 1 << logn;
+        std::cout << "n:" << n << std::endl;
+        std::vector<int> v(n, 0);
+        for (size_t i = 0; i < v.size(); ++i)
+          std::cout << v[i] << ",";
+        std::cout << std::endl;
+
+        int* in_ptr = sycl::malloc_device<int>(n, q);
+        int* out_ptr = sycl::malloc_device<int>(n, q);
+
+        constexpr int n_elements_per_workitem = 8;
+
+        q.copy(v.data(), in_ptr, n).wait();
+        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, class ScanKernel>;
+        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, [](int x) { return x == 0; });
+
+        std::vector<int> tmp(n, 0);
+        q.copy(out_ptr, tmp.data(), n);
+        q.wait();
+
+        std::copy_if(v.begin(), v.end(), v.begin(), [](int x) { return x == 0; });
+
+        bool passed = true;
+        // for (size_t i  = 0; i < n; ++i)
+        // {
+        //     if (tmp[i] != v[i])
+        //     {
+        //         passed = false;
+        //         std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
+        //     }
+        // }
+
+        // if (passed)
+        //     std::cout << " passed" << std::endl;
+        // else
+        //     std::cout << " failed" << std::endl;
+
+        for (size_t i = 0; i < n/(n_elements_per_workitem*128) + 1; ++i) {
+          std::cout << "i:" << i << " count:" << tmp[i] << std::endl;
+        }
+
+        all_passed &= passed;
+        sycl::free(in_ptr, q);
+        sycl::free(out_ptr, q);
+    }
+
+    return !all_passed;
+}

From 54c0ae9a66a049d62dcac83e891e225ab3e50e1a Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Mon, 27 Nov 2023 13:26:38 +0000
Subject: [PATCH 031/134] Add phase 2

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 112 +++++++++++++++---
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   |   6 +-
 2 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index c6da15a17b0..5a9d3241574 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -739,11 +739,14 @@ inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, siz
     return src[i + wg_stride * wg_group_id];
 }
 
-template <typename _KernelParam, typename _InRange, typename _OutRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _UnaryPredicate>
 void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _TileIdT = TileId::_TileIdT;
+    using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>;
+    using _FlagT = typename _LookbackScanMemory::_FlagT;
 
     const ::std::size_t n = __in_rng.size();
 
@@ -751,33 +754,87 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
 
     // Avoid non_uniform n by padding up to a multiple of wgsize
-    std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
+    ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
+    scratch.allocate(num_wgs);
+
+    // Memory Structure:
+    // [Lookback Scan Memory, Tile Id Counter]
+    auto scan_memory_begin = scratch.scan_memory_ptr();
+    auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs);
+    auto tile_id_begin = scratch.tile_id_ptr();
+
+    ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs);
+    // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
+    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
+
+    auto fill_event = __queue.submit(
+        [&](sycl::handler& hdl)
+        {
+            hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
+                             [=](const sycl::nd_item<1>& item)
+                             {
+                                 int id = item.get_global_linear_id();
+                                 if (id < num_elements)
+                                     status_flags_begin[id] = id < _LookbackScanMemory::padding
+                                                                  ? _LookbackScanMemory::OUT_OF_BOUNDS
+                                                                  : _LookbackScanMemory::NOT_READY;
+                                 if (id == num_elements)
+                                     tile_id_begin[0] = 0;
+                             });
+        });
+
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl);
         auto l_wg_count = sycl::local_accessor<size_t, 1>(sycl::range<1>{1}, hdl);
 
+        auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
+        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+        hdl.parallel_for<class copy_if_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
             auto group = item.get_group();
             auto wg_group_id = item.get_group(0);
             auto wg_local_id = item.get_local_id(0);
             auto sg = item.get_sub_group();
+            constexpr ::std::uint32_t stride = wgsize;                 
+                                                            
+            // Init tile_id                                 
+            std::uint32_t tile_id;                          
+            if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
+            {
+                // Obtain unique ID for this work-group that will be used in decoupled lookback
+                TileId dynamic_tile_id(tile_id_begin);
+                if (group.leader())
+                {
+                    tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                }
+                sycl::group_barrier(group);
+                tile_id = tile_id_lacc[0];
+            }
+            else
+            {
+                tile_id = group.get_group_linear_id();
+            }
+
+            // Global load into local
+            auto wg_current_offset = (tile_id * elems_in_tile);
+            auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
+            auto wg_local_memory_size = elems_in_tile;
 
             // Must be a better way to init atomics
             l_wg_count[0] = 0;
             sycl::group_barrier(group);
             sycl::atomic_ref<size_t, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
 
-            constexpr size_t elems_per_workgroup = elems_per_workitem*wgsize;
-
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if ((wg_group_id + 1) * elems_per_workgroup  <= n) {
+            if ((wg_group_id + 1) * elems_in_tile  <= n) {
               #pragma unroll
-              for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) {
-                _Type val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id);
+              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+                _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id);
 
                 size_t satisfies_pred = pred(val);
                 //size_t satisfies_pred = 0;
@@ -794,12 +851,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             else {
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
-              #pragma unroll
-              for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) {
+              //#pragma unroll
+              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 size_t satisfies_pred = 0;
                 _Type val; // TODO: alloca
-                if (i + elems_per_workgroup * wg_group_id < n) {
-                  val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id, n);
+                if (i + elems_in_tile * wg_group_id < n) {
+                  val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id, n);
 
                   satisfies_pred = pred(val);
                 }
@@ -813,13 +870,36 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 sycl::group_barrier(group);
               }
             }
+
+            // Phase 2: Global scan across wg_count
+            auto local_sum = wg_count.load();
+
+            auto in_begin = tile_vals.get_pointer();
+
+            _Type prev_sum = 0;
+
+            // The first sub-group will query the previous tiles to find a prefix
+            if (sg.get_group_id() == 0)
+            {
+                _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
+
+                if (group.leader())
+                    scan_mem.set_partial(tile_id, local_sum);
+
+                // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_Type>(), scan_mem);
+
+                if (group.leader())
+                    scan_mem.set_full(tile_id, prev_sum + local_sum);
+            }
+
+            _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+ 
             // Check behaviour
             if (group.leader()) {
-              __out_rng[wg_group_id] = wg_count.load();
+              __out_rng[wg_group_id] = carry;
             }
 
-            // Phase 2: Global scan across wg_count
-
             // Phase 3: copy values to global memory
         });
     });
@@ -839,7 +919,7 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
-    single_pass_copy_if_impl<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), pred);
+    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), pred);
 }
 
 } // inline namespace igpu
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 459449d933d..917e88a7707 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -30,9 +30,9 @@ main()
         int n = 1 << logn;
         std::cout << "n:" << n << std::endl;
         std::vector<int> v(n, 0);
-        for (size_t i = 0; i < v.size(); ++i)
-          std::cout << v[i] << ",";
-        std::cout << std::endl;
+        //for (size_t i = 0; i < v.size(); ++i)
+        //  std::cout << v[i] << ",";
+        //std::cout << std::endl;
 
         int* in_ptr = sycl::malloc_device<int>(n, q);
         int* out_ptr = sycl::malloc_device<int>(n, q);

From ba543ed4040c5faa4daca25cc77bb3679d15d4c6 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 28 Nov 2023 15:19:56 +0000
Subject: [PATCH 032/134] Add phase 3

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 67 ++++++++-------
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   | 86 ++++++++++++-------
 2 files changed, 87 insertions(+), 66 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 5a9d3241574..63a59476234 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -721,27 +721,27 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
 
 // Load function to try and get some PVC perf w/ coalesced
 template <typename Tp, typename _InRange>
-inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id) {
+inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id) {
     // if constexpr (std::is_arithmetic_v<Tp>) {
-    //   return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id);
+    //   return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id);
     // } 
-    return src[i + wg_stride * wg_group_id];
+    return src[i + wg_stride * tile_id];
 }
 
 // Load with checking for the subgroup case
 template <typename Tp, typename _InRange>
-inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id, size_t input_size) {
+inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id, size_t input_size) {
     // if constexpr (std::is_arithmetic_v<Tp>) {
-      // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * wg_group_id <= input_size) 
-        // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id);
-      // return src[i + wg_stride * wg_group_id];
+      // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * tile_id <= input_size) 
+        // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id);
+      // return src[i + wg_stride * tile_id];
     // } 
-    return src[i + wg_stride * wg_group_id];
+    return src[i + wg_stride * tile_id];
 }
 
-template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
 void
-single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred)
+single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _TileIdT = TileId::_TileIdT;
@@ -793,11 +793,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+        hdl.depends_on(fill_event);
 
-        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for<class copy_if_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
             auto group = item.get_group();
-            auto wg_group_id = item.get_group(0);
             auto wg_local_id = item.get_local_id(0);
             auto sg = item.get_sub_group();
             constexpr ::std::uint32_t stride = wgsize;                 
@@ -822,7 +822,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
             // Global load into local
             auto wg_current_offset = (tile_id * elems_in_tile);
-            auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
             auto wg_local_memory_size = elems_in_tile;
 
             // Must be a better way to init atomics
@@ -831,10 +830,10 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             sycl::atomic_ref<size_t, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if ((wg_group_id + 1) * elems_in_tile  <= n) {
+            if ((tile_id + 1) * elems_in_tile <= n) {
               #pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id);
+                _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id);
 
                 size_t satisfies_pred = pred(val);
                 //size_t satisfies_pred = 0;
@@ -847,16 +846,15 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                   wg_count += (count + satisfies_pred);
                 sycl::group_barrier(group);
               }
-            } 
-            else {
+            } else {
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
-              //#pragma unroll
+              #pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 size_t satisfies_pred = 0;
                 _Type val; // TODO: alloca
-                if (i + elems_in_tile * wg_group_id < n) {
-                  val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id, n);
+                if (i + elems_in_tile * tile_id < n) {
+                  val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n);
 
                   satisfies_pred = pred(val);
                 }
@@ -873,10 +871,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
             // Phase 2: Global scan across wg_count
             auto local_sum = wg_count.load();
-
             auto in_begin = tile_vals.get_pointer();
-
-            _Type prev_sum = 0;
+            size_t prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
             if (sg.get_group_id() == 0)
@@ -893,22 +889,23 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                     scan_mem.set_full(tile_id, prev_sum + local_sum);
             }
 
-            _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+            size_t start_idx = sycl::group_broadcast(group, prev_sum, 0);
  
-            // Check behaviour
-            if (group.leader()) {
-              __out_rng[wg_group_id] = carry;
-            }
-
             // Phase 3: copy values to global memory
+            for (int i = wg_local_id; i < local_sum; i += wgsize) {
+                // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store
+                __out_rng[start_idx + i] = wg_copy_if_values[i];
+            }
+            if (tile_id == (num_wgs - 1) && group.leader())
+                __num_rng[0] = start_idx + local_sum;
         });
     });
     event.wait();
 }
 
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _UnaryPredicate>
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>
 void
-single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _UnaryPredicate pred)
+single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
 {
     auto __n = __in_end - __in_begin;
 
@@ -919,7 +916,11 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
-    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), pred);
+    auto __keep_num =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
+    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
+
+    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
 }
 
 } // inline namespace igpu
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 917e88a7707..202f28fbaad 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -14,64 +14,84 @@
 //===----------------------------------------------------------------------===//
 
 #include "support/test_config.h"
+#include "support/utils.h"
 
 #include _PSTL_TEST_HEADER(execution)
 #include _PSTL_TEST_HEADER(numeric)
 
-int
-main()
+using namespace TestUtils;
+
+template <typename T, typename Predicate>
+class CopyIfKernel;
+
+template<typename T, typename Predicate, typename Generator>
+bool test(Predicate pred, Generator gen)
 {
     bool all_passed = true;
     sycl::queue q;
 
-    for (int logn : {4, 8, 10, 12, 14})
+    for (int logn : {4, 8, 10, 12, 14, 15, 18})
     {
-        std::cout << "Testing 2^" << logn << std::endl;
         int n = 1 << logn;
-        std::cout << "n:" << n << std::endl;
-        std::vector<int> v(n, 0);
-        //for (size_t i = 0; i < v.size(); ++i)
-        //  std::cout << v[i] << ",";
-        //std::cout << std::endl;
 
-        int* in_ptr = sycl::malloc_device<int>(n, q);
-        int* out_ptr = sycl::malloc_device<int>(n, q);
+        Sequence<T> in(n, [&](size_t k) -> T { 
+            return gen(n ^ k); 
+        });
+
+        Sequence<T> std_out(n);
+
+        T* in_ptr = sycl::malloc_device<T>(n, q);
+        T* out_ptr = sycl::malloc_device<T>(n, q);
+        size_t* out_num = sycl::malloc_device<size_t>(1, q);
 
         constexpr int n_elements_per_workitem = 8;
 
-        q.copy(v.data(), in_ptr, n).wait();
-        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, class ScanKernel>;
-        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, [](int x) { return x == 0; });
+        q.copy(in.data(), in_ptr, n).wait();
+        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, CopyIfKernel<T, Predicate>>;
+        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, out_num, pred);
 
-        std::vector<int> tmp(n, 0);
-        q.copy(out_ptr, tmp.data(), n);
+        Sequence<T> kt_out(n);
+        size_t num_selected = 0;
+        q.copy(out_ptr, kt_out.data(), n);
+        q.copy(out_num, &num_selected, 1);
         q.wait();
 
-        std::copy_if(v.begin(), v.end(), v.begin(), [](int x) { return x == 0; });
+        auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred);
 
         bool passed = true;
-        // for (size_t i  = 0; i < n; ++i)
-        // {
-        //     if (tmp[i] != v[i])
-        //     {
-        //         passed = false;
-        //         std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
-        //     }
-        // }
-
-        // if (passed)
-        //     std::cout << " passed" << std::endl;
-        // else
-        //     std::cout << " failed" << std::endl;
-
-        for (size_t i = 0; i < n/(n_elements_per_workitem*128) + 1; ++i) {
-          std::cout << "i:" << i << " count:" << tmp[i] << std::endl;
+        if (num_selected != (std_out_end - std_out.begin())) {
+            passed = false;
+            std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected << "\n";
+        }
+
+        for (size_t i  = 0; i < (std_out_end - std_out.begin()); ++i)
+        {
+            if (kt_out[i] != std_out[i])
+            {
+                passed = false;
+                std::cout << "expected " << i << ' ' << std_out[i] << ' ' << kt_out[i] << '\n';
+            }
         }
 
+        if (passed)
+            std::cout << " passed" << std::endl;
+        else
+            std::cout << " failed" << std::endl;
+
         all_passed &= passed;
         sycl::free(in_ptr, q);
         sycl::free(out_ptr, q);
+        sycl::free(out_num, q);
     }
 
     return !all_passed;
 }
+
+int main() {
+    bool all_passed;
+    all_passed &= test<float64_t>([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
+    all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; });
+    all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });
+
+    return all_passed;
+}

From cdf74d0aed9c8f64bb5fc10e3ed96ecf1626732d Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 5 Dec 2023 11:28:40 +0000
Subject: [PATCH 033/134] Add count datatype _SizeT

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 63a59476234..3d6289642bc 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -744,8 +744,9 @@ void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _SizeT = uint64_t;
     using _TileIdT = TileId::_TileIdT;
-    using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>;
+    using _LookbackScanMemory = LookbackScanMemory<_SizeT, _UseAtomic64>;
     using _FlagT = typename _LookbackScanMemory::_FlagT;
 
     const ::std::size_t n = __in_rng.size();
@@ -758,7 +759,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
-    ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
+    ScanMemoryManager<_SizeT, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
     scratch.allocate(num_wgs);
 
     // Memory Structure:
@@ -792,7 +793,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         auto l_wg_count = sycl::local_accessor<size_t, 1>(sycl::range<1>{1}, hdl);
 
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
-        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+        auto tile_vals = sycl::local_accessor<_SizeT, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
@@ -827,7 +828,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             // Must be a better way to init atomics
             l_wg_count[0] = 0;
             sycl::group_barrier(group);
-            sycl::atomic_ref<size_t, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
+            sycl::atomic_ref<_SizeT, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
+            sycl::group_barrier(group);
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if ((tile_id + 1) * elems_in_tile <= n) {
@@ -835,9 +837,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id);
 
-                size_t satisfies_pred = pred(val);
-                //size_t satisfies_pred = 0;
-                size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<size_t>());
+                _SizeT satisfies_pred = pred(val);
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>());
 
                 if (satisfies_pred)
                   wg_copy_if_values[count + wg_count.load()] = val;
@@ -851,14 +852,14 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
               // Might have unneccessary group_barrier calls
               #pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                size_t satisfies_pred = 0;
+                _SizeT satisfies_pred = 0;
                 _Type val; // TODO: alloca
                 if (i + elems_in_tile * tile_id < n) {
                   val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n);
 
                   satisfies_pred = pred(val);
                 }
-                size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<size_t>());
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>());
 
                 if (satisfies_pred)
                   wg_copy_if_values[count + wg_count.load()] = val;
@@ -870,9 +871,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             }
 
             // Phase 2: Global scan across wg_count
-            auto local_sum = wg_count.load();
-            auto in_begin = tile_vals.get_pointer();
-            size_t prev_sum = 0;
+            _SizeT local_sum = wg_count.load();
+            _SizeT* in_begin = tile_vals.get_pointer();
+            _SizeT prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
             if (sg.get_group_id() == 0)
@@ -883,13 +884,13 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                     scan_mem.set_partial(tile_id, local_sum);
 
                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_Type>(), scan_mem);
+                prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
 
                 if (group.leader())
                     scan_mem.set_full(tile_id, prev_sum + local_sum);
             }
 
-            size_t start_idx = sycl::group_broadcast(group, prev_sum, 0);
+            _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
  
             // Phase 3: copy values to global memory
             for (int i = wg_local_id; i < local_sum; i += wgsize) {

From c5670d813d8a1d9b77f9ad93e86e940a859b9a3b Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 5 Dec 2023 12:24:51 +0000
Subject: [PATCH 034/134] Move away from atomics

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 36 +++++++------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 3d6289642bc..60c2db24b78 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -789,11 +789,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         });
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl);
-        auto l_wg_count = sycl::local_accessor<size_t, 1>(sycl::range<1>{1}, hdl);
+        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
 
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
-        auto tile_vals = sycl::local_accessor<_SizeT, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
@@ -825,11 +823,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             auto wg_current_offset = (tile_id * elems_in_tile);
             auto wg_local_memory_size = elems_in_tile;
 
-            // Must be a better way to init atomics
-            l_wg_count[0] = 0;
-            sycl::group_barrier(group);
-            sycl::atomic_ref<_SizeT, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
-            sycl::group_barrier(group);
+            _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if ((tile_id + 1) * elems_in_tile <= n) {
@@ -838,14 +832,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id);
 
                 _SizeT satisfies_pred = pred(val);
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>());
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
 
                 if (satisfies_pred)
-                  wg_copy_if_values[count + wg_count.load()] = val;
+                  wg_copy_if_values[count] = val;
 
-                if (wg_local_id == (wgsize - 1))
-                  wg_count += (count + satisfies_pred);
-                sycl::group_barrier(group);
+                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
               }
             } else {
               // Edge of input, have to handle memory bounds
@@ -859,20 +851,16 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
                   satisfies_pred = pred(val);
                 }
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>());
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
 
                 if (satisfies_pred)
-                  wg_copy_if_values[count + wg_count.load()] = val;
+                  wg_copy_if_values[count] = val;
 
-                if (wg_local_id == (wgsize - 1))
-                  wg_count += (count + satisfies_pred);
-                sycl::group_barrier(group);
+                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
               }
             }
 
             // Phase 2: Global scan across wg_count
-            _SizeT local_sum = wg_count.load();
-            _SizeT* in_begin = tile_vals.get_pointer();
             _SizeT prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
@@ -881,24 +869,24 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
 
                 if (group.leader())
-                    scan_mem.set_partial(tile_id, local_sum);
+                    scan_mem.set_partial(tile_id, wg_count);
 
                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
                 prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
 
                 if (group.leader())
-                    scan_mem.set_full(tile_id, prev_sum + local_sum);
+                    scan_mem.set_full(tile_id, prev_sum + wg_count);
             }
 
             _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
  
             // Phase 3: copy values to global memory
-            for (int i = wg_local_id; i < local_sum; i += wgsize) {
+            for (int i = wg_local_id; i < wg_count; i += wgsize) {
                 // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store
                 __out_rng[start_idx + i] = wg_copy_if_values[i];
             }
             if (tile_id == (num_wgs - 1) && group.leader())
-                __num_rng[0] = start_idx + local_sum;
+                __num_rng[0] = start_idx + wg_count;
         });
     });
     event.wait();

From 8918b42eeb91ffa6baac73f7fb5276d80e3d7758 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 5 Dec 2023 19:42:54 +0000
Subject: [PATCH 035/134] Sort out test logic

---
 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 202f28fbaad..75769131522 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -84,11 +84,11 @@ bool test(Predicate pred, Generator gen)
         sycl::free(out_num, q);
     }
 
-    return !all_passed;
+    return all_passed;
 }
 
 int main() {
-    bool all_passed;
+    bool all_passed = true;
     all_passed &= test<float64_t>([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
     all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; });
     all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });

From 625f3156d814ecb4d57666e35515098744fcaf15 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 5 Dec 2023 19:50:48 +0000
Subject: [PATCH 036/134] Remove unnecessary load and store functions

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 25 ++-----------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 60c2db24b78..68d11740df0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -719,26 +719,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
-// Load function to try and get some PVC perf w/ coalesced
-template <typename Tp, typename _InRange>
-inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id) {
-    // if constexpr (std::is_arithmetic_v<Tp>) {
-    //   return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id);
-    // } 
-    return src[i + wg_stride * tile_id];
-}
-
-// Load with checking for the subgroup case
-template <typename Tp, typename _InRange>
-inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id, size_t input_size) {
-    // if constexpr (std::is_arithmetic_v<Tp>) {
-      // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * tile_id <= input_size) 
-        // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id);
-      // return src[i + wg_stride * tile_id];
-    // } 
-    return src[i + wg_stride * tile_id];
-}
-
 template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
 void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
@@ -829,7 +809,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             if ((tile_id + 1) * elems_in_tile <= n) {
               #pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id);
+                _Type val = __in_rng[i + elems_in_tile * tile_id];
 
                 _SizeT satisfies_pred = pred(val);
                 _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
@@ -847,7 +827,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 _SizeT satisfies_pred = 0;
                 _Type val; // TODO: alloca
                 if (i + elems_in_tile * tile_id < n) {
-                  val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n);
+                  val = __in_rng[i + elems_in_tile * tile_id];
 
                   satisfies_pred = pred(val);
                 }
@@ -882,7 +862,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
  
             // Phase 3: copy values to global memory
             for (int i = wg_local_id; i < wg_count; i += wgsize) {
-                // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store
                 __out_rng[start_idx + i] = wg_copy_if_values[i];
             }
             if (tile_id == (num_wgs - 1) && group.leader())

From ca7a8306df3c0ed75d3b3d9d958c8ec81d33a3b5 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Wed, 6 Dec 2023 11:03:59 +0000
Subject: [PATCH 037/134] Release scratch mem

---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h  | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 68d11740df0..db642fc7177 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -868,6 +868,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 __num_rng[0] = start_idx + wg_count;
         });
     });
+
+    scratch.async_free(event);
+
     event.wait();
 }
 

From 25238ebee912c4b1419b15c3aa0310cbf182ae9c Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 11:15:29 +0000
Subject: [PATCH 038/134] Add single wg copy if

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 108 +++++++++++++++++-
 1 file changed, 106 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index db642fc7177..36e395b7285 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -79,6 +79,11 @@ struct ScanMemoryManager
             });
     }
 
+    void free()
+    {
+        sycl::free(scratch, q);
+    }
+
   private:
     ::std::uint8_t* scratch = nullptr;
     ::std::uint8_t* scan_memory_begin = nullptr;
@@ -719,6 +724,86 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
+template <typename _KernelParam, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
+void
+single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _SizeT = uint64_t;
+    using _TileIdT = TileId::_TileIdT;
+
+    const ::std::size_t n = __in_rng.size();
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
+    ::std::size_t num_workitems = num_wgs * wgsize;
+
+    auto event = __queue.submit([&](sycl::handler& hdl) {
+        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+            auto group = item.get_group();
+            auto wg_local_id = item.get_local_id(0);
+            constexpr ::std::uint32_t stride = wgsize;                 
+                                                            
+            // Global load into local
+            auto wg_current_offset = 0;
+            auto wg_local_memory_size = elems_in_tile;
+
+            _SizeT wg_count = 0;
+
+            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+            if (elems_in_tile <= n) {
+              #pragma unroll
+              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+                _Type val = __in_rng[i];
+
+                _SizeT satisfies_pred = pred(val);
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+
+                if (satisfies_pred)
+                  wg_copy_if_values[count] = val;
+
+                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+              }
+            } else {
+              // Edge of input, have to handle memory bounds
+              // Might have unneccessary group_barrier calls
+              #pragma unroll
+              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+                _SizeT satisfies_pred = 0;
+                _Type val; // TODO: alloca
+                if (i < n) {
+                  val = __in_rng[i];
+
+                  satisfies_pred = pred(val);
+                }
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+
+                if (satisfies_pred)
+                  wg_copy_if_values[count] = val;
+
+                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+              }
+            }
+
+            // Phase 3: copy values to global memory
+            for (int i = wg_local_id; i < wg_count; i += wgsize) {
+                __out_rng[i] = wg_copy_if_values[i];
+            }
+            if (group.leader())
+                __num_rng[0] = wg_count;
+        });
+    });
+
+    event.wait();
+}
+
 template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
 void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
@@ -869,9 +954,28 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         });
     });
 
-    scratch.async_free(event);
-
     event.wait();
+    scratch.free();
+}
+
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>
+void
+single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
+{
+    auto __n = __in_end - __in_begin;
+
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    auto __keep_num =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
+    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
+
+    single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
 }
 
 template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>

From 2f2ccb2ba8d379fcc1569e149784c5566f6407e8 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 11:38:45 +0000
Subject: [PATCH 039/134] Fix unrolls and use memset

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 36 ++++++-------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 36e395b7285..fcfb3ad1b84 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -753,13 +753,12 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
                                                             
             // Global load into local
             auto wg_current_offset = 0;
-            auto wg_local_memory_size = elems_in_tile;
 
             _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if (elems_in_tile <= n) {
-              #pragma unroll
+#pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 _Type val = __in_rng[i];
 
@@ -774,7 +773,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
             } else {
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
-              #pragma unroll
+#pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 _SizeT satisfies_pred = 0;
                 _Type val; // TODO: alloca
@@ -837,21 +836,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
     ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
 
-    auto fill_event = __queue.submit(
-        [&](sycl::handler& hdl)
-        {
-            hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
-                             [=](const sycl::nd_item<1>& item)
-                             {
-                                 int id = item.get_global_linear_id();
-                                 if (id < num_elements)
-                                     status_flags_begin[id] = id < _LookbackScanMemory::padding
-                                                                  ? _LookbackScanMemory::OUT_OF_BOUNDS
-                                                                  : _LookbackScanMemory::NOT_READY;
-                                 if (id == num_elements)
-                                     tile_id_begin[0] = 0;
-                             });
-        });
+    auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT));
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
@@ -886,15 +871,14 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
             // Global load into local
             auto wg_current_offset = (tile_id * elems_in_tile);
-            auto wg_local_memory_size = elems_in_tile;
 
             _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if ((tile_id + 1) * elems_in_tile <= n) {
-              #pragma unroll
-              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                _Type val = __in_rng[i + elems_in_tile * tile_id];
+#pragma unroll
+              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
+                _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
 
                 _SizeT satisfies_pred = pred(val);
                 _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
@@ -907,12 +891,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             } else {
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
-              #pragma unroll
-              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+#pragma unroll
+              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
                 _SizeT satisfies_pred = 0;
                 _Type val; // TODO: alloca
-                if (i + elems_in_tile * tile_id < n) {
-                  val = __in_rng[i + elems_in_tile * tile_id];
+                if (i + wg_local_id + elems_in_tile * tile_id < n) {
+                  val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
 
                   satisfies_pred = pred(val);
                 }

From 021fb9a1ef3e8f0ac5ca4fb2c1cacb15cdb1d0b4 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 11:48:50 +0000
Subject: [PATCH 040/134] apply changes to single wg

---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index fcfb3ad1b84..60007e4566c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -759,8 +759,8 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if (elems_in_tile <= n) {
 #pragma unroll
-              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                _Type val = __in_rng[i];
+              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
+                _Type val = __in_rng[i + wg_local_id];
 
                 _SizeT satisfies_pred = pred(val);
                 _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
@@ -774,11 +774,11 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
 #pragma unroll
-              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
                 _SizeT satisfies_pred = 0;
-                _Type val; // TODO: alloca
-                if (i < n) {
-                  val = __in_rng[i];
+                _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                if (i + wg_local_id < n) {
+                  val = __in_rng[i + wg_local_id];
 
                   satisfies_pred = pred(val);
                 }
@@ -894,7 +894,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 #pragma unroll
               for (size_t i = 0; i < elems_in_tile; i += wgsize) {
                 _SizeT satisfies_pred = 0;
-                _Type val; // TODO: alloca
+                _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
                 if (i + wg_local_id + elems_in_tile * tile_id < n) {
                   val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
 

From c4b05a4cf175bdb24c38bd6fa85c0f862b2352b7 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 15:31:31 +0000
Subject: [PATCH 041/134] Remove unused variables

---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h      | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 60007e4566c..fcb539cab2b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -741,6 +741,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
     constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
+    assert(num_wgs == 1);
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
@@ -752,8 +753,6 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
             constexpr ::std::uint32_t stride = wgsize;                 
                                                             
             // Global load into local
-            auto wg_current_offset = 0;
-
             _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
@@ -869,9 +868,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 tile_id = group.get_group_linear_id();
             }
 
-            // Global load into local
-            auto wg_current_offset = (tile_id * elems_in_tile);
-
             _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values

From 5d1ed8ec303544cdadcc7938b1b555231a05cb34 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 15:35:41 +0000
Subject: [PATCH 042/134] Clang-format copy_if_kt commits

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 333 ++++++++++--------
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   |  33 +-
 2 files changed, 200 insertions(+), 166 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index fcb539cab2b..0838817fd4f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -79,7 +79,8 @@ struct ScanMemoryManager
             });
     }
 
-    void free()
+    void
+    free()
     {
         sycl::free(scratch, q);
     }
@@ -724,9 +725,11 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
-template <typename _KernelParam, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _InRange, typename _OutRange, typename _NumSelectedRange,
+          typename _UnaryPredicate>
 void
-single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
+single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
+                                   _NumSelectedRange __num_rng, _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _SizeT = uint64_t;
@@ -747,64 +750,76 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-            auto group = item.get_group();
-            auto wg_local_id = item.get_local_id(0);
-            constexpr ::std::uint32_t stride = wgsize;                 
-                                                            
-            // Global load into local
-            _SizeT wg_count = 0;
-
-            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if (elems_in_tile <= n) {
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
+                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                             auto group = item.get_group();
+                             auto wg_local_id = item.get_local_id(0);
+                             constexpr ::std::uint32_t stride = wgsize;
+
+                             // Global load into local
+                             _SizeT wg_count = 0;
+
+                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+                             if (elems_in_tile <= n)
+                             {
 #pragma unroll
-              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
-                _Type val = __in_rng[i + wg_local_id];
-
-                _SizeT satisfies_pred = pred(val);
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                if (satisfies_pred)
-                  wg_copy_if_values[count] = val;
-
-                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-              }
-            } else {
-              // Edge of input, have to handle memory bounds
-              // Might have unneccessary group_barrier calls
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _Type val = __in_rng[i + wg_local_id];
+
+                                     _SizeT satisfies_pred = pred(val);
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+                             else
+                             {
+                // Edge of input, have to handle memory bounds
+                // Might have unneccessary group_barrier calls
 #pragma unroll
-              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
-                _SizeT satisfies_pred = 0;
-                _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
-                if (i + wg_local_id < n) {
-                  val = __in_rng[i + wg_local_id];
-
-                  satisfies_pred = pred(val);
-                }
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                if (satisfies_pred)
-                  wg_copy_if_values[count] = val;
-
-                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-              }
-            }
-
-            // Phase 3: copy values to global memory
-            for (int i = wg_local_id; i < wg_count; i += wgsize) {
-                __out_rng[i] = wg_copy_if_values[i];
-            }
-            if (group.leader())
-                __num_rng[0] = wg_count;
-        });
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _SizeT satisfies_pred = 0;
+                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                                     if (i + wg_local_id < n)
+                                     {
+                                         val = __in_rng[i + wg_local_id];
+
+                                         satisfies_pred = pred(val);
+                                     }
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+
+                             // Phase 3: copy values to global memory
+                             for (int i = wg_local_id; i < wg_count; i += wgsize)
+                             {
+                                 __out_rng[i] = wg_copy_if_values[i];
+                             }
+                             if (group.leader())
+                                 __num_rng[0] = wg_count;
+                         });
     });
 
     event.wait();
 }
 
-template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange,
+          typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
 void
-single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
+single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
+                         _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _SizeT = uint64_t;
@@ -844,138 +859,150 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-            auto group = item.get_group();
-            auto wg_local_id = item.get_local_id(0);
-            auto sg = item.get_sub_group();
-            constexpr ::std::uint32_t stride = wgsize;                 
-                                                            
-            // Init tile_id                                 
-            std::uint32_t tile_id;                          
-            if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
-            {
-                // Obtain unique ID for this work-group that will be used in decoupled lookback
-                TileId dynamic_tile_id(tile_id_begin);
-                if (group.leader())
-                {
-                    tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-                }
-                sycl::group_barrier(group);
-                tile_id = tile_id_lacc[0];
-            }
-            else
-            {
-                tile_id = group.get_group_linear_id();
-            }
-
-            _SizeT wg_count = 0;
-
-            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if ((tile_id + 1) * elems_in_tile <= n) {
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
+                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                             auto group = item.get_group();
+                             auto wg_local_id = item.get_local_id(0);
+                             auto sg = item.get_sub_group();
+                             constexpr ::std::uint32_t stride = wgsize;
+
+                             // Init tile_id
+                             std::uint32_t tile_id;
+                             if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
+                             {
+                                 // Obtain unique ID for this work-group that will be used in decoupled lookback
+                                 TileId dynamic_tile_id(tile_id_begin);
+                                 if (group.leader())
+                                 {
+                                     tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                                 }
+                                 sycl::group_barrier(group);
+                                 tile_id = tile_id_lacc[0];
+                             }
+                             else
+                             {
+                                 tile_id = group.get_group_linear_id();
+                             }
+
+                             _SizeT wg_count = 0;
+
+                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+                             if ((tile_id + 1) * elems_in_tile <= n)
+                             {
 #pragma unroll
-              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
-                _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
-
-                _SizeT satisfies_pred = pred(val);
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                if (satisfies_pred)
-                  wg_copy_if_values[count] = val;
-
-                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-              }
-            } else {
-              // Edge of input, have to handle memory bounds
-              // Might have unneccessary group_barrier calls
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
+
+                                     _SizeT satisfies_pred = pred(val);
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+                             else
+                             {
+                // Edge of input, have to handle memory bounds
+                // Might have unneccessary group_barrier calls
 #pragma unroll
-              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
-                _SizeT satisfies_pred = 0;
-                _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
-                if (i + wg_local_id + elems_in_tile * tile_id < n) {
-                  val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
-
-                  satisfies_pred = pred(val);
-                }
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                if (satisfies_pred)
-                  wg_copy_if_values[count] = val;
-
-                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-              }
-            }
-
-            // Phase 2: Global scan across wg_count
-            _SizeT prev_sum = 0;
-
-            // The first sub-group will query the previous tiles to find a prefix
-            if (sg.get_group_id() == 0)
-            {
-                _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
-
-                if (group.leader())
-                    scan_mem.set_partial(tile_id, wg_count);
-
-                // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
-
-                if (group.leader())
-                    scan_mem.set_full(tile_id, prev_sum + wg_count);
-            }
-
-            _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
- 
-            // Phase 3: copy values to global memory
-            for (int i = wg_local_id; i < wg_count; i += wgsize) {
-                __out_rng[start_idx + i] = wg_copy_if_values[i];
-            }
-            if (tile_id == (num_wgs - 1) && group.leader())
-                __num_rng[0] = start_idx + wg_count;
-        });
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _SizeT satisfies_pred = 0;
+                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                                     if (i + wg_local_id + elems_in_tile * tile_id < n)
+                                     {
+                                         val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
+
+                                         satisfies_pred = pred(val);
+                                     }
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+
+                             // Phase 2: Global scan across wg_count
+                             _SizeT prev_sum = 0;
+
+                             // The first sub-group will query the previous tiles to find a prefix
+                             if (sg.get_group_id() == 0)
+                             {
+                                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
+
+                                 if (group.leader())
+                                     scan_mem.set_partial(tile_id, wg_count);
+
+                                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                                 prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
+
+                                 if (group.leader())
+                                     scan_mem.set_full(tile_id, prev_sum + wg_count);
+                             }
+
+                             _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
+
+                             // Phase 3: copy values to global memory
+                             for (int i = wg_local_id; i < wg_count; i += wgsize)
+                             {
+                                 __out_rng[start_idx + i] = wg_copy_if_values[i];
+                             }
+                             if (tile_id == (num_wgs - 1) && group.leader())
+                                 __num_rng[0] = start_idx + wg_count;
+                         });
     });
 
     event.wait();
     scratch.free();
 }
 
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
+          typename _UnaryPredicate>
 void
-single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
+single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
+                              _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
 {
     auto __n = __in_end - __in_begin;
 
-    auto __keep1 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
     auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
     auto __keep_num =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
+    single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                     __buf_num.all_view(), pred);
 }
 
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
+          typename _UnaryPredicate>
 void
-single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
+single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
+                    _NumSelectedRange __num_begin, _UnaryPredicate pred)
 {
     auto __n = __in_end - __in_begin;
 
-    auto __keep1 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
     auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
     auto __keep_num =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
+    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(
+        __queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
 }
 
 } // inline namespace igpu
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 75769131522..a77b76491e7 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -24,8 +24,9 @@ using namespace TestUtils;
 template <typename T, typename Predicate>
 class CopyIfKernel;
 
-template<typename T, typename Predicate, typename Generator>
-bool test(Predicate pred, Generator gen)
+template <typename T, typename Predicate, typename Generator>
+bool
+test(Predicate pred, Generator gen)
 {
     bool all_passed = true;
     sycl::queue q;
@@ -34,9 +35,7 @@ bool test(Predicate pred, Generator gen)
     {
         int n = 1 << logn;
 
-        Sequence<T> in(n, [&](size_t k) -> T { 
-            return gen(n ^ k); 
-        });
+        Sequence<T> in(n, [&](size_t k) -> T { return gen(n ^ k); });
 
         Sequence<T> std_out(n);
 
@@ -47,8 +46,9 @@ bool test(Predicate pred, Generator gen)
         constexpr int n_elements_per_workitem = 8;
 
         q.copy(in.data(), in_ptr, n).wait();
-        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, CopyIfKernel<T, Predicate>>;
-        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, out_num, pred);
+        using KernelParams =
+            oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, CopyIfKernel<T, Predicate>>;
+        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr + n, out_ptr, out_num, pred);
 
         Sequence<T> kt_out(n);
         size_t num_selected = 0;
@@ -59,12 +59,14 @@ bool test(Predicate pred, Generator gen)
         auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred);
 
         bool passed = true;
-        if (num_selected != (std_out_end - std_out.begin())) {
+        if (num_selected != (std_out_end - std_out.begin()))
+        {
             passed = false;
-            std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected << "\n";
+            std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected
+                      << "\n";
         }
 
-        for (size_t i  = 0; i < (std_out_end - std_out.begin()); ++i)
+        for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i)
         {
             if (kt_out[i] != std_out[i])
             {
@@ -87,11 +89,16 @@ bool test(Predicate pred, Generator gen)
     return all_passed;
 }
 
-int main() {
+int
+main()
+{
     bool all_passed = true;
-    all_passed &= test<float64_t>([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
+    all_passed &=
+        test<float64_t>([](const float64_t& x) { return x * x <= 1024; },
+                        [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
     all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; });
-    all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });
+    all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; },
+                                     [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });
 
     return all_passed;
 }

From 70e751a99ad1f3d9b2d681713a5392f1bb9adfa0 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 29 Nov 2023 12:28:43 +0000
Subject: [PATCH 043/134] Enable pragma unroll for open-source DPC++

---
 include/oneapi/dpl/pstl/onedpl_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h
index 450cae9a347..fff5f2405b5 100644
--- a/include/oneapi/dpl/pstl/onedpl_config.h
+++ b/include/oneapi/dpl/pstl/onedpl_config.h
@@ -123,7 +123,7 @@
 // Enable loop unrolling pragmas where supported
 #if (__INTEL_LLVM_COMPILER || __INTEL_COMPILER ||                                                                      \
      (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) &&                                                 \
-      ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 30700))))
+      ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000))))
 #    define _ONEDPL_PRAGMA_UNROLL _ONEDPL_PRAGMA(unroll)
 #else //no pragma unroll
 #    define _ONEDPL_PRAGMA_UNROLL

From a9fdaa365ec80fc2cc2d640fb946cce227c1373d Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Fri, 18 Aug 2023 14:52:41 -0500
Subject: [PATCH 044/134] Start of single-pass scan kernel template

---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |   1 +
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 150 ++++++++++++++++++
 .../numeric/numeric.ops/scan_kt.pass.cpp      |  30 ++++
 3 files changed, 181 insertions(+)
 create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
 create mode 100644 test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index b006eae051b..162fcf2c282 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -46,6 +46,7 @@
 #endif
 
 #include "sycl_traits.h" //SYCL traits specialization for some oneDPL types.
+#include "parallel_backend_sycl_scan.h"
 
 namespace oneapi
 {
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
new file mode 100644
index 00000000000..4fc2dbe4d44
--- /dev/null
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -0,0 +1,150 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_parallel_backend_sycl_scan_H
+#define _ONEDPL_parallel_backend_sycl_scan_H
+
+namespace oneapi::dpl::experimental::igpu
+{
+
+template<typename _T>
+struct __scan_status_flag
+{
+    using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>;
+    static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2);
+    static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1);
+    static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask);
+
+    __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id)
+      : atomic_flag(*(flags_begin + tile_id))
+    {
+
+    }
+
+    void set_partial(std::uint32_t val)
+    {
+        atomic_flag.store(val | partial_mask);
+    }
+
+    void set_full(std::uint32_t val)
+    {
+        atomic_flag.store(val | full_mask);
+    }
+
+    _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin)
+    {
+        _T sum = 0;
+        int i = 0;
+        for (std::int32_t tile = static_cast<std::int32_t>(tile_id) - 1; tile >= 0; --tile)
+        {
+            _AtomicRefT tile_atomic(*(flags_begin + tile));
+            std::uint32_t tile_val = 0;
+            do {
+                tile_val = tile_atomic.load();
+            } while (tile_val == 0);
+
+            sum += tile_val & value_mask;
+
+            // If this was a full value, we can stop looking at previous tiles. Otherwise,
+            // keep going through tiles until we either find a full tile or we've completely
+            // recomputed the prefix using partial values
+            if (tile_val & full_mask)
+                break;
+        }
+        return sum;
+    }
+
+    _AtomicRefT atomic_flag;
+};
+
+template <bool _Inclusive, typename _Policy, typename _InRange, typename _OutRange, typename _BinaryOp>
+void
+single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+
+    const ::std::size_t n = __in_rng.size();
+    auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+    std::size_t num_wgs = __max_cu;
+
+    std::size_t wgsize = n/__max_cu;
+
+    std::uint32_t status_flags_buf_size = num_wgs+1;
+    sycl::buffer<uint32_t, 1> status_flags_buf(status_flags_buf_size);
+
+    // TODO: this probably isn't the best way to do this
+    sycl::host_accessor<std::uint32_t, 1> status_flags(status_flags_buf);
+    for (std::size_t i = 0; i < status_flags_buf_size; ++i)
+        status_flags[i] = 0;
+
+
+    auto event = __exec.queue().submit([&](sycl::handler& hdl) {
+        auto status_flags = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write>(status_flags_buf, hdl);
+        auto tile_id_lacc = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write, sycl::target::local>(sycl::range<1>{1}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
+        hdl.parallel_for(sycl::nd_range<1>(n, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
+            auto item_id = item.get_local_linear_id();
+            auto group = item.get_group();
+
+            //std::uint32_t elems_in_tile = elems_per_item*wgsize;
+            std::uint32_t elems_in_tile = wgsize;
+
+            // Obtain unique ID for this work-group that will be used in decoupled lookback
+            if (group.leader())
+            {
+                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_buf_size-1]);
+                tile_id_lacc[0] = idx_atomic.fetch_add(1);
+            }
+            sycl::group_barrier(group);
+            std::uint32_t tile_id = tile_id_lacc[0];
+
+            auto in_begin = __in_rng.begin() + (tile_id*elems_in_tile);
+            auto in_end = __in_rng.begin() + ((tile_id+1)*elems_in_tile);
+            auto out_begin = __out_rng.begin() + (tile_id*elems_in_tile);
+
+            auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
+
+			__scan_status_flag<_Type> flag(status_flags.get_pointer(), tile_id);
+			flag.set_partial(local_sum);
+
+            auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer());
+            flag.set_full(prev_sum + local_sum);
+
+            sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
+        });
+    });
+
+    event.wait();
+}
+
+template <typename _Policy, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+void
+single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
+{
+    auto __n = __in_end - __in_begin;
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    single_pass_scan_impl<true>(__exec, __buf1.all_view(), __buf2.all_view(), __binary_op);
+}
+
+} // namespace oneapi::dpl::experimental::igpu
+
+#endif /* _ONEDPL_parallel_backend_sycl_scan_H */
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
new file mode 100644
index 00000000000..71a725563d4
--- /dev/null
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -0,0 +1,30 @@
+// -*- C++ -*-
+//===-- scan.pass.cpp -----------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/test_config.h"
+
+#include _PSTL_TEST_HEADER(execution)
+#include _PSTL_TEST_HEADER(numeric)
+
+int
+main()
+{
+    int n = 1 << 16;
+    sycl::queue q;
+    int* in_ptr = sycl::malloc_device<int>(n, q);
+    int* out_ptr = sycl::malloc_device<int>(n, q);
+    oneapi::dpl::experimental::igpu::single_pass_inclusive_scan(oneapi::dpl::execution::dpcpp_default, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
+    return 0;
+}

From dfef06f7db2bc09249df4a65769084ab26b7d2e1 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Thu, 24 Aug 2023 08:48:42 -0500
Subject: [PATCH 045/134] Fix hang in inclusive scan

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 44 ++++++++++++++-----
 .../numeric/numeric.ops/scan_kt.pass.cpp      | 30 ++++++++++++-
 2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 4fc2dbe4d44..e71398a44b7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -16,9 +16,11 @@
 #ifndef _ONEDPL_parallel_backend_sycl_scan_H
 #define _ONEDPL_parallel_backend_sycl_scan_H
 
-namespace oneapi::dpl::experimental::igpu
+namespace oneapi::dpl::experimental::kt
 {
 
+inline namespace igpu {
+
 template<typename _T>
 struct __scan_status_flag
 {
@@ -69,28 +71,36 @@ struct __scan_status_flag
     _AtomicRefT atomic_flag;
 };
 
-template <bool _Inclusive, typename _Policy, typename _InRange, typename _OutRange, typename _BinaryOp>
+template <typename _KernelParam, bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
 void
-single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
+single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
 
+    static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
+
     const ::std::size_t n = __in_rng.size();
-    auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
-    std::size_t num_wgs = __max_cu;
+    auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
+    //std::size_t num_wgs = __max_cu;
+    std::size_t num_wgs = 64;
 
-    std::size_t wgsize = n/__max_cu;
+    // TODO: use wgsize and iters per item from _KernelParam
+    std::size_t wgsize = n/num_wgs;
 
     std::uint32_t status_flags_buf_size = num_wgs+1;
     sycl::buffer<uint32_t, 1> status_flags_buf(status_flags_buf_size);
 
     // TODO: this probably isn't the best way to do this
+    {
     sycl::host_accessor<std::uint32_t, 1> status_flags(status_flags_buf);
     for (std::size_t i = 0; i < status_flags_buf_size; ++i)
         status_flags[i] = 0;
+    }
+
+//    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%lu\n", n, num_wgs, wgsize, __max_cu);
 
 
-    auto event = __exec.queue().submit([&](sycl::handler& hdl) {
+    auto event = __queue.submit([&](sycl::handler& hdl) {
         auto status_flags = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write>(status_flags_buf, hdl);
         auto tile_id_lacc = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write, sycl::target::local>(sycl::range<1>{1}, hdl);
 
@@ -121,6 +131,7 @@ single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_r
 			flag.set_partial(local_sum);
 
             auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer());
+            //auto prev_sum = 0;
             flag.set_full(prev_sum + local_sum);
 
             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
@@ -130,9 +141,18 @@ single_pass_scan_impl(_Policy&& __exec, _InRange&& __in_rng, _OutRange&& __out_r
     event.wait();
 }
 
-template <typename _Policy, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+// The generic structure for configuring a kernel
+template <std::uint16_t DataPerWorkItem, std::uint16_t WorkGroupSize, typename KernelName>
+struct kernel_param
+{
+    static constexpr std::uint16_t data_per_workitem = DataPerWorkItem;
+    static constexpr std::uint16_t workgroup_size = WorkGroupSize;
+    using kernel_name = KernelName;
+};
+
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _BinaryOp>
 void
-single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
+single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
 {
     auto __n = __in_end - __in_begin;
     auto __keep1 =
@@ -142,9 +162,11 @@ single_pass_inclusive_scan(_Policy&& __exec, _InIterator __in_begin, _InIterator
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
-    single_pass_scan_impl<true>(__exec, __buf1.all_view(), __buf2.all_view(), __binary_op);
+    single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
 }
 
-} // namespace oneapi::dpl::experimental::igpu
+} // inline namespace igpu
+
+} // namespace oneapi::dpl::experimental::kt
 
 #endif /* _ONEDPL_parallel_backend_sycl_scan_H */
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
index 71a725563d4..4ae83a92041 100644
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -22,9 +22,35 @@ int
 main()
 {
     int n = 1 << 16;
+    std::vector<int> v(n, 1);
     sycl::queue q;
     int* in_ptr = sycl::malloc_device<int>(n, q);
     int* out_ptr = sycl::malloc_device<int>(n, q);
-    oneapi::dpl::experimental::igpu::single_pass_inclusive_scan(oneapi::dpl::execution::dpcpp_default, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
-    return 0;
+
+
+    q.copy(v.data(), in_ptr, n);
+    using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
+    oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
+
+    std::vector<int> tmp(n, 0);
+    q.copy(out_ptr, tmp.data(), n);
+
+    std::inclusive_scan(v.begin(), v.end(), v.begin());
+
+    bool passed = true;
+    for (size_t i  = 0; i < n; ++i)
+    {
+        if (tmp[i] != v[i])
+        {
+            passed = false;
+            std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
+        }
+    }
+
+    if (passed)
+        std::cout << "passed" << std::endl;
+    else
+        std::cout << "failed" << std::endl;
+
+    return !passed;
 }

From 555f6f9b714c825ffd17577e75c2e2ad0f1c5786 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Thu, 31 Aug 2023 06:18:55 -0700
Subject: [PATCH 046/134] Debug statements for scan kernel template

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 81 +++++++++++++------
 1 file changed, 58 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index e71398a44b7..c70bbabb82b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -82,56 +82,67 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     const ::std::size_t n = __in_rng.size();
     auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
     //std::size_t num_wgs = __max_cu;
-    std::size_t num_wgs = 64;
+    std::size_t num_wgs = 256;
 
     // TODO: use wgsize and iters per item from _KernelParam
-    std::size_t wgsize = n/num_wgs;
+    //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
+    constexpr ::std::size_t __elems_per_item = 2;
+    std::size_t wgsize = n/num_wgs/__elems_per_item;
+    std::size_t num_items = n/__elems_per_item;
 
-    std::uint32_t status_flags_buf_size = num_wgs+1;
-    sycl::buffer<uint32_t, 1> status_flags_buf(status_flags_buf_size);
 
-    // TODO: this probably isn't the best way to do this
-    {
-    sycl::host_accessor<std::uint32_t, 1> status_flags(status_flags_buf);
-    for (std::size_t i = 0; i < status_flags_buf_size; ++i)
-        status_flags[i] = 0;
-    }
+    std::uint32_t status_flags_size = num_wgs+1;
+
+    uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
 
-//    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%lu\n", n, num_wgs, wgsize, __max_cu);
+    //printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
+    /*printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
 
+    uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    uint32_t* debug4 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+    uint32_t* debug5 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);*/
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto status_flags = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write>(status_flags_buf, hdl);
-        auto tile_id_lacc = sycl::accessor<std::uint32_t, 1, sycl::access_mode::read_write, sycl::target::local>(sycl::range<1>{1}, hdl);
+        auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for(sycl::nd_range<1>(n, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
-            auto item_id = item.get_local_linear_id();
+        hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
             auto group = item.get_group();
 
-            //std::uint32_t elems_in_tile = elems_per_item*wgsize;
-            std::uint32_t elems_in_tile = wgsize;
+            std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
             // Obtain unique ID for this work-group that will be used in decoupled lookback
             if (group.leader())
             {
-                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_buf_size-1]);
+                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_size-1]);
                 tile_id_lacc[0] = idx_atomic.fetch_add(1);
             }
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
+            //debug5[group.get_local_id()] = tile_id;
 
-            auto in_begin = __in_rng.begin() + (tile_id*elems_in_tile);
-            auto in_end = __in_rng.begin() + ((tile_id+1)*elems_in_tile);
-            auto out_begin = __out_rng.begin() + (tile_id*elems_in_tile);
+            auto current_offset = (tile_id*elems_in_tile);
+            auto next_offset = ((tile_id+1)*elems_in_tile);
+            auto in_begin = __in_rng.begin() + current_offset;
+            auto in_end = __in_rng.begin() + next_offset;
+            auto out_begin = __out_rng.begin() + current_offset;
+
+            //debug3[tile_id] = current_offset;
+            //debug4[tile_id] = next_offset;
 
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
+            //auto local_sum = 0;
+            ///debug1[tile_id] = local_sum;
 
-			__scan_status_flag<_Type> flag(status_flags.get_pointer(), tile_id);
+			__scan_status_flag<_Type> flag(status_flags, tile_id);
 			flag.set_partial(local_sum);
 
-            auto prev_sum = flag.lookback(tile_id, status_flags.get_pointer());
+            auto prev_sum = flag.lookback(tile_id, status_flags);
             //auto prev_sum = 0;
+            //debug2[tile_id] = prev_sum;
             flag.set_full(prev_sum + local_sum);
 
             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
@@ -139,6 +150,30 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     });
 
     event.wait();
+
+#if 0
+    std::vector<uint32_t> debug1v(status_flags_size);
+    std::vector<uint32_t> debug2v(status_flags_size);
+    std::vector<uint32_t> debug3v(status_flags_size);
+    std::vector<uint32_t> debug4v(status_flags_size);
+    std::vector<uint32_t> debug5v(status_flags_size);
+    __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t));
+
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "local_sum " << i << " " << debug1v[i] << std::endl;
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "lookback " << i << " " << debug2v[i] << std::endl;
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "offset " << i << " " << debug3v[i] << std::endl;
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "end " << i << " " << debug4v[i] << std::endl;
+#endif
+
+    sycl::free(status_flags, __queue);
 }
 
 // The generic structure for configuring a kernel

From 10cfc687cd7c1feb9c8315fb30ffab034289bee1 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Wed, 6 Sep 2023 08:46:10 -0500
Subject: [PATCH 047/134] Update scan kernel template test

---
 .../numeric/numeric.ops/scan_kt.pass.cpp      | 54 +++++++++++--------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
index 4ae83a92041..de5ecafc25b 100644
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -21,36 +21,44 @@
 int
 main()
 {
-    int n = 1 << 16;
-    std::vector<int> v(n, 1);
-    sycl::queue q;
-    int* in_ptr = sycl::malloc_device<int>(n, q);
-    int* out_ptr = sycl::malloc_device<int>(n, q);
+    bool all_passed = true;
 
+    for (int logn : {4, 8, 11, 16, 19, 21})
+    {
+        std::cout << "Testing 2^" << logn << '\n';
+        int n = 1 << logn;
+        std::vector<int> v(n, 1);
+        sycl::queue q;
+        int* in_ptr = sycl::malloc_device<int>(n, q);
+        int* out_ptr = sycl::malloc_device<int>(n, q);
 
-    q.copy(v.data(), in_ptr, n);
-    using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
-    oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
 
-    std::vector<int> tmp(n, 0);
-    q.copy(out_ptr, tmp.data(), n);
+        q.copy(v.data(), in_ptr, n);
+        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
+        oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
 
-    std::inclusive_scan(v.begin(), v.end(), v.begin());
+        std::vector<int> tmp(n, 0);
+        q.copy(out_ptr, tmp.data(), n);
 
-    bool passed = true;
-    for (size_t i  = 0; i < n; ++i)
-    {
-        if (tmp[i] != v[i])
+        std::inclusive_scan(v.begin(), v.end(), v.begin());
+
+        bool passed = true;
+        for (size_t i  = 0; i < n; ++i)
         {
-            passed = false;
-            std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
+            if (tmp[i] != v[i])
+            {
+                passed = false;
+                std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
+            }
         }
-    }
 
-    if (passed)
-        std::cout << "passed" << std::endl;
-    else
-        std::cout << "failed" << std::endl;
+        if (passed)
+            std::cout << "passed" << std::endl;
+        else
+            std::cout << "failed" << std::endl;
+
+        all_passed &= passed;
+    }
 
-    return !passed;
+    return !all_passed;
 }

From 53faf10fe9e585654e77fe87a82826b370a7b2f6 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Thu, 14 Sep 2023 09:08:55 -0700
Subject: [PATCH 048/134] Only have a single work-item per group query for
 previous tile status

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index c70bbabb82b..b01f56ac539 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -86,7 +86,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     // TODO: use wgsize and iters per item from _KernelParam
     //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
-    constexpr ::std::size_t __elems_per_item = 2;
+    constexpr ::std::size_t __elems_per_item = 16;
     std::size_t wgsize = n/num_wgs/__elems_per_item;
     std::size_t num_items = n/__elems_per_item;
 
@@ -96,14 +96,16 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
 
-    //printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
-    /*printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
+#if SCAN_KT_DEBUG
+    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
+    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
 
     uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug4 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug5 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);*/
+    uint32_t* debug5 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
+#endif
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
@@ -138,12 +140,21 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             ///debug1[tile_id] = local_sum;
 
 			__scan_status_flag<_Type> flag(status_flags, tile_id);
-			flag.set_partial(local_sum);
 
-            auto prev_sum = flag.lookback(tile_id, status_flags);
-            //auto prev_sum = 0;
+            if (group.leader())
+                flag.set_partial(local_sum);
+
+            // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+            //sycl::reduce_over_group(item.get_subgroup())
+
+            auto prev_sum = 0;
+
+            if (group.leader())
+                prev_sum = flag.lookback(tile_id, status_flags);
             //debug2[tile_id] = prev_sum;
-            flag.set_full(prev_sum + local_sum);
+
+            if (group.leader())
+                flag.set_full(prev_sum + local_sum);
 
             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
         });

From dc63d16f8832f9c10a1ec6d76508d4c1b1f2c455 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Mon, 18 Sep 2023 08:06:43 -0700
Subject: [PATCH 049/134] First attempt at parallel lookback

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 106 +++++++++++++++---
 1 file changed, 89 insertions(+), 17 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index b01f56ac539..27fdc1d09b4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -24,13 +24,21 @@ inline namespace igpu {
 template<typename _T>
 struct __scan_status_flag
 {
+    // 00xxxx - not computed
+    // 01xxxx - partial
+    // 10xxxx - full
+    // 110000 - out of bounds
+
     using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>;
     static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2);
     static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1);
     static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask);
+    static constexpr std::uint32_t oob_value = partial_mask | full_mask;
+
+    static constexpr int padding = 32;
 
     __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id)
-      : atomic_flag(*(flags_begin + tile_id))
+      : atomic_flag(*(flags_begin + tile_id + padding))
     {
 
     }
@@ -42,16 +50,57 @@ struct __scan_status_flag
 
     void set_full(std::uint32_t val)
     {
-        atomic_flag.store(val | full_mask);
+        atomic_flag.store((val ^ partial_mask) | full_mask);
+    }
+
+    template<typename _Subgroup, typename BinOp>
+    _T cooperative_lookback(const std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin)
+    {
+        _T sum = 0;
+        int offset = -1;
+        int i = 0;
+        int local_id = subgroup.get_local_id();
+
+        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; offset -= 32)
+        {
+            _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id));
+            std::uint32_t tile_val = 0;
+            do {
+                tile_val = tile_atomic.load();
+
+            //} while (!sycl::all_of_group(subgroup, tile_val != 0));
+            } while (0);
+
+            bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0);
+            auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
+            ::std::uint32_t is_full_ballot_bits{};
+            is_full_ballot.extract_bits(is_full_ballot_bits);
+
+            auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits);
+            _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{};
+
+            // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
+            sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
+
+            // If we found a full value, we can stop looking at previous tiles. Otherwise,
+            // keep going through tiles until we either find a full tile or we've completely
+            // recomputed the prefix using partial values
+            if (is_full_ballot_bits)
+                break;
+
+            //if (i++ > 10) break;
+        }
+        return sum;
     }
 
+#if 0
     _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin)
     {
         _T sum = 0;
         int i = 0;
         for (std::int32_t tile = static_cast<std::int32_t>(tile_id) - 1; tile >= 0; --tile)
         {
-            _AtomicRefT tile_atomic(*(flags_begin + tile));
+            _AtomicRefT tile_atomic(*(flags_begin + tile + padding));
             std::uint32_t tile_val = 0;
             do {
                 tile_val = tile_atomic.load();
@@ -67,6 +116,7 @@ struct __scan_status_flag
         }
         return sum;
     }
+#endif
 
     _AtomicRefT atomic_flag;
 };
@@ -86,15 +136,28 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     // TODO: use wgsize and iters per item from _KernelParam
     //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
-    constexpr ::std::size_t __elems_per_item = 16;
+#ifdef _ONEDPL_SCAN_ITER_SIZE
+    constexpr ::std::size_t __elems_per_item = _ONEDPL_SCAN_ITER_SIZE;
+#else
+    constexpr ::std::size_t __elems_per_item = 8;
+#endif
     std::size_t wgsize = n/num_wgs/__elems_per_item;
     std::size_t num_items = n/__elems_per_item;
 
 
-    std::uint32_t status_flags_size = num_wgs+1;
+    constexpr int status_flag_padding = 32;
+    std::uint32_t status_flags_size = num_wgs+1+status_flag_padding;
 
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    __queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
+    //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
+
+    auto fill_event = __queue.submit([&](sycl::handler& hdl) {
+
+        hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
+                int id = item.get_linear_id();
+                status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0;
+        });
+    });
 
 #if SCAN_KT_DEBUG
     printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
@@ -109,10 +172,12 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
+        hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
         hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
             auto group = item.get_group();
+            auto subgroup = item.get_sub_group();
 
             std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
@@ -139,23 +204,30 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             //auto local_sum = 0;
             ///debug1[tile_id] = local_sum;
 
-			__scan_status_flag<_Type> flag(status_flags, tile_id);
+            auto prev_sum = 0;
 
-            if (group.leader())
-                flag.set_partial(local_sum);
+            // The first sub-group will query the previous tiles to find a prefix
+            if (subgroup.get_group_id() == 0)
+            {
+                __scan_status_flag<_Type> flag(status_flags, tile_id);
 
-            // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-            //sycl::reduce_over_group(item.get_subgroup())
+                if (group.leader())
+                    flag.set_partial(local_sum);
 
-            auto prev_sum = 0;
+                // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                //sycl::reduce_over_group(item.get_subgroup())
 
-            if (group.leader())
-                prev_sum = flag.lookback(tile_id, status_flags);
-            //debug2[tile_id] = prev_sum;
 
-            if (group.leader())
-                flag.set_full(prev_sum + local_sum);
+                prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags);
+                //if (group.leader())
+                //    prev_sum = flag.lookback(tile_id, status_flags);
+                //debug2[tile_id] = prev_sum;
+
+                if (group.leader())
+                    flag.set_full(prev_sum + local_sum);
+            }
 
+            prev_sum = sycl::group_broadcast(group, prev_sum, 0);
             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
         });
     });

From f8c3f2ba26c03476b9fb35716b764152a0addbe3 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Fri, 22 Sep 2023 11:42:33 -0700
Subject: [PATCH 050/134] Working cooperative lookback

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 61 +++++++++++++------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 27fdc1d09b4..963de2952e6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -50,26 +50,26 @@ struct __scan_status_flag
 
     void set_full(std::uint32_t val)
     {
-        atomic_flag.store((val ^ partial_mask) | full_mask);
+        atomic_flag.store(val | full_mask);
     }
 
     template<typename _Subgroup, typename BinOp>
-    _T cooperative_lookback(const std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin)
+    _T cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin)
     {
         _T sum = 0;
         int offset = -1;
         int i = 0;
         int local_id = subgroup.get_local_id();
 
-        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; offset -= 32)
+        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= 32)
         {
             _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id));
             std::uint32_t tile_val = 0;
             do {
                 tile_val = tile_atomic.load();
 
-            //} while (!sycl::all_of_group(subgroup, tile_val != 0));
-            } while (0);
+            } while (!sycl::all_of_group(subgroup, tile_val != 0));
+            //} while (0);
 
             bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
@@ -77,7 +77,7 @@ struct __scan_status_flag
             is_full_ballot.extract_bits(is_full_ballot_bits);
 
             auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits);
-            _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{};
+            _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{0};
 
             // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
             sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
@@ -132,6 +132,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     const ::std::size_t n = __in_rng.size();
     auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
     //std::size_t num_wgs = __max_cu;
+    //std::size_t num_wgs = 448;
     std::size_t num_wgs = 256;
 
     // TODO: use wgsize and iters per item from _KernelParam
@@ -143,26 +144,31 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 #endif
     std::size_t wgsize = n/num_wgs/__elems_per_item;
     std::size_t num_items = n/__elems_per_item;
+    //
+    //std::size_t wgsize = 256;
+    //std::size_t num_items = 114688;
 
 
     constexpr int status_flag_padding = 32;
     std::uint32_t status_flags_size = num_wgs+1+status_flag_padding;
 
+    printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu);
+
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
 
     auto fill_event = __queue.submit([&](sycl::handler& hdl) {
 
-        hdl.parallel_for(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
+        hdl.parallel_for<class scan_kt_init>(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
                 int id = item.get_linear_id();
                 status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0;
         });
     });
 
-#if SCAN_KT_DEBUG
-    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
-    printf("launching kernel items=%lu wgs=%lu wgsize=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __max_cu);
+    std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
+#define SCAN_KT_DEBUG 1
+#if SCAN_KT_DEBUG
     uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
@@ -175,11 +181,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
+        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
             auto group = item.get_group();
             auto subgroup = item.get_sub_group();
 
-            std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
             // Obtain unique ID for this work-group that will be used in decoupled lookback
             if (group.leader())
@@ -189,7 +194,9 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             }
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
-            //debug5[group.get_local_id()] = tile_id;
+#if SCAN_KT_DEBUG
+            debug5[group.get_group_linear_id()] = tile_id;
+#endif
 
             auto current_offset = (tile_id*elems_in_tile);
             auto next_offset = ((tile_id+1)*elems_in_tile);
@@ -197,12 +204,15 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             auto in_end = __in_rng.begin() + next_offset;
             auto out_begin = __out_rng.begin() + current_offset;
 
-            //debug3[tile_id] = current_offset;
-            //debug4[tile_id] = next_offset;
+#if SCAN_KT_DEBUG
+            debug3[tile_id] = current_offset;
+            debug4[tile_id] = next_offset;
+#endif
 
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
-            //auto local_sum = 0;
-            ///debug1[tile_id] = local_sum;
+#if SCAN_KT_DEBUG
+            debug1[tile_id] = local_sum;
+#endif
 
             auto prev_sum = 0;
 
@@ -221,7 +231,9 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                 prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags);
                 //if (group.leader())
                 //    prev_sum = flag.lookback(tile_id, status_flags);
-                //debug2[tile_id] = prev_sum;
+#if SCAN_KT_DEBUG
+                debug2[tile_id] = prev_sum;
+#endif
 
                 if (group.leader())
                     flag.set_full(prev_sum + local_sum);
@@ -234,20 +246,31 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     event.wait();
 
-#if 0
+#if SCAN_KT_DEBUG
     std::vector<uint32_t> debug1v(status_flags_size);
     std::vector<uint32_t> debug2v(status_flags_size);
     std::vector<uint32_t> debug3v(status_flags_size);
     std::vector<uint32_t> debug4v(status_flags_size);
     std::vector<uint32_t> debug5v(status_flags_size);
+    std::vector<uint32_t> debug6v(status_flags_size);
     __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t));
     __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t));
     __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t));
     __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t));
     __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t));
+    __queue.memcpy(debug6v.data(), status_flags, status_flags_size * sizeof(uint32_t));
 
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "tile " << i << " " << debug5v[i] << std::endl;
     for (int i = 0; i < status_flags_size-1; ++i)
         std::cout << "local_sum " << i << " " << debug1v[i] << std::endl;
+    for (int i = 0; i < status_flags_size-1; ++i)
+    {
+        auto val = (debug6v[i] & __scan_status_flag<_Type>::value_mask);
+        int a = val / elems_in_tile;
+        int b = val % elems_in_tile;
+        std::cout << "flags " << i << " " << std::bitset<32>(debug6v[i]) << " (" << val<< " = " << a << "/" << elems_in_tile << "+" << b <<")" << std::endl;
+    }
     for (int i = 0; i < status_flags_size-1; ++i)
         std::cout << "lookback " << i << " " << debug2v[i] << std::endl;
     for (int i = 0; i < status_flags_size-1; ++i)

From 1d72d3f0d4a962eca63eb2c16d6c9db04f8941da Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Wed, 25 Oct 2023 11:13:53 -0700
Subject: [PATCH 051/134] Fix correctness issue with non-power-of-2 sizes

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 52 ++++++++++++++++---
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 963de2952e6..7aaf3f2a255 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -133,7 +133,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
     //std::size_t num_wgs = __max_cu;
     //std::size_t num_wgs = 448;
-    std::size_t num_wgs = 256;
+    //std::size_t num_wgs = 256;
 
     // TODO: use wgsize and iters per item from _KernelParam
     //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
@@ -142,8 +142,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 #else
     constexpr ::std::size_t __elems_per_item = 8;
 #endif
-    std::size_t wgsize = n/num_wgs/__elems_per_item;
-    std::size_t num_items = n/__elems_per_item;
+    // Next power of 2 greater than or equal to __n
+    auto __n_uniform = n;
+    if ((__n_uniform & (__n_uniform - 1)) != 0)
+        __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1;
+    //std::size_t wgsize = n/num_wgs/__elems_per_item;
+    std::size_t wgsize = 256;
+    std::size_t num_items = __n_uniform/__elems_per_item;
+    std::size_t num_wgs = num_items/wgsize;
     //
     //std::size_t wgsize = 256;
     //std::size_t num_items = 114688;
@@ -152,7 +158,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     constexpr int status_flag_padding = 32;
     std::uint32_t status_flags_size = num_wgs+1+status_flag_padding;
 
-    printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu);
+    //printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu);
 
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
@@ -165,10 +171,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         });
     });
 
+
     std::uint32_t elems_in_tile = wgsize*__elems_per_item;
 
-#define SCAN_KT_DEBUG 1
+#define SCAN_KT_DEBUG 0
 #if SCAN_KT_DEBUG
+    std::vector<uint32_t> debug11v(status_flags_size);
+    __queue.memcpy(debug11v.data(), status_flags, status_flags_size * sizeof(uint32_t));
+
+    for (int i = 0; i < status_flags_size-1; ++i)
+        std::cout << "flag_before " << i << " " << debug11v[i] << std::endl;
+
     uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
     uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
@@ -200,21 +213,27 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
             auto current_offset = (tile_id*elems_in_tile);
             auto next_offset = ((tile_id+1)*elems_in_tile);
+            if (next_offset > n)
+                next_offset = n;
             auto in_begin = __in_rng.begin() + current_offset;
             auto in_end = __in_rng.begin() + next_offset;
             auto out_begin = __out_rng.begin() + current_offset;
 
+
 #if SCAN_KT_DEBUG
             debug3[tile_id] = current_offset;
             debug4[tile_id] = next_offset;
 #endif
 
+            if (current_offset >= n)
+                return;
+
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
 #if SCAN_KT_DEBUG
             debug1[tile_id] = local_sum;
 #endif
 
-            auto prev_sum = 0;
+            _Type prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
             if (subgroup.get_group_id() == 0)
@@ -296,6 +315,17 @@ void
 single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
 {
     auto __n = __in_end - __in_begin;
+
+#if SCAN_KT_DEBUG
+    using _Type = std::remove_pointer_t<_InIterator>;
+    std::vector<_Type> in_debug(__n);
+    __queue.memcpy(in_debug.data(), __in_begin, __n * sizeof(_Type));
+
+    for (int i = 0; i < __n; ++i)
+        std::cout << "input_before " << i << " " << in_debug[i] << std::endl;
+#endif
+
+    //printf("KERNEL_TEMPLATE %lu\n", __n);
     auto __keep1 =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
     auto __buf1 = __keep1(__in_begin, __in_end);
@@ -304,6 +334,16 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
     single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+
+#if SCAN_KT_DEBUG
+    std::vector<_Type> in_debug2(__n);
+    __queue.memcpy(in_debug2.data(), __in_begin, __n * sizeof(_Type));
+
+    for (int i = 0; i < __n; ++i)
+        std::cout << "input_after " << i << " " << in_debug2[i] << std::endl;
+#endif
+
+    //printf("KERNEL_TEMPLATE DONE %lu\n", __n);
 }
 
 } // inline namespace igpu

From 567a50ebab7a53ae9fe4539d4640b7c46d8546e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Tue, 7 Nov 2023 10:51:21 +0000
Subject: [PATCH 052/134] Scan_kt Flags and Values separated (#15)

Atomic flags and the values used in Scan_kt separated to avoid truncating the range to 30bit values, and prepare for a more general scan implementation.
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 242 +++++-------------
 .../numeric/numeric.ops/scan_kt.pass.cpp      |  14 +-
 2 files changed, 75 insertions(+), 181 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 7aaf3f2a255..f52e4ef532f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -21,63 +21,75 @@ namespace oneapi::dpl::experimental::kt
 
 inline namespace igpu {
 
+constexpr size_t SUBGROUP_SIZE = 32;
+
 template<typename _T>
 struct __scan_status_flag
 {
-    // 00xxxx - not computed
-    // 01xxxx - partial
-    // 10xxxx - full
-    // 110000 - out of bounds
-
-    using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space>;
-    static constexpr std::uint32_t partial_mask = 1 << (sizeof(std::uint32_t)*8 - 2);
-    static constexpr std::uint32_t full_mask = 1 << (sizeof(std::uint32_t)*8 - 1);
-    static constexpr std::uint32_t value_mask = ~(partial_mask | full_mask);
-    static constexpr std::uint32_t oob_value = partial_mask | full_mask;
-
-    static constexpr int padding = 32;
-
-    __scan_status_flag(std::uint32_t* flags_begin, const std::uint32_t tile_id)
-      : atomic_flag(*(flags_begin + tile_id + padding))
+    using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::acq_rel, sycl::memory_scope::device,
+                                         sycl::access::address_space::global_space>;
+    static constexpr std::uint32_t NOT_READY = 0;
+    static constexpr std::uint32_t PARTIAL_MASK = 1;
+    static constexpr std::uint32_t FULL_MASK = 2;
+    static constexpr std::uint32_t OUT_OF_BOUNDS = 4;
+
+    static constexpr int padding = SUBGROUP_SIZE;
+
+    __scan_status_flag(const std::uint32_t tile_id, std::uint32_t* flags_begin, _T* tile_sums,
+                       size_t num_elements)
+        : atomic_flag(*(flags_begin + tile_id + padding)), scanned_partial_value(tile_sums + tile_id + padding),
+          scanned_full_value(tile_sums + tile_id + padding + num_elements), num_elements{num_elements}
     {
-
     }
 
-    void set_partial(std::uint32_t val)
+    void
+    set_partial(_T val)
     {
-        atomic_flag.store(val | partial_mask);
+        (*scanned_partial_value) = val;
+        atomic_flag.store(PARTIAL_MASK);
     }
 
-    void set_full(std::uint32_t val)
+    void
+    set_full(_T val)
     {
-        atomic_flag.store(val | full_mask);
+        (*scanned_full_value) = val;
+        atomic_flag.store(FULL_MASK);
     }
 
-    template<typename _Subgroup, typename BinOp>
-    _T cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin)
+    template <typename _Subgroup, typename BinOp>
+    _T
+    cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin,
+                         _T* tile_sums)
     {
         _T sum = 0;
         int offset = -1;
         int i = 0;
         int local_id = subgroup.get_local_id();
 
-        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= 32)
+        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
         {
             _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id));
-            std::uint32_t tile_val = 0;
-            do {
-                tile_val = tile_atomic.load();
+            std::uint32_t flag;
+            do
+            {
+                flag = tile_atomic.load();
+            } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready
 
-            } while (!sycl::all_of_group(subgroup, tile_val != 0));
-            //} while (0);
+            bool is_full = flag == FULL_MASK;
 
-            bool is_full = (tile_val & full_mask) && ((tile_val & partial_mask) == 0);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
             ::std::uint32_t is_full_ballot_bits{};
             is_full_ballot.extract_bits(is_full_ballot_bits);
 
             auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits);
-            _T contribution = local_id <= lowest_item_with_full ? tile_val & value_mask : _T{0};
+
+            // The partial scan results and the full scan sum values are in contiguous memory.
+            // Each section of the memory is of size num_elements.
+            // The partial sum for a tile is at [i] and the full sum is at [i + num_elements]
+            // is_full * num_elements allows to select between the two values without branching the code.
+            size_t contrib_offset = tile + padding - local_id + is_full * num_elements;
+            _T val = *(tile_sums + contrib_offset);
+            _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? val : _T{0};
 
             // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
             sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
@@ -88,37 +100,16 @@ struct __scan_status_flag
             if (is_full_ballot_bits)
                 break;
 
-            //if (i++ > 10) break;
         }
-        return sum;
-    }
-
-#if 0
-    _T lookback(const std::uint32_t tile_id, std::uint32_t* flags_begin)
-    {
-        _T sum = 0;
-        int i = 0;
-        for (std::int32_t tile = static_cast<std::int32_t>(tile_id) - 1; tile >= 0; --tile)
-        {
-            _AtomicRefT tile_atomic(*(flags_begin + tile + padding));
-            std::uint32_t tile_val = 0;
-            do {
-                tile_val = tile_atomic.load();
-            } while (tile_val == 0);
-
-            sum += tile_val & value_mask;
 
-            // If this was a full value, we can stop looking at previous tiles. Otherwise,
-            // keep going through tiles until we either find a full tile or we've completely
-            // recomputed the prefix using partial values
-            if (tile_val & full_mask)
-                break;
-        }
         return sum;
     }
-#endif
 
     _AtomicRefT atomic_flag;
+    _T* scanned_partial_value;
+    _T* scanned_full_value;
+
+    size_t num_elements;
 };
 
 template <typename _KernelParam, bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
@@ -130,86 +121,57 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
 
     const ::std::size_t n = __in_rng.size();
-    auto __max_cu = __queue.get_device().template get_info<sycl::info::device::max_compute_units>();
-    //std::size_t num_wgs = __max_cu;
-    //std::size_t num_wgs = 448;
-    //std::size_t num_wgs = 256;
-
-    // TODO: use wgsize and iters per item from _KernelParam
-    //constexpr ::std::size_t __elems_per_item = _KernelParam::data_per_workitem;
 #ifdef _ONEDPL_SCAN_ITER_SIZE
-    constexpr ::std::size_t __elems_per_item = _ONEDPL_SCAN_ITER_SIZE;
+    constexpr ::std::size_t __elems_per_workitem = _ONEDPL_SCAN_ITER_SIZE;
 #else
-    constexpr ::std::size_t __elems_per_item = 8;
+    constexpr ::std::size_t __elems_per_workitem = 8;
 #endif
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = n;
     if ((__n_uniform & (__n_uniform - 1)) != 0)
         __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1;
-    //std::size_t wgsize = n/num_wgs/__elems_per_item;
-    std::size_t wgsize = 256;
-    std::size_t num_items = __n_uniform/__elems_per_item;
-    std::size_t num_wgs = num_items/wgsize;
-    //
-    //std::size_t wgsize = 256;
-    //std::size_t num_items = 114688;
-
+    std::size_t num_workitems = __n_uniform / __elems_per_workitem;
+    std::size_t wgsize = num_workitems > 256 ? 256 : num_workitems;
+    std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_workitems, wgsize);
 
-    constexpr int status_flag_padding = 32;
-    std::uint32_t status_flags_size = num_wgs+1+status_flag_padding;
-
-    //printf("launching kernel items=%lu wgs=%lu wgsize=%lu elems_per_iter=%lu max_cu=%u\n", num_items, num_wgs, wgsize, __elems_per_item, __max_cu);
+    constexpr int status_flag_padding = SUBGROUP_SIZE;
+    std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1;
+    std::uint32_t tile_sums_size = num_wgs + status_flag_padding;
 
     uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    //__queue.memset(status_flags, 0, status_flags_size * sizeof(uint32_t));
+    // First status_flags_size elements: partial scanned values (using _BinaryOp) of each workgroup
+    // Second status_flags_size elements: full scanned values, i.e. sum of the previous and current workgroup's partial sums
+    _Type* tile_sums = sycl::malloc_device<_Type>(tile_sums_size * 2, __queue);
 
     auto fill_event = __queue.submit([&](sycl::handler& hdl) {
-
         hdl.parallel_for<class scan_kt_init>(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
                 int id = item.get_linear_id();
-                status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::oob_value : 0;
+                status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
+                                                            : __scan_status_flag<_Type>::NOT_READY;
         });
     });
 
-
-    std::uint32_t elems_in_tile = wgsize*__elems_per_item;
-
-#define SCAN_KT_DEBUG 0
-#if SCAN_KT_DEBUG
-    std::vector<uint32_t> debug11v(status_flags_size);
-    __queue.memcpy(debug11v.data(), status_flags, status_flags_size * sizeof(uint32_t));
-
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "flag_before " << i << " " << debug11v[i] << std::endl;
-
-    uint32_t* debug1 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug2 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug3 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug4 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    uint32_t* debug5 = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-#endif
+    std::uint32_t elems_in_tile = wgsize*__elems_per_workitem;
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_items, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(32)]] {
+        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
             auto group = item.get_group();
             auto subgroup = item.get_sub_group();
 
-
             // Obtain unique ID for this work-group that will be used in decoupled lookback
             if (group.leader())
             {
-                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device, sycl::access::address_space::global_space> idx_atomic(status_flags[status_flags_size-1]);
+                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                 sycl::access::address_space::global_space>
+                    idx_atomic(status_flags[status_flags_size - 1]);
                 tile_id_lacc[0] = idx_atomic.fetch_add(1);
             }
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
-#if SCAN_KT_DEBUG
-            debug5[group.get_group_linear_id()] = tile_id;
-#endif
 
             auto current_offset = (tile_id*elems_in_tile);
             auto next_offset = ((tile_id+1)*elems_in_tile);
@@ -219,40 +181,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             auto in_end = __in_rng.begin() + next_offset;
             auto out_begin = __out_rng.begin() + current_offset;
 
-
-#if SCAN_KT_DEBUG
-            debug3[tile_id] = current_offset;
-            debug4[tile_id] = next_offset;
-#endif
-
             if (current_offset >= n)
                 return;
 
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
-#if SCAN_KT_DEBUG
-            debug1[tile_id] = local_sum;
-#endif
-
             _Type prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
             if (subgroup.get_group_id() == 0)
             {
-                __scan_status_flag<_Type> flag(status_flags, tile_id);
+                __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, status_flags_size);
 
                 if (group.leader())
                     flag.set_partial(local_sum);
 
                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                //sycl::reduce_over_group(item.get_subgroup())
-
-
-                prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags);
-                //if (group.leader())
-                //    prev_sum = flag.lookback(tile_id, status_flags);
-#if SCAN_KT_DEBUG
-                debug2[tile_id] = prev_sum;
-#endif
+                prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags, tile_sums);
 
                 if (group.leader())
                     flag.set_full(prev_sum + local_sum);
@@ -265,40 +209,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     event.wait();
 
-#if SCAN_KT_DEBUG
-    std::vector<uint32_t> debug1v(status_flags_size);
-    std::vector<uint32_t> debug2v(status_flags_size);
-    std::vector<uint32_t> debug3v(status_flags_size);
-    std::vector<uint32_t> debug4v(status_flags_size);
-    std::vector<uint32_t> debug5v(status_flags_size);
-    std::vector<uint32_t> debug6v(status_flags_size);
-    __queue.memcpy(debug1v.data(), debug1, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug2v.data(), debug2, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug3v.data(), debug3, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug4v.data(), debug4, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug5v.data(), debug5, status_flags_size * sizeof(uint32_t));
-    __queue.memcpy(debug6v.data(), status_flags, status_flags_size * sizeof(uint32_t));
-
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "tile " << i << " " << debug5v[i] << std::endl;
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "local_sum " << i << " " << debug1v[i] << std::endl;
-    for (int i = 0; i < status_flags_size-1; ++i)
-    {
-        auto val = (debug6v[i] & __scan_status_flag<_Type>::value_mask);
-        int a = val / elems_in_tile;
-        int b = val % elems_in_tile;
-        std::cout << "flags " << i << " " << std::bitset<32>(debug6v[i]) << " (" << val<< " = " << a << "/" << elems_in_tile << "+" << b <<")" << std::endl;
-    }
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "lookback " << i << " " << debug2v[i] << std::endl;
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "offset " << i << " " << debug3v[i] << std::endl;
-    for (int i = 0; i < status_flags_size-1; ++i)
-        std::cout << "end " << i << " " << debug4v[i] << std::endl;
-#endif
-
     sycl::free(status_flags, __queue);
+    sycl::free(tile_sums, __queue);
 }
 
 // The generic structure for configuring a kernel
@@ -316,16 +228,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
 {
     auto __n = __in_end - __in_begin;
 
-#if SCAN_KT_DEBUG
-    using _Type = std::remove_pointer_t<_InIterator>;
-    std::vector<_Type> in_debug(__n);
-    __queue.memcpy(in_debug.data(), __in_begin, __n * sizeof(_Type));
-
-    for (int i = 0; i < __n; ++i)
-        std::cout << "input_before " << i << " " << in_debug[i] << std::endl;
-#endif
-
-    //printf("KERNEL_TEMPLATE %lu\n", __n);
     auto __keep1 =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
     auto __buf1 = __keep1(__in_begin, __in_end);
@@ -334,16 +236,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
     single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
-
-#if SCAN_KT_DEBUG
-    std::vector<_Type> in_debug2(__n);
-    __queue.memcpy(in_debug2.data(), __in_begin, __n * sizeof(_Type));
-
-    for (int i = 0; i < __n; ++i)
-        std::cout << "input_after " << i << " " << in_debug2[i] << std::endl;
-#endif
-
-    //printf("KERNEL_TEMPLATE DONE %lu\n", __n);
 }
 
 } // inline namespace igpu
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
index de5ecafc25b..38a82b026d7 100644
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -22,23 +22,23 @@ int
 main()
 {
     bool all_passed = true;
+    sycl::queue q;
 
     for (int logn : {4, 8, 11, 16, 19, 21})
     {
-        std::cout << "Testing 2^" << logn << '\n';
+        std::cout << "Testing 2^" << logn << std::endl;
         int n = 1 << logn;
         std::vector<int> v(n, 1);
-        sycl::queue q;
         int* in_ptr = sycl::malloc_device<int>(n, q);
         int* out_ptr = sycl::malloc_device<int>(n, q);
 
-
-        q.copy(v.data(), in_ptr, n);
+        q.copy(v.data(), in_ptr, n).wait();
         using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
         oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
 
         std::vector<int> tmp(n, 0);
         q.copy(out_ptr, tmp.data(), n);
+        q.wait();
 
         std::inclusive_scan(v.begin(), v.end(), v.begin());
 
@@ -53,11 +53,13 @@ main()
         }
 
         if (passed)
-            std::cout << "passed" << std::endl;
+            std::cout << " passed" << std::endl;
         else
-            std::cout << "failed" << std::endl;
+            std::cout << " failed" << std::endl;
 
         all_passed &= passed;
+        sycl::free(in_ptr, q);
+        sycl::free(out_ptr, q);
     }
 
     return !all_passed;

From 0c91640f4e128a1f8a574dba72563c03ce1f88e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Tue, 7 Nov 2023 13:07:04 +0000
Subject: [PATCH 053/134] Refactored Scan_kt code (#16)

* Improved Scan_kt: templated parameters, ballot, wgsize calculation.

- Changed number of workgroups calculation from next power of two to
next multiple of wgsize
- Improved group_ballot by using the class member functions
- Using kernel_param struct to determine wgsize and elems per work item.
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 34 +++++++------------
 .../numeric/numeric.ops/scan_kt.pass.cpp      |  2 +-
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index f52e4ef532f..e7a0ca345e6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -76,12 +76,8 @@ struct __scan_status_flag
             } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready
 
             bool is_full = flag == FULL_MASK;
-
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
-            ::std::uint32_t is_full_ballot_bits{};
-            is_full_ballot.extract_bits(is_full_ballot_bits);
-
-            auto lowest_item_with_full = sycl::ctz(is_full_ballot_bits);
+            auto lowest_item_with_full = is_full_ballot.find_low();
 
             // The partial scan results and the full scan sum values are in contiguous memory.
             // Each section of the memory is of size num_elements.
@@ -97,7 +93,7 @@ struct __scan_status_flag
             // If we found a full value, we can stop looking at previous tiles. Otherwise,
             // keep going through tiles until we either find a full tile or we've completely
             // recomputed the prefix using partial values
-            if (is_full_ballot_bits)
+            if (is_full_ballot.any())
                 break;
 
         }
@@ -121,18 +117,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
 
     const ::std::size_t n = __in_rng.size();
-#ifdef _ONEDPL_SCAN_ITER_SIZE
-    constexpr ::std::size_t __elems_per_workitem = _ONEDPL_SCAN_ITER_SIZE;
-#else
-    constexpr ::std::size_t __elems_per_workitem = 8;
-#endif
-    // Next power of 2 greater than or equal to __n
-    auto __n_uniform = n;
-    if ((__n_uniform & (__n_uniform - 1)) != 0)
-        __n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(n) << 1;
-    std::size_t num_workitems = __n_uniform / __elems_per_workitem;
-    std::size_t wgsize = num_workitems > 256 ? 256 : num_workitems;
-    std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_workitems, wgsize);
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
+    ::std::size_t num_workitems = num_wgs * wgsize;
 
     constexpr int status_flag_padding = SUBGROUP_SIZE;
     std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1;
@@ -151,8 +143,6 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         });
     });
 
-    std::uint32_t elems_in_tile = wgsize*__elems_per_workitem;
-
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         hdl.depends_on(fill_event);
@@ -214,10 +204,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 }
 
 // The generic structure for configuring a kernel
-template <std::uint16_t DataPerWorkItem, std::uint16_t WorkGroupSize, typename KernelName>
+template <std::uint16_t ElemsPerWorkItem, std::uint16_t WorkGroupSize, typename KernelName>
 struct kernel_param
 {
-    static constexpr std::uint16_t data_per_workitem = DataPerWorkItem;
+    static constexpr std::uint16_t elems_per_workitem = ElemsPerWorkItem;
     static constexpr std::uint16_t workgroup_size = WorkGroupSize;
     using kernel_name = KernelName;
 };
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
index 38a82b026d7..b3407581f37 100644
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
@@ -33,7 +33,7 @@ main()
         int* out_ptr = sycl::malloc_device<int>(n, q);
 
         q.copy(v.data(), in_ptr, n).wait();
-        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<128, 2, class ScanKernel>;
+        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<8, 128, class ScanKernel>;
         oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
 
         std::vector<int> tmp(n, 0);

From 78d2d7d1d4cd264f673e3c2eb587f08a58552f70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Wed, 8 Nov 2023 16:47:52 +0000
Subject: [PATCH 054/134] Scan_kt: Single memory allocation for device_memory
 (#17) and async free of the device memory (#18)

* Single memory allocation for device_memory

* async free of device memory

---------

Co-authored-by: Joe Todd <joeatodd@users.noreply.github.com>
Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index e7a0ca345e6..5773b80e1be 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -127,13 +127,24 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     ::std::size_t num_workitems = num_wgs * wgsize;
 
     constexpr int status_flag_padding = SUBGROUP_SIZE;
-    std::uint32_t status_flags_size = num_wgs + status_flag_padding + 1;
-    std::uint32_t tile_sums_size = num_wgs + status_flag_padding;
+    std::size_t status_flags_elems = num_wgs + status_flag_padding + 1;
+    std::size_t status_flags_size = status_flags_elems * sizeof(std::uint32_t);
 
-    uint32_t* status_flags = sycl::malloc_device<uint32_t>(status_flags_size, __queue);
-    // First status_flags_size elements: partial scanned values (using _BinaryOp) of each workgroup
-    // Second status_flags_size elements: full scanned values, i.e. sum of the previous and current workgroup's partial sums
-    _Type* tile_sums = sycl::malloc_device<_Type>(tile_sums_size * 2, __queue);
+    std::size_t tile_sums_elems = num_wgs + status_flag_padding;
+    std::size_t tile_sums_size = status_flags_elems * sizeof(_Type);
+
+    std::size_t extra_mem_for_aligment = alignof(_Type) - (status_flags_size % alignof(_Type));
+    // status_flags_size for the status_flags
+    // extra_mem_for_aligment of the datatype _Type
+    // First tile_sums_size partial scanned values
+    // Second tile_sums_size full scanned values (current partial plus all previous workgroups partial)
+    char* mem_pool =
+        sycl::malloc_device<char>(status_flags_size + extra_mem_for_aligment + 2 * tile_sums_size, __queue);
+
+    std::size_t tile_sums_offset = status_flags_size + extra_mem_for_aligment;
+
+    std::uint32_t* status_flags = reinterpret_cast<std::uint32_t*>(mem_pool);
+    _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset);
 
     auto fill_event = __queue.submit([&](sycl::handler& hdl) {
         hdl.parallel_for<class scan_kt_init>(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
@@ -180,7 +191,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             // The first sub-group will query the previous tiles to find a prefix
             if (subgroup.get_group_id() == 0)
             {
-                __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, status_flags_size);
+                __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, tile_sums_elems);
 
                 if (group.leader())
                     flag.set_partial(local_sum);
@@ -197,10 +208,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         });
     });
 
-    event.wait();
+    auto free_event = __queue.submit(
+        [=](sycl::handler& hdl)
+        {
+            hdl.depends_on(event);
+            hdl.host_task([=](){ sycl::free(mem_pool, __queue); });
+        });
 
-    sycl::free(status_flags, __queue);
-    sycl::free(tile_sums, __queue);
+    event.wait();
 }
 
 // The generic structure for configuring a kernel

From 55dc287d7b2c17188275a0acfd3a186c98ffad4f Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 8 Nov 2023 16:07:30 +0000
Subject: [PATCH 055/134] Replace sycl::range with sycl::nd_range for fill

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 5773b80e1be..53d925a14c8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -146,13 +146,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     std::uint32_t* status_flags = reinterpret_cast<std::uint32_t*>(mem_pool);
     _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset);
 
-    auto fill_event = __queue.submit([&](sycl::handler& hdl) {
-        hdl.parallel_for<class scan_kt_init>(sycl::range<1>{status_flags_size}, [=](const sycl::item<1>& item)  {
-                int id = item.get_linear_id();
-                status_flags[id] = id < status_flag_padding ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
-                                                            : __scan_status_flag<_Type>::NOT_READY;
+    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_size, wgsize);
+
+    auto fill_event = __queue.submit(
+        [&](sycl::handler& hdl)
+        {
+            hdl.parallel_for<class scan_kt_init>(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
+                                                 [=](const sycl::nd_item<1>& item)
+                                                 {
+                                                     int id = item.get_global_linear_id();
+                                                     if (id < status_flags_size)
+                                                         status_flags[id] =
+                                                             id < status_flag_padding
+                                                                 ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
+                                                                 : __scan_status_flag<_Type>::NOT_READY;
+                                                 });
         });
-    });
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);

From 37bfd1de142f82e91aa9e4fbeac857e38d467702 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 8 Nov 2023 19:14:32 +0000
Subject: [PATCH 056/134] Bug fix

---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 53d925a14c8..038018a13ac 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -146,7 +146,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     std::uint32_t* status_flags = reinterpret_cast<std::uint32_t*>(mem_pool);
     _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset);
 
-    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_size, wgsize);
+    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_elems, wgsize);
 
     auto fill_event = __queue.submit(
         [&](sycl::handler& hdl)
@@ -155,7 +155,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                                                  [=](const sycl::nd_item<1>& item)
                                                  {
                                                      int id = item.get_global_linear_id();
-                                                     if (id < status_flags_size)
+                                                     if (id < status_flags_elems)
                                                          status_flags[id] =
                                                              id < status_flag_padding
                                                                  ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
@@ -177,7 +177,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             {
                 sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device,
                                  sycl::access::address_space::global_space>
-                    idx_atomic(status_flags[status_flags_size - 1]);
+                    idx_atomic(status_flags[status_flags_elems - 1]);
                 tile_id_lacc[0] = idx_atomic.fetch_add(1);
             }
             sycl::group_barrier(group);

From 21038df158b2196188f675827cba0b5e2bd47f97 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Wed, 8 Nov 2023 13:21:32 +0000
Subject: [PATCH 057/134] Global to local then perform op

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 038018a13ac..846208007da 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -165,11 +165,14 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
+        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
         hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
             auto group = item.get_group();
+            auto local_id = item.get_local_id(0);
+            auto stride = item.get_local_range(0);
             auto subgroup = item.get_sub_group();
 
             // Obtain unique ID for this work-group that will be used in decoupled lookback
@@ -183,16 +186,33 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
 
-            auto current_offset = (tile_id*elems_in_tile);
-            auto next_offset = ((tile_id+1)*elems_in_tile);
-            if (next_offset > n)
-                next_offset = n;
-            auto in_begin = __in_rng.begin() + current_offset;
-            auto in_end = __in_rng.begin() + next_offset;
-            auto out_begin = __out_rng.begin() + current_offset;
-
-            if (current_offset >= n)
+            // Global load into local
+            auto wg_current_offset = (tile_id*elems_in_tile);
+            auto wg_next_offset = ((tile_id+1)*elems_in_tile);
+            size_t wg_local_memory_size = elems_in_tile;
+            if (wg_current_offset >= n)
                 return;
+            if (wg_next_offset >= n) {
+                wg_local_memory_size = n - wg_current_offset;
+                wg_next_offset = n; // Not needed
+            }
+
+            // TODO: vectorize loads, where possible
+            if (wg_next_offset <= n) {
+                _ONEDPL_PRAGMA_UNROLL
+                for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                    tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
+            } else {
+                for (std::uint32_t i = 0; i < elems_per_workitem; ++i) {
+                    if (wg_current_offset + stride * i < n)
+                        tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + stride * i];
+                }
+            }
+            sycl::group_barrier(group);
+
+            auto in_begin = tile_vals.get_pointer();
+            auto in_end = in_begin + wg_local_memory_size;
+            auto out_begin = __out_rng.begin() + wg_current_offset;
 
             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
             _Type prev_sum = 0;

From bdcc9c9da190cbcc7499a7ba4ff50b824c4f9447 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 10 Nov 2023 13:51:35 +0000
Subject: [PATCH 058/134] Update based on feedback

---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h   | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 846208007da..1bd10595413 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -192,20 +192,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             size_t wg_local_memory_size = elems_in_tile;
             if (wg_current_offset >= n)
                 return;
-            if (wg_next_offset >= n) {
+            if (wg_next_offset > n)
                 wg_local_memory_size = n - wg_current_offset;
-                wg_next_offset = n; // Not needed
-            }
 
-            // TODO: vectorize loads, where possible
             if (wg_next_offset <= n) {
                 _ONEDPL_PRAGMA_UNROLL
                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
                     tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
             } else {
                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i) {
-                    if (wg_current_offset + stride * i < n)
-                        tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + stride * i];
+                    if (wg_current_offset + local_id + stride * i < n)
+                        tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
                 }
             }
             sycl::group_barrier(group);

From 9717e095cd3155db9f7175b8a580e114f7a178c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Tue, 21 Nov 2023 11:48:48 +0000
Subject: [PATCH 059/134] Refactored cooperative_loopback and memory
 implementation (#24)

* Refactored cooperative_loopback and memory implementation detail

* renamed load_counter to fetch_add_counter

* Removed dynamic tile counter from the scan memory struct

* scratch memory Reordering

* Fixed wrong values returned in LoopbackScanMemory.get_value

* Improved Class and variable naming
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 334 +++++++++++++-----
 1 file changed, 253 insertions(+), 81 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 1bd10595413..314ace11410 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -16,51 +16,244 @@
 #ifndef _ONEDPL_parallel_backend_sycl_scan_H
 #define _ONEDPL_parallel_backend_sycl_scan_H
 
+#include <cstdint>
+#include <sycl/sycl.hpp>
+
 namespace oneapi::dpl::experimental::kt
 {
 
 inline namespace igpu {
 
-constexpr size_t SUBGROUP_SIZE = 32;
+constexpr ::std::size_t SUBGROUP_SIZE = 32;
+
+template <typename Type, template <typename> typename LoopbackScanMemory, typename TileId>
+struct ScanMemoryManager
+{
+    using _TileIdT = typename TileId::_TileIdT;
+    using _FlagT = typename LoopbackScanMemory<Type>::_FlagT;
+
+    ScanMemoryManager(sycl::queue q) : q{q} {};
+
+    ::std::uint8_t*
+    scan_memory_ptr() noexcept
+    {
+        return scan_memory_begin;
+    };
+
+    _TileIdT*
+    tile_id_ptr() noexcept
+    {
+        return tile_id_begin;
+    };
+
+    void
+    allocate(::std::size_t num_wgs)
+    {
+        ::std::size_t scan_memory_size = LoopbackScanMemory<Type>::get_memory_size(num_wgs);
+        constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size();
+        constexpr ::std::size_t tileid_size = TileId::get_memory_size();
+
+        auto mem_size_bytes = scan_memory_size + padded_tileid_size;
+
+        scratch = sycl::malloc_device<::std::uint8_t>(mem_size_bytes, q);
+
+        scan_memory_begin = scratch;
+
+        void* base_tileid_ptr = reinterpret_cast<void*>(scan_memory_begin + scan_memory_size);
+        size_t remainder = mem_size_bytes - scan_memory_size;
+
+        tile_id_begin = reinterpret_cast<_TileIdT*>(
+            ::std::align(::std::alignment_of_v<_TileIdT>, tileid_size, base_tileid_ptr, remainder));
+    }
+
+    sycl::event
+    async_free(sycl::event dependency)
+    {
+        return q.submit(
+            [e = dependency, ptr = scratch, q_ = q](sycl::handler& hdl)
+            {
+                hdl.depends_on(e);
+                hdl.host_task([=]() { sycl::free(ptr, q_); });
+            });
+    }
+
+  private:
+    ::std::uint8_t* scratch = nullptr;
+    ::std::uint8_t* scan_memory_begin = nullptr;
+    _TileIdT* tile_id_begin = nullptr;
+
+    sycl::queue q;
+};
 
-template<typename _T>
-struct __scan_status_flag
+template <typename _T>
+struct LoopbackScanMemory
 {
-    using _AtomicRefT = sycl::atomic_ref<::std::uint32_t, sycl::memory_order::acq_rel, sycl::memory_scope::device,
-                                         sycl::access::address_space::global_space>;
-    static constexpr std::uint32_t NOT_READY = 0;
-    static constexpr std::uint32_t PARTIAL_MASK = 1;
-    static constexpr std::uint32_t FULL_MASK = 2;
-    static constexpr std::uint32_t OUT_OF_BOUNDS = 4;
-
-    static constexpr int padding = SUBGROUP_SIZE;
-
-    __scan_status_flag(const std::uint32_t tile_id, std::uint32_t* flags_begin, _T* tile_sums,
-                       size_t num_elements)
-        : atomic_flag(*(flags_begin + tile_id + padding)), scanned_partial_value(tile_sums + tile_id + padding),
-          scanned_full_value(tile_sums + tile_id + padding + num_elements), num_elements{num_elements}
+    using _FlagT = ::std::uint32_t;
+    using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>;
+
+    static constexpr _FlagT NOT_READY = 0;
+    static constexpr _FlagT PARTIAL_MASK = 1;
+    static constexpr _FlagT FULL_MASK = 2;
+    static constexpr _FlagT OUT_OF_BOUNDS = 4;
+
+    static constexpr ::std::size_t padding = SUBGROUP_SIZE;
+
+    LoopbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
+        : num_elements(get_num_elements(num_wgs))
     {
+        // LoopbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...]
+        // Each section has num_wgs + padding elements
+        tile_values_begin = reinterpret_cast<_T*>(scan_memory_begin);
+        flags_begin = get_flags_begin(scan_memory_begin, num_wgs);
     }
 
     void
-    set_partial(_T val)
+    set_partial(::std::size_t tile_id, _T val)
     {
-        (*scanned_partial_value) = val;
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        tile_values_begin[tile_id + padding] = val;
         atomic_flag.store(PARTIAL_MASK);
     }
 
     void
-    set_full(_T val)
+    set_full(::std::size_t tile_id, _T val)
     {
-        (*scanned_full_value) = val;
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        tile_values_begin[tile_id + padding + num_elements] = val;
         atomic_flag.store(FULL_MASK);
     }
 
-    template <typename _Subgroup, typename BinOp>
+    _FlagT
+    load_flag(::std::size_t tile_id) const
+    {
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        return atomic_flag.load();
+    }
+
+    _T
+    get_value(::std::size_t tile_id, _FlagT flag) const
+    {
+        ::std::size_t offset = tile_id + padding + num_elements * is_full(flag);
+        return tile_values_begin[offset];
+    }
+
+    static ::std::size_t
+    get_tile_values_bytes(::std::size_t num_elements)
+    {
+        return (2 * num_elements) * sizeof(_T);
+    }
+
+    static ::std::size_t
+    get_flag_bytes(::std::size_t num_elements)
+    {
+        return num_elements * sizeof(_FlagT);
+    }
+
+    static ::std::size_t
+    get_padded_flag_bytes(::std::size_t num_elements)
+    {
+        // sizeof(_FlagT) extra bytes for possible intenal alignment
+        return get_flag_bytes(num_elements) + sizeof(_FlagT);
+    }
+
+    static _FlagT*
+    get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
+    {
+        // Aligned flags
+        ::std::size_t num_elements = get_num_elements(num_wgs);
+        ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements);
+        void* base_flags = reinterpret_cast<void*>(scan_memory_begin + tile_values_bytes);
+        auto remainder = get_padded_flag_bytes(num_elements); // scan_memory_bytes - tile_values_bytes
+        return reinterpret_cast<_FlagT*>(
+            ::std::align(::std::alignment_of_v<_FlagT>, get_flag_bytes(num_elements), base_flags, remainder));
+    }
+
+    static ::std::size_t
+    get_memory_size(::std::size_t num_wgs)
+    {
+        ::std::size_t num_elements = get_num_elements(num_wgs);
+        // sizeof(_T) extra bytes are not needed because LoopbackScanMemory is going at the beginning of the scratch
+        ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements);
+        // Padding to provide room for aligment
+        ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements);
+
+        return tile_values_bytes + flag_bytes;
+    }
+
+    static ::std::size_t
+    get_num_elements(::std::size_t num_wgs)
+    {
+        return padding + num_wgs;
+    }
+
+    static bool
+    is_ready(_FlagT flag)
+    {
+        return flag != NOT_READY;
+    }
+
+    static bool
+    is_full(_FlagT flag)
+    {
+        return flag == FULL_MASK;
+    }
+
+    static bool
+    is_out_of_bounds(_FlagT flag)
+    {
+        return flag == OUT_OF_BOUNDS;
+    }
+
+  private:
+    ::std::size_t num_elements;
+    _FlagT* flags_begin;
+    _T* tile_values_begin;
+};
+
+struct TileId
+{
+    using _TileIdT = ::std::uint32_t;
+    using _AtomicTileRefT = sycl::atomic_ref<_TileIdT, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>;
+
+    TileId(_TileIdT* tileid_memory) : tile_counter(*(tileid_memory)) {}
+
+    constexpr static ::std::size_t
+    get_padded_memory_size()
+    {
+        // extra sizeof(_TileIdT) for possible aligment issues
+        return sizeof(_TileIdT) + sizeof(_TileIdT);
+    }
+
+    constexpr static ::std::size_t
+    get_memory_size()
+    {
+        // extra sizeof(_TileIdT) for possible aligment issues
+        return sizeof(_TileIdT);
+    }
+
+    _TileIdT
+    fetch_inc()
+    {
+        return tile_counter.fetch_add(1);
+    }
+
+    _AtomicTileRefT tile_counter;
+};
+
+struct cooperative_lookback
+{
+
+    template <typename _T, typename _Subgroup, typename BinOp, template <typename> typename LoopbackScanMemory>
     _T
-    cooperative_lookback(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, std::uint32_t* flags_begin,
-                         _T* tile_sums)
+    operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, LoopbackScanMemory<_T> memory)
     {
+        using FlagT = typename LoopbackScanMemory<_T>::_FlagT;
+
         _T sum = 0;
         int offset = -1;
         int i = 0;
@@ -68,24 +261,20 @@ struct __scan_status_flag
 
         for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
         {
-            _AtomicRefT tile_atomic(*(flags_begin + tile + padding - local_id));
-            std::uint32_t flag;
+            FlagT flag;
             do
             {
-                flag = tile_atomic.load();
-            } while (!sycl::all_of_group(subgroup, flag != NOT_READY)); // Loop till all ready
+                flag = memory.load_flag(tile - local_id);
+            } while (!sycl::all_of_group(subgroup, LoopbackScanMemory<_T>::is_ready(flag))); // Loop till all ready
 
-            bool is_full = flag == FULL_MASK;
+            bool is_full = LoopbackScanMemory<_T>::is_full(flag);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
             auto lowest_item_with_full = is_full_ballot.find_low();
 
-            // The partial scan results and the full scan sum values are in contiguous memory.
-            // Each section of the memory is of size num_elements.
-            // The partial sum for a tile is at [i] and the full sum is at [i + num_elements]
-            // is_full * num_elements allows to select between the two values without branching the code.
-            size_t contrib_offset = tile + padding - local_id + is_full * num_elements;
-            _T val = *(tile_sums + contrib_offset);
-            _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0) ? val : _T{0};
+            // TODO: Use identity_fn for out of bounds values
+            _T contribution = local_id <= lowest_item_with_full && !LoopbackScanMemory<_T>::is_out_of_bounds(flag)
+                                  ? memory.get_value(tile - local_id, flag)
+                                  : _T{0};
 
             // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
             sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
@@ -100,12 +289,6 @@ struct __scan_status_flag
 
         return sum;
     }
-
-    _AtomicRefT atomic_flag;
-    _T* scanned_partial_value;
-    _T* scanned_full_value;
-
-    size_t num_elements;
 };
 
 template <typename _KernelParam, bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
@@ -113,6 +296,8 @@ void
 single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _TileIdT = TileId::_TileIdT;
+    using _FlagT = typename LoopbackScanMemory<_Type>::_FlagT;
 
     static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
 
@@ -122,31 +307,22 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
 
     // Avoid non_uniform n by padding up to a multiple of wgsize
-    std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
-    constexpr int status_flag_padding = SUBGROUP_SIZE;
-    std::size_t status_flags_elems = num_wgs + status_flag_padding + 1;
-    std::size_t status_flags_size = status_flags_elems * sizeof(std::uint32_t);
-
-    std::size_t tile_sums_elems = num_wgs + status_flag_padding;
-    std::size_t tile_sums_size = status_flags_elems * sizeof(_Type);
+    ScanMemoryManager<_Type, LoopbackScanMemory, TileId> scratch(__queue);
+    scratch.allocate(num_wgs);
 
-    std::size_t extra_mem_for_aligment = alignof(_Type) - (status_flags_size % alignof(_Type));
-    // status_flags_size for the status_flags
-    // extra_mem_for_aligment of the datatype _Type
-    // First tile_sums_size partial scanned values
-    // Second tile_sums_size full scanned values (current partial plus all previous workgroups partial)
-    char* mem_pool =
-        sycl::malloc_device<char>(status_flags_size + extra_mem_for_aligment + 2 * tile_sums_size, __queue);
+    // Memory Structure:
+    // [Loopback Scan Memory, Tile Id Counter]
+    auto scan_memory_begin = scratch.scan_memory_ptr();
+    auto status_flags_begin = LoopbackScanMemory<_Type>::get_flags_begin(scan_memory_begin, num_wgs);
+    auto tile_id_begin = scratch.tile_id_ptr();
 
-    std::size_t tile_sums_offset = status_flags_size + extra_mem_for_aligment;
-
-    std::uint32_t* status_flags = reinterpret_cast<std::uint32_t*>(mem_pool);
-    _Type* tile_sums = reinterpret_cast<_Type*>(mem_pool + tile_sums_offset);
-
-    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(status_flags_elems, wgsize);
+    ::std::size_t num_elements = LoopbackScanMemory<_Type>::get_num_elements(num_wgs);
+    // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
+    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
 
     auto fill_event = __queue.submit(
         [&](sycl::handler& hdl)
@@ -155,14 +331,17 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                                                  [=](const sycl::nd_item<1>& item)
                                                  {
                                                      int id = item.get_global_linear_id();
-                                                     if (id < status_flags_elems)
-                                                         status_flags[id] =
-                                                             id < status_flag_padding
-                                                                 ? __scan_status_flag<_Type>::OUT_OF_BOUNDS
-                                                                 : __scan_status_flag<_Type>::NOT_READY;
+                                                     if (id < num_elements)
+                                                         status_flags_begin[id] =
+                                                             id < LoopbackScanMemory<_Type>::padding
+                                                                 ? LoopbackScanMemory<_Type>::OUT_OF_BOUNDS
+                                                                 : LoopbackScanMemory<_Type>::NOT_READY;
+                                                     if (id == num_elements)
+                                                         tile_id_begin[0] = 0;
                                                  });
         });
 
+
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
@@ -176,12 +355,10 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             auto subgroup = item.get_sub_group();
 
             // Obtain unique ID for this work-group that will be used in decoupled lookback
+            TileId dynamic_tile_id(tile_id_begin);
             if (group.leader())
             {
-                sycl::atomic_ref<::std::uint32_t, sycl::memory_order::relaxed, sycl::memory_scope::device,
-                                 sycl::access::address_space::global_space>
-                    idx_atomic(status_flags[status_flags_elems - 1]);
-                tile_id_lacc[0] = idx_atomic.fetch_add(1);
+                tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
             }
             sycl::group_barrier(group);
             std::uint32_t tile_id = tile_id_lacc[0];
@@ -207,7 +384,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             }
             sycl::group_barrier(group);
 
-            auto in_begin = tile_vals.get_pointer();
+            auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
             auto in_end = in_begin + wg_local_memory_size;
             auto out_begin = __out_rng.begin() + wg_current_offset;
 
@@ -217,16 +394,16 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
             // The first sub-group will query the previous tiles to find a prefix
             if (subgroup.get_group_id() == 0)
             {
-                __scan_status_flag<_Type> flag(tile_id, status_flags, tile_sums, tile_sums_elems);
+                LoopbackScanMemory<_Type> scan_mem(scan_memory_begin, num_wgs);
 
                 if (group.leader())
-                    flag.set_partial(local_sum);
+                    scan_mem.set_partial(tile_id, local_sum);
 
                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                prev_sum = flag.cooperative_lookback(tile_id, subgroup, __binary_op, status_flags, tile_sums);
+                prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
 
                 if (group.leader())
-                    flag.set_full(prev_sum + local_sum);
+                    scan_mem.set_full(tile_id, prev_sum + local_sum);
             }
 
             prev_sum = sycl::group_broadcast(group, prev_sum, 0);
@@ -234,12 +411,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
         });
     });
 
-    auto free_event = __queue.submit(
-        [=](sycl::handler& hdl)
-        {
-            hdl.depends_on(event);
-            hdl.host_task([=](){ sycl::free(mem_pool, __queue); });
-        });
+    scratch.async_free(event);
 
     event.wait();
 }

From 8d23836ebb42aa9650cc0b3a865a4c716cb6e98b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Thu, 23 Nov 2023 14:11:27 +0000
Subject: [PATCH 060/134] [Scan_kt] Atomic64 flags + value implementation (#25)

* Implemented atomic64 version of the scan_kt pass

* Removed repeated offset calculation for tile id atomic flag

* Loopback -> Lookback. Removed unused var.
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 351 ++++++++++++------
 1 file changed, 243 insertions(+), 108 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 314ace11410..6dfe1bb6ef1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -26,11 +26,13 @@ inline namespace igpu {
 
 constexpr ::std::size_t SUBGROUP_SIZE = 32;
 
-template <typename Type, template <typename> typename LoopbackScanMemory, typename TileId>
+template <typename Type, typename UseAtomic64, template <typename, typename> typename LookbackScanMemory,
+          typename TileId>
 struct ScanMemoryManager
 {
     using _TileIdT = typename TileId::_TileIdT;
-    using _FlagT = typename LoopbackScanMemory<Type>::_FlagT;
+    using _LookbackScanMemory = LookbackScanMemory<Type, UseAtomic64>;
+    using _FlagT = typename _LookbackScanMemory::_FlagT;
 
     ScanMemoryManager(sycl::queue q) : q{q} {};
 
@@ -49,7 +51,7 @@ struct ScanMemoryManager
     void
     allocate(::std::size_t num_wgs)
     {
-        ::std::size_t scan_memory_size = LoopbackScanMemory<Type>::get_memory_size(num_wgs);
+        ::std::size_t scan_memory_size = _LookbackScanMemory::get_memory_size(num_wgs);
         constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size();
         constexpr ::std::size_t tileid_size = TileId::get_memory_size();
 
@@ -85,8 +87,11 @@ struct ScanMemoryManager
     sycl::queue q;
 };
 
+template <typename _T, typename UseAtomic64>
+struct LookbackScanMemory;
+
 template <typename _T>
-struct LoopbackScanMemory
+struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::false_type>
 {
     using _FlagT = ::std::uint32_t;
     using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device,
@@ -99,13 +104,12 @@ struct LoopbackScanMemory
 
     static constexpr ::std::size_t padding = SUBGROUP_SIZE;
 
-    LoopbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
-        : num_elements(get_num_elements(num_wgs))
+    // LookbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...]
+    // Each section has num_wgs + padding elements
+    LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
+        : num_elements(get_num_elements(num_wgs)), tile_values_begin(reinterpret_cast<_T*>(scan_memory_begin)),
+          flags_begin(get_flags_begin(scan_memory_begin, num_wgs))
     {
-        // LoopbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...]
-        // Each section has num_wgs + padding elements
-        tile_values_begin = reinterpret_cast<_T*>(scan_memory_begin);
-        flags_begin = get_flags_begin(scan_memory_begin, num_wgs);
     }
 
     void
@@ -126,19 +130,17 @@ struct LoopbackScanMemory
         atomic_flag.store(FULL_MASK);
     }
 
-    _FlagT
-    load_flag(::std::size_t tile_id) const
+    _AtomicFlagRefT
+    get_flag(::std::size_t tile_id) const
     {
-        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
-
-        return atomic_flag.load();
+        return _AtomicFlagRefT(*(flags_begin + tile_id + padding));
     }
 
     _T
     get_value(::std::size_t tile_id, _FlagT flag) const
     {
-        ::std::size_t offset = tile_id + padding + num_elements * is_full(flag);
-        return tile_values_begin[offset];
+        // full_value and partial_value are num_elements apart
+        return *(tile_values_begin + tile_id + padding + num_elements * is_full(flag));
     }
 
     static ::std::size_t
@@ -176,7 +178,7 @@ struct LoopbackScanMemory
     get_memory_size(::std::size_t num_wgs)
     {
         ::std::size_t num_elements = get_num_elements(num_wgs);
-        // sizeof(_T) extra bytes are not needed because LoopbackScanMemory is going at the beginning of the scratch
+        // sizeof(_T) extra bytes are not needed because LookbackScanMemory is going at the beginning of the scratch
         ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements);
         // Padding to provide room for aligment
         ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements);
@@ -214,6 +216,110 @@ struct LoopbackScanMemory
     _T* tile_values_begin;
 };
 
+template <typename _T>
+struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type>
+{
+    using _FlagT = ::std::uint64_t;
+    using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>;
+
+    // Each flag is divided in 2 32bit values
+    // 32..63 status bits
+    // 00..31 value bits
+    // Example: status = full scanned value, int value = 15:
+    // 1000 0000 0000 0000 0000 0000 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 1111
+
+    // Status values:
+    // 00xxxx - not computed
+    // 01xxxx - partial
+    // 10xxxx - full
+    // 110000 - out of bounds
+
+    static constexpr _FlagT NOT_READY = 0;
+    static constexpr _FlagT PARTIAL_MASK = 1l << (sizeof(_FlagT) * 8 - 2);
+    static constexpr _FlagT FULL_MASK = 1l << (sizeof(_FlagT) * 8 - 1);
+    static constexpr _FlagT OUT_OF_BOUNDS = PARTIAL_MASK | FULL_MASK;
+
+    static constexpr _FlagT VALUE_MASK = (1l << sizeof(::std::uint32_t) * 8) - 1; // 32 bit mask to store value
+
+    static constexpr ::std::size_t padding = SUBGROUP_SIZE;
+
+    LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
+        : num_elements(get_num_elements(num_wgs)), flags_begin(get_flags_begin(scan_memory_begin, num_wgs))
+    {
+    }
+
+    void
+    set_partial(::std::size_t tile_id, _T val)
+    {
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        atomic_flag.store(PARTIAL_MASK | static_cast<::std::uint32_t>(val));
+    }
+
+    void
+    set_full(::std::size_t tile_id, _T val)
+    {
+        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
+
+        atomic_flag.store(FULL_MASK | static_cast<::std::uint32_t>(val));
+    }
+
+    _AtomicFlagRefT
+    get_flag(::std::size_t tile_id) const
+    {
+        return _AtomicFlagRefT(*(flags_begin + tile_id + padding));
+    }
+
+    _T
+    get_value(::std::size_t, _FlagT flag) const
+    {
+        return static_cast<::std::uint32_t>(flag & VALUE_MASK);
+    }
+
+    static _FlagT*
+    get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t)
+    {
+        return reinterpret_cast<_FlagT*>(scan_memory_begin);
+    }
+
+    static ::std::size_t
+    get_memory_size(::std::size_t num_wgs)
+    {
+        ::std::size_t num_elements = get_num_elements(num_wgs);
+        return num_elements * sizeof(_FlagT);
+    }
+
+    static ::std::size_t
+    get_num_elements(::std::size_t num_wgs)
+    {
+        return padding + num_wgs;
+    }
+
+    static bool
+    is_ready(_FlagT flag)
+    {
+        // flag & OUT_OF_BOUNDS != NOT_READY means it has either partial or full value, or is out of bounds
+        return (flag & OUT_OF_BOUNDS) != NOT_READY;
+    }
+
+    static bool
+    is_full(_FlagT flag)
+    {
+        return (flag & OUT_OF_BOUNDS) == FULL_MASK;
+    }
+
+    static bool
+    is_out_of_bounds(_FlagT flag)
+    {
+        return (flag & OUT_OF_BOUNDS) == OUT_OF_BOUNDS;
+    }
+
+  private:
+    ::std::size_t num_elements;
+    _FlagT* flags_begin;
+};
+
 struct TileId
 {
     using _TileIdT = ::std::uint32_t;
@@ -248,11 +354,14 @@ struct TileId
 struct cooperative_lookback
 {
 
-    template <typename _T, typename _Subgroup, typename BinOp, template <typename> typename LoopbackScanMemory>
+    template <typename _T, typename _Subgroup, typename BinOp,
+              template <typename, typename> typename LookbackScanMemory, typename UseAtomic64>
     _T
-    operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op, LoopbackScanMemory<_T> memory)
+    operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op,
+               LookbackScanMemory<_T, UseAtomic64> memory)
     {
-        using FlagT = typename LoopbackScanMemory<_T>::_FlagT;
+        using _LookbackScanMemory = LookbackScanMemory<_T, UseAtomic64>;
+        using FlagT = typename _LookbackScanMemory::_FlagT;
 
         _T sum = 0;
         int offset = -1;
@@ -261,18 +370,19 @@ struct cooperative_lookback
 
         for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
         {
+            auto atomic_flag = memory.get_flag(tile - local_id);
             FlagT flag;
             do
             {
-                flag = memory.load_flag(tile - local_id);
-            } while (!sycl::all_of_group(subgroup, LoopbackScanMemory<_T>::is_ready(flag))); // Loop till all ready
+                flag = atomic_flag.load();
+            } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag))); // Loop till all ready
 
-            bool is_full = LoopbackScanMemory<_T>::is_full(flag);
+            bool is_full = _LookbackScanMemory::is_full(flag);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
             auto lowest_item_with_full = is_full_ballot.find_low();
 
             // TODO: Use identity_fn for out of bounds values
-            _T contribution = local_id <= lowest_item_with_full && !LoopbackScanMemory<_T>::is_out_of_bounds(flag)
+            _T contribution = local_id <= lowest_item_with_full && !_LookbackScanMemory::is_out_of_bounds(flag)
                                   ? memory.get_value(tile - local_id, flag)
                                   : _T{0};
 
@@ -291,124 +401,131 @@ struct cooperative_lookback
     }
 };
 
-template <typename _KernelParam, bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
+template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _InRange, typename _OutRange,
+          typename _BinaryOp>
 void
 single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _TileIdT = TileId::_TileIdT;
-    using _FlagT = typename LoopbackScanMemory<_Type>::_FlagT;
+    using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>;
+    using _FlagT = typename _LookbackScanMemory::_FlagT;
 
-    static_assert(_Inclusive, "Single-pass scan only available for inclusive scan");
+    static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan");
 
     const ::std::size_t n = __in_rng.size();
 
     constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
-
     // Avoid non_uniform n by padding up to a multiple of wgsize
     ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
-    ScanMemoryManager<_Type, LoopbackScanMemory, TileId> scratch(__queue);
+    ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
     scratch.allocate(num_wgs);
 
     // Memory Structure:
-    // [Loopback Scan Memory, Tile Id Counter]
+    // [Lookback Scan Memory, Tile Id Counter]
     auto scan_memory_begin = scratch.scan_memory_ptr();
-    auto status_flags_begin = LoopbackScanMemory<_Type>::get_flags_begin(scan_memory_begin, num_wgs);
+    auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs);
     auto tile_id_begin = scratch.tile_id_ptr();
 
-    ::std::size_t num_elements = LoopbackScanMemory<_Type>::get_num_elements(num_wgs);
+    ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs);
     // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
     ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
 
     auto fill_event = __queue.submit(
         [&](sycl::handler& hdl)
         {
-            hdl.parallel_for<class scan_kt_init>(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
-                                                 [=](const sycl::nd_item<1>& item)
-                                                 {
-                                                     int id = item.get_global_linear_id();
-                                                     if (id < num_elements)
-                                                         status_flags_begin[id] =
-                                                             id < LoopbackScanMemory<_Type>::padding
-                                                                 ? LoopbackScanMemory<_Type>::OUT_OF_BOUNDS
-                                                                 : LoopbackScanMemory<_Type>::NOT_READY;
-                                                     if (id == num_elements)
-                                                         tile_id_begin[0] = 0;
-                                                 });
+            hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
+                             [=](const sycl::nd_item<1>& item)
+                             {
+                                 int id = item.get_global_linear_id();
+                                 if (id < num_elements)
+                                     status_flags_begin[id] = id < _LookbackScanMemory::padding
+                                                                  ? _LookbackScanMemory::OUT_OF_BOUNDS
+                                                                  : _LookbackScanMemory::NOT_READY;
+                                 if (id == num_elements)
+                                     tile_id_begin[0] = 0;
+                             });
         });
 
-
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-            auto group = item.get_group();
-            auto local_id = item.get_local_id(0);
-            auto stride = item.get_local_range(0);
-            auto subgroup = item.get_sub_group();
-
-            // Obtain unique ID for this work-group that will be used in decoupled lookback
-            TileId dynamic_tile_id(tile_id_begin);
-            if (group.leader())
-            {
-                tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-            }
-            sycl::group_barrier(group);
-            std::uint32_t tile_id = tile_id_lacc[0];
-
-            // Global load into local
-            auto wg_current_offset = (tile_id*elems_in_tile);
-            auto wg_next_offset = ((tile_id+1)*elems_in_tile);
-            size_t wg_local_memory_size = elems_in_tile;
-            if (wg_current_offset >= n)
-                return;
-            if (wg_next_offset > n)
-                wg_local_memory_size = n - wg_current_offset;
-
-            if (wg_next_offset <= n) {
-                _ONEDPL_PRAGMA_UNROLL
-                for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                    tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
-            } else {
-                for (std::uint32_t i = 0; i < elems_per_workitem; ++i) {
-                    if (wg_current_offset + local_id + stride * i < n)
-                        tile_vals[local_id + stride * i] = __in_rng[wg_current_offset + local_id + stride * i];
-                }
-            }
-            sycl::group_barrier(group);
-
-            auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
-            auto in_end = in_begin + wg_local_memory_size;
-            auto out_begin = __out_rng.begin() + wg_current_offset;
-
-            auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
-            _Type prev_sum = 0;
-
-            // The first sub-group will query the previous tiles to find a prefix
-            if (subgroup.get_group_id() == 0)
-            {
-                LoopbackScanMemory<_Type> scan_mem(scan_memory_begin, num_wgs);
-
-                if (group.leader())
-                    scan_mem.set_partial(tile_id, local_sum);
-
-                // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
-
-                if (group.leader())
-                    scan_mem.set_full(tile_id, prev_sum + local_sum);
-            }
-
-            prev_sum = sycl::group_broadcast(group, prev_sum, 0);
-            sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
-        });
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
+                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]]
+                         {
+                             auto group = item.get_group();
+                             auto local_id = item.get_local_id(0);
+                             auto stride = item.get_local_range(0);
+                             auto subgroup = item.get_sub_group();
+
+                             // Obtain unique ID for this work-group that will be used in decoupled lookback
+                             TileId dynamic_tile_id(tile_id_begin);
+                             if (group.leader())
+                             {
+                                 tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                             }
+                             sycl::group_barrier(group);
+                             std::uint32_t tile_id = tile_id_lacc[0];
+
+                             // Global load into local
+                             auto wg_current_offset = (tile_id * elems_in_tile);
+                             auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
+                             size_t wg_local_memory_size = elems_in_tile;
+                             if (wg_current_offset >= n)
+                                 return;
+                             if (wg_next_offset > n)
+                                 wg_local_memory_size = n - wg_current_offset;
+
+                             if (wg_next_offset <= n)
+                             {
+                                 _ONEDPL_PRAGMA_UNROLL
+                                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                                     tile_vals[local_id + stride * i] =
+                                         __in_rng[wg_current_offset + local_id + stride * i];
+                             }
+                             else
+                             {
+                                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                                 {
+                                     if (wg_current_offset + local_id + stride * i < n)
+                                         tile_vals[local_id + stride * i] =
+                                             __in_rng[wg_current_offset + local_id + stride * i];
+                                 }
+                             }
+                             sycl::group_barrier(group);
+
+                             auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
+                             auto in_end = in_begin + wg_local_memory_size;
+                             auto out_begin = __out_rng.begin() + wg_current_offset;
+
+                             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
+                             _Type prev_sum = 0;
+
+                             // The first sub-group will query the previous tiles to find a prefix
+                             if (subgroup.get_group_id() == 0)
+                             {
+                                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
+
+                                 if (group.leader())
+                                     scan_mem.set_partial(tile_id, local_sum);
+
+                                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                                 prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
+
+                                 if (group.leader())
+                                     scan_mem.set_full(tile_id, prev_sum + local_sum);
+                             }
+
+                             prev_sum = sycl::group_broadcast(group, prev_sum, 0);
+                             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
+                         });
     });
 
     scratch.async_free(event);
@@ -438,7 +555,25 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
-    single_pass_scan_impl<_KernelParam, true>(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+    // Avoid aspect query overhead for sizeof(Types) > 32 bits
+    if constexpr (sizeof(typename std::iterator_traits<_InIterator>::value_type) <= sizeof(std::uint32_t))
+    {
+        if (__queue.get_device().has(sycl::aspect::atomic64))
+        {
+            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type>(
+                __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+        }
+        else
+        {
+            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>(
+                __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+        }
+    }
+    else
+    {
+        single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>(
+            __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+    }
 }
 
 } // inline namespace igpu

From c3c3218c6e2dedd844df5f3023d7a315ad12d0ac Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:39:49 +0000
Subject: [PATCH 061/134] constexpr, types and remove an unneeded check

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h     | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 6dfe1bb6ef1..266d4b18657 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -364,8 +364,7 @@ struct cooperative_lookback
         using FlagT = typename _LookbackScanMemory::_FlagT;
 
         _T sum = 0;
-        int offset = -1;
-        int i = 0;
+        constexpr int offset = -1;
         int local_id = subgroup.get_local_id();
 
         for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
@@ -418,7 +417,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
     // Avoid non_uniform n by padding up to a multiple of wgsize
-    ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
@@ -461,8 +460,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                          [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]]
                          {
                              auto group = item.get_group();
-                             auto local_id = item.get_local_id(0);
-                             auto stride = item.get_local_range(0);
+                             ::std::uint32_t local_id = item.get_local_id(0);
+                             constexpr ::std::uint32_t stride = wgsize;
                              auto subgroup = item.get_sub_group();
 
                              // Obtain unique ID for this work-group that will be used in decoupled lookback
@@ -477,9 +476,8 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                              // Global load into local
                              auto wg_current_offset = (tile_id * elems_in_tile);
                              auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
-                             size_t wg_local_memory_size = elems_in_tile;
-                             if (wg_current_offset >= n)
-                                 return;
+                             auto wg_local_memory_size = elems_in_tile;
+
                              if (wg_next_offset > n)
                                  wg_local_memory_size = n - wg_current_offset;
 
@@ -502,7 +500,6 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                              sycl::group_barrier(group);
 
                              auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
-                             auto in_end = in_begin + wg_local_memory_size;
                              auto out_begin = __out_rng.begin() + wg_current_offset;
 
                              auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);

From d2577024a0755c1cf9c3993ff2de02e6060af71f Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:40:04 +0000
Subject: [PATCH 062/134] Correct static_cast ?

---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 266d4b18657..0655b60deb1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -274,7 +274,7 @@ struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type>
     _T
     get_value(::std::size_t, _FlagT flag) const
     {
-        return static_cast<::std::uint32_t>(flag & VALUE_MASK);
+        return static_cast<_T>(flag & VALUE_MASK);
     }
 
     static _FlagT*

From 43e17ba4c5aeba6a519ef8ba063210f8a08d73df Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:40:19 +0000
Subject: [PATCH 063/134] Defer group comms in lookback

---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 0655b60deb1..ce186b4ffa4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -386,8 +386,7 @@ struct cooperative_lookback
                                   : _T{0};
 
             // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
-            sum += sycl::reduce_over_group(subgroup, contribution, bin_op);
-
+            sum = bin_op(sum, contribution);
             // If we found a full value, we can stop looking at previous tiles. Otherwise,
             // keep going through tiles until we either find a full tile or we've completely
             // recomputed the prefix using partial values
@@ -395,6 +394,7 @@ struct cooperative_lookback
                 break;
 
         }
+        sum = sycl::reduce_over_group(subgroup, sum, bin_op);
 
         return sum;
     }

From e5b3ca4bb386bb1a665f178348e63904f9aac61b Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:40:52 +0000
Subject: [PATCH 064/134] Disable dynamic tile ID by default

TODO: we still allocate & initialize the memory for the counter
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index ce186b4ffa4..007186a2f9a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -400,8 +400,8 @@ struct cooperative_lookback
     }
 };
 
-template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _InRange, typename _OutRange,
-          typename _BinaryOp>
+template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _UseDynamicTileID,
+          typename _InRange, typename _OutRange, typename _BinaryOp>
 void
 single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
 {
@@ -464,14 +464,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                              constexpr ::std::uint32_t stride = wgsize;
                              auto subgroup = item.get_sub_group();
 
-                             // Obtain unique ID for this work-group that will be used in decoupled lookback
-                             TileId dynamic_tile_id(tile_id_begin);
-                             if (group.leader())
-                             {
-                                 tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-                             }
-                             sycl::group_barrier(group);
-                             std::uint32_t tile_id = tile_id_lacc[0];
+                              std::uint32_t tile_id;
+                              if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
+                              {
+                                  // Obtain unique ID for this work-group that will be used in decoupled lookback
+                                  TileId dynamic_tile_id(tile_id_begin);
+                                  if (group.leader())
+                                  {
+                                      tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                                  }
+                                  sycl::group_barrier(group);
+                                  tile_id = tile_id_lacc[0];
+                              }
+                              else
+                              {
+                                  tile_id = group.get_group_linear_id();
+                              }
+
 
                              // Global load into local
                              auto wg_current_offset = (tile_id * elems_in_tile);
@@ -557,18 +566,18 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     {
         if (__queue.get_device().has(sycl::aspect::atomic64))
         {
-            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type>(
+            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::false_type>(
                 __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
         }
         else
         {
-            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>(
+            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>(
                 __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
         }
     }
     else
     {
-        single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type>(
+        single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>(
             __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
     }
 }

From ab346da026243d430f95b1f85ad86d20711f3939 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:41:32 +0000
Subject: [PATCH 065/134] Reduce from register sums instead of local mem

Also use #pragma unroll for now
---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 007186a2f9a..e43cfee6aa6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -489,29 +489,36 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
                              if (wg_next_offset > n)
                                  wg_local_memory_size = n - wg_current_offset;
-
+                             _Type my_reducer{};
                              if (wg_next_offset <= n)
                              {
-                                 _ONEDPL_PRAGMA_UNROLL
+                                 #pragma unroll
                                  for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                                     tile_vals[local_id + stride * i] =
-                                         __in_rng[wg_current_offset + local_id + stride * i];
+                                 {
+                                     _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
+                                     my_reducer = __binary_op(my_reducer, in_val);
+                                     tile_vals[local_id + stride * i] = in_val;
+                                 }
                              }
                              else
                              {
+                                 #pragma unroll
                                  for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
                                  {
                                      if (wg_current_offset + local_id + stride * i < n)
-                                         tile_vals[local_id + stride * i] =
-                                             __in_rng[wg_current_offset + local_id + stride * i];
+                                     {
+                                         _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
+                                         my_reducer = __binary_op(my_reducer, in_val);
+                                         tile_vals[local_id + stride * i] = in_val;
+                                     }
                                  }
                              }
-                             sycl::group_barrier(group);
+
+                             auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op);
 
                              auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
                              auto out_begin = __out_rng.begin() + wg_current_offset;
 
-                             auto local_sum = sycl::joint_reduce(group, in_begin, in_end, __binary_op);
                              _Type prev_sum = 0;
 
                              // The first sub-group will query the previous tiles to find a prefix

From f87573c3f7b835fb3fea91735d33036af6652931 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 14:42:06 +0000
Subject: [PATCH 066/134] Unrolled version of joint_inclusive_scan

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index e43cfee6aa6..68921c08c3c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -536,8 +536,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                                      scan_mem.set_full(tile_id, prev_sum + local_sum);
                              }
 
-                             prev_sum = sycl::group_broadcast(group, prev_sum, 0);
-                             sycl::joint_inclusive_scan(group, in_begin, in_end, out_begin, __binary_op, prev_sum);
+                             _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+                             #pragma unroll
+                             for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
+                             {
+                                 ::std::uint32_t i = stride * step;
+                                 _Type x;
+                                 if (i + local_id < wg_local_memory_size)
+                                 {
+                                     x = in_begin[i + local_id];
+                                 }
+                                 _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry);
+                                 if (i + local_id < wg_local_memory_size)
+                                 {
+                                     out_begin[i + local_id] = out;
+                                 }
+                                 carry = group_broadcast(group, out, stride - 1);
+                             }
                          });
     });
 

From 621adf7eb4d661a4d1a9ef0ee65b3990987d7f69 Mon Sep 17 00:00:00 2001
From: Joe Todd <joeatodd@users.noreply.github.com>
Date: Thu, 23 Nov 2023 15:25:30 +0000
Subject: [PATCH 067/134] Update
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 68921c08c3c..dae5cd7a48e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -537,6 +537,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
                              }
 
                              _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+                             // TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL
                              #pragma unroll
                              for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
                              {

From b8c837f099a6961b5b3eef75489f8c39058f39e0 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 23 Nov 2023 15:27:43 +0000
Subject: [PATCH 068/134] Add TODO

---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index dae5cd7a48e..a85d86aeb31 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -489,6 +489,7 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
                              if (wg_next_offset > n)
                                  wg_local_memory_size = n - wg_current_offset;
+                             //TODO: assumes default ctor produces identity w.r.t. __binary_op
                              _Type my_reducer{};
                              if (wg_next_offset <= n)
                              {

From 8367be7eec618670aa809659994e85f97f7fe976 Mon Sep 17 00:00:00 2001
From: Alberto Cabrera <alberto.cabrera@codeplay.com>
Date: Tue, 28 Nov 2023 15:55:38 +0000
Subject: [PATCH 069/134] Changing fill kernel for a memset

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 39 ++++++++++---------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index a85d86aeb31..c1e1d2c0cbd 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -369,19 +369,20 @@ struct cooperative_lookback
 
         for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
         {
-            auto atomic_flag = memory.get_flag(tile - local_id);
+            auto atomic_flag = memory.get_flag(tile - local_id); //
             FlagT flag;
             do
             {
                 flag = atomic_flag.load();
-            } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag))); // Loop till all ready
+            } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag) ||
+                                                       (tile - local_id < 0))); // Loop till all ready
 
             bool is_full = _LookbackScanMemory::is_full(flag);
             auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
             auto lowest_item_with_full = is_full_ballot.find_low();
 
             // TODO: Use identity_fn for out of bounds values
-            _T contribution = local_id <= lowest_item_with_full && !_LookbackScanMemory::is_out_of_bounds(flag)
+            _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0)
                                   ? memory.get_value(tile - local_id, flag)
                                   : _T{0};
 
@@ -434,21 +435,23 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
     // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
     ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
 
-    auto fill_event = __queue.submit(
-        [&](sycl::handler& hdl)
-        {
-            hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
-                             [=](const sycl::nd_item<1>& item)
-                             {
-                                 int id = item.get_global_linear_id();
-                                 if (id < num_elements)
-                                     status_flags_begin[id] = id < _LookbackScanMemory::padding
-                                                                  ? _LookbackScanMemory::OUT_OF_BOUNDS
-                                                                  : _LookbackScanMemory::NOT_READY;
-                                 if (id == num_elements)
-                                     tile_id_begin[0] = 0;
-                             });
-        });
+    auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT));
+
+    // auto fill_event = __queue.submit(
+    //     [&](sycl::handler& hdl)
+    //     {
+    //         hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
+    //                          [=](const sycl::nd_item<1>& item)
+    //                          {
+    //                              int id = item.get_global_linear_id();
+    //                              if (id < num_elements)
+    //                                  status_flags_begin[id] = id < _LookbackScanMemory::padding
+    //                                                               ? _LookbackScanMemory::OUT_OF_BOUNDS
+    //                                                               : _LookbackScanMemory::NOT_READY;
+    //                              if (id == num_elements)
+    //                                  tile_id_begin[0] = 0;
+    //                          });
+    //     });
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);

From 02ff9f338a9703367bf387e9042d5d8de4739411 Mon Sep 17 00:00:00 2001
From: Alberto Cabrera <alberto.cabrera@codeplay.com>
Date: Wed, 29 Nov 2023 15:19:30 +0000
Subject: [PATCH 070/134] Single wg implementation

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 358 ++++++++++++------
 1 file changed, 234 insertions(+), 124 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index c1e1d2c0cbd..345da745608 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -401,6 +401,89 @@ struct cooperative_lookback
     }
 };
 
+template <typename _KernelParam, typename _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
+void
+single_pass_scan_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+
+    static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan");
+
+    const ::std::size_t n = __in_rng.size();
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    constexpr ::std::size_t num_workitems = wgsize;
+
+    auto event = __queue.submit([&](sycl::handler& hdl) {
+        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
+        hdl.parallel_for(
+            sycl::nd_range<1>(num_workitems, wgsize), [=
+        ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                auto group = item.get_group();
+                ::std::uint32_t local_id = item.get_local_id(0);
+                constexpr ::std::uint32_t stride = wgsize;
+                auto subgroup = item.get_sub_group();
+
+                constexpr std::uint32_t tile_id = 0;
+                constexpr std::uint32_t wg_begin = 0;
+                constexpr std::uint32_t wg_end = elems_in_tile;
+
+                std::uint32_t wg_local_memory_size = elems_in_tile;
+
+                auto out_begin = __out_rng.begin();
+                _Type carry = 0;
+
+                // Global load into local
+                if (wg_end > n)
+                    wg_local_memory_size = n;
+
+                //TODO: assumes default ctor produces identity w.r.t. __binary_op
+                // _Type my_reducer{};
+                if (wg_end <= n)
+                {
+#pragma unroll
+                    for (std::uint32_t step = 0; step < elems_per_workitem; ++step)
+                    {
+                        ::std::uint32_t i = stride * step;
+                        _Type in_val = __in_rng[i + local_id];
+                        // my_reducer = __binary_op(my_reducer, in_val);
+                        _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry);
+                        out_begin[i + local_id] = out;
+                        carry = group_broadcast(group, out, stride - 1);
+                    }
+                }
+                else
+                {
+#pragma unroll
+                    for (std::uint32_t step = 0; step < elems_per_workitem; ++step)
+                    {
+                        ::std::uint32_t i = stride * step;
+                        _Type in_val;
+
+                        if (i + local_id < n)
+                        {
+                            in_val = __in_rng[i + local_id];
+                            // my_reducer = __binary_op(my_reducer, in_val);
+                        }
+                        _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry);
+                        if (i + local_id < n)
+                        {
+                            out_begin[i + local_id] = out;
+                        }
+                        carry = group_broadcast(group, out, stride - 1);
+                    }
+                }
+            });
+    });
+
+    event.wait();
+}
+
 template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _UseDynamicTileID,
           typename _InRange, typename _OutRange, typename _BinaryOp>
 void
@@ -437,128 +520,111 @@ single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __ou
 
     auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT));
 
-    // auto fill_event = __queue.submit(
-    //     [&](sycl::handler& hdl)
-    //     {
-    //         hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
-    //                          [=](const sycl::nd_item<1>& item)
-    //                          {
-    //                              int id = item.get_global_linear_id();
-    //                              if (id < num_elements)
-    //                                  status_flags_begin[id] = id < _LookbackScanMemory::padding
-    //                                                               ? _LookbackScanMemory::OUT_OF_BOUNDS
-    //                                                               : _LookbackScanMemory::NOT_READY;
-    //                              if (id == num_elements)
-    //                                  tile_id_begin[0] = 0;
-    //                          });
-    //     });
-
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
-                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]]
-                         {
-                             auto group = item.get_group();
-                             ::std::uint32_t local_id = item.get_local_id(0);
-                             constexpr ::std::uint32_t stride = wgsize;
-                             auto subgroup = item.get_sub_group();
-
-                              std::uint32_t tile_id;
-                              if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
-                              {
-                                  // Obtain unique ID for this work-group that will be used in decoupled lookback
-                                  TileId dynamic_tile_id(tile_id_begin);
-                                  if (group.leader())
-                                  {
-                                      tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-                                  }
-                                  sycl::group_barrier(group);
-                                  tile_id = tile_id_lacc[0];
-                              }
-                              else
-                              {
-                                  tile_id = group.get_group_linear_id();
-                              }
-
-
-                             // Global load into local
-                             auto wg_current_offset = (tile_id * elems_in_tile);
-                             auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
-                             auto wg_local_memory_size = elems_in_tile;
-
-                             if (wg_next_offset > n)
-                                 wg_local_memory_size = n - wg_current_offset;
-                             //TODO: assumes default ctor produces identity w.r.t. __binary_op
-                             _Type my_reducer{};
-                             if (wg_next_offset <= n)
-                             {
-                                 #pragma unroll
-                                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                                 {
-                                     _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
-                                     my_reducer = __binary_op(my_reducer, in_val);
-                                     tile_vals[local_id + stride * i] = in_val;
-                                 }
-                             }
-                             else
-                             {
-                                 #pragma unroll
-                                 for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                                 {
-                                     if (wg_current_offset + local_id + stride * i < n)
-                                     {
-                                         _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
-                                         my_reducer = __binary_op(my_reducer, in_val);
-                                         tile_vals[local_id + stride * i] = in_val;
-                                     }
-                                 }
-                             }
-
-                             auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op);
-
-                             auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
-                             auto out_begin = __out_rng.begin() + wg_current_offset;
-
-                             _Type prev_sum = 0;
-
-                             // The first sub-group will query the previous tiles to find a prefix
-                             if (subgroup.get_group_id() == 0)
-                             {
-                                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
-
-                                 if (group.leader())
-                                     scan_mem.set_partial(tile_id, local_sum);
-
-                                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                                 prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
-
-                                 if (group.leader())
-                                     scan_mem.set_full(tile_id, prev_sum + local_sum);
-                             }
-
-                             _Type carry = sycl::group_broadcast(group, prev_sum, 0);
-                             // TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL
-                             #pragma unroll
-                             for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
-                             {
-                                 ::std::uint32_t i = stride * step;
-                                 _Type x;
-                                 if (i + local_id < wg_local_memory_size)
-                                 {
-                                     x = in_begin[i + local_id];
-                                 }
-                                 _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry);
-                                 if (i + local_id < wg_local_memory_size)
-                                 {
-                                     out_begin[i + local_id] = out;
-                                 }
-                                 carry = group_broadcast(group, out, stride - 1);
-                             }
-                         });
+        hdl.parallel_for(
+            sycl::nd_range<1>(num_workitems, wgsize), [=
+        ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                auto group = item.get_group();
+                ::std::uint32_t local_id = item.get_local_id(0);
+                constexpr ::std::uint32_t stride = wgsize;
+                auto subgroup = item.get_sub_group();
+
+                std::uint32_t tile_id;
+                if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
+                {
+                    // Obtain unique ID for this work-group that will be used in decoupled lookback
+                    TileId dynamic_tile_id(tile_id_begin);
+                    if (group.leader())
+                    {
+                        tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                    }
+                    sycl::group_barrier(group);
+                    tile_id = tile_id_lacc[0];
+                }
+                else
+                {
+                    tile_id = group.get_group_linear_id();
+                }
+
+                // Global load into local
+                auto wg_current_offset = (tile_id * elems_in_tile);
+                auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
+                auto wg_local_memory_size = elems_in_tile;
+
+                if (wg_next_offset > n)
+                    wg_local_memory_size = n - wg_current_offset;
+                //TODO: assumes default ctor produces identity w.r.t. __binary_op
+                _Type my_reducer{};
+                if (wg_next_offset <= n)
+                {
+#pragma unroll
+                    for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                    {
+                        _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
+                        my_reducer = __binary_op(my_reducer, in_val);
+                        tile_vals[local_id + stride * i] = in_val;
+                    }
+                }
+                else
+                {
+#pragma unroll
+                    for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
+                    {
+                        if (wg_current_offset + local_id + stride * i < n)
+                        {
+                            _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
+                            my_reducer = __binary_op(my_reducer, in_val);
+                            tile_vals[local_id + stride * i] = in_val;
+                        }
+                    }
+                }
+
+                auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op);
+
+                auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
+                auto out_begin = __out_rng.begin() + wg_current_offset;
+
+                _Type prev_sum = 0;
+
+                // The first sub-group will query the previous tiles to find a prefix
+                if (subgroup.get_group_id() == 0)
+                {
+                    _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
+
+                    if (group.leader())
+                        scan_mem.set_partial(tile_id, local_sum);
+
+                    // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                    prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
+
+                    if (group.leader())
+                        scan_mem.set_full(tile_id, prev_sum + local_sum);
+                }
+
+                _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+// TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL
+#pragma unroll
+                for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
+                {
+                    ::std::uint32_t i = stride * step;
+                    _Type x;
+                    if (i + local_id < wg_local_memory_size)
+                    {
+                        x = in_begin[i + local_id];
+                    }
+                    _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry);
+                    if (i + local_id < wg_local_memory_size)
+                    {
+                        out_begin[i + local_id] = out;
+                    }
+                    carry = group_broadcast(group, out, stride - 1);
+                }
+            });
     });
 
     scratch.async_free(event);
@@ -575,9 +641,10 @@ struct kernel_param
     using kernel_name = KernelName;
 };
 
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+template <typename _KernelParam, typename _Inclusive, typename _InIterator, typename _OutIterator, typename _BinaryOp>
 void
-single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op)
+single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
+                           _BinaryOp __binary_op)
 {
     auto __n = __in_end - __in_begin;
 
@@ -593,19 +660,62 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     {
         if (__queue.get_device().has(sycl::aspect::atomic64))
         {
-            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::false_type>(
-                __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+            single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::true_type,
+                                  /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                                          __binary_op);
         }
         else
         {
-            single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>(
-                __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+            single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type,
+                                  /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                                          __binary_op);
         }
     }
     else
     {
-        single_pass_scan_impl<_KernelParam, /* Inclusive */ std::true_type, /* UseAtomic64 */ std::false_type, /* UseDynamicTileID */ std::false_type>(
-            __queue, __buf1.all_view(), __buf2.all_view(), __binary_op);
+        single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type,
+                              /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                                      __binary_op);
+    }
+}
+
+template <typename _KernelParam, typename _Inclusive, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+void
+single_pass_single_wg_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
+                                     _OutIterator __out_begin, _BinaryOp __binary_op)
+{
+    auto __n = __in_end - __in_begin;
+
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    // Avoid aspect query overhead for sizeof(Types) > 32 bits
+    single_pass_scan_impl_single_wg<_KernelParam, /* Inclusive */ std::true_type>(__queue, __buf1.all_view(),
+                                                                                  __buf2.all_view(), __binary_op);
+}
+
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _BinaryOp>
+void
+single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
+                           _BinaryOp __binary_op)
+{
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    auto __n = __in_end - __in_begin;
+
+    if (__n <= elems_in_tile)
+    {
+        single_pass_single_wg_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(
+            __queue, __in_begin, __in_end, __out_begin, __binary_op);
+    }
+    else
+    {
+        single_pass_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(__queue, __in_begin, __in_end,
+                                                                                 __out_begin, __binary_op);
     }
 }
 

From 25a93ff4640a6e33de6f66f0986f7ef05cadaa48 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Tue, 21 Nov 2023 10:38:29 +0000
Subject: [PATCH 071/134] Add phase 1

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 123 ++++++++++++++++++
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   |  77 +++++++++++
 2 files changed, 200 insertions(+)
 create mode 100644 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 345da745608..c6da15a17b0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -719,6 +719,129 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
+// Load function to try and get some PVC perf w/ coalesced
+template <typename Tp, typename _InRange>
+inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id) {
+    // if constexpr (std::is_arithmetic_v<Tp>) {
+    //   return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id);
+    // } 
+    return src[i + wg_stride * wg_group_id];
+}
+
+// Load with checking for the subgroup case
+template <typename Tp, typename _InRange>
+inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id, size_t input_size) {
+    // if constexpr (std::is_arithmetic_v<Tp>) {
+      // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * wg_group_id <= input_size) 
+        // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id);
+      // return src[i + wg_stride * wg_group_id];
+    // } 
+    return src[i + wg_stride * wg_group_id];
+}
+
+template <typename _KernelParam, typename _InRange, typename _OutRange, typename _UnaryPredicate>
+void
+single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+
+    const ::std::size_t n = __in_rng.size();
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
+    ::std::size_t num_workitems = num_wgs * wgsize;
+
+    auto event = __queue.submit([&](sycl::handler& hdl) {
+        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl);
+        auto l_wg_count = sycl::local_accessor<size_t, 1>(sycl::range<1>{1}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
+        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+            auto group = item.get_group();
+            auto wg_group_id = item.get_group(0);
+            auto wg_local_id = item.get_local_id(0);
+            auto sg = item.get_sub_group();
+
+            // Must be a better way to init atomics
+            l_wg_count[0] = 0;
+            sycl::group_barrier(group);
+            sycl::atomic_ref<size_t, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
+
+            constexpr size_t elems_per_workgroup = elems_per_workitem*wgsize;
+
+            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+            if ((wg_group_id + 1) * elems_per_workgroup  <= n) {
+              #pragma unroll
+              for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) {
+                _Type val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id);
+
+                size_t satisfies_pred = pred(val);
+                //size_t satisfies_pred = 0;
+                size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<size_t>());
+
+                if (satisfies_pred)
+                  wg_copy_if_values[count + wg_count.load()] = val;
+
+                if (wg_local_id == (wgsize - 1))
+                  wg_count += (count + satisfies_pred);
+                sycl::group_barrier(group);
+              }
+            } 
+            else {
+              // Edge of input, have to handle memory bounds
+              // Might have unneccessary group_barrier calls
+              #pragma unroll
+              for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) {
+                size_t satisfies_pred = 0;
+                _Type val; // TODO: alloca
+                if (i + elems_per_workgroup * wg_group_id < n) {
+                  val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id, n);
+
+                  satisfies_pred = pred(val);
+                }
+                size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<size_t>());
+
+                if (satisfies_pred)
+                  wg_copy_if_values[count + wg_count.load()] = val;
+
+                if (wg_local_id == (wgsize - 1))
+                  wg_count += (count + satisfies_pred);
+                sycl::group_barrier(group);
+              }
+            }
+            // Check behaviour
+            if (group.leader()) {
+              __out_rng[wg_group_id] = wg_count.load();
+            }
+
+            // Phase 2: Global scan across wg_count
+
+            // Phase 3: copy values to global memory
+        });
+    });
+    event.wait();
+}
+
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _UnaryPredicate>
+void
+single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _UnaryPredicate pred)
+{
+    auto __n = __in_end - __in_begin;
+
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    single_pass_copy_if_impl<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), pred);
+}
+
 } // inline namespace igpu
 
 } // namespace oneapi::dpl::experimental::kt
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
new file mode 100644
index 00000000000..459449d933d
--- /dev/null
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -0,0 +1,77 @@
+// -*- C++ -*-
+//===-- scan.pass.cpp -----------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/test_config.h"
+
+#include _PSTL_TEST_HEADER(execution)
+#include _PSTL_TEST_HEADER(numeric)
+
+int
+main()
+{
+    bool all_passed = true;
+    sycl::queue q;
+
+    for (int logn : {4, 8, 10, 12, 14})
+    {
+        std::cout << "Testing 2^" << logn << std::endl;
+        int n = 1 << logn;
+        std::cout << "n:" << n << std::endl;
+        std::vector<int> v(n, 0);
+        for (size_t i = 0; i < v.size(); ++i)
+          std::cout << v[i] << ",";
+        std::cout << std::endl;
+
+        int* in_ptr = sycl::malloc_device<int>(n, q);
+        int* out_ptr = sycl::malloc_device<int>(n, q);
+
+        constexpr int n_elements_per_workitem = 8;
+
+        q.copy(v.data(), in_ptr, n).wait();
+        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, class ScanKernel>;
+        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, [](int x) { return x == 0; });
+
+        std::vector<int> tmp(n, 0);
+        q.copy(out_ptr, tmp.data(), n);
+        q.wait();
+
+        std::copy_if(v.begin(), v.end(), v.begin(), [](int x) { return x == 0; });
+
+        bool passed = true;
+        // for (size_t i  = 0; i < n; ++i)
+        // {
+        //     if (tmp[i] != v[i])
+        //     {
+        //         passed = false;
+        //         std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
+        //     }
+        // }
+
+        // if (passed)
+        //     std::cout << " passed" << std::endl;
+        // else
+        //     std::cout << " failed" << std::endl;
+
+        for (size_t i = 0; i < n/(n_elements_per_workitem*128) + 1; ++i) {
+          std::cout << "i:" << i << " count:" << tmp[i] << std::endl;
+        }
+
+        all_passed &= passed;
+        sycl::free(in_ptr, q);
+        sycl::free(out_ptr, q);
+    }
+
+    return !all_passed;
+}

From aea60093c080026093f671bdd843578b46f5ba0d Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Mon, 27 Nov 2023 13:26:38 +0000
Subject: [PATCH 072/134] Add phase 2

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 112 +++++++++++++++---
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   |   6 +-
 2 files changed, 99 insertions(+), 19 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index c6da15a17b0..5a9d3241574 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -739,11 +739,14 @@ inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, siz
     return src[i + wg_stride * wg_group_id];
 }
 
-template <typename _KernelParam, typename _InRange, typename _OutRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _UnaryPredicate>
 void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _TileIdT = TileId::_TileIdT;
+    using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>;
+    using _FlagT = typename _LookbackScanMemory::_FlagT;
 
     const ::std::size_t n = __in_rng.size();
 
@@ -751,33 +754,87 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
 
     // Avoid non_uniform n by padding up to a multiple of wgsize
-    std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
+    ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
+    scratch.allocate(num_wgs);
+
+    // Memory Structure:
+    // [Lookback Scan Memory, Tile Id Counter]
+    auto scan_memory_begin = scratch.scan_memory_ptr();
+    auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs);
+    auto tile_id_begin = scratch.tile_id_ptr();
+
+    ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs);
+    // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
+    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
+
+    auto fill_event = __queue.submit(
+        [&](sycl::handler& hdl)
+        {
+            hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
+                             [=](const sycl::nd_item<1>& item)
+                             {
+                                 int id = item.get_global_linear_id();
+                                 if (id < num_elements)
+                                     status_flags_begin[id] = id < _LookbackScanMemory::padding
+                                                                  ? _LookbackScanMemory::OUT_OF_BOUNDS
+                                                                  : _LookbackScanMemory::NOT_READY;
+                                 if (id == num_elements)
+                                     tile_id_begin[0] = 0;
+                             });
+        });
+
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl);
         auto l_wg_count = sycl::local_accessor<size_t, 1>(sycl::range<1>{1}, hdl);
 
+        auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
+        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for<class scan_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+        hdl.parallel_for<class copy_if_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
             auto group = item.get_group();
             auto wg_group_id = item.get_group(0);
             auto wg_local_id = item.get_local_id(0);
             auto sg = item.get_sub_group();
+            constexpr ::std::uint32_t stride = wgsize;                 
+                                                            
+            // Init tile_id                                 
+            std::uint32_t tile_id;                          
+            if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
+            {
+                // Obtain unique ID for this work-group that will be used in decoupled lookback
+                TileId dynamic_tile_id(tile_id_begin);
+                if (group.leader())
+                {
+                    tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                }
+                sycl::group_barrier(group);
+                tile_id = tile_id_lacc[0];
+            }
+            else
+            {
+                tile_id = group.get_group_linear_id();
+            }
+
+            // Global load into local
+            auto wg_current_offset = (tile_id * elems_in_tile);
+            auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
+            auto wg_local_memory_size = elems_in_tile;
 
             // Must be a better way to init atomics
             l_wg_count[0] = 0;
             sycl::group_barrier(group);
             sycl::atomic_ref<size_t, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
 
-            constexpr size_t elems_per_workgroup = elems_per_workitem*wgsize;
-
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if ((wg_group_id + 1) * elems_per_workgroup  <= n) {
+            if ((wg_group_id + 1) * elems_in_tile  <= n) {
               #pragma unroll
-              for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) {
-                _Type val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id);
+              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+                _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id);
 
                 size_t satisfies_pred = pred(val);
                 //size_t satisfies_pred = 0;
@@ -794,12 +851,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             else {
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
-              #pragma unroll
-              for (size_t i = wg_local_id; i < elems_per_workgroup; i += wgsize) {
+              //#pragma unroll
+              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 size_t satisfies_pred = 0;
                 _Type val; // TODO: alloca
-                if (i + elems_per_workgroup * wg_group_id < n) {
-                  val = load<_Type>(sg, __in_rng, i, elems_per_workgroup, wg_group_id, n);
+                if (i + elems_in_tile * wg_group_id < n) {
+                  val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id, n);
 
                   satisfies_pred = pred(val);
                 }
@@ -813,13 +870,36 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 sycl::group_barrier(group);
               }
             }
+
+            // Phase 2: Global scan across wg_count
+            auto local_sum = wg_count.load();
+
+            auto in_begin = tile_vals.get_pointer();
+
+            _Type prev_sum = 0;
+
+            // The first sub-group will query the previous tiles to find a prefix
+            if (sg.get_group_id() == 0)
+            {
+                _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
+
+                if (group.leader())
+                    scan_mem.set_partial(tile_id, local_sum);
+
+                // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_Type>(), scan_mem);
+
+                if (group.leader())
+                    scan_mem.set_full(tile_id, prev_sum + local_sum);
+            }
+
+            _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+ 
             // Check behaviour
             if (group.leader()) {
-              __out_rng[wg_group_id] = wg_count.load();
+              __out_rng[wg_group_id] = carry;
             }
 
-            // Phase 2: Global scan across wg_count
-
             // Phase 3: copy values to global memory
         });
     });
@@ -839,7 +919,7 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
-    single_pass_copy_if_impl<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), pred);
+    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), pred);
 }
 
 } // inline namespace igpu
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 459449d933d..917e88a7707 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -30,9 +30,9 @@ main()
         int n = 1 << logn;
         std::cout << "n:" << n << std::endl;
         std::vector<int> v(n, 0);
-        for (size_t i = 0; i < v.size(); ++i)
-          std::cout << v[i] << ",";
-        std::cout << std::endl;
+        //for (size_t i = 0; i < v.size(); ++i)
+        //  std::cout << v[i] << ",";
+        //std::cout << std::endl;
 
         int* in_ptr = sycl::malloc_device<int>(n, q);
         int* out_ptr = sycl::malloc_device<int>(n, q);

From d5c2cb5d130cef60f25757915e02bcf16f6eb695 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 28 Nov 2023 15:19:56 +0000
Subject: [PATCH 073/134] Add phase 3

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 67 ++++++++-------
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   | 86 ++++++++++++-------
 2 files changed, 87 insertions(+), 66 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 5a9d3241574..63a59476234 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -721,27 +721,27 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
 
 // Load function to try and get some PVC perf w/ coalesced
 template <typename Tp, typename _InRange>
-inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id) {
+inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id) {
     // if constexpr (std::is_arithmetic_v<Tp>) {
-    //   return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id);
+    //   return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id);
     // } 
-    return src[i + wg_stride * wg_group_id];
+    return src[i + wg_stride * tile_id];
 }
 
 // Load with checking for the subgroup case
 template <typename Tp, typename _InRange>
-inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t wg_group_id, size_t input_size) {
+inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id, size_t input_size) {
     // if constexpr (std::is_arithmetic_v<Tp>) {
-      // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * wg_group_id <= input_size) 
-        // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * wg_group_id);
-      // return src[i + wg_stride * wg_group_id];
+      // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * tile_id <= input_size) 
+        // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id);
+      // return src[i + wg_stride * tile_id];
     // } 
-    return src[i + wg_stride * wg_group_id];
+    return src[i + wg_stride * tile_id];
 }
 
-template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
 void
-single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _UnaryPredicate pred)
+single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _TileIdT = TileId::_TileIdT;
@@ -793,11 +793,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
         auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+        hdl.depends_on(fill_event);
 
-        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for<class copy_if_kt_main>(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
             auto group = item.get_group();
-            auto wg_group_id = item.get_group(0);
             auto wg_local_id = item.get_local_id(0);
             auto sg = item.get_sub_group();
             constexpr ::std::uint32_t stride = wgsize;                 
@@ -822,7 +822,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
             // Global load into local
             auto wg_current_offset = (tile_id * elems_in_tile);
-            auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
             auto wg_local_memory_size = elems_in_tile;
 
             // Must be a better way to init atomics
@@ -831,10 +830,10 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             sycl::atomic_ref<size_t, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if ((wg_group_id + 1) * elems_in_tile  <= n) {
+            if ((tile_id + 1) * elems_in_tile <= n) {
               #pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id);
+                _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id);
 
                 size_t satisfies_pred = pred(val);
                 //size_t satisfies_pred = 0;
@@ -847,16 +846,15 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                   wg_count += (count + satisfies_pred);
                 sycl::group_barrier(group);
               }
-            } 
-            else {
+            } else {
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
-              //#pragma unroll
+              #pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 size_t satisfies_pred = 0;
                 _Type val; // TODO: alloca
-                if (i + elems_in_tile * wg_group_id < n) {
-                  val = load<_Type>(sg, __in_rng, i, elems_in_tile, wg_group_id, n);
+                if (i + elems_in_tile * tile_id < n) {
+                  val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n);
 
                   satisfies_pred = pred(val);
                 }
@@ -873,10 +871,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
             // Phase 2: Global scan across wg_count
             auto local_sum = wg_count.load();
-
             auto in_begin = tile_vals.get_pointer();
-
-            _Type prev_sum = 0;
+            size_t prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
             if (sg.get_group_id() == 0)
@@ -893,22 +889,23 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                     scan_mem.set_full(tile_id, prev_sum + local_sum);
             }
 
-            _Type carry = sycl::group_broadcast(group, prev_sum, 0);
+            size_t start_idx = sycl::group_broadcast(group, prev_sum, 0);
  
-            // Check behaviour
-            if (group.leader()) {
-              __out_rng[wg_group_id] = carry;
-            }
-
             // Phase 3: copy values to global memory
+            for (int i = wg_local_id; i < local_sum; i += wgsize) {
+                // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store
+                __out_rng[start_idx + i] = wg_copy_if_values[i];
+            }
+            if (tile_id == (num_wgs - 1) && group.leader())
+                __num_rng[0] = start_idx + local_sum;
         });
     });
     event.wait();
 }
 
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _UnaryPredicate>
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>
 void
-single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _UnaryPredicate pred)
+single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
 {
     auto __n = __in_end - __in_begin;
 
@@ -919,7 +916,11 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
-    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), pred);
+    auto __keep_num =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
+    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
+
+    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
 }
 
 } // inline namespace igpu
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 917e88a7707..202f28fbaad 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -14,64 +14,84 @@
 //===----------------------------------------------------------------------===//
 
 #include "support/test_config.h"
+#include "support/utils.h"
 
 #include _PSTL_TEST_HEADER(execution)
 #include _PSTL_TEST_HEADER(numeric)
 
-int
-main()
+using namespace TestUtils;
+
+template <typename T, typename Predicate>
+class CopyIfKernel;
+
+template<typename T, typename Predicate, typename Generator>
+bool test(Predicate pred, Generator gen)
 {
     bool all_passed = true;
     sycl::queue q;
 
-    for (int logn : {4, 8, 10, 12, 14})
+    for (int logn : {4, 8, 10, 12, 14, 15, 18})
     {
-        std::cout << "Testing 2^" << logn << std::endl;
         int n = 1 << logn;
-        std::cout << "n:" << n << std::endl;
-        std::vector<int> v(n, 0);
-        //for (size_t i = 0; i < v.size(); ++i)
-        //  std::cout << v[i] << ",";
-        //std::cout << std::endl;
 
-        int* in_ptr = sycl::malloc_device<int>(n, q);
-        int* out_ptr = sycl::malloc_device<int>(n, q);
+        Sequence<T> in(n, [&](size_t k) -> T { 
+            return gen(n ^ k); 
+        });
+
+        Sequence<T> std_out(n);
+
+        T* in_ptr = sycl::malloc_device<T>(n, q);
+        T* out_ptr = sycl::malloc_device<T>(n, q);
+        size_t* out_num = sycl::malloc_device<size_t>(1, q);
 
         constexpr int n_elements_per_workitem = 8;
 
-        q.copy(v.data(), in_ptr, n).wait();
-        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, class ScanKernel>;
-        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, [](int x) { return x == 0; });
+        q.copy(in.data(), in_ptr, n).wait();
+        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, CopyIfKernel<T, Predicate>>;
+        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, out_num, pred);
 
-        std::vector<int> tmp(n, 0);
-        q.copy(out_ptr, tmp.data(), n);
+        Sequence<T> kt_out(n);
+        size_t num_selected = 0;
+        q.copy(out_ptr, kt_out.data(), n);
+        q.copy(out_num, &num_selected, 1);
         q.wait();
 
-        std::copy_if(v.begin(), v.end(), v.begin(), [](int x) { return x == 0; });
+        auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred);
 
         bool passed = true;
-        // for (size_t i  = 0; i < n; ++i)
-        // {
-        //     if (tmp[i] != v[i])
-        //     {
-        //         passed = false;
-        //         std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
-        //     }
-        // }
-
-        // if (passed)
-        //     std::cout << " passed" << std::endl;
-        // else
-        //     std::cout << " failed" << std::endl;
-
-        for (size_t i = 0; i < n/(n_elements_per_workitem*128) + 1; ++i) {
-          std::cout << "i:" << i << " count:" << tmp[i] << std::endl;
+        if (num_selected != (std_out_end - std_out.begin())) {
+            passed = false;
+            std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected << "\n";
+        }
+
+        for (size_t i  = 0; i < (std_out_end - std_out.begin()); ++i)
+        {
+            if (kt_out[i] != std_out[i])
+            {
+                passed = false;
+                std::cout << "expected " << i << ' ' << std_out[i] << ' ' << kt_out[i] << '\n';
+            }
         }
 
+        if (passed)
+            std::cout << " passed" << std::endl;
+        else
+            std::cout << " failed" << std::endl;
+
         all_passed &= passed;
         sycl::free(in_ptr, q);
         sycl::free(out_ptr, q);
+        sycl::free(out_num, q);
     }
 
     return !all_passed;
 }
+
+int main() {
+    bool all_passed;
+    all_passed &= test<float64_t>([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
+    all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; });
+    all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });
+
+    return all_passed;
+}

From 1f574b8f501922f22c2b919e1fb11b0bae6480a2 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 5 Dec 2023 11:28:40 +0000
Subject: [PATCH 074/134] Add count datatype _SizeT

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 63a59476234..3d6289642bc 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -744,8 +744,9 @@ void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _SizeT = uint64_t;
     using _TileIdT = TileId::_TileIdT;
-    using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>;
+    using _LookbackScanMemory = LookbackScanMemory<_SizeT, _UseAtomic64>;
     using _FlagT = typename _LookbackScanMemory::_FlagT;
 
     const ::std::size_t n = __in_rng.size();
@@ -758,7 +759,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
 
-    ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
+    ScanMemoryManager<_SizeT, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
     scratch.allocate(num_wgs);
 
     // Memory Structure:
@@ -792,7 +793,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         auto l_wg_count = sycl::local_accessor<size_t, 1>(sycl::range<1>{1}, hdl);
 
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
-        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+        auto tile_vals = sycl::local_accessor<_SizeT, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
@@ -827,7 +828,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             // Must be a better way to init atomics
             l_wg_count[0] = 0;
             sycl::group_barrier(group);
-            sycl::atomic_ref<size_t, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
+            sycl::atomic_ref<_SizeT, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
+            sycl::group_barrier(group);
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if ((tile_id + 1) * elems_in_tile <= n) {
@@ -835,9 +837,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id);
 
-                size_t satisfies_pred = pred(val);
-                //size_t satisfies_pred = 0;
-                size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<size_t>());
+                _SizeT satisfies_pred = pred(val);
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>());
 
                 if (satisfies_pred)
                   wg_copy_if_values[count + wg_count.load()] = val;
@@ -851,14 +852,14 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
               // Might have unneccessary group_barrier calls
               #pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                size_t satisfies_pred = 0;
+                _SizeT satisfies_pred = 0;
                 _Type val; // TODO: alloca
                 if (i + elems_in_tile * tile_id < n) {
                   val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n);
 
                   satisfies_pred = pred(val);
                 }
-                size_t count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<size_t>());
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>());
 
                 if (satisfies_pred)
                   wg_copy_if_values[count + wg_count.load()] = val;
@@ -870,9 +871,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             }
 
             // Phase 2: Global scan across wg_count
-            auto local_sum = wg_count.load();
-            auto in_begin = tile_vals.get_pointer();
-            size_t prev_sum = 0;
+            _SizeT local_sum = wg_count.load();
+            _SizeT* in_begin = tile_vals.get_pointer();
+            _SizeT prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
             if (sg.get_group_id() == 0)
@@ -883,13 +884,13 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                     scan_mem.set_partial(tile_id, local_sum);
 
                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_Type>(), scan_mem);
+                prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
 
                 if (group.leader())
                     scan_mem.set_full(tile_id, prev_sum + local_sum);
             }
 
-            size_t start_idx = sycl::group_broadcast(group, prev_sum, 0);
+            _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
  
             // Phase 3: copy values to global memory
             for (int i = wg_local_id; i < local_sum; i += wgsize) {

From 16ef9c2ad691c0c52b51b391a49697584921758c Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 5 Dec 2023 12:24:51 +0000
Subject: [PATCH 075/134] Move away from atomics

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 36 +++++++------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 3d6289642bc..60c2db24b78 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -789,11 +789,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         });
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_per_workitem*wgsize}, hdl);
-        auto l_wg_count = sycl::local_accessor<size_t, 1>(sycl::range<1>{1}, hdl);
+        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
 
         auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
-        auto tile_vals = sycl::local_accessor<_SizeT, 1>(sycl::range<1>{elems_in_tile}, hdl);
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
@@ -825,11 +823,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             auto wg_current_offset = (tile_id * elems_in_tile);
             auto wg_local_memory_size = elems_in_tile;
 
-            // Must be a better way to init atomics
-            l_wg_count[0] = 0;
-            sycl::group_barrier(group);
-            sycl::atomic_ref<_SizeT, sycl::memory_order::acq_rel, sycl::memory_scope::work_group, sycl::access::address_space::local_space> wg_count(l_wg_count[0]);
-            sycl::group_barrier(group);
+            _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if ((tile_id + 1) * elems_in_tile <= n) {
@@ -838,14 +832,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id);
 
                 _SizeT satisfies_pred = pred(val);
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>());
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
 
                 if (satisfies_pred)
-                  wg_copy_if_values[count + wg_count.load()] = val;
+                  wg_copy_if_values[count] = val;
 
-                if (wg_local_id == (wgsize - 1))
-                  wg_count += (count + satisfies_pred);
-                sycl::group_barrier(group);
+                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
               }
             } else {
               // Edge of input, have to handle memory bounds
@@ -859,20 +851,16 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
                   satisfies_pred = pred(val);
                 }
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, sycl::plus<_SizeT>());
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
 
                 if (satisfies_pred)
-                  wg_copy_if_values[count + wg_count.load()] = val;
+                  wg_copy_if_values[count] = val;
 
-                if (wg_local_id == (wgsize - 1))
-                  wg_count += (count + satisfies_pred);
-                sycl::group_barrier(group);
+                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
               }
             }
 
             // Phase 2: Global scan across wg_count
-            _SizeT local_sum = wg_count.load();
-            _SizeT* in_begin = tile_vals.get_pointer();
             _SizeT prev_sum = 0;
 
             // The first sub-group will query the previous tiles to find a prefix
@@ -881,24 +869,24 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
 
                 if (group.leader())
-                    scan_mem.set_partial(tile_id, local_sum);
+                    scan_mem.set_partial(tile_id, wg_count);
 
                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
                 prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
 
                 if (group.leader())
-                    scan_mem.set_full(tile_id, prev_sum + local_sum);
+                    scan_mem.set_full(tile_id, prev_sum + wg_count);
             }
 
             _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
  
             // Phase 3: copy values to global memory
-            for (int i = wg_local_id; i < local_sum; i += wgsize) {
+            for (int i = wg_local_id; i < wg_count; i += wgsize) {
                 // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store
                 __out_rng[start_idx + i] = wg_copy_if_values[i];
             }
             if (tile_id == (num_wgs - 1) && group.leader())
-                __num_rng[0] = start_idx + local_sum;
+                __num_rng[0] = start_idx + wg_count;
         });
     });
     event.wait();

From 45a1fb77d57e8754ce196b2476b9fc1fe2bcf213 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 5 Dec 2023 19:42:54 +0000
Subject: [PATCH 076/134] Sort out test logic

---
 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 202f28fbaad..75769131522 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -84,11 +84,11 @@ bool test(Predicate pred, Generator gen)
         sycl::free(out_num, q);
     }
 
-    return !all_passed;
+    return all_passed;
 }
 
 int main() {
-    bool all_passed;
+    bool all_passed = true;
     all_passed &= test<float64_t>([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
     all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; });
     all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });

From cec32d79daa1ac8926f0a58dfd6c722c0908d232 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Tue, 5 Dec 2023 19:50:48 +0000
Subject: [PATCH 077/134] Remove unnecessary load and store functions

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 25 ++-----------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 60c2db24b78..68d11740df0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -719,26 +719,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
-// Load function to try and get some PVC perf w/ coalesced
-template <typename Tp, typename _InRange>
-inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id) {
-    // if constexpr (std::is_arithmetic_v<Tp>) {
-    //   return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id);
-    // } 
-    return src[i + wg_stride * tile_id];
-}
-
-// Load with checking for the subgroup case
-template <typename Tp, typename _InRange>
-inline Tp load(sycl::sub_group sg, _InRange src, size_t i, size_t wg_stride, size_t tile_id, size_t input_size) {
-    // if constexpr (std::is_arithmetic_v<Tp>) {
-      // if (i / SUBGROUP_SIZE + SUBGROUP_SIZE + wg_stride * tile_id <= input_size) 
-        // return sg.load(src.begin() + i / SUBGROUP_SIZE + wg_stride * tile_id);
-      // return src[i + wg_stride * tile_id];
-    // } 
-    return src[i + wg_stride * tile_id];
-}
-
 template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
 void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
@@ -829,7 +809,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             if ((tile_id + 1) * elems_in_tile <= n) {
               #pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                _Type val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id);
+                _Type val = __in_rng[i + elems_in_tile * tile_id];
 
                 _SizeT satisfies_pred = pred(val);
                 _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
@@ -847,7 +827,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 _SizeT satisfies_pred = 0;
                 _Type val; // TODO: alloca
                 if (i + elems_in_tile * tile_id < n) {
-                  val = load<_Type>(sg, __in_rng, i, elems_in_tile, tile_id, n);
+                  val = __in_rng[i + elems_in_tile * tile_id];
 
                   satisfies_pred = pred(val);
                 }
@@ -882,7 +862,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
  
             // Phase 3: copy values to global memory
             for (int i = wg_local_id; i < wg_count; i += wgsize) {
-                // Probably adjust method to try and get some perf on PVC for arithmetic types using sg.store
                 __out_rng[start_idx + i] = wg_copy_if_values[i];
             }
             if (tile_id == (num_wgs - 1) && group.leader())

From b7d659c81db111deeac008a4647a104cdcc3dfa9 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Wed, 6 Dec 2023 11:03:59 +0000
Subject: [PATCH 078/134] Release scratch mem

---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h  | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 68d11740df0..db642fc7177 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -868,6 +868,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 __num_rng[0] = start_idx + wg_count;
         });
     });
+
+    scratch.async_free(event);
+
     event.wait();
 }
 

From fdb1824018b8f4c0c9bfe1e63dcd31f36c61e641 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 11:15:29 +0000
Subject: [PATCH 079/134] Add single wg copy if

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 108 +++++++++++++++++-
 1 file changed, 106 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index db642fc7177..36e395b7285 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -79,6 +79,11 @@ struct ScanMemoryManager
             });
     }
 
+    void free()
+    {
+        sycl::free(scratch, q);
+    }
+
   private:
     ::std::uint8_t* scratch = nullptr;
     ::std::uint8_t* scan_memory_begin = nullptr;
@@ -719,6 +724,86 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
+template <typename _KernelParam, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
+void
+single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _SizeT = uint64_t;
+    using _TileIdT = TileId::_TileIdT;
+
+    const ::std::size_t n = __in_rng.size();
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::elems_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
+    ::std::size_t num_workitems = num_wgs * wgsize;
+
+    auto event = __queue.submit([&](sycl::handler& hdl) {
+        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+            auto group = item.get_group();
+            auto wg_local_id = item.get_local_id(0);
+            constexpr ::std::uint32_t stride = wgsize;                 
+                                                            
+            // Global load into local
+            auto wg_current_offset = 0;
+            auto wg_local_memory_size = elems_in_tile;
+
+            _SizeT wg_count = 0;
+
+            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+            if (elems_in_tile <= n) {
+              #pragma unroll
+              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+                _Type val = __in_rng[i];
+
+                _SizeT satisfies_pred = pred(val);
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+
+                if (satisfies_pred)
+                  wg_copy_if_values[count] = val;
+
+                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+              }
+            } else {
+              // Edge of input, have to handle memory bounds
+              // Might have unneccessary group_barrier calls
+              #pragma unroll
+              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+                _SizeT satisfies_pred = 0;
+                _Type val; // TODO: alloca
+                if (i < n) {
+                  val = __in_rng[i];
+
+                  satisfies_pred = pred(val);
+                }
+                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+
+                if (satisfies_pred)
+                  wg_copy_if_values[count] = val;
+
+                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+              }
+            }
+
+            // Phase 3: copy values to global memory
+            for (int i = wg_local_id; i < wg_count; i += wgsize) {
+                __out_rng[i] = wg_copy_if_values[i];
+            }
+            if (group.leader())
+                __num_rng[0] = wg_count;
+        });
+    });
+
+    event.wait();
+}
+
 template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
 void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
@@ -869,9 +954,28 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         });
     });
 
-    scratch.async_free(event);
-
     event.wait();
+    scratch.free();
+}
+
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>
+void
+single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
+{
+    auto __n = __in_end - __in_begin;
+
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    auto __keep_num =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
+    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
+
+    single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
 }
 
 template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>

From 1df5fbb37573f10bc61fc804b6d026126e01d8a6 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 11:38:45 +0000
Subject: [PATCH 080/134] Fix unrolls and use memset

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 36 ++++++-------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 36e395b7285..fcfb3ad1b84 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -753,13 +753,12 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
                                                             
             // Global load into local
             auto wg_current_offset = 0;
-            auto wg_local_memory_size = elems_in_tile;
 
             _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if (elems_in_tile <= n) {
-              #pragma unroll
+#pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 _Type val = __in_rng[i];
 
@@ -774,7 +773,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
             } else {
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
-              #pragma unroll
+#pragma unroll
               for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
                 _SizeT satisfies_pred = 0;
                 _Type val; // TODO: alloca
@@ -837,21 +836,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
     ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
 
-    auto fill_event = __queue.submit(
-        [&](sycl::handler& hdl)
-        {
-            hdl.parallel_for(sycl::nd_range<1>{fill_num_wgs * wgsize, wgsize},
-                             [=](const sycl::nd_item<1>& item)
-                             {
-                                 int id = item.get_global_linear_id();
-                                 if (id < num_elements)
-                                     status_flags_begin[id] = id < _LookbackScanMemory::padding
-                                                                  ? _LookbackScanMemory::OUT_OF_BOUNDS
-                                                                  : _LookbackScanMemory::NOT_READY;
-                                 if (id == num_elements)
-                                     tile_id_begin[0] = 0;
-                             });
-        });
+    auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT));
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
@@ -886,15 +871,14 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
             // Global load into local
             auto wg_current_offset = (tile_id * elems_in_tile);
-            auto wg_local_memory_size = elems_in_tile;
 
             _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if ((tile_id + 1) * elems_in_tile <= n) {
-              #pragma unroll
-              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                _Type val = __in_rng[i + elems_in_tile * tile_id];
+#pragma unroll
+              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
+                _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
 
                 _SizeT satisfies_pred = pred(val);
                 _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
@@ -907,12 +891,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
             } else {
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
-              #pragma unroll
-              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+#pragma unroll
+              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
                 _SizeT satisfies_pred = 0;
                 _Type val; // TODO: alloca
-                if (i + elems_in_tile * tile_id < n) {
-                  val = __in_rng[i + elems_in_tile * tile_id];
+                if (i + wg_local_id + elems_in_tile * tile_id < n) {
+                  val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
 
                   satisfies_pred = pred(val);
                 }

From d8b77febdea55241dbde182449e88df413338356 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 11:48:50 +0000
Subject: [PATCH 081/134] apply changes to single wg

---
 .../pstl/hetero/dpcpp/parallel_backend_sycl_scan.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index fcfb3ad1b84..60007e4566c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -759,8 +759,8 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
             if (elems_in_tile <= n) {
 #pragma unroll
-              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
-                _Type val = __in_rng[i];
+              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
+                _Type val = __in_rng[i + wg_local_id];
 
                 _SizeT satisfies_pred = pred(val);
                 _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
@@ -774,11 +774,11 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
               // Edge of input, have to handle memory bounds
               // Might have unneccessary group_barrier calls
 #pragma unroll
-              for (size_t i = wg_local_id; i < elems_in_tile; i += wgsize) {
+              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
                 _SizeT satisfies_pred = 0;
-                _Type val; // TODO: alloca
-                if (i < n) {
-                  val = __in_rng[i];
+                _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                if (i + wg_local_id < n) {
+                  val = __in_rng[i + wg_local_id];
 
                   satisfies_pred = pred(val);
                 }
@@ -894,7 +894,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 #pragma unroll
               for (size_t i = 0; i < elems_in_tile; i += wgsize) {
                 _SizeT satisfies_pred = 0;
-                _Type val; // TODO: alloca
+                _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
                 if (i + wg_local_id + elems_in_tile * tile_id < n) {
                   val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
 

From 5b53de669e20fe2a81be81ec2cc5fddd9bdb6543 Mon Sep 17 00:00:00 2001
From: Aidan <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 15:31:31 +0000
Subject: [PATCH 082/134] Remove unused variables

---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h      | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index 60007e4566c..fcb539cab2b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -741,6 +741,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
     constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
     ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
     ::std::size_t num_workitems = num_wgs * wgsize;
+    assert(num_wgs == 1);
 
     auto event = __queue.submit([&](sycl::handler& hdl) {
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
@@ -752,8 +753,6 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
             constexpr ::std::uint32_t stride = wgsize;                 
                                                             
             // Global load into local
-            auto wg_current_offset = 0;
-
             _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
@@ -869,9 +868,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 tile_id = group.get_group_linear_id();
             }
 
-            // Global load into local
-            auto wg_current_offset = (tile_id * elems_in_tile);
-
             _SizeT wg_count = 0;
 
             // Phase 1: Create wg_count and construct in-order wg_copy_if_values

From acc4f9b65a842fe113c58f86e85e31b386a384aa Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Fri, 8 Dec 2023 15:35:41 +0000
Subject: [PATCH 083/134] Clang-format copy_if_kt commits

---
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 333 ++++++++++--------
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   |  33 +-
 2 files changed, 200 insertions(+), 166 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index fcb539cab2b..0838817fd4f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -79,7 +79,8 @@ struct ScanMemoryManager
             });
     }
 
-    void free()
+    void
+    free()
     {
         sycl::free(scratch, q);
     }
@@ -724,9 +725,11 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
-template <typename _KernelParam, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _InRange, typename _OutRange, typename _NumSelectedRange,
+          typename _UnaryPredicate>
 void
-single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
+single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
+                                   _NumSelectedRange __num_rng, _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _SizeT = uint64_t;
@@ -747,64 +750,76 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-            auto group = item.get_group();
-            auto wg_local_id = item.get_local_id(0);
-            constexpr ::std::uint32_t stride = wgsize;                 
-                                                            
-            // Global load into local
-            _SizeT wg_count = 0;
-
-            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if (elems_in_tile <= n) {
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
+                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                             auto group = item.get_group();
+                             auto wg_local_id = item.get_local_id(0);
+                             constexpr ::std::uint32_t stride = wgsize;
+
+                             // Global load into local
+                             _SizeT wg_count = 0;
+
+                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+                             if (elems_in_tile <= n)
+                             {
 #pragma unroll
-              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
-                _Type val = __in_rng[i + wg_local_id];
-
-                _SizeT satisfies_pred = pred(val);
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                if (satisfies_pred)
-                  wg_copy_if_values[count] = val;
-
-                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-              }
-            } else {
-              // Edge of input, have to handle memory bounds
-              // Might have unneccessary group_barrier calls
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _Type val = __in_rng[i + wg_local_id];
+
+                                     _SizeT satisfies_pred = pred(val);
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+                             else
+                             {
+                // Edge of input, have to handle memory bounds
+                // Might have unneccessary group_barrier calls
 #pragma unroll
-              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
-                _SizeT satisfies_pred = 0;
-                _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
-                if (i + wg_local_id < n) {
-                  val = __in_rng[i + wg_local_id];
-
-                  satisfies_pred = pred(val);
-                }
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                if (satisfies_pred)
-                  wg_copy_if_values[count] = val;
-
-                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-              }
-            }
-
-            // Phase 3: copy values to global memory
-            for (int i = wg_local_id; i < wg_count; i += wgsize) {
-                __out_rng[i] = wg_copy_if_values[i];
-            }
-            if (group.leader())
-                __num_rng[0] = wg_count;
-        });
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _SizeT satisfies_pred = 0;
+                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                                     if (i + wg_local_id < n)
+                                     {
+                                         val = __in_rng[i + wg_local_id];
+
+                                         satisfies_pred = pred(val);
+                                     }
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+
+                             // Phase 3: copy values to global memory
+                             for (int i = wg_local_id; i < wg_count; i += wgsize)
+                             {
+                                 __out_rng[i] = wg_copy_if_values[i];
+                             }
+                             if (group.leader())
+                                 __num_rng[0] = wg_count;
+                         });
     });
 
     event.wait();
 }
 
-template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange,
+          typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
 void
-single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng, _UnaryPredicate pred)
+single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
+                         _UnaryPredicate pred)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _SizeT = uint64_t;
@@ -844,138 +859,150 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         hdl.depends_on(fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>& item)  [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-            auto group = item.get_group();
-            auto wg_local_id = item.get_local_id(0);
-            auto sg = item.get_sub_group();
-            constexpr ::std::uint32_t stride = wgsize;                 
-                                                            
-            // Init tile_id                                 
-            std::uint32_t tile_id;                          
-            if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
-            {
-                // Obtain unique ID for this work-group that will be used in decoupled lookback
-                TileId dynamic_tile_id(tile_id_begin);
-                if (group.leader())
-                {
-                    tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-                }
-                sycl::group_barrier(group);
-                tile_id = tile_id_lacc[0];
-            }
-            else
-            {
-                tile_id = group.get_group_linear_id();
-            }
-
-            _SizeT wg_count = 0;
-
-            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if ((tile_id + 1) * elems_in_tile <= n) {
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
+                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                             auto group = item.get_group();
+                             auto wg_local_id = item.get_local_id(0);
+                             auto sg = item.get_sub_group();
+                             constexpr ::std::uint32_t stride = wgsize;
+
+                             // Init tile_id
+                             std::uint32_t tile_id;
+                             if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
+                             {
+                                 // Obtain unique ID for this work-group that will be used in decoupled lookback
+                                 TileId dynamic_tile_id(tile_id_begin);
+                                 if (group.leader())
+                                 {
+                                     tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
+                                 }
+                                 sycl::group_barrier(group);
+                                 tile_id = tile_id_lacc[0];
+                             }
+                             else
+                             {
+                                 tile_id = group.get_group_linear_id();
+                             }
+
+                             _SizeT wg_count = 0;
+
+                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+                             if ((tile_id + 1) * elems_in_tile <= n)
+                             {
 #pragma unroll
-              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
-                _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
-
-                _SizeT satisfies_pred = pred(val);
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                if (satisfies_pred)
-                  wg_copy_if_values[count] = val;
-
-                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-              }
-            } else {
-              // Edge of input, have to handle memory bounds
-              // Might have unneccessary group_barrier calls
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
+
+                                     _SizeT satisfies_pred = pred(val);
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+                             else
+                             {
+                // Edge of input, have to handle memory bounds
+                // Might have unneccessary group_barrier calls
 #pragma unroll
-              for (size_t i = 0; i < elems_in_tile; i += wgsize) {
-                _SizeT satisfies_pred = 0;
-                _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
-                if (i + wg_local_id + elems_in_tile * tile_id < n) {
-                  val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
-
-                  satisfies_pred = pred(val);
-                }
-                _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                if (satisfies_pred)
-                  wg_copy_if_values[count] = val;
-
-                wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-              }
-            }
-
-            // Phase 2: Global scan across wg_count
-            _SizeT prev_sum = 0;
-
-            // The first sub-group will query the previous tiles to find a prefix
-            if (sg.get_group_id() == 0)
-            {
-                _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
-
-                if (group.leader())
-                    scan_mem.set_partial(tile_id, wg_count);
-
-                // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
-
-                if (group.leader())
-                    scan_mem.set_full(tile_id, prev_sum + wg_count);
-            }
-
-            _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
- 
-            // Phase 3: copy values to global memory
-            for (int i = wg_local_id; i < wg_count; i += wgsize) {
-                __out_rng[start_idx + i] = wg_copy_if_values[i];
-            }
-            if (tile_id == (num_wgs - 1) && group.leader())
-                __num_rng[0] = start_idx + wg_count;
-        });
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _SizeT satisfies_pred = 0;
+                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                                     if (i + wg_local_id + elems_in_tile * tile_id < n)
+                                     {
+                                         val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
+
+                                         satisfies_pred = pred(val);
+                                     }
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+
+                             // Phase 2: Global scan across wg_count
+                             _SizeT prev_sum = 0;
+
+                             // The first sub-group will query the previous tiles to find a prefix
+                             if (sg.get_group_id() == 0)
+                             {
+                                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
+
+                                 if (group.leader())
+                                     scan_mem.set_partial(tile_id, wg_count);
+
+                                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
+                                 prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
+
+                                 if (group.leader())
+                                     scan_mem.set_full(tile_id, prev_sum + wg_count);
+                             }
+
+                             _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
+
+                             // Phase 3: copy values to global memory
+                             for (int i = wg_local_id; i < wg_count; i += wgsize)
+                             {
+                                 __out_rng[start_idx + i] = wg_copy_if_values[i];
+                             }
+                             if (tile_id == (num_wgs - 1) && group.leader())
+                                 __num_rng[0] = start_idx + wg_count;
+                         });
     });
 
     event.wait();
     scratch.free();
 }
 
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
+          typename _UnaryPredicate>
 void
-single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
+single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
+                              _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
 {
     auto __n = __in_end - __in_begin;
 
-    auto __keep1 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
     auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
     auto __keep_num =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
+    single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                     __buf_num.all_view(), pred);
 }
 
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate>
+template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
+          typename _UnaryPredicate>
 void
-single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
+single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
+                    _NumSelectedRange __num_begin, _UnaryPredicate pred)
 {
     auto __n = __in_end - __in_begin;
 
-    auto __keep1 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
     auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
     auto __buf2 = __keep2(__out_begin, __out_begin + __n);
 
     auto __keep_num =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
+    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(
+        __queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
 }
 
 } // inline namespace igpu
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 75769131522..a77b76491e7 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -24,8 +24,9 @@ using namespace TestUtils;
 template <typename T, typename Predicate>
 class CopyIfKernel;
 
-template<typename T, typename Predicate, typename Generator>
-bool test(Predicate pred, Generator gen)
+template <typename T, typename Predicate, typename Generator>
+bool
+test(Predicate pred, Generator gen)
 {
     bool all_passed = true;
     sycl::queue q;
@@ -34,9 +35,7 @@ bool test(Predicate pred, Generator gen)
     {
         int n = 1 << logn;
 
-        Sequence<T> in(n, [&](size_t k) -> T { 
-            return gen(n ^ k); 
-        });
+        Sequence<T> in(n, [&](size_t k) -> T { return gen(n ^ k); });
 
         Sequence<T> std_out(n);
 
@@ -47,8 +46,9 @@ bool test(Predicate pred, Generator gen)
         constexpr int n_elements_per_workitem = 8;
 
         q.copy(in.data(), in_ptr, n).wait();
-        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, CopyIfKernel<T, Predicate>>;
-        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, out_num, pred);
+        using KernelParams =
+            oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, CopyIfKernel<T, Predicate>>;
+        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr + n, out_ptr, out_num, pred);
 
         Sequence<T> kt_out(n);
         size_t num_selected = 0;
@@ -59,12 +59,14 @@ bool test(Predicate pred, Generator gen)
         auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred);
 
         bool passed = true;
-        if (num_selected != (std_out_end - std_out.begin())) {
+        if (num_selected != (std_out_end - std_out.begin()))
+        {
             passed = false;
-            std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected << "\n";
+            std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected
+                      << "\n";
         }
 
-        for (size_t i  = 0; i < (std_out_end - std_out.begin()); ++i)
+        for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i)
         {
             if (kt_out[i] != std_out[i])
             {
@@ -87,11 +89,16 @@ bool test(Predicate pred, Generator gen)
     return all_passed;
 }
 
-int main() {
+int
+main()
+{
     bool all_passed = true;
-    all_passed &= test<float64_t>([](const float64_t& x) { return x * x <= 1024; }, [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
+    all_passed &=
+        test<float64_t>([](const float64_t& x) { return x * x <= 1024; },
+                        [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
     all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; });
-    all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });
+    all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; },
+                                     [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });
 
     return all_passed;
 }

From 9d39fc689e79c3e77cd68a779acafe982acb1fb2 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 24 May 2024 10:29:06 -0400
Subject: [PATCH 084/134] refactor to share lookback and memory mgr

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 490 ++++++++++++++++--
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 280 ----------
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   |  26 +-
 3 files changed, 456 insertions(+), 340 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index cafffd6493d..e420dad591a 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -44,10 +44,120 @@ class __lookback_kernel;
 
 static constexpr int SUBGROUP_SIZE = 32;
 
+
+template <typename _ScanStatusFlag>
+struct ScanMemoryManager
+{
+    using _FlagStorageType = typename _ScanStatusFlag::_FlagStorageType;
+    using _ValueType = typename _ScanStatusFlag::_ValueType;
+
+    ScanMemoryManager(sycl::queue __q, std::size_t num_wgs) : __queue{__q}, __num_wgs(num_wgs) {};
+
+    ::std::uint8_t*
+    scan_memory_ptr() noexcept
+    {
+        return scan_memory_begin;
+    };
+
+    void
+    allocate()
+    {
+        ::std::size_t scan_memory_size = get_memory_size();
+
+        scan_memory_begin = sycl::malloc_device<::std::uint8_t>(scan_memory_size, __queue);
+        if (!scan_memory_begin)
+            throw std::bad_alloc();
+    }
+
+    sycl::event
+    async_free(sycl::event dependency)
+    {
+        return __queue.submit(
+            [e = dependency, ptr = scan_memory_begin, __q = __queue](sycl::handler& hdl)
+            {
+                hdl.depends_on(e);
+                hdl.host_task([=]() { sycl::free(ptr, __q); });
+            });
+    }
+
+    void
+    free()
+    {
+        sycl::free(scan_memory_begin, __queue);
+    }
+
+    _FlagStorageType*
+    get_flags_begin()
+    {
+        // Aligned flags
+        ::std::size_t tile_values_bytes = get_tile_values_bytes();
+        void* base_flags = reinterpret_cast<void*>(scan_memory_begin + tile_values_bytes);
+        auto remainder = get_padded_flag_bytes(); // scan_memory_bytes - tile_values_bytes
+        return reinterpret_cast<_FlagStorageType*>(
+            ::std::align(::std::alignment_of_v<_FlagStorageType>, get_flag_bytes(), base_flags, remainder));
+    }
+
+    _ValueType*
+    get_partial_values_begin()
+    {
+        return reinterpret_cast<_ValueType*>(scan_memory_begin);
+    }
+
+    _ValueType*
+    get_full_values_begin()
+    {
+        return reinterpret_cast<_ValueType*>(scan_memory_begin + get_num_elements() * sizeof(_ValueType));
+    }
+
+    std::size_t
+    get_num_elements()
+    {
+        return _ScanStatusFlag::__padding + __num_wgs;
+    }
+
+  private:
+
+    std::size_t
+    get_tile_values_bytes()
+    {
+        return (2 * get_num_elements()) * sizeof(_ValueType);
+    }
+
+    std::size_t
+    get_flag_bytes()
+    {
+        return get_num_elements() * sizeof(_FlagStorageType);
+    }
+
+    std::size_t
+    get_padded_flag_bytes()
+    {
+        // sizeof(_FlagStorageType) extra bytes for possible intenal alignment
+        return get_flag_bytes() + sizeof(_FlagStorageType);
+    }
+
+    std::size_t
+    get_memory_size()
+    {
+        // sizeof(_T) extra bytes are not needed because data is going at the beginning of the scratch
+        ::std::size_t tile_values_bytes = get_tile_values_bytes();
+        // Padding to provide room for aligment
+        ::std::size_t flag_bytes = get_padded_flag_bytes();
+
+        return tile_values_bytes + flag_bytes;
+    }
+
+    std::uint8_t* scan_memory_begin = nullptr;
+    std::size_t __num_wgs;
+
+    sycl::queue __queue;
+};
+
 template <typename _T>
 struct __scan_status_flag
 {
     using _FlagStorageType = uint32_t;
+    using _ValueType = _T;
     using _AtomicFlagT = sycl::atomic_ref<_FlagStorageType, sycl::memory_order::acq_rel, sycl::memory_scope::device,
                                           sycl::access::address_space::global_space>;
     using _AtomicValueT = sycl::atomic_ref<_T, sycl::memory_order::acq_rel, sycl::memory_scope::device,
@@ -159,6 +269,34 @@ struct __lookback_init_submitter<_FlagType, _Type, _BinaryOp,
     }
 };
 
+template <typename _FlagType, typename _Group, typename _SubGroup, typename _StatusFlags, typename _StatusValues,
+          typename _Type, typename _BinaryOp>
+void
+__lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags& __status_flags,
+                _StatusValues& __status_vals_full, _StatusValues& __status_vals_partial, std::uint32_t __tile_id,
+                _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op)
+{
+    // The first sub-group will query the previous tiles to find a prefix
+    if (__subgroup.get_group_id() == 0)
+    {
+        _FlagType __flag(__status_flags, __status_vals_full, __status_vals_partial, __tile_id);
+
+        if (__subgroup.get_local_id() == 0)
+        {
+            __flag.set_partial(__local_reduction);
+        }
+
+        __prev_tile_reduction = __flag.cooperative_lookback(__subgroup, __binary_op);
+
+        if (__subgroup.get_local_id() == 0)
+        {
+            __flag.set_full(__binary_op(__prev_tile_reduction, __local_reduction));
+        }
+    }
+    __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0);
+}
+
+
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
           typename _KernelName>
 struct __lookback_submitter;
@@ -243,25 +381,8 @@ struct __lookback_kernel_func
             sycl::joint_reduce(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __binary_op);
         _Type __prev_tile_reduction{};
 
-        // The first sub-group will query the previous tiles to find a prefix
-        if (__subgroup.get_group_id() == 0)
-        {
-            _FlagType __flag(__status_flags, __status_vals_full, __status_vals_partial, __tile_id);
-
-            if (__subgroup.get_local_id() == 0)
-            {
-                __flag.set_partial(__local_reduction);
-            }
-
-            __prev_tile_reduction = __flag.cooperative_lookback(__subgroup, __binary_op);
-
-            if (__subgroup.get_local_id() == 0)
-            {
-                __flag.set_full(__binary_op(__prev_tile_reduction, __local_reduction));
-            }
-        }
-
-        __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0);
+        __lookback_phase<_FlagType>(__group, __subgroup, __status_flags, __status_vals_full, __status_vals_partial,
+                                   __tile_id, __local_reduction, __prev_tile_reduction, __binary_op);
 
         sycl::joint_inclusive_scan(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __out_begin,
                                    __binary_op, __prev_tile_reduction);
@@ -348,31 +469,19 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     std::size_t __elems_in_tile = __workgroup_size * __data_per_workitem;
     std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
 
-    constexpr int __status_flag_padding = SUBGROUP_SIZE;
-    std::size_t __status_flags_size = __num_wgs + 1 + __status_flag_padding;
-
-    std::size_t __mem_align_pad = sizeof(_Type);
-    std::size_t __status_flags_bytes = __status_flags_size * sizeof(_FlagStorageType);
-    std::size_t __status_vals_full_offset_bytes = __status_flags_size * sizeof(_Type);
-    std::size_t __status_vals_partial_offset_bytes = __status_flags_size * sizeof(_Type);
-    std::size_t __mem_bytes =
-        __status_flags_bytes + __status_vals_full_offset_bytes + __status_vals_partial_offset_bytes + __mem_align_pad;
-
-    std::byte* __device_mem = reinterpret_cast<std::byte*>(sycl::malloc_device(__mem_bytes, __queue));
-    if (!__device_mem)
-        throw std::bad_alloc();
-
-    _FlagStorageType* __status_flags = reinterpret_cast<_FlagStorageType*>(__device_mem);
-    std::size_t __remainder = __mem_bytes - __status_flags_bytes;
-    void* __vals_base_ptr = reinterpret_cast<void*>(__device_mem + __status_flags_bytes);
-    void* __vals_aligned_ptr =
-        std::align(std::alignment_of_v<_Type>, __status_vals_full_offset_bytes, __vals_base_ptr, __remainder);
-    _Type* __status_vals_full = reinterpret_cast<_Type*>(__vals_aligned_ptr);
-    _Type* __status_vals_partial =
-        reinterpret_cast<_Type*>(__status_vals_full + __status_vals_full_offset_bytes / sizeof(_Type));
+
+    ScanMemoryManager<_FlagType> __device_mem_mgr(__queue, __num_wgs);
+
+    __device_mem_mgr.allocate();
+
+    _Type* __status_vals_full = __device_mem_mgr.get_full_values_begin();
+    _Type* __status_vals_partial = __device_mem_mgr.get_partial_values_begin();
+    _FlagStorageType* __status_flags = __device_mem_mgr.get_flags_begin();
+    //adding 1 to the number elements to account for the tile id
+    std::size_t __status_flags_size = __device_mem_mgr.get_num_elements() + 1;
 
     auto __fill_event = __lookback_init_submitter<_FlagType, _Type, _BinaryOp, _LookbackInitKernel>{}(
-        __queue, __status_flags, __status_vals_partial, __status_flags_size, __status_flag_padding);
+        __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding);
 
     std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
     std::size_t __current_num_items = __current_num_wgs * __workgroup_size;
@@ -388,21 +497,308 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     // we should replace this code with the asynchronous version below.
     if (0)
     {
-        return __queue.submit([=](sycl::handler& __hdl) {
-            __hdl.depends_on(__prev_event);
-            __hdl.host_task([=]() { sycl::free(__device_mem, __queue); });
-        });
+        return __device_mem_mgr.async_free(__prev_event);
     }
     else
     {
         __prev_event.wait();
-        sycl::free(__device_mem, __queue);
+        __device_mem_mgr.free();
         return __prev_event;
     }
 }
 
+template <typename _InRange, typename _OutRange, typename _NumSelectedRange,
+          typename _UnaryPredicate, typename _KernelParam>
+void
+single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
+                                   _NumSelectedRange __num_rng, _UnaryPredicate pred, _KernelParam)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _SizeT = uint64_t;
+    using _TileIdT = TileId::_TileIdT;
+    using _KernelName = typename _KernelParam::kernel_name;
+
+    const ::std::size_t n = __in_rng.size();
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
+    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
+    ::std::size_t num_workitems = num_wgs * wgsize;
+    assert(num_wgs == 1);
+
+    auto event = __queue.submit([&](sycl::handler& hdl) {
+        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
+                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                             auto group = item.get_group();
+                             auto wg_local_id = item.get_local_id(0);
+                             constexpr ::std::uint32_t stride = wgsize;
+
+                             // Global load into local
+                             _SizeT wg_count = 0;
+
+                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+                             if (elems_in_tile <= n)
+                             {
+#pragma unroll
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _Type val = __in_rng[i + wg_local_id];
+
+                                     _SizeT satisfies_pred = pred(val);
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+                             else
+                             {
+                // Edge of input, have to handle memory bounds
+                // Might have unneccessary group_barrier calls
+#pragma unroll
+                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                                 {
+                                     _SizeT satisfies_pred = 0;
+                                     // launder is used here to create data without initialization without requiring
+                                     // a default constructor or out of bounds access
+                                     // TODO: replace with "union" trick to avoid launder,
+                                     // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470
+                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                                     if (i + wg_local_id < n)
+                                     {
+                                         val = __in_rng[i + wg_local_id];
+
+                                         satisfies_pred = pred(val);
+                                     }
+                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+
+                             // Phase 3: copy values to global memory
+                             for (int i = wg_local_id; i < wg_count; i += wgsize)
+                             {
+                                 __out_rng[i] = wg_copy_if_values[i];
+                             }
+                             if (group.leader())
+                                 __num_rng[0] = wg_count;
+                         });
+    });
+
+    event.wait();
+}
+
+template <typename _InRange,
+          typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate, typename _KernelParam>
+void
+single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
+                         _UnaryPredicate pred, _KernelParam)
+{
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _SizeT = uint64_t;
+    using _TileIdT = TileId::_TileIdT;
+    using _KernelName = typename _KernelParam::kernel_name;
+
+    using _BinaryOp = std::plus<_SizeT>;
+
+
+    using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>;
+
+    using _FlagType = __scan_status_flag<_SizeT>;
+    using _FlagStorageType = typename _FlagType::_FlagStorageType;
+
+
+    const ::std::size_t n = __in_rng.size();
+
+    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of wgsize
+    constexpr std::uint32_t __elems_in_tile = wgsize * elems_per_workitem;
+    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, __elems_in_tile);
+    ::std::size_t num_workitems = num_wgs * wgsize;
+
+    ScanMemoryManager<_FlagType> scratch(__queue, num_wgs);
+    scratch.allocate();
+
+    // Memory Structure:
+    // [Lookback Scan Memory, Tile Id Counter]
+    auto __status_vals_full = scratch.get_full_values_begin();
+    auto __status_vals_partial = scratch.get_partial_values_begin();
+    auto __status_flags = scratch.get_flags_begin();
+    //adding 1 to the number elements to account for the tile id
+    std::size_t __status_flags_size = scratch.get_num_elements() + 1;
+
+    auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}(
+        __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding);
+
+    auto event = __queue.submit([&](sycl::handler& hdl) {
+        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl);
+
+        auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
+        hdl.depends_on(__fill_event);
+
+        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
+                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                             auto __group = item.get_group();
+                             auto wg_local_id = item.get_local_id(0);
+                             auto sg = item.get_sub_group();
+                             constexpr ::std::uint32_t stride = wgsize;
+
+                            std::uint32_t __tile_id = 0;
+
+                            // Obtain unique ID for this work-group that will be used in decoupled lookback
+                            if (__group.leader())
+                            {
+                                sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                                sycl::access::address_space::global_space>
+                                    __idx_atomic(__status_flags[__status_flags_size - 1]);
+                                __tile_id = __idx_atomic.fetch_add(1);
+                            }
+
+                            __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
+
+                            std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
+
+                             _SizeT wg_count = 0;
+
+                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+                             if ((__tile_id + 1) * __elems_in_tile <= n)
+                             {
+#pragma unroll
+                                 for (size_t i = 0; i < __elems_in_tile; i += wgsize)
+                                 {
+                                     // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
+                                     //  if load is done in a scalar fashion and provides the same performance, we 
+                                     //  can avoid the broadcast (I think)
+                                     // would need to loop over the elements per work item first accumulating into 
+                                     // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to 
+                                     // global memory needs to be loaded per work item per element, skipping copies 
+                                     // when they were not saved.
+                                     _Type val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id];
+
+                                     _SizeT satisfies_pred = pred(val);
+                                     _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+                             else
+                             {
+                // Edge of input, have to handle memory bounds
+                // Might have unneccessary group_barrier calls
+#pragma unroll
+                                 for (size_t i = 0; i < __elems_in_tile; i += wgsize)
+                                 {
+                                     _SizeT satisfies_pred = 0;
+                                     // launder is used here to create data without initialization without requiring
+                                     // a default constructor or out of bounds access
+                                     // TODO: replace with "union" trick to avoid launder,
+                                     // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470
+                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                                     if (i + wg_local_id + __elems_in_tile * __tile_id < n)
+                                     {
+                                         val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id];
+
+                                         satisfies_pred = pred(val);
+                                     }
+                                     _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count,
+                                                                                    sycl::plus<_SizeT>());
+
+                                     if (satisfies_pred)
+                                         wg_copy_if_values[count] = val;
+
+                                     wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
+                                 }
+                             }
+
+                             // Phase 2: Global scan across wg_count
+                             _SizeT copied_elements = 0;
+
+                            __lookback_phase<_FlagType>(__group, sg, __status_flags, __status_vals_full, __status_vals_partial,
+                                   __tile_id, wg_count, copied_elements, sycl::plus<_SizeT>());
+
+                            //TODO: explore above comment about scalar load
+                             // Phase 3: copy values to global memory
+                             for (int i = wg_local_id; i < wg_count; i += wgsize)
+                             {
+                                 __out_rng[copied_elements + i] = wg_copy_if_values[i];
+                             }
+                             if (__tile_id == (num_wgs - 1) && __group.leader())
+                                 __num_rng[0] = copied_elements + wg_count;
+                         });
+    });
+
+    event.wait();
+    scratch.free();
+}
+
 } // namespace __impl
 
+template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
+          typename _UnaryPredicate, typename _KernelParam>
+void
+single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
+                              _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred,
+                              _KernelParam __param = {})
+{
+    auto __n = __in_end - __in_begin;
+
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    auto __keep_num =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
+    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
+
+    __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(),
+                                                     __buf_num.all_view(), pred, __param);
+}
+
+template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
+          typename _UnaryPredicate, typename _KernelParam>
+void
+single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
+                    _NumSelectedRange __num_begin, _UnaryPredicate pred, _KernelParam __param = {})
+{
+    auto __n = __in_end - __in_begin;
+
+    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
+    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
+    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+
+    auto __keep_num =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
+    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
+
+    __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred, __param);
+}
+
+
+
 template <typename _InRng, typename _OutRng, typename _BinaryOp, typename _KernelParam>
 sycl::event
 inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
index aaca8a4a81e..8752c4baf0e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
@@ -716,286 +716,6 @@ single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InItera
     }
 }
 
-template <typename _KernelParam, typename _InRange, typename _OutRange, typename _NumSelectedRange,
-          typename _UnaryPredicate>
-void
-single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
-                                   _NumSelectedRange __num_rng, _UnaryPredicate pred)
-{
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-    using _SizeT = uint64_t;
-    using _TileIdT = TileId::_TileIdT;
-
-    const ::std::size_t n = __in_rng.size();
-
-    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
-
-    // Avoid non_uniform n by padding up to a multiple of wgsize
-    constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
-    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
-    ::std::size_t num_workitems = num_wgs * wgsize;
-    assert(num_wgs == 1);
-
-    auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
-
-        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
-                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-                             auto group = item.get_group();
-                             auto wg_local_id = item.get_local_id(0);
-                             constexpr ::std::uint32_t stride = wgsize;
-
-                             // Global load into local
-                             _SizeT wg_count = 0;
-
-                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-                             if (elems_in_tile <= n)
-                             {
-#pragma unroll
-                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
-                                 {
-                                     _Type val = __in_rng[i + wg_local_id];
-
-                                     _SizeT satisfies_pred = pred(val);
-                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
-                                                                                    sycl::plus<_SizeT>());
-
-                                     if (satisfies_pred)
-                                         wg_copy_if_values[count] = val;
-
-                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-                                 }
-                             }
-                             else
-                             {
-                // Edge of input, have to handle memory bounds
-                // Might have unneccessary group_barrier calls
-#pragma unroll
-                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
-                                 {
-                                     _SizeT satisfies_pred = 0;
-                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
-                                     if (i + wg_local_id < n)
-                                     {
-                                         val = __in_rng[i + wg_local_id];
-
-                                         satisfies_pred = pred(val);
-                                     }
-                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
-                                                                                    sycl::plus<_SizeT>());
-
-                                     if (satisfies_pred)
-                                         wg_copy_if_values[count] = val;
-
-                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-                                 }
-                             }
-
-                             // Phase 3: copy values to global memory
-                             for (int i = wg_local_id; i < wg_count; i += wgsize)
-                             {
-                                 __out_rng[i] = wg_copy_if_values[i];
-                             }
-                             if (group.leader())
-                                 __num_rng[0] = wg_count;
-                         });
-    });
-
-    event.wait();
-}
-
-template <typename _KernelParam, typename _UseAtomic64, typename _UseDynamicTileID, typename _InRange,
-          typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
-void
-single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
-                         _UnaryPredicate pred)
-{
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-    using _SizeT = uint64_t;
-    using _TileIdT = TileId::_TileIdT;
-    using _LookbackScanMemory = LookbackScanMemory<_SizeT, _UseAtomic64>;
-    using _FlagT = typename _LookbackScanMemory::_FlagT;
-
-    const ::std::size_t n = __in_rng.size();
-
-    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
-
-    // Avoid non_uniform n by padding up to a multiple of wgsize
-    constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
-    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
-    ::std::size_t num_workitems = num_wgs * wgsize;
-
-    ScanMemoryManager<_SizeT, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
-    scratch.allocate(num_wgs);
-
-    // Memory Structure:
-    // [Lookback Scan Memory, Tile Id Counter]
-    auto scan_memory_begin = scratch.scan_memory_ptr();
-    auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs);
-    auto tile_id_begin = scratch.tile_id_ptr();
-
-    ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs);
-    // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
-    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
-
-    auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT));
-
-    auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
-
-        auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
-        hdl.depends_on(fill_event);
-
-        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
-                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-                             auto group = item.get_group();
-                             auto wg_local_id = item.get_local_id(0);
-                             auto sg = item.get_sub_group();
-                             constexpr ::std::uint32_t stride = wgsize;
-
-                             // Init tile_id
-                             std::uint32_t tile_id;
-                             if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
-                             {
-                                 // Obtain unique ID for this work-group that will be used in decoupled lookback
-                                 TileId dynamic_tile_id(tile_id_begin);
-                                 if (group.leader())
-                                 {
-                                     tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-                                 }
-                                 sycl::group_barrier(group);
-                                 tile_id = tile_id_lacc[0];
-                             }
-                             else
-                             {
-                                 tile_id = group.get_group_linear_id();
-                             }
-
-                             _SizeT wg_count = 0;
-
-                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-                             if ((tile_id + 1) * elems_in_tile <= n)
-                             {
-#pragma unroll
-                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
-                                 {
-                                     _Type val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
-
-                                     _SizeT satisfies_pred = pred(val);
-                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
-                                                                                    sycl::plus<_SizeT>());
-
-                                     if (satisfies_pred)
-                                         wg_copy_if_values[count] = val;
-
-                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-                                 }
-                             }
-                             else
-                             {
-                // Edge of input, have to handle memory bounds
-                // Might have unneccessary group_barrier calls
-#pragma unroll
-                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
-                                 {
-                                     _SizeT satisfies_pred = 0;
-                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
-                                     if (i + wg_local_id + elems_in_tile * tile_id < n)
-                                     {
-                                         val = __in_rng[i + wg_local_id + elems_in_tile * tile_id];
-
-                                         satisfies_pred = pred(val);
-                                     }
-                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
-                                                                                    sycl::plus<_SizeT>());
-
-                                     if (satisfies_pred)
-                                         wg_copy_if_values[count] = val;
-
-                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-                                 }
-                             }
-
-                             // Phase 2: Global scan across wg_count
-                             _SizeT prev_sum = 0;
-
-                             // The first sub-group will query the previous tiles to find a prefix
-                             if (sg.get_group_id() == 0)
-                             {
-                                 _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
-
-                                 if (group.leader())
-                                     scan_mem.set_partial(tile_id, wg_count);
-
-                                 // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                                 prev_sum = cooperative_lookback()(tile_id, sg, sycl::plus<_SizeT>(), scan_mem);
-
-                                 if (group.leader())
-                                     scan_mem.set_full(tile_id, prev_sum + wg_count);
-                             }
-
-                             _SizeT start_idx = sycl::group_broadcast(group, prev_sum, 0);
-
-                             // Phase 3: copy values to global memory
-                             for (int i = wg_local_id; i < wg_count; i += wgsize)
-                             {
-                                 __out_rng[start_idx + i] = wg_copy_if_values[i];
-                             }
-                             if (tile_id == (num_wgs - 1) && group.leader())
-                                 __num_rng[0] = start_idx + wg_count;
-                         });
-    });
-
-    event.wait();
-    scratch.free();
-}
-
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
-          typename _UnaryPredicate>
-void
-single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
-                              _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred)
-{
-    auto __n = __in_end - __in_begin;
-
-    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
-    auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
-    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
-
-    auto __keep_num =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
-    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
-
-    single_pass_copy_if_impl_single_wg<_KernelParam>(__queue, __buf1.all_view(), __buf2.all_view(),
-                                                     __buf_num.all_view(), pred);
-}
-
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
-          typename _UnaryPredicate>
-void
-single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
-                    _NumSelectedRange __num_begin, _UnaryPredicate pred)
-{
-    auto __n = __in_end - __in_begin;
-
-    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
-    auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
-    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
-
-    auto __keep_num =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
-    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
-
-    single_pass_copy_if_impl<_KernelParam, /* UseAtomic64 */ std::true_type, /* UseDynamicTileID */ std::true_type>(
-        __queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred);
-}
-
 } // inline namespace igpu
 
 } // namespace oneapi::dpl::experimental::kt
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index a77b76491e7..e0a079eaa3f 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -16,17 +16,15 @@
 #include "support/test_config.h"
 #include "support/utils.h"
 
+#include <oneapi/dpl/experimental/kernel_templates>
 #include _PSTL_TEST_HEADER(execution)
 #include _PSTL_TEST_HEADER(numeric)
 
 using namespace TestUtils;
 
-template <typename T, typename Predicate>
-class CopyIfKernel;
-
-template <typename T, typename Predicate, typename Generator>
+template <typename T, typename Predicate, typename Generator, typename KernelParam>
 bool
-test(Predicate pred, Generator gen)
+test(Predicate pred, Generator gen, KernelParam param)
 {
     bool all_passed = true;
     sycl::queue q;
@@ -43,12 +41,8 @@ test(Predicate pred, Generator gen)
         T* out_ptr = sycl::malloc_device<T>(n, q);
         size_t* out_num = sycl::malloc_device<size_t>(1, q);
 
-        constexpr int n_elements_per_workitem = 8;
-
         q.copy(in.data(), in_ptr, n).wait();
-        using KernelParams =
-            oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128, CopyIfKernel<T, Predicate>>;
-        oneapi::dpl::experimental::kt::single_pass_copy_if<KernelParams>(q, in_ptr, in_ptr + n, out_ptr, out_num, pred);
+        oneapi::dpl::experimental::kt::gpu::single_pass_copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param);
 
         Sequence<T> kt_out(n);
         size_t num_selected = 0;
@@ -93,12 +87,18 @@ int
 main()
 {
     bool all_passed = true;
+    constexpr int n_elements_per_workitem = 8;
+
+    auto param = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128>{};
     all_passed &=
         test<float64_t>([](const float64_t& x) { return x * x <= 1024; },
-                        [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); });
-    all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; });
+                        [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); },
+                        TestUtils::get_new_kernel_params<0>(param));
+    all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; },
+                        TestUtils::get_new_kernel_params<1>(param));
     all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; },
-                                     [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; });
+                                     [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; },
+                                     TestUtils::get_new_kernel_params<2>(param));
 
     return all_passed;
 }

From 1b10214d09bf8f9f231e914a937c94f506ded22f Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 24 May 2024 10:32:47 -0400
Subject: [PATCH 085/134] formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 341 +++++++++---------
 1 file changed, 167 insertions(+), 174 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index e420dad591a..df85e047ca1 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -44,14 +44,13 @@ class __lookback_kernel;
 
 static constexpr int SUBGROUP_SIZE = 32;
 
-
 template <typename _ScanStatusFlag>
 struct ScanMemoryManager
 {
     using _FlagStorageType = typename _ScanStatusFlag::_FlagStorageType;
     using _ValueType = typename _ScanStatusFlag::_ValueType;
 
-    ScanMemoryManager(sycl::queue __q, std::size_t num_wgs) : __queue{__q}, __num_wgs(num_wgs) {};
+    ScanMemoryManager(sycl::queue __q, std::size_t num_wgs) : __queue{__q}, __num_wgs(num_wgs){};
 
     ::std::uint8_t*
     scan_memory_ptr() noexcept
@@ -72,12 +71,10 @@ struct ScanMemoryManager
     sycl::event
     async_free(sycl::event dependency)
     {
-        return __queue.submit(
-            [e = dependency, ptr = scan_memory_begin, __q = __queue](sycl::handler& hdl)
-            {
-                hdl.depends_on(e);
-                hdl.host_task([=]() { sycl::free(ptr, __q); });
-            });
+        return __queue.submit([e = dependency, ptr = scan_memory_begin, __q = __queue](sycl::handler& hdl) {
+            hdl.depends_on(e);
+            hdl.host_task([=]() { sycl::free(ptr, __q); });
+        });
     }
 
     void
@@ -116,7 +113,6 @@ struct ScanMemoryManager
     }
 
   private:
-
     std::size_t
     get_tile_values_bytes()
     {
@@ -273,8 +269,8 @@ template <typename _FlagType, typename _Group, typename _SubGroup, typename _Sta
           typename _Type, typename _BinaryOp>
 void
 __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags& __status_flags,
-                _StatusValues& __status_vals_full, _StatusValues& __status_vals_partial, std::uint32_t __tile_id,
-                _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op)
+                 _StatusValues& __status_vals_full, _StatusValues& __status_vals_partial, std::uint32_t __tile_id,
+                 _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op)
 {
     // The first sub-group will query the previous tiles to find a prefix
     if (__subgroup.get_group_id() == 0)
@@ -296,7 +292,6 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag
     __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0);
 }
 
-
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
           typename _KernelName>
 struct __lookback_submitter;
@@ -382,7 +377,7 @@ struct __lookback_kernel_func
         _Type __prev_tile_reduction{};
 
         __lookback_phase<_FlagType>(__group, __subgroup, __status_flags, __status_vals_full, __status_vals_partial,
-                                   __tile_id, __local_reduction, __prev_tile_reduction, __binary_op);
+                                    __tile_id, __local_reduction, __prev_tile_reduction, __binary_op);
 
         sycl::joint_inclusive_scan(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __out_begin,
                                    __binary_op, __prev_tile_reduction);
@@ -469,7 +464,6 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     std::size_t __elems_in_tile = __workgroup_size * __data_per_workitem;
     std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
 
-
     ScanMemoryManager<_FlagType> __device_mem_mgr(__queue, __num_wgs);
 
     __device_mem_mgr.allocate();
@@ -507,8 +501,8 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     }
 }
 
-template <typename _InRange, typename _OutRange, typename _NumSelectedRange,
-          typename _UnaryPredicate, typename _KernelParam>
+template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
+          typename _KernelParam>
 void
 single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
                                    _NumSelectedRange __num_rng, _UnaryPredicate pred, _KernelParam)
@@ -533,77 +527,78 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
         auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
-                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-                             auto group = item.get_group();
-                             auto wg_local_id = item.get_local_id(0);
-                             constexpr ::std::uint32_t stride = wgsize;
-
-                             // Global load into local
-                             _SizeT wg_count = 0;
-
-                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-                             if (elems_in_tile <= n)
-                             {
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>&
+                                                                           item) [[intel::reqd_sub_group_size(
+                                                                       SUBGROUP_SIZE)]] {
+            auto group = item.get_group();
+            auto wg_local_id = item.get_local_id(0);
+            constexpr ::std::uint32_t stride = wgsize;
+
+            // Global load into local
+            _SizeT wg_count = 0;
+
+            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+            if (elems_in_tile <= n)
+            {
 #pragma unroll
-                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
-                                 {
-                                     _Type val = __in_rng[i + wg_local_id];
-
-                                     _SizeT satisfies_pred = pred(val);
-                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
-                                                                                    sycl::plus<_SizeT>());
-
-                                     if (satisfies_pred)
-                                         wg_copy_if_values[count] = val;
-
-                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-                                 }
-                             }
-                             else
-                             {
+                for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                {
+                    _Type val = __in_rng[i + wg_local_id];
+
+                    _SizeT satisfies_pred = pred(val);
+                    _SizeT count =
+                        sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+
+                    if (satisfies_pred)
+                        wg_copy_if_values[count] = val;
+
+                    wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                }
+            }
+            else
+            {
                 // Edge of input, have to handle memory bounds
                 // Might have unneccessary group_barrier calls
 #pragma unroll
-                                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
-                                 {
-                                     _SizeT satisfies_pred = 0;
-                                     // launder is used here to create data without initialization without requiring
-                                     // a default constructor or out of bounds access
-                                     // TODO: replace with "union" trick to avoid launder,
-                                     // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470
-                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
-                                     if (i + wg_local_id < n)
-                                     {
-                                         val = __in_rng[i + wg_local_id];
-
-                                         satisfies_pred = pred(val);
-                                     }
-                                     _SizeT count = sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count,
-                                                                                    sycl::plus<_SizeT>());
-
-                                     if (satisfies_pred)
-                                         wg_copy_if_values[count] = val;
-
-                                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
-                                 }
-                             }
-
-                             // Phase 3: copy values to global memory
-                             for (int i = wg_local_id; i < wg_count; i += wgsize)
-                             {
-                                 __out_rng[i] = wg_copy_if_values[i];
-                             }
-                             if (group.leader())
-                                 __num_rng[0] = wg_count;
-                         });
+                for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                {
+                    _SizeT satisfies_pred = 0;
+                    // launder is used here to create data without initialization without requiring
+                    // a default constructor or out of bounds access
+                    // TODO: replace with "union" trick to avoid launder,
+                    // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470
+                    _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                    if (i + wg_local_id < n)
+                    {
+                        val = __in_rng[i + wg_local_id];
+
+                        satisfies_pred = pred(val);
+                    }
+                    _SizeT count =
+                        sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+
+                    if (satisfies_pred)
+                        wg_copy_if_values[count] = val;
+
+                    wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                }
+            }
+
+            // Phase 3: copy values to global memory
+            for (int i = wg_local_id; i < wg_count; i += wgsize)
+            {
+                __out_rng[i] = wg_copy_if_values[i];
+            }
+            if (group.leader())
+                __num_rng[0] = wg_count;
+        });
     });
 
     event.wait();
 }
 
-template <typename _InRange,
-          typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate, typename _KernelParam>
+template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
+          typename _KernelParam>
 void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
                          _UnaryPredicate pred, _KernelParam)
@@ -615,14 +610,12 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
     using _BinaryOp = std::plus<_SizeT>;
 
-
     using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
         __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>;
 
     using _FlagType = __scan_status_flag<_SizeT>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 
-
     const ::std::size_t n = __in_rng.size();
 
     constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
@@ -654,99 +647,100 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         hdl.depends_on(__fill_event);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize),
-                         [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-                             auto __group = item.get_group();
-                             auto wg_local_id = item.get_local_id(0);
-                             auto sg = item.get_sub_group();
-                             constexpr ::std::uint32_t stride = wgsize;
-
-                            std::uint32_t __tile_id = 0;
-
-                            // Obtain unique ID for this work-group that will be used in decoupled lookback
-                            if (__group.leader())
-                            {
-                                sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
-                                                sycl::access::address_space::global_space>
-                                    __idx_atomic(__status_flags[__status_flags_size - 1]);
-                                __tile_id = __idx_atomic.fetch_add(1);
-                            }
+        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>&
+                                                                           item) [[intel::reqd_sub_group_size(
+                                                                       SUBGROUP_SIZE)]] {
+            auto __group = item.get_group();
+            auto wg_local_id = item.get_local_id(0);
+            auto sg = item.get_sub_group();
+            constexpr ::std::uint32_t stride = wgsize;
+
+            std::uint32_t __tile_id = 0;
+
+            // Obtain unique ID for this work-group that will be used in decoupled lookback
+            if (__group.leader())
+            {
+                sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                 sycl::access::address_space::global_space>
+                    __idx_atomic(__status_flags[__status_flags_size - 1]);
+                __tile_id = __idx_atomic.fetch_add(1);
+            }
 
-                            __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
+            __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
 
-                            std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
+            std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
 
-                             _SizeT wg_count = 0;
+            _SizeT wg_count = 0;
 
-                             // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-                             if ((__tile_id + 1) * __elems_in_tile <= n)
-                             {
+            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
+            if ((__tile_id + 1) * __elems_in_tile <= n)
+            {
 #pragma unroll
-                                 for (size_t i = 0; i < __elems_in_tile; i += wgsize)
-                                 {
-                                     // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
-                                     //  if load is done in a scalar fashion and provides the same performance, we 
-                                     //  can avoid the broadcast (I think)
-                                     // would need to loop over the elements per work item first accumulating into 
-                                     // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to 
-                                     // global memory needs to be loaded per work item per element, skipping copies 
-                                     // when they were not saved.
-                                     _Type val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id];
-
-                                     _SizeT satisfies_pred = pred(val);
-                                     _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count,
-                                                                                    sycl::plus<_SizeT>());
-
-                                     if (satisfies_pred)
-                                         wg_copy_if_values[count] = val;
-
-                                     wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
-                                 }
-                             }
-                             else
-                             {
+                for (size_t i = 0; i < __elems_in_tile; i += wgsize)
+                {
+                    // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
+                    //  if load is done in a scalar fashion and provides the same performance, we
+                    //  can avoid the broadcast (I think)
+                    // would need to loop over the elements per work item first accumulating into
+                    // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
+                    // global memory needs to be loaded per work item per element, skipping copies
+                    // when they were not saved.
+                    _Type val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id];
+
+                    _SizeT satisfies_pred = pred(val);
+                    _SizeT count =
+                        sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+
+                    if (satisfies_pred)
+                        wg_copy_if_values[count] = val;
+
+                    wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
+                }
+            }
+            else
+            {
                 // Edge of input, have to handle memory bounds
                 // Might have unneccessary group_barrier calls
 #pragma unroll
-                                 for (size_t i = 0; i < __elems_in_tile; i += wgsize)
-                                 {
-                                     _SizeT satisfies_pred = 0;
-                                     // launder is used here to create data without initialization without requiring
-                                     // a default constructor or out of bounds access
-                                     // TODO: replace with "union" trick to avoid launder,
-                                     // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470
-                                     _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
-                                     if (i + wg_local_id + __elems_in_tile * __tile_id < n)
-                                     {
-                                         val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id];
-
-                                         satisfies_pred = pred(val);
-                                     }
-                                     _SizeT count = sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count,
-                                                                                    sycl::plus<_SizeT>());
-
-                                     if (satisfies_pred)
-                                         wg_copy_if_values[count] = val;
-
-                                     wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
-                                 }
-                             }
-
-                             // Phase 2: Global scan across wg_count
-                             _SizeT copied_elements = 0;
-
-                            __lookback_phase<_FlagType>(__group, sg, __status_flags, __status_vals_full, __status_vals_partial,
-                                   __tile_id, wg_count, copied_elements, sycl::plus<_SizeT>());
-
-                            //TODO: explore above comment about scalar load
-                             // Phase 3: copy values to global memory
-                             for (int i = wg_local_id; i < wg_count; i += wgsize)
-                             {
-                                 __out_rng[copied_elements + i] = wg_copy_if_values[i];
-                             }
-                             if (__tile_id == (num_wgs - 1) && __group.leader())
-                                 __num_rng[0] = copied_elements + wg_count;
-                         });
+                for (size_t i = 0; i < __elems_in_tile; i += wgsize)
+                {
+                    _SizeT satisfies_pred = 0;
+                    // launder is used here to create data without initialization without requiring
+                    // a default constructor or out of bounds access
+                    // TODO: replace with "union" trick to avoid launder,
+                    // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470
+                    _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                    if (i + wg_local_id + __elems_in_tile * __tile_id < n)
+                    {
+                        val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id];
+
+                        satisfies_pred = pred(val);
+                    }
+                    _SizeT count =
+                        sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+
+                    if (satisfies_pred)
+                        wg_copy_if_values[count] = val;
+
+                    wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
+                }
+            }
+
+            // Phase 2: Global scan across wg_count
+            _SizeT copied_elements = 0;
+
+            __lookback_phase<_FlagType>(__group, sg, __status_flags, __status_vals_full, __status_vals_partial,
+                                        __tile_id, wg_count, copied_elements, sycl::plus<_SizeT>());
+
+            //TODO: explore above comment about scalar load
+            // Phase 3: copy values to global memory
+            for (int i = wg_local_id; i < wg_count; i += wgsize)
+            {
+                __out_rng[copied_elements + i] = wg_copy_if_values[i];
+            }
+            if (__tile_id == (num_wgs - 1) && __group.leader())
+                __num_rng[0] = copied_elements + wg_count;
+        });
     });
 
     event.wait();
@@ -755,8 +749,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
 } // namespace __impl
 
-template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
-          typename _UnaryPredicate, typename _KernelParam>
+template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
+          typename _KernelParam>
 void
 single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
                               _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred,
@@ -773,12 +767,12 @@ single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIt
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(),
-                                                     __buf_num.all_view(), pred, __param);
+    __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(),
+                                               pred, __param);
 }
 
-template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange,
-          typename _UnaryPredicate, typename _KernelParam>
+template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
+          typename _KernelParam>
 void
 single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
                     _NumSelectedRange __num_begin, _UnaryPredicate pred, _KernelParam __param = {})
@@ -794,11 +788,10 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred, __param);
+    __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred,
+                                     __param);
 }
 
-
-
 template <typename _InRng, typename _OutRng, typename _BinaryOp, typename _KernelParam>
 sycl::event
 inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op,

From 593c218c9d2f00d8c94e97e64dddd732ec4cc4b5 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 24 May 2024 10:54:06 -0400
Subject: [PATCH 086/134] remove launder in favor of lazy ctor union

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 38 +++++++++----------
 include/oneapi/dpl/pstl/utils.h               |  8 ++++
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index df85e047ca1..14027002f1b 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -563,23 +563,21 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
                 for (size_t i = 0; i < elems_in_tile; i += wgsize)
                 {
                     _SizeT satisfies_pred = 0;
-                    // launder is used here to create data without initialization without requiring
-                    // a default constructor or out of bounds access
-                    // TODO: replace with "union" trick to avoid launder,
-                    // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470
-                    _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                    oneapi::dpl::__internal::__lazy_ctor_storage<_Type> val;
                     if (i + wg_local_id < n)
                     {
-                        val = __in_rng[i + wg_local_id];
+                        new (&val.__v) _Type(__in_rng[i + wg_local_id]);
 
-                        satisfies_pred = pred(val);
+                        satisfies_pred = pred(val.__v);
                     }
                     _SizeT count =
                         sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-
-                    if (satisfies_pred)
-                        wg_copy_if_values[count] = val;
-
+                    if (i + wg_local_id < n)
+                    {
+                        if (satisfies_pred)
+                            wg_copy_if_values[count] = std::move(val.__v);
+                        val.__v.~_Type();
+                    }
                     wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
                 }
             }
@@ -705,22 +703,22 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 for (size_t i = 0; i < __elems_in_tile; i += wgsize)
                 {
                     _SizeT satisfies_pred = 0;
-                    // launder is used here to create data without initialization without requiring
-                    // a default constructor or out of bounds access
-                    // TODO: replace with "union" trick to avoid launder,
-                    // see https://github.com/oneapi-src/oneDPL/pull/1495 and https://github.com/oneapi-src/oneDPL/pull/1470
-                    _Type val = *std::launder(reinterpret_cast<_Type*>(alloca(sizeof(_Type))));
+                    oneapi::dpl::__internal::__lazy_ctor_storage<_Type> val;
                     if (i + wg_local_id + __elems_in_tile * __tile_id < n)
                     {
-                        val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id];
+                        new (&val.__v) _Type(__in_rng[i + wg_local_id + __elems_in_tile * __tile_id]);
 
-                        satisfies_pred = pred(val);
+                        satisfies_pred = pred(val.__v);
                     }
                     _SizeT count =
                         sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
 
-                    if (satisfies_pred)
-                        wg_copy_if_values[count] = val;
+                    if (i + wg_local_id + __elems_in_tile * __tile_id < n)
+                    {
+                        if (satisfies_pred)
+                            wg_copy_if_values[count] = std::move(val.__v);
+                        val.__v.~_Type();
+                    }
 
                     wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
                 }
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index e8bbde63c04..c68e74e6ef7 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -765,6 +765,14 @@ struct __is_iterator_type<_T, std::void_t<typename std::iterator_traits<_T>::dif
 template <typename _T>
 static constexpr bool __is_iterator_type_v = __is_iterator_type<_T>::value;
 
+//For use to lazily create objects values of type _Tp without requiring a default constructibility of _Tp
+template <typename _Tp>
+union __lazy_ctor_storage
+{
+    _Tp __v;
+    __lazy_ctor_storage() {}
+};
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi

From 5a1752ce32cabc2b9a033be2a6098d23488fb8ee Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 24 May 2024 11:22:20 -0400
Subject: [PATCH 087/134] distinguishing kernel names

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/experimental/kt/single_pass_scan.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 14027002f1b..880aaf2b411 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -36,6 +36,15 @@ namespace gpu
 namespace __impl
 {
 
+template <typename... _Name>
+class __copy_if_kernel;
+
+template <typename... _Name>
+class __copy_if_single_wg_kernel;
+
+template <typename... _Name>
+class __inclusive_scan_kernel;
+
 template <typename... _Name>
 class __lookback_init_kernel;
 
@@ -426,7 +435,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     using _FlagType = __scan_status_flag<_Type>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 
-    using _KernelName = typename _KernelParam::kernel_name;
+    using _KernelName = __inclusive_scan_kernel<typename _KernelParam::kernel_name>;
     using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
         __lookback_init_kernel<_KernelName, _Type, _BinaryOp>>;
     using _LookbackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
@@ -510,7 +519,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _SizeT = uint64_t;
     using _TileIdT = TileId::_TileIdT;
-    using _KernelName = typename _KernelParam::kernel_name;
+    using _KernelName = __copy_if_single_wg_kernel<typename _KernelParam::kernel_name>;
 
     const ::std::size_t n = __in_rng.size();
 
@@ -604,7 +613,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _SizeT = uint64_t;
     using _TileIdT = TileId::_TileIdT;
-    using _KernelName = typename _KernelParam::kernel_name;
+    using _KernelName = __copy_if_kernel<typename _KernelParam::kernel_name>;
 
     using _BinaryOp = std::plus<_SizeT>;
 

From a0576c3c2967a54debc55a2e0fe3b4c7682967b4 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 24 May 2024 11:38:39 -0400
Subject: [PATCH 088/134] uglify

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 267 +++++++++---------
 1 file changed, 126 insertions(+), 141 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 880aaf2b411..ede5ed4c2ae 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -54,71 +54,62 @@ class __lookback_kernel;
 static constexpr int SUBGROUP_SIZE = 32;
 
 template <typename _ScanStatusFlag>
-struct ScanMemoryManager
+struct __scan_lookback_mem_mgr
 {
     using _FlagStorageType = typename _ScanStatusFlag::_FlagStorageType;
     using _ValueType = typename _ScanStatusFlag::_ValueType;
 
-    ScanMemoryManager(sycl::queue __q, std::size_t num_wgs) : __queue{__q}, __num_wgs(num_wgs){};
-
-    ::std::uint8_t*
-    scan_memory_ptr() noexcept
-    {
-        return scan_memory_begin;
-    };
+    __scan_lookback_mem_mgr(sycl::queue __q, std::size_t __num_wgs) : __queue{__q}, __num_workgroups(__num_wgs){};
 
     void
     allocate()
     {
-        ::std::size_t scan_memory_size = get_memory_size();
-
-        scan_memory_begin = sycl::malloc_device<::std::uint8_t>(scan_memory_size, __queue);
-        if (!scan_memory_begin)
+        __scan_memory_begin = sycl::malloc_device<std::uint8_t>(get_memory_size(), __queue);
+        if (!__scan_memory_begin)
             throw std::bad_alloc();
     }
 
     sycl::event
-    async_free(sycl::event dependency)
+    async_free(sycl::event __dependency)
     {
-        return __queue.submit([e = dependency, ptr = scan_memory_begin, __q = __queue](sycl::handler& hdl) {
-            hdl.depends_on(e);
-            hdl.host_task([=]() { sycl::free(ptr, __q); });
+        return __queue.submit([__e = __dependency, __ptr = __scan_memory_begin, __q = __queue](sycl::handler& __hdl) {
+            __hdl.depends_on(__e);
+            __hdl.host_task([=]() { sycl::free(__ptr, __q); });
         });
     }
 
     void
     free()
     {
-        sycl::free(scan_memory_begin, __queue);
+        sycl::free(__scan_memory_begin, __queue);
     }
 
     _FlagStorageType*
     get_flags_begin()
     {
         // Aligned flags
-        ::std::size_t tile_values_bytes = get_tile_values_bytes();
-        void* base_flags = reinterpret_cast<void*>(scan_memory_begin + tile_values_bytes);
-        auto remainder = get_padded_flag_bytes(); // scan_memory_bytes - tile_values_bytes
+        void* __base_flags = reinterpret_cast<void*>(__scan_memory_begin + get_tile_values_bytes());
+        auto __remainder = get_padded_flag_bytes();
         return reinterpret_cast<_FlagStorageType*>(
-            ::std::align(::std::alignment_of_v<_FlagStorageType>, get_flag_bytes(), base_flags, remainder));
+            ::std::align(::std::alignment_of_v<_FlagStorageType>, get_flag_bytes(), __base_flags, __remainder));
     }
 
     _ValueType*
     get_partial_values_begin()
     {
-        return reinterpret_cast<_ValueType*>(scan_memory_begin);
+        return reinterpret_cast<_ValueType*>(__scan_memory_begin);
     }
 
     _ValueType*
     get_full_values_begin()
     {
-        return reinterpret_cast<_ValueType*>(scan_memory_begin + get_num_elements() * sizeof(_ValueType));
+        return reinterpret_cast<_ValueType*>(__scan_memory_begin + get_num_elements() * sizeof(_ValueType));
     }
 
     std::size_t
     get_num_elements()
     {
-        return _ScanStatusFlag::__padding + __num_wgs;
+        return _ScanStatusFlag::__padding + __num_workgroups;
     }
 
   private:
@@ -144,16 +135,11 @@ struct ScanMemoryManager
     std::size_t
     get_memory_size()
     {
-        // sizeof(_T) extra bytes are not needed because data is going at the beginning of the scratch
-        ::std::size_t tile_values_bytes = get_tile_values_bytes();
-        // Padding to provide room for aligment
-        ::std::size_t flag_bytes = get_padded_flag_bytes();
-
-        return tile_values_bytes + flag_bytes;
+        return get_tile_values_bytes() + get_padded_flag_bytes();
     }
 
-    std::uint8_t* scan_memory_begin = nullptr;
-    std::size_t __num_wgs;
+    std::uint8_t* __scan_memory_begin = nullptr;
+    std::size_t __num_workgroups;
 
     sycl::queue __queue;
 };
@@ -473,7 +459,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     std::size_t __elems_in_tile = __workgroup_size * __data_per_workitem;
     std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
 
-    ScanMemoryManager<_FlagType> __device_mem_mgr(__queue, __num_wgs);
+    __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __num_wgs);
 
     __device_mem_mgr.allocate();
 
@@ -514,54 +500,55 @@ template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typ
           typename _KernelParam>
 void
 single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
-                                   _NumSelectedRange __num_rng, _UnaryPredicate pred, _KernelParam)
+                                   _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _SizeT = uint64_t;
     using _TileIdT = TileId::_TileIdT;
     using _KernelName = __copy_if_single_wg_kernel<typename _KernelParam::kernel_name>;
+    using _BinaryOp = std::plus<_SizeT>;
 
-    const ::std::size_t n = __in_rng.size();
+    const ::std::size_t __n = __in_rng.size();
 
-    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
+    constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 
-    // Avoid non_uniform n by padding up to a multiple of wgsize
-    constexpr std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
-    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
-    ::std::size_t num_workitems = num_wgs * wgsize;
-    assert(num_wgs == 1);
+    // Avoid non_uniform n by padding up to a multiple of __wgsize
+    constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem;
+    ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
+    ::std::size_t __num_workitesm = __num_wgs * __wgsize;
+    assert(__num_wgs == 1);
 
-    auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
+    auto __event = __queue.submit([&](sycl::handler& hdl) {
+        auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>&
+        hdl.parallel_for(sycl::nd_range<1>(__num_workitesm, __wgsize), [=](const sycl::nd_item<1>&
                                                                            item) [[intel::reqd_sub_group_size(
                                                                        SUBGROUP_SIZE)]] {
-            auto group = item.get_group();
-            auto wg_local_id = item.get_local_id(0);
-            constexpr ::std::uint32_t stride = wgsize;
+            auto __group = item.get_group();
+            auto __wg_local_id = item.get_local_id(0);
+            constexpr ::std::uint32_t stride = __wgsize;
 
             // Global load into local
-            _SizeT wg_count = 0;
+            _SizeT __wg_count = 0;
 
-            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if (elems_in_tile <= n)
+            // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
+            if (__elems_in_tile <= __n)
             {
 #pragma unroll
-                for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
                 {
-                    _Type val = __in_rng[i + wg_local_id];
+                    _Type __val = __in_rng[__i + __wg_local_id];
 
-                    _SizeT satisfies_pred = pred(val);
-                    _SizeT count =
-                        sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+                    _SizeT __satisfies_pred = __pred(__val);
+                    _SizeT __count =
+                        sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
 
-                    if (satisfies_pred)
-                        wg_copy_if_values[count] = val;
+                    if (__satisfies_pred)
+                        __wg_copy_if_values[__count] = __val;
 
-                    wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                    __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
                 }
             }
             else
@@ -569,46 +556,46 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
                 // Edge of input, have to handle memory bounds
                 // Might have unneccessary group_barrier calls
 #pragma unroll
-                for (size_t i = 0; i < elems_in_tile; i += wgsize)
+                for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
                 {
-                    _SizeT satisfies_pred = 0;
-                    oneapi::dpl::__internal::__lazy_ctor_storage<_Type> val;
-                    if (i + wg_local_id < n)
+                    _SizeT __satisfies_pred = 0;
+                    oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
+                    if (__i + __wg_local_id < __n)
                     {
-                        new (&val.__v) _Type(__in_rng[i + wg_local_id]);
+                        new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]);
 
-                        satisfies_pred = pred(val.__v);
+                        __satisfies_pred = __pred(__val.__v);
                     }
-                    _SizeT count =
-                        sycl::exclusive_scan_over_group(group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
-                    if (i + wg_local_id < n)
+                    _SizeT __count =
+                        sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+                    if (__i + __wg_local_id < __n)
                     {
-                        if (satisfies_pred)
-                            wg_copy_if_values[count] = std::move(val.__v);
-                        val.__v.~_Type();
+                        if (__satisfies_pred)
+                            __wg_copy_if_values[__count] = std::move(__val.__v);
+                        __val.__v.~_Type();
                     }
-                    wg_count = sycl::group_broadcast(group, count + satisfies_pred, wgsize - 1);
+                    __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
                 }
             }
 
             // Phase 3: copy values to global memory
-            for (int i = wg_local_id; i < wg_count; i += wgsize)
+            for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize)
             {
-                __out_rng[i] = wg_copy_if_values[i];
+                __out_rng[__i] = __wg_copy_if_values[__i];
             }
-            if (group.leader())
-                __num_rng[0] = wg_count;
+            if (__group.leader())
+                __num_rng[0] = __wg_count;
         });
     });
 
-    event.wait();
+    __event.wait();
 }
 
 template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
 void
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
-                         _UnaryPredicate pred, _KernelParam)
+                         _UnaryPredicate __pred, _KernelParam)
 {
     using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _SizeT = uint64_t;
@@ -623,44 +610,42 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     using _FlagType = __scan_status_flag<_SizeT>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 
-    const ::std::size_t n = __in_rng.size();
+    const ::std::size_t __n = __in_rng.size();
 
-    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
+    constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
+    constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 
-    // Avoid non_uniform n by padding up to a multiple of wgsize
-    constexpr std::uint32_t __elems_in_tile = wgsize * elems_per_workitem;
-    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, __elems_in_tile);
-    ::std::size_t num_workitems = num_wgs * wgsize;
+    // Avoid non_uniform n by padding up to a multiple of __wgsize
+    constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem;
+    ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
+    ::std::size_t __num_workitesm = __num_wgs * __wgsize;
 
-    ScanMemoryManager<_FlagType> scratch(__queue, num_wgs);
-    scratch.allocate();
+    __scan_lookback_mem_mgr<_FlagType> __scratch(__queue, __num_wgs);
+    __scratch.allocate();
 
     // Memory Structure:
     // [Lookback Scan Memory, Tile Id Counter]
-    auto __status_vals_full = scratch.get_full_values_begin();
-    auto __status_vals_partial = scratch.get_partial_values_begin();
-    auto __status_flags = scratch.get_flags_begin();
+    auto __status_vals_full = __scratch.get_full_values_begin();
+    auto __status_vals_partial = __scratch.get_partial_values_begin();
+    auto __status_flags = __scratch.get_flags_begin();
     //adding 1 to the number elements to account for the tile id
-    std::size_t __status_flags_size = scratch.get_num_elements() + 1;
+    std::size_t __status_flags_size = __scratch.get_num_elements() + 1;
 
     auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}(
         __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding);
 
-    auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl);
-
-        auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
-        hdl.depends_on(__fill_event);
+    auto __event = __queue.submit([&](sycl::handler& __hdl) {
+        auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, __hdl);
+        __hdl.depends_on(__fill_event);
 
-        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(num_workitems, wgsize), [=](const sycl::nd_item<1>&
-                                                                           item) [[intel::reqd_sub_group_size(
+        oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
+        __hdl.parallel_for(sycl::nd_range<1>(__num_workitesm, __wgsize), [=](const sycl::nd_item<1>&
+                                                                           __item) [[intel::reqd_sub_group_size(
                                                                        SUBGROUP_SIZE)]] {
-            auto __group = item.get_group();
-            auto wg_local_id = item.get_local_id(0);
-            auto sg = item.get_sub_group();
-            constexpr ::std::uint32_t stride = wgsize;
+            auto __group = __item.get_group();
+            auto __wg_local_id = __item.get_local_id(0);
+            auto __sg = __item.get_sub_group();
+            constexpr ::std::uint32_t __stride = __wgsize;
 
             std::uint32_t __tile_id = 0;
 
@@ -677,13 +662,13 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
             std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
 
-            _SizeT wg_count = 0;
+            _SizeT __wg_count = 0;
 
-            // Phase 1: Create wg_count and construct in-order wg_copy_if_values
-            if ((__tile_id + 1) * __elems_in_tile <= n)
+            // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
+            if ((__tile_id + 1) * __elems_in_tile <= __n)
             {
 #pragma unroll
-                for (size_t i = 0; i < __elems_in_tile; i += wgsize)
+                for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
                 {
                     // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
                     //  if load is done in a scalar fashion and provides the same performance, we
@@ -692,16 +677,16 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                     // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
                     // global memory needs to be loaded per work item per element, skipping copies
                     // when they were not saved.
-                    _Type val = __in_rng[i + wg_local_id + __elems_in_tile * __tile_id];
+                    _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id];
 
-                    _SizeT satisfies_pred = pred(val);
-                    _SizeT count =
-                        sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+                    _SizeT __satisfies_pred = __pred(__val);
+                    _SizeT __count =
+                        sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
 
-                    if (satisfies_pred)
-                        wg_copy_if_values[count] = val;
+                    if (__satisfies_pred)
+                        __wg_copy_if_values[__count] = __val;
 
-                    wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
+                    __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
                 }
             }
             else
@@ -709,49 +694,49 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 // Edge of input, have to handle memory bounds
                 // Might have unneccessary group_barrier calls
 #pragma unroll
-                for (size_t i = 0; i < __elems_in_tile; i += wgsize)
+                for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
                 {
-                    _SizeT satisfies_pred = 0;
-                    oneapi::dpl::__internal::__lazy_ctor_storage<_Type> val;
-                    if (i + wg_local_id + __elems_in_tile * __tile_id < n)
+                    _SizeT __satisfies_pred = 0;
+                    oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
+                    if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
                     {
-                        new (&val.__v) _Type(__in_rng[i + wg_local_id + __elems_in_tile * __tile_id]);
+                        new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]);
 
-                        satisfies_pred = pred(val.__v);
+                        __satisfies_pred = __pred(__val.__v);
                     }
-                    _SizeT count =
-                        sycl::exclusive_scan_over_group(__group, satisfies_pred, wg_count, sycl::plus<_SizeT>());
+                    _SizeT __count =
+                        sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
 
-                    if (i + wg_local_id + __elems_in_tile * __tile_id < n)
+                    if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
                     {
-                        if (satisfies_pred)
-                            wg_copy_if_values[count] = std::move(val.__v);
-                        val.__v.~_Type();
+                        if (__satisfies_pred)
+                            __wg_copy_if_values[__count] = std::move(__val.__v);
+                        __val.__v.~_Type();
                     }
 
-                    wg_count = sycl::group_broadcast(__group, count + satisfies_pred, wgsize - 1);
+                    __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
                 }
             }
 
-            // Phase 2: Global scan across wg_count
-            _SizeT copied_elements = 0;
+            // Phase 2: Global scan across __wg_count
+            _SizeT __copied_elements = 0;
 
-            __lookback_phase<_FlagType>(__group, sg, __status_flags, __status_vals_full, __status_vals_partial,
-                                        __tile_id, wg_count, copied_elements, sycl::plus<_SizeT>());
+            __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial,
+                                        __tile_id, __wg_count, __copied_elements, _BinaryOp{});
 
             //TODO: explore above comment about scalar load
             // Phase 3: copy values to global memory
-            for (int i = wg_local_id; i < wg_count; i += wgsize)
+            for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize)
             {
-                __out_rng[copied_elements + i] = wg_copy_if_values[i];
+                __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i];
             }
-            if (__tile_id == (num_wgs - 1) && __group.leader())
-                __num_rng[0] = copied_elements + wg_count;
+            if (__tile_id == (__num_wgs - 1) && __group.leader())
+                __num_rng[0] = __copied_elements + __wg_count;
         });
     });
 
-    event.wait();
-    scratch.free();
+    __event.wait();
+    __scratch.free();
 }
 
 } // namespace __impl
@@ -760,7 +745,7 @@ template <typename _InIterator, typename _OutIterator, typename _NumSelectedRang
           typename _KernelParam>
 void
 single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
-                              _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate pred,
+                              _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred,
                               _KernelParam __param = {})
 {
     auto __n = __in_end - __in_begin;
@@ -775,14 +760,14 @@ single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIt
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
     __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(),
-                                               pred, __param);
+                                               __pred, __param);
 }
 
 template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
 void
 single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
-                    _NumSelectedRange __num_begin, _UnaryPredicate pred, _KernelParam __param = {})
+                    _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {})
 {
     auto __n = __in_end - __in_begin;
 
@@ -795,7 +780,7 @@ single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __i
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), pred,
+    __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred,
                                      __param);
 }
 

From c5065a784c901da356c9aacc4684f4bd188757ae Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 24 May 2024 12:02:35 -0400
Subject: [PATCH 089/134] format

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 260 +++++++++---------
 1 file changed, 130 insertions(+), 130 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index ede5ed4c2ae..aee2dbb063d 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -523,69 +523,69 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
         auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl);
 
         oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(sycl::nd_range<1>(__num_workitesm, __wgsize), [=](const sycl::nd_item<1>&
-                                                                           item) [[intel::reqd_sub_group_size(
-                                                                       SUBGROUP_SIZE)]] {
-            auto __group = item.get_group();
-            auto __wg_local_id = item.get_local_id(0);
-            constexpr ::std::uint32_t stride = __wgsize;
-
-            // Global load into local
-            _SizeT __wg_count = 0;
-
-            // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
-            if (__elems_in_tile <= __n)
-            {
-#pragma unroll
-                for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
+        hdl.parallel_for(
+            sycl::nd_range<1>(__num_workitesm, __wgsize),
+            [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                auto __group = item.get_group();
+                auto __wg_local_id = item.get_local_id(0);
+                constexpr ::std::uint32_t stride = __wgsize;
+
+                // Global load into local
+                _SizeT __wg_count = 0;
+
+                // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
+                if (__elems_in_tile <= __n)
                 {
-                    _Type __val = __in_rng[__i + __wg_local_id];
+#pragma unroll
+                    for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
+                    {
+                        _Type __val = __in_rng[__i + __wg_local_id];
 
-                    _SizeT __satisfies_pred = __pred(__val);
-                    _SizeT __count =
-                        sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+                        _SizeT __satisfies_pred = __pred(__val);
+                        _SizeT __count =
+                            sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
 
-                    if (__satisfies_pred)
-                        __wg_copy_if_values[__count] = __val;
+                        if (__satisfies_pred)
+                            __wg_copy_if_values[__count] = __val;
 
-                    __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
+                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
+                    }
                 }
-            }
-            else
-            {
+                else
+                {
                 // Edge of input, have to handle memory bounds
                 // Might have unneccessary group_barrier calls
 #pragma unroll
-                for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
-                {
-                    _SizeT __satisfies_pred = 0;
-                    oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
-                    if (__i + __wg_local_id < __n)
+                    for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
                     {
-                        new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]);
-
-                        __satisfies_pred = __pred(__val.__v);
+                        _SizeT __satisfies_pred = 0;
+                        oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
+                        if (__i + __wg_local_id < __n)
+                        {
+                            new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]);
+
+                            __satisfies_pred = __pred(__val.__v);
+                        }
+                        _SizeT __count =
+                            sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+                        if (__i + __wg_local_id < __n)
+                        {
+                            if (__satisfies_pred)
+                                __wg_copy_if_values[__count] = std::move(__val.__v);
+                            __val.__v.~_Type();
+                        }
+                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
                     }
-                    _SizeT __count =
-                        sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-                    if (__i + __wg_local_id < __n)
-                    {
-                        if (__satisfies_pred)
-                            __wg_copy_if_values[__count] = std::move(__val.__v);
-                        __val.__v.~_Type();
-                    }
-                    __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
                 }
-            }
 
-            // Phase 3: copy values to global memory
-            for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize)
-            {
-                __out_rng[__i] = __wg_copy_if_values[__i];
-            }
-            if (__group.leader())
-                __num_rng[0] = __wg_count;
-        });
+                // Phase 3: copy values to global memory
+                for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize)
+                {
+                    __out_rng[__i] = __wg_copy_if_values[__i];
+                }
+                if (__group.leader())
+                    __num_rng[0] = __wg_count;
+            });
     });
 
     __event.wait();
@@ -639,100 +639,100 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         __hdl.depends_on(__fill_event);
 
         oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
-        __hdl.parallel_for(sycl::nd_range<1>(__num_workitesm, __wgsize), [=](const sycl::nd_item<1>&
-                                                                           __item) [[intel::reqd_sub_group_size(
-                                                                       SUBGROUP_SIZE)]] {
-            auto __group = __item.get_group();
-            auto __wg_local_id = __item.get_local_id(0);
-            auto __sg = __item.get_sub_group();
-            constexpr ::std::uint32_t __stride = __wgsize;
-
-            std::uint32_t __tile_id = 0;
-
-            // Obtain unique ID for this work-group that will be used in decoupled lookback
-            if (__group.leader())
-            {
-                sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
-                                 sycl::access::address_space::global_space>
-                    __idx_atomic(__status_flags[__status_flags_size - 1]);
-                __tile_id = __idx_atomic.fetch_add(1);
-            }
+        __hdl.parallel_for(
+            sycl::nd_range<1>(__num_workitesm, __wgsize),
+            [=](const sycl::nd_item<1>& __item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+                auto __group = __item.get_group();
+                auto __wg_local_id = __item.get_local_id(0);
+                auto __sg = __item.get_sub_group();
+                constexpr ::std::uint32_t __stride = __wgsize;
+
+                std::uint32_t __tile_id = 0;
+
+                // Obtain unique ID for this work-group that will be used in decoupled lookback
+                if (__group.leader())
+                {
+                    sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>
+                        __idx_atomic(__status_flags[__status_flags_size - 1]);
+                    __tile_id = __idx_atomic.fetch_add(1);
+                }
 
-            __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
+                __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
 
-            std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
+                std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
 
-            _SizeT __wg_count = 0;
+                _SizeT __wg_count = 0;
 
-            // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
-            if ((__tile_id + 1) * __elems_in_tile <= __n)
-            {
-#pragma unroll
-                for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
+                // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
+                if ((__tile_id + 1) * __elems_in_tile <= __n)
                 {
-                    // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
-                    //  if load is done in a scalar fashion and provides the same performance, we
-                    //  can avoid the broadcast (I think)
-                    // would need to loop over the elements per work item first accumulating into
-                    // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
-                    // global memory needs to be loaded per work item per element, skipping copies
-                    // when they were not saved.
-                    _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id];
-
-                    _SizeT __satisfies_pred = __pred(__val);
-                    _SizeT __count =
-                        sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-
-                    if (__satisfies_pred)
-                        __wg_copy_if_values[__count] = __val;
-
-                    __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
-                }
-            }
-            else
-            {
-                // Edge of input, have to handle memory bounds
-                // Might have unneccessary group_barrier calls
 #pragma unroll
-                for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
-                {
-                    _SizeT __satisfies_pred = 0;
-                    oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
-                    if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
+                    for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
                     {
-                        new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]);
+                        // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
+                        //  if load is done in a scalar fashion and provides the same performance, we
+                        //  can avoid the broadcast (I think)
+                        // would need to loop over the elements per work item first accumulating into
+                        // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
+                        // global memory needs to be loaded per work item per element, skipping copies
+                        // when they were not saved.
+                        _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id];
+
+                        _SizeT __satisfies_pred = __pred(__val);
+                        _SizeT __count =
+                            sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
 
-                        __satisfies_pred = __pred(__val.__v);
-                    }
-                    _SizeT __count =
-                        sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+                        if (__satisfies_pred)
+                            __wg_copy_if_values[__count] = __val;
 
-                    if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
+                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
+                    }
+                }
+                else
+                {
+                // Edge of input, have to handle memory bounds
+                // Might have unneccessary group_barrier calls
+#pragma unroll
+                    for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
                     {
-                        if (__satisfies_pred)
-                            __wg_copy_if_values[__count] = std::move(__val.__v);
-                        __val.__v.~_Type();
+                        _SizeT __satisfies_pred = 0;
+                        oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
+                        if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
+                        {
+                            new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]);
+
+                            __satisfies_pred = __pred(__val.__v);
+                        }
+                        _SizeT __count =
+                            sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+
+                        if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
+                        {
+                            if (__satisfies_pred)
+                                __wg_copy_if_values[__count] = std::move(__val.__v);
+                            __val.__v.~_Type();
+                        }
+
+                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
                     }
-
-                    __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
                 }
-            }
 
-            // Phase 2: Global scan across __wg_count
-            _SizeT __copied_elements = 0;
+                // Phase 2: Global scan across __wg_count
+                _SizeT __copied_elements = 0;
 
-            __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial,
-                                        __tile_id, __wg_count, __copied_elements, _BinaryOp{});
+                __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial,
+                                            __tile_id, __wg_count, __copied_elements, _BinaryOp{});
 
-            //TODO: explore above comment about scalar load
-            // Phase 3: copy values to global memory
-            for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize)
-            {
-                __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i];
-            }
-            if (__tile_id == (__num_wgs - 1) && __group.leader())
-                __num_rng[0] = __copied_elements + __wg_count;
-        });
+                //TODO: explore above comment about scalar load
+                // Phase 3: copy values to global memory
+                for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize)
+                {
+                    __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i];
+                }
+                if (__tile_id == (__num_wgs - 1) && __group.leader())
+                    __num_rng[0] = __copied_elements + __wg_count;
+            });
     });
 
     __event.wait();

From 6a7291a61b6beffb888d755cee3bc8eb2396b872 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 24 May 2024 15:31:08 -0400
Subject: [PATCH 090/134] change single wg scan to submitter and kernel
 operator

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 209 +++++++++++-------
 1 file changed, 124 insertions(+), 85 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index aee2dbb063d..50fc41b2be6 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -287,10 +287,6 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag
     __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0);
 }
 
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
-          typename _KernelName>
-struct __lookback_submitter;
-
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
           typename _InRng, typename _OutRng, typename _BinaryOp, typename _StatusFlags, typename _StatusValues,
           typename _TileVals>
@@ -379,9 +375,13 @@ struct __lookback_kernel_func
     }
 };
 
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
+          typename _KernelName>
+struct __lookback_scan_submitter;
+
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
           typename... _Name>
-struct __lookback_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType,
+struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType,
                             oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
@@ -476,7 +476,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     std::size_t __current_num_items = __current_num_wgs * __workgroup_size;
 
     auto __prev_event =
-        __lookback_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType, _LookbackKernel>{}(
+        __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType, _LookbackKernel>{}(
             __queue, __fill_event, __in_rng, __out_rng, __binary_op, __n, __status_flags, __status_flags_size,
             __status_vals_full, __status_vals_partial, __current_num_items);
 
@@ -496,6 +496,121 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     }
 }
 
+
+
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRng, typename _OutRng, typename _NumRng, typename _TileValues>
+struct __copy_if_single_wg_kernel_func
+{
+    static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
+    using _SizeT = std::size_t;
+    using _BinaryOp = std::plus<_SizeT>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
+
+    _InRng __in_rng;
+    _OutRng __out_rng;
+    _NumRng __num_rng;
+    _SizeT __n;
+    _TileValues __wg_copy_if_values;
+
+    [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
+    operator()(const sycl::nd_item<1>& __item) const
+    {
+        auto __group = __item.get_group();
+        auto __wg_local_id = __item.get_local_id(0);
+
+        // Global load into local
+        _SizeT __wg_count = 0;
+
+        // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
+        if (__elems_in_tile <= __n)
+        {
+#pragma unroll
+            for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
+            {
+                _Type __val = __in_rng[__i + __wg_local_id];
+
+                _SizeT __satisfies_pred = __pred(__val);
+                _SizeT __count =
+                    sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+
+                if (__satisfies_pred)
+                    __wg_copy_if_values[__count] = __val;
+
+                __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
+            }
+        }
+        else
+        {
+        // Edge of input, have to handle memory bounds
+        // Might have unneccessary group_barrier calls
+#pragma unroll
+            for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
+            {
+                _SizeT __satisfies_pred = 0;
+                oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
+                if (__i + __wg_local_id < __n)
+                {
+                    new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]);
+
+                    __satisfies_pred = __pred(__val.__v);
+                }
+                _SizeT __count =
+                    sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+                if (__i + __wg_local_id < __n)
+                {
+                    if (__satisfies_pred)
+                        __wg_copy_if_values[__count] = std::move(__val.__v);
+                    __val.__v.~_Type();
+                }
+                __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
+            }
+        }
+
+        // Phase 3: copy values to global memory
+        for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size)
+        {
+            __out_rng[__i] = __wg_copy_if_values[__i];
+        }
+        if (__group.leader())
+            __num_rng[0] = __wg_count;
+    }
+};
+
+
+
+
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _KernelName>
+struct __copy_if_single_wg_submitter;
+
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename... _Name>
+struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, 
+                                     oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
+{
+
+    template <typename _InRng, typename _OutRng, typename _NumSelectedRange>
+    sycl::event
+    operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n) const
+    {
+        using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
+        using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
+        using _KernelFunc =
+            __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>,
+                                   std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _LocalAccessorType>;
+
+        static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
+
+        return __q.submit([&](sycl::handler& __hdl) {
+            auto __tile_vals = _LocalAccessorType(sycl::range<1>{__elems_in_tile}, __hdl);
+            oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
+            __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__workgroup_size, __workgroup_size),
+                                         _KernelFunc{__in_rng, __out_rng, __n, __num_rng, __tile_vals});
+        });
+    }
+};
+
+
+
+
 template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
 void
@@ -513,82 +628,7 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
     constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 
-    // Avoid non_uniform n by padding up to a multiple of __wgsize
-    constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem;
-    ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
-    ::std::size_t __num_workitesm = __num_wgs * __wgsize;
-    assert(__num_wgs == 1);
-
-    auto __event = __queue.submit([&](sycl::handler& hdl) {
-        auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, hdl);
-
-        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng, __num_rng);
-        hdl.parallel_for(
-            sycl::nd_range<1>(__num_workitesm, __wgsize),
-            [=](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-                auto __group = item.get_group();
-                auto __wg_local_id = item.get_local_id(0);
-                constexpr ::std::uint32_t stride = __wgsize;
-
-                // Global load into local
-                _SizeT __wg_count = 0;
-
-                // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
-                if (__elems_in_tile <= __n)
-                {
-#pragma unroll
-                    for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
-                    {
-                        _Type __val = __in_rng[__i + __wg_local_id];
-
-                        _SizeT __satisfies_pred = __pred(__val);
-                        _SizeT __count =
-                            sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-
-                        if (__satisfies_pred)
-                            __wg_copy_if_values[__count] = __val;
-
-                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
-                    }
-                }
-                else
-                {
-                // Edge of input, have to handle memory bounds
-                // Might have unneccessary group_barrier calls
-#pragma unroll
-                    for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
-                    {
-                        _SizeT __satisfies_pred = 0;
-                        oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
-                        if (__i + __wg_local_id < __n)
-                        {
-                            new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]);
-
-                            __satisfies_pred = __pred(__val.__v);
-                        }
-                        _SizeT __count =
-                            sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-                        if (__i + __wg_local_id < __n)
-                        {
-                            if (__satisfies_pred)
-                                __wg_copy_if_values[__count] = std::move(__val.__v);
-                            __val.__v.~_Type();
-                        }
-                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
-                    }
-                }
-
-                // Phase 3: copy values to global memory
-                for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize)
-                {
-                    __out_rng[__i] = __wg_copy_if_values[__i];
-                }
-                if (__group.leader())
-                    __num_rng[0] = __wg_count;
-            });
-    });
-
-    __event.wait();
+    __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, __num_rng, __n).wait();
 }
 
 template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
@@ -618,7 +658,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     // Avoid non_uniform n by padding up to a multiple of __wgsize
     constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem;
     ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
-    ::std::size_t __num_workitesm = __num_wgs * __wgsize;
+    ::std::size_t __num_workitems = __num_wgs * __wgsize;
 
     __scan_lookback_mem_mgr<_FlagType> __scratch(__queue, __num_wgs);
     __scratch.allocate();
@@ -640,12 +680,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
         oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
         __hdl.parallel_for(
-            sycl::nd_range<1>(__num_workitesm, __wgsize),
+            sycl::nd_range<1>(__num_workitems, __wgsize),
             [=](const sycl::nd_item<1>& __item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
                 auto __group = __item.get_group();
                 auto __wg_local_id = __item.get_local_id(0);
                 auto __sg = __item.get_sub_group();
-                constexpr ::std::uint32_t __stride = __wgsize;
 
                 std::uint32_t __tile_id = 0;
 

From 1b78dcda322b224c130aad3ce74cef17fe3e81df Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 08:38:22 -0400
Subject: [PATCH 091/134] change scan to submitter and kernel operator

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 205 ++++++++++++------
 1 file changed, 135 insertions(+), 70 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 50fc41b2be6..9a729ae4356 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -263,8 +263,8 @@ struct __lookback_init_submitter<_FlagType, _Type, _BinaryOp,
 template <typename _FlagType, typename _Group, typename _SubGroup, typename _StatusFlags, typename _StatusValues,
           typename _Type, typename _BinaryOp>
 void
-__lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags& __status_flags,
-                 _StatusValues& __status_vals_full, _StatusValues& __status_vals_partial, std::uint32_t __tile_id,
+__lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags __status_flags,
+                 _StatusValues __status_vals_full, _StatusValues __status_vals_partial, std::uint32_t __tile_id,
                  _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op)
 {
     // The first sub-group will query the previous tiles to find a prefix
@@ -303,7 +303,6 @@ struct __lookback_kernel_func
     std::size_t __status_flags_size;
     _StatusValues __status_vals_full;
     _StatusValues __status_vals_partial;
-    std::size_t __current_num_items;
     _TileVals __tile_vals;
 
     [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
@@ -408,7 +407,7 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _
             __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__current_num_items, __workgroup_size),
                                          _KernelFunc{__in_rng, __out_rng, __binary_op, __n, __status_flags,
                                                      __status_flags_size, __status_vals_full, __status_vals_partial,
-                                                     __current_num_items, __tile_vals});
+                                                     __tile_vals});
         });
     }
 };
@@ -498,7 +497,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
 
 
 
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRng, typename _OutRng, typename _NumRng, typename _TileValues>
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRng, typename _OutRng, typename _NumRng, typename _UnaryPredicate, typename _TileValues>
 struct __copy_if_single_wg_kernel_func
 {
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
@@ -510,6 +509,7 @@ struct __copy_if_single_wg_kernel_func
     _OutRng __out_rng;
     _NumRng __num_rng;
     _SizeT __n;
+    _UnaryPredicate __pred;
     _TileValues __wg_copy_if_values;
 
     [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
@@ -587,15 +587,15 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
                                      oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _InRng, typename _OutRng, typename _NumSelectedRange>
+    template <typename _InRng, typename _OutRng, typename _NumSelectedRange, typename _UnaryPredicate>
     sycl::event
-    operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n) const
+    operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred) const
     {
         using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc =
             __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>,
-                                   std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _LocalAccessorType>;
+                                   std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, std::decay_t<_LocalAccessorType>>;
 
         static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
 
@@ -603,7 +603,7 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
             auto __tile_vals = _LocalAccessorType(sycl::range<1>{__elems_in_tile}, __hdl);
             oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
             __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__workgroup_size, __workgroup_size),
-                                         _KernelFunc{__in_rng, __out_rng, __n, __num_rng, __tile_vals});
+                                         _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred, __tile_vals});
         });
     }
 };
@@ -617,71 +617,42 @@ void
 single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
                                    _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam)
 {
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-    using _SizeT = uint64_t;
-    using _TileIdT = TileId::_TileIdT;
     using _KernelName = __copy_if_single_wg_kernel<typename _KernelParam::kernel_name>;
-    using _BinaryOp = std::plus<_SizeT>;
 
     const ::std::size_t __n = __in_rng.size();
 
     constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 
-    __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, __num_rng, __n).wait();
+    __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, __num_rng, __n, __pred).wait();
 }
 
-template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
-          typename _KernelParam>
-void
-single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
-                         _UnaryPredicate __pred, _KernelParam)
-{
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-    using _SizeT = uint64_t;
-    using _TileIdT = TileId::_TileIdT;
-    using _KernelName = __copy_if_kernel<typename _KernelParam::kernel_name>;
 
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRng, typename _OutRng, 
+          typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues, typename _TileValues>
+struct __copy_if_kernel_func
+{
+    static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
+    using _SizeT = std::size_t;
     using _BinaryOp = std::plus<_SizeT>;
-
-    using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-        __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>;
-
-    using _FlagType = __scan_status_flag<_SizeT>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 
-    const ::std::size_t __n = __in_rng.size();
-
-    constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
-
-    // Avoid non_uniform n by padding up to a multiple of __wgsize
-    constexpr std::uint32_t __elems_in_tile = __wgsize * __elems_per_workitem;
-    ::std::size_t __num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
-    ::std::size_t __num_workitems = __num_wgs * __wgsize;
-
-    __scan_lookback_mem_mgr<_FlagType> __scratch(__queue, __num_wgs);
-    __scratch.allocate();
-
-    // Memory Structure:
-    // [Lookback Scan Memory, Tile Id Counter]
-    auto __status_vals_full = __scratch.get_full_values_begin();
-    auto __status_vals_partial = __scratch.get_partial_values_begin();
-    auto __status_flags = __scratch.get_flags_begin();
-    //adding 1 to the number elements to account for the tile id
-    std::size_t __status_flags_size = __scratch.get_num_elements() + 1;
-
-    auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}(
-        __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding);
-
-    auto __event = __queue.submit([&](sycl::handler& __hdl) {
-        auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, __hdl);
-        __hdl.depends_on(__fill_event);
+    _InRng __in_rng;
+    _OutRng __out_rng;
+    _NumRng __num_rng;
+    _SizeT __n;
+    _UnaryPredicate __pred;
+    _StatusFlags __status_flags;
+    std::size_t __status_flags_size;
+    _StatusValues __status_vals_full;
+    _StatusValues __status_vals_partial;
+    _TileValues __wg_copy_if_values;
+    std::size_t __current_num_wgs;
 
-        oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
-        __hdl.parallel_for(
-            sycl::nd_range<1>(__num_workitems, __wgsize),
-            [=](const sycl::nd_item<1>& __item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
+    [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
+    operator()(const sycl::nd_item<1>& __item) const
+    {
                 auto __group = __item.get_group();
                 auto __wg_local_id = __item.get_local_id(0);
                 auto __sg = __item.get_sub_group();
@@ -707,7 +678,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 if ((__tile_id + 1) * __elems_in_tile <= __n)
                 {
 #pragma unroll
-                    for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
+                    for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
                     {
                         // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
                         //  if load is done in a scalar fashion and provides the same performance, we
@@ -725,7 +696,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                         if (__satisfies_pred)
                             __wg_copy_if_values[__count] = __val;
 
-                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
+                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
                     }
                 }
                 else
@@ -733,7 +704,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                 // Edge of input, have to handle memory bounds
                 // Might have unneccessary group_barrier calls
 #pragma unroll
-                    for (size_t __i = 0; __i < __elems_in_tile; __i += __wgsize)
+                    for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
                     {
                         _SizeT __satisfies_pred = 0;
                         oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
@@ -753,7 +724,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
                             __val.__v.~_Type();
                         }
 
-                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __wgsize - 1);
+                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
                     }
                 }
 
@@ -765,17 +736,111 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
                 //TODO: explore above comment about scalar load
                 // Phase 3: copy values to global memory
-                for (int __i = __wg_local_id; __i < __wg_count; __i += __wgsize)
+                for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size)
                 {
                     __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i];
                 }
-                if (__tile_id == (__num_wgs - 1) && __group.leader())
+                if (__tile_id == (__current_num_wgs - 1) && __group.leader())
                     __num_rng[0] = __copied_elements + __wg_count;
-            });
-    });
+            }
+
+};
+
+
+
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType,
+          typename _KernelName>
+struct __copy_if_submitter;
+
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType,
+          typename... _Name>
+struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
+                            oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
+{
+
+    template <typename _Event, typename _InRng, typename _OutRng, typename _NumSelectedRange, typename _UnaryPredicate, typename _StatusFlags,
+              typename _StatusValues>
+    sycl::event
+    operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng,
+               _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
+               _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items, std::size_t __current_num_wgs) const
+    {
+        using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
+        using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
+        using _KernelFunc =
+            __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRng>,
+                                   std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate,
+                                   std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>,
+                                   std::decay_t<_LocalAccessorType>>;
+
+        static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
+
+        return __q.submit([&](sycl::handler& __hdl) {
+            auto __wg_copy_if_values = sycl::local_accessor<_Type, 1>(sycl::range<1>{__elems_in_tile}, __hdl);
+            __hdl.depends_on(__fill_event);
+
+            auto __tile_vals = _LocalAccessorType(sycl::range<1>{__elems_in_tile}, __hdl);
+            oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
+            __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__current_num_items, __workgroup_size),
+                                         _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred, __status_flags,
+                                                     __status_flags_size, __status_vals_full, __status_vals_partial,
+                                                     __tile_vals, __current_num_wgs});
+        });
+    }
+};
+
+
+
+template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
+          typename _KernelParam>
+void
+single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
+                         _UnaryPredicate __pred, _KernelParam)
+{
+    using _SizeT = uint64_t;
+    using _KernelName = __copy_if_kernel<typename _KernelParam::kernel_name>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _FlagType = __scan_status_flag<_SizeT>;
+
+    using _BinaryOp = std::plus<_SizeT>;
+
+    using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>;
+
+    using _CopyIfKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __copy_if_kernel<_KernelName, _Type, _BinaryOp>>;
+
+    const std::size_t __n = __in_rng.size();
+
+    constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size;
+    constexpr std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
+
+    // Avoid non_uniform n by padding up to a multiple of __workgroup_size
+    constexpr std::uint32_t __elems_in_tile = __workgroup_size * __elems_per_workitem;
+    std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
+    std::size_t __current_num_items = __current_num_wgs * __workgroup_size;
+
+    __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __current_num_wgs);
+    __device_mem_mgr.allocate();
+
+    // Memory Structure:
+    // [Lookback Scan Memory, Tile Id Counter]
+    auto __status_vals_full = __device_mem_mgr.get_full_values_begin();
+    auto __status_vals_partial = __device_mem_mgr.get_partial_values_begin();
+    auto __status_flags = __device_mem_mgr.get_flags_begin();
+    //adding 1 to the number elements to account for the tile id
+    std::size_t __status_flags_size = __device_mem_mgr.get_num_elements() + 1;
+
+    auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}(
+        __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding);
+
+    auto submitter = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{};
 
-    __event.wait();
-    __scratch.free();
+    submitter(__queue, __fill_event,__in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size,
+                                                                               __status_vals_full,
+                                                                               __status_vals_partial,
+                                                                               __current_num_items, __current_num_wgs).wait();
+    __device_mem_mgr.free();
 }
 
 } // namespace __impl

From 246dbf33c37fea7f650928186740399905de6dd8 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 08:38:42 -0400
Subject: [PATCH 092/134] formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 228 ++++++++----------
 1 file changed, 107 insertions(+), 121 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 9a729ae4356..73a24d4be16 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -381,7 +381,7 @@ struct __lookback_scan_submitter;
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
           typename... _Name>
 struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType,
-                            oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
+                                 oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
     template <typename _InRng, typename _OutRng, typename _BinaryOp, typename _StatusFlags, typename _StatusValues>
@@ -495,9 +495,8 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     }
 }
 
-
-
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRng, typename _OutRng, typename _NumRng, typename _UnaryPredicate, typename _TileValues>
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRng, typename _OutRng,
+          typename _NumRng, typename _UnaryPredicate, typename _TileValues>
 struct __copy_if_single_wg_kernel_func
 {
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
@@ -530,8 +529,7 @@ struct __copy_if_single_wg_kernel_func
                 _Type __val = __in_rng[__i + __wg_local_id];
 
                 _SizeT __satisfies_pred = __pred(__val);
-                _SizeT __count =
-                    sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+                _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
 
                 if (__satisfies_pred)
                     __wg_copy_if_values[__count] = __val;
@@ -541,8 +539,8 @@ struct __copy_if_single_wg_kernel_func
         }
         else
         {
-        // Edge of input, have to handle memory bounds
-        // Might have unneccessary group_barrier calls
+            // Edge of input, have to handle memory bounds
+            // Might have unneccessary group_barrier calls
 #pragma unroll
             for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
             {
@@ -554,8 +552,7 @@ struct __copy_if_single_wg_kernel_func
 
                     __satisfies_pred = __pred(__val.__v);
                 }
-                _SizeT __count =
-                    sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+                _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
                 if (__i + __wg_local_id < __n)
                 {
                     if (__satisfies_pred)
@@ -576,26 +573,24 @@ struct __copy_if_single_wg_kernel_func
     }
 };
 
-
-
-
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _KernelName>
 struct __copy_if_single_wg_submitter;
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename... _Name>
-struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size, 
+struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
                                      oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
     template <typename _InRng, typename _OutRng, typename _NumSelectedRange, typename _UnaryPredicate>
     sycl::event
-    operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred) const
+    operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n,
+               _UnaryPredicate __pred) const
     {
         using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
-        using _KernelFunc =
-            __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>,
-                                   std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate, std::decay_t<_LocalAccessorType>>;
+        using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>,
+                                                            std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>,
+                                                            _UnaryPredicate, std::decay_t<_LocalAccessorType>>;
 
         static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
 
@@ -608,9 +603,6 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
     }
 };
 
-
-
-
 template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
 void
@@ -624,12 +616,14 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
     constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 
-    __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng, __num_rng, __n, __pred).wait();
+    __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng,
+                                                                                 __num_rng, __n, __pred)
+        .wait();
 }
 
-
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRng, typename _OutRng, 
-          typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues, typename _TileValues>
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRng,
+          typename _OutRng, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
+          typename _TileValues>
 struct __copy_if_kernel_func
 {
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
@@ -653,125 +647,120 @@ struct __copy_if_kernel_func
     [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
     operator()(const sycl::nd_item<1>& __item) const
     {
-                auto __group = __item.get_group();
-                auto __wg_local_id = __item.get_local_id(0);
-                auto __sg = __item.get_sub_group();
+        auto __group = __item.get_group();
+        auto __wg_local_id = __item.get_local_id(0);
+        auto __sg = __item.get_sub_group();
 
-                std::uint32_t __tile_id = 0;
+        std::uint32_t __tile_id = 0;
 
-                // Obtain unique ID for this work-group that will be used in decoupled lookback
-                if (__group.leader())
-                {
-                    sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
-                                     sycl::access::address_space::global_space>
-                        __idx_atomic(__status_flags[__status_flags_size - 1]);
-                    __tile_id = __idx_atomic.fetch_add(1);
-                }
+        // Obtain unique ID for this work-group that will be used in decoupled lookback
+        if (__group.leader())
+        {
+            sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                __idx_atomic(__status_flags[__status_flags_size - 1]);
+            __tile_id = __idx_atomic.fetch_add(1);
+        }
 
-                __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
+        __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
 
-                std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
+        std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
 
-                _SizeT __wg_count = 0;
+        _SizeT __wg_count = 0;
 
-                // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
-                if ((__tile_id + 1) * __elems_in_tile <= __n)
-                {
-#pragma unroll
-                    for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
-                    {
-                        // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
-                        //  if load is done in a scalar fashion and provides the same performance, we
-                        //  can avoid the broadcast (I think)
-                        // would need to loop over the elements per work item first accumulating into
-                        // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
-                        // global memory needs to be loaded per work item per element, skipping copies
-                        // when they were not saved.
-                        _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id];
-
-                        _SizeT __satisfies_pred = __pred(__val);
-                        _SizeT __count =
-                            sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-
-                        if (__satisfies_pred)
-                            __wg_copy_if_values[__count] = __val;
-
-                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
-                    }
-                }
-                else
-                {
-                // Edge of input, have to handle memory bounds
-                // Might have unneccessary group_barrier calls
+        // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
+        if ((__tile_id + 1) * __elems_in_tile <= __n)
+        {
 #pragma unroll
-                    for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
-                    {
-                        _SizeT __satisfies_pred = 0;
-                        oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
-                        if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
-                        {
-                            new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]);
-
-                            __satisfies_pred = __pred(__val.__v);
-                        }
-                        _SizeT __count =
-                            sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-
-                        if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
-                        {
-                            if (__satisfies_pred)
-                                __wg_copy_if_values[__count] = std::move(__val.__v);
-                            __val.__v.~_Type();
-                        }
-
-                        __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
-                    }
-                }
+            for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
+            {
+                // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
+                //  if load is done in a scalar fashion and provides the same performance, we
+                //  can avoid the broadcast (I think)
+                // would need to loop over the elements per work item first accumulating into
+                // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
+                // global memory needs to be loaded per work item per element, skipping copies
+                // when they were not saved.
+                _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id];
 
-                // Phase 2: Global scan across __wg_count
-                _SizeT __copied_elements = 0;
+                _SizeT __satisfies_pred = __pred(__val);
+                _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
 
-                __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial,
-                                            __tile_id, __wg_count, __copied_elements, _BinaryOp{});
+                if (__satisfies_pred)
+                    __wg_copy_if_values[__count] = __val;
 
-                //TODO: explore above comment about scalar load
-                // Phase 3: copy values to global memory
-                for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size)
+                __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
+            }
+        }
+        else
+        {
+            // Edge of input, have to handle memory bounds
+            // Might have unneccessary group_barrier calls
+#pragma unroll
+            for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
+            {
+                _SizeT __satisfies_pred = 0;
+                oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
+                if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
                 {
-                    __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i];
+                    new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]);
+
+                    __satisfies_pred = __pred(__val.__v);
                 }
-                if (__tile_id == (__current_num_wgs - 1) && __group.leader())
-                    __num_rng[0] = __copied_elements + __wg_count;
+                _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
+
+                if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
+                {
+                    if (__satisfies_pred)
+                        __wg_copy_if_values[__count] = std::move(__val.__v);
+                    __val.__v.~_Type();
+                }
+
+                __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
             }
+        }
 
-};
+        // Phase 2: Global scan across __wg_count
+        _SizeT __copied_elements = 0;
 
+        __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id,
+                                    __wg_count, __copied_elements, _BinaryOp{});
 
+        //TODO: explore above comment about scalar load
+        // Phase 3: copy values to global memory
+        for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size)
+        {
+            __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i];
+        }
+        if (__tile_id == (__current_num_wgs - 1) && __group.leader())
+            __num_rng[0] = __copied_elements + __wg_count;
+    }
+};
 
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType,
-          typename _KernelName>
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _KernelName>
 struct __copy_if_submitter;
 
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType,
-          typename... _Name>
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename... _Name>
 struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
-                            oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
+                           oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _Event, typename _InRng, typename _OutRng, typename _NumSelectedRange, typename _UnaryPredicate, typename _StatusFlags,
-              typename _StatusValues>
+    template <typename _Event, typename _InRng, typename _OutRng, typename _NumSelectedRange, typename _UnaryPredicate,
+              typename _StatusFlags, typename _StatusValues>
     sycl::event
     operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng,
-               _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
-               _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items, std::size_t __current_num_wgs) const
+               _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags,
+               std::size_t __status_flags_size, _StatusValues&& __status_vals_full,
+               _StatusValues&& __status_vals_partial, std::size_t __current_num_items,
+               std::size_t __current_num_wgs) const
     {
         using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc =
             __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRng>,
-                                   std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate,
-                                   std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>,
-                                   std::decay_t<_LocalAccessorType>>;
+                                  std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate,
+                                  std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>,
+                                  std::decay_t<_LocalAccessorType>>;
 
         static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
 
@@ -789,8 +778,6 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
     }
 };
 
-
-
 template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
 void
@@ -836,10 +823,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
     auto submitter = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{};
 
-    submitter(__queue, __fill_event,__in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size,
-                                                                               __status_vals_full,
-                                                                               __status_vals_partial,
-                                                                               __current_num_items, __current_num_wgs).wait();
+    submitter(__queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size,
+              __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs)
+        .wait();
     __device_mem_mgr.free();
 }
 

From 3023fec71f179fa679cb7899e2ef3e60328d2f18 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 08:40:24 -0400
Subject: [PATCH 093/134] remove unnecessary variable

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 73a24d4be16..70a4a0630c0 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -821,10 +821,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}(
         __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding);
 
-    auto submitter = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{};
-
-    submitter(__queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size,
-              __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs)
+    __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}(
+        __queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size,
+        __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs)
         .wait();
     __device_mem_mgr.free();
 }

From 52f6d82bd0222d2bc787fa2bf44b1311f693f966 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 11:02:59 -0400
Subject: [PATCH 094/134] renaming public APIs

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h     | 4 ++--
 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 70a4a0630c0..eaa70c3c271 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -833,7 +833,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
 void
-single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
+copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
                               _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred,
                               _KernelParam __param = {})
 {
@@ -855,7 +855,7 @@ single_pass_single_wg_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIt
 template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
 void
-single_pass_copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
+copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
                     _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {})
 {
     auto __n = __in_end - __in_begin;
diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index e0a079eaa3f..48fed4733f6 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -42,7 +42,7 @@ test(Predicate pred, Generator gen, KernelParam param)
         size_t* out_num = sycl::malloc_device<size_t>(1, q);
 
         q.copy(in.data(), in_ptr, n).wait();
-        oneapi::dpl::experimental::kt::gpu::single_pass_copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param);
+        oneapi::dpl::experimental::kt::gpu::copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param);
 
         Sequence<T> kt_out(n);
         size_t num_selected = 0;

From 8b504d359cb507e4f5f7b0d2a605bdcd41ac6494 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 11:59:06 -0400
Subject: [PATCH 095/134] sync with scan for asychronicity

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index eaa70c3c271..0e748e11261 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -616,9 +616,8 @@ single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _Ou
     constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 
-    __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng,
-                                                                                 __num_rng, __n, __pred)
-        .wait();
+    return __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng,
+                                                                                 __num_rng, __n, __pred);
 }
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRng,
@@ -821,11 +820,24 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     auto __fill_event = __lookback_init_submitter<_FlagType, _SizeT, _BinaryOp, _LookbackInitKernel>{}(
         __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding);
 
-    __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}(
+    sycl::event __prev_event = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}(
         __queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size,
-        __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs)
-        .wait();
-    __device_mem_mgr.free();
+        __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs);
+
+    // TODO: Currently, the following portion of code makes this entire function synchronous.
+    // Ideally, we should be able to use the asynchronous free below, but we have found that doing
+    // so introduces a large unexplainable slowdown. Once this slowdown has been identified and corrected,
+    // we should replace this code with the asynchronous version below.
+    if (0)
+    {
+        return __device_mem_mgr.async_free(__prev_event);
+    }
+    else
+    {
+        __prev_event.wait();
+        __device_mem_mgr.free();
+        return __prev_event;
+    }
 }
 
 } // namespace __impl
@@ -848,7 +860,7 @@ copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(),
+    return __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(),
                                                __pred, __param);
 }
 
@@ -869,7 +881,7 @@ copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutI
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred,
+    return __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred,
                                      __param);
 }
 

From 6f60f10d9c150cc57cffb4db640b15cd9606833b Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 12:02:14 -0400
Subject: [PATCH 096/134] sycl::event returns

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 0e748e11261..6d404ca6c03 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -605,7 +605,7 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
 
 template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
-void
+sycl::event
 single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
                                    _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam)
 {
@@ -779,7 +779,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
 
 template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
-void
+sycl::event
 single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
                          _UnaryPredicate __pred, _KernelParam)
 {
@@ -844,7 +844,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
 template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
-void
+sycl::event
 copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
                               _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred,
                               _KernelParam __param = {})
@@ -866,7 +866,7 @@ copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_
 
 template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
-void
+sycl::event
 copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
                     _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {})
 {

From 0dfcd15e439da0803b0174e1d31f19f02e276452 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 16:19:03 -0400
Subject: [PATCH 097/134] naming and minor fixes

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 69 ++++++++++---------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 6d404ca6c03..c673bedf8b4 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -288,15 +288,15 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag
 }
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
-          typename _InRng, typename _OutRng, typename _BinaryOp, typename _StatusFlags, typename _StatusValues,
+          typename _InRange, typename _OutRange, typename _BinaryOp, typename _StatusFlags, typename _StatusValues,
           typename _TileVals>
 struct __lookback_kernel_func
 {
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
 
-    _InRng __in_rng;
-    _OutRng __out_rng;
+    _InRange __in_rng;
+    _OutRange __out_rng;
     _BinaryOp __binary_op;
     std::size_t __n;
     _StatusFlags __status_flags;
@@ -384,17 +384,17 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _
                                  oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _InRng, typename _OutRng, typename _BinaryOp, typename _StatusFlags, typename _StatusValues>
+    template <typename _InRange, typename _OutRange, typename _BinaryOp, typename _StatusFlags, typename _StatusValues>
     sycl::event
-    operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op,
+    operator()(sycl::queue __q, sycl::event __prev_event, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op,
                std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
                _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial,
                std::size_t __current_num_items) const
     {
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc =
-            __lookback_kernel_func<__data_per_workitem, __workgroup_size, _Type, _FlagType, std::decay_t<_InRng>,
-                                   std::decay_t<_OutRng>, std::decay_t<_BinaryOp>, std::decay_t<_StatusFlags>,
+            __lookback_kernel_func<__data_per_workitem, __workgroup_size, _Type, _FlagType, std::decay_t<_InRange>,
+                                   std::decay_t<_OutRange>, std::decay_t<_BinaryOp>, std::decay_t<_StatusFlags>,
                                    std::decay_t<_StatusValues>, std::decay_t<_LocalAccessorType>>;
 
         static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
@@ -495,17 +495,17 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     }
 }
 
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRng, typename _OutRng,
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRange, typename _OutRange,
           typename _NumRng, typename _UnaryPredicate, typename _TileValues>
 struct __copy_if_single_wg_kernel_func
 {
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
     using _SizeT = std::size_t;
     using _BinaryOp = std::plus<_SizeT>;
-    using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
 
-    _InRng __in_rng;
-    _OutRng __out_rng;
+    _InRange __in_rng;
+    _OutRange __out_rng;
     _NumRng __num_rng;
     _SizeT __n;
     _UnaryPredicate __pred;
@@ -581,15 +581,15 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
                                      oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _InRng, typename _OutRng, typename _NumSelectedRange, typename _UnaryPredicate>
+    template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
     sycl::event
-    operator()(sycl::queue __q, _InRng&& __in_rng, _OutRng&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n,
+    operator()(sycl::queue __q, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n,
                _UnaryPredicate __pred) const
     {
-        using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
+        using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
-        using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRng>,
-                                                            std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>,
+        using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>,
+                                                            std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>,
                                                             _UnaryPredicate, std::decay_t<_LocalAccessorType>>;
 
         static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
@@ -609,30 +609,33 @@ sycl::event
 single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
                                    _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam)
 {
-    using _KernelName = __copy_if_single_wg_kernel<typename _KernelParam::kernel_name>;
 
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _KernelName = __copy_if_single_wg_kernel<typename _KernelParam::kernel_name>;
+    using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __copy_if_kernel<_KernelName, _Type>>;
     const ::std::size_t __n = __in_rng.size();
 
     constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
     constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 
-    return __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _KernelName>{}(__queue, __in_rng, __out_rng,
+    return __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng,
                                                                                  __num_rng, __n, __pred);
 }
 
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRng,
-          typename _OutRng, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRange,
+          typename _OutRange, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
           typename _TileValues>
 struct __copy_if_kernel_func
 {
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
     using _SizeT = std::size_t;
     using _BinaryOp = std::plus<_SizeT>;
-    using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 
-    _InRng __in_rng;
-    _OutRng __out_rng;
+    _InRange __in_rng;
+    _OutRange __out_rng;
     _NumRng __num_rng;
     _SizeT __n;
     _UnaryPredicate __pred;
@@ -744,20 +747,20 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
                            oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _Event, typename _InRng, typename _OutRng, typename _NumSelectedRange, typename _UnaryPredicate,
+    template <typename _Event, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
               typename _StatusFlags, typename _StatusValues>
     sycl::event
-    operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng,
+    operator()(sycl::queue __q, _Event __fill_event, _InRange&& __in_rng, _OutRange&& __out_rng,
                _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags,
                std::size_t __status_flags_size, _StatusValues&& __status_vals_full,
                _StatusValues&& __status_vals_partial, std::size_t __current_num_items,
                std::size_t __current_num_wgs) const
     {
-        using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
+        using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc =
-            __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRng>,
-                                  std::decay_t<_OutRng>, std::decay_t<_NumSelectedRange>, _UnaryPredicate,
+            __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>,
+                                  std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate,
                                   std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>,
                                   std::decay_t<_LocalAccessorType>>;
 
@@ -794,7 +797,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>;
 
     using _CopyIfKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-        __copy_if_kernel<_KernelName, _Type, _BinaryOp>>;
+        __copy_if_kernel<_KernelName, _Type>>;
 
     const std::size_t __n = __in_rng.size();
 
@@ -885,13 +888,13 @@ copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutI
                                      __param);
 }
 
-template <typename _InRng, typename _OutRng, typename _BinaryOp, typename _KernelParam>
+template <typename _InRange, typename _OutRange, typename _BinaryOp, typename _KernelParam>
 sycl::event
-inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op,
+inclusive_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op,
                _KernelParam __param = {})
 {
-    auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng));
-    auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng));
+    auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRange>(__in_rng));
+    auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__out_rng));
 
     return __impl::__single_pass_scan<true>(__queue, std::move(__in_view), std::move(__out_view), __binary_op, __param);
 }

From 12db7225976742befda8911e0c579792f3192d0d Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 16:37:04 -0400
Subject: [PATCH 098/134] removing single_wg public api

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 53 +++++--------------
 1 file changed, 12 insertions(+), 41 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index c673bedf8b4..cbb5ea8d29f 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -603,25 +603,6 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
     }
 };
 
-template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
-          typename _KernelParam>
-sycl::event
-single_pass_copy_if_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng,
-                                   _NumSelectedRange __num_rng, _UnaryPredicate __pred, _KernelParam)
-{
-
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-    using _KernelName = __copy_if_single_wg_kernel<typename _KernelParam::kernel_name>;
-    using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-        __copy_if_kernel<_KernelName, _Type>>;
-    const ::std::size_t __n = __in_rng.size();
-
-    constexpr ::std::size_t __wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
-
-    return __copy_if_single_wg_submitter<__elems_per_workitem, __wgsize, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng,
-                                                                                 __num_rng, __n, __pred);
-}
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRange,
           typename _OutRange, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
@@ -799,6 +780,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     using _CopyIfKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
         __copy_if_kernel<_KernelName, _Type>>;
 
+    using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __copy_if_single_wg_kernel<_KernelName, _Type>>;
+
     const std::size_t __n = __in_rng.size();
 
     constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size;
@@ -809,6 +793,15 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
     std::size_t __current_num_items = __current_num_wgs * __workgroup_size;
 
+    //If we fit in a single WG, use the single wg version
+    if (__current_num_wgs == 1)
+    {
+        return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng,
+                                                                                 __num_rng, __n, __pred);
+    }
+
+
+
     __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __current_num_wgs);
     __device_mem_mgr.allocate();
 
@@ -845,28 +838,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
 } // namespace __impl
 
-template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
-          typename _KernelParam>
-sycl::event
-copy_if_single_wg(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
-                              _OutIterator __out_begin, _NumSelectedRange __num_begin, _UnaryPredicate __pred,
-                              _KernelParam __param = {})
-{
-    auto __n = __in_end - __in_begin;
-
-    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
-    auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
-    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
-
-    auto __keep_num =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
-    auto __buf_num = __keep2(__num_begin, __num_begin + 1);
-
-    return __impl::single_pass_copy_if_impl_single_wg(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(),
-                                               __pred, __param);
-}
-
 template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
           typename _KernelParam>
 sycl::event

From 4850fcf3c0616ccbaa07a84c86623768da064a92 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 29 May 2024 17:03:20 -0400
Subject: [PATCH 099/134] temporarily disable single wg version

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index cbb5ea8d29f..5aedaa88291 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -794,11 +794,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     std::size_t __current_num_items = __current_num_wgs * __workgroup_size;
 
     //If we fit in a single WG, use the single wg version
-    if (__current_num_wgs == 1)
-    {
-        return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng,
-                                                                                 __num_rng, __n, __pred);
-    }
+    // if (__current_num_wgs == 1)
+    // {
+    //     return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng,
+    //                                                                              __num_rng, __n, __pred);
+    // }
 
 
 

From a8ccbd104623c5b1061faf2ee82a9ed7b333cef1 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 30 May 2024 12:10:21 -0400
Subject: [PATCH 100/134] wait after call for async algs

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
index 48fed4733f6..a46b76a3be2 100644
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
@@ -42,7 +42,7 @@ test(Predicate pred, Generator gen, KernelParam param)
         size_t* out_num = sycl::malloc_device<size_t>(1, q);
 
         q.copy(in.data(), in_ptr, n).wait();
-        oneapi::dpl::experimental::kt::gpu::copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param);
+        oneapi::dpl::experimental::kt::gpu::copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param).wait();
 
         Sequence<T> kt_out(n);
         size_t num_selected = 0;

From 70ad4897f93c1b3481b0fede2d4f34523b665348 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 30 May 2024 12:18:33 -0400
Subject: [PATCH 101/134] reenable single wg

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/experimental/kt/single_pass_scan.h    | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 5aedaa88291..359ac108382 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -794,13 +794,11 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     std::size_t __current_num_items = __current_num_wgs * __workgroup_size;
 
     //If we fit in a single WG, use the single wg version
-    // if (__current_num_wgs == 1)
-    // {
-    //     return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng,
-    //                                                                              __num_rng, __n, __pred);
-    // }
-
-
+    if (__current_num_wgs == 1)
+    {
+        return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng,
+                                                                                 __num_rng, __n, __pred);
+    }
 
     __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __current_num_wgs);
     __device_mem_mgr.allocate();

From e78b72c0c017a3ad431516f6c72e6dc277185717 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 30 May 2024 16:22:43 -0400
Subject: [PATCH 102/134] only need single phase for single wg

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 21 ++++++-------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 359ac108382..e2bd3f358e1 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -496,7 +496,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
 }
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRange, typename _OutRange,
-          typename _NumRng, typename _UnaryPredicate, typename _TileValues>
+          typename _NumRng, typename _UnaryPredicate>
 struct __copy_if_single_wg_kernel_func
 {
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
@@ -509,7 +509,6 @@ struct __copy_if_single_wg_kernel_func
     _NumRng __num_rng;
     _SizeT __n;
     _UnaryPredicate __pred;
-    _TileValues __wg_copy_if_values;
 
     [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
     operator()(const sycl::nd_item<1>& __item) const
@@ -520,8 +519,7 @@ struct __copy_if_single_wg_kernel_func
         // Global load into local
         _SizeT __wg_count = 0;
 
-        // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
-        if (__elems_in_tile <= __n)
+        if (__elems_in_tile == __n)
         {
 #pragma unroll
             for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
@@ -532,7 +530,7 @@ struct __copy_if_single_wg_kernel_func
                 _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
 
                 if (__satisfies_pred)
-                    __wg_copy_if_values[__count] = __val;
+                    __out_rng[__count] = __val;
 
                 __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
             }
@@ -556,18 +554,13 @@ struct __copy_if_single_wg_kernel_func
                 if (__i + __wg_local_id < __n)
                 {
                     if (__satisfies_pred)
-                        __wg_copy_if_values[__count] = std::move(__val.__v);
+                        __out_rng[__count] = std::move(__val.__v);
                     __val.__v.~_Type();
                 }
                 __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
             }
         }
 
-        // Phase 3: copy values to global memory
-        for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size)
-        {
-            __out_rng[__i] = __wg_copy_if_values[__i];
-        }
         if (__group.leader())
             __num_rng[0] = __wg_count;
     }
@@ -587,18 +580,16 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
                _UnaryPredicate __pred) const
     {
         using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-        using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>,
                                                             std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>,
-                                                            _UnaryPredicate, std::decay_t<_LocalAccessorType>>;
+                                                            _UnaryPredicate>;
 
         static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
 
         return __q.submit([&](sycl::handler& __hdl) {
-            auto __tile_vals = _LocalAccessorType(sycl::range<1>{__elems_in_tile}, __hdl);
             oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
             __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__workgroup_size, __workgroup_size),
-                                         _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred, __tile_vals});
+                                         _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred});
         });
     }
 };

From 2ca42871824e9bcf48a3998e35ef2347f8eacbaf Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 31 May 2024 09:50:26 -0400
Subject: [PATCH 103/134] reusing single workgroup  copy_if from oneDPL main

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    |  46 ++++----
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 111 ++++++++++--------
 .../dpcpp/parallel_backend_sycl_utils.h       |   8 +-
 3 files changed, 97 insertions(+), 68 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index e2bd3f358e1..29b9fb94ef7 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -386,8 +386,8 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _
 
     template <typename _InRange, typename _OutRange, typename _BinaryOp, typename _StatusFlags, typename _StatusValues>
     sycl::event
-    operator()(sycl::queue __q, sycl::event __prev_event, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op,
-               std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
+    operator()(sycl::queue __q, sycl::event __prev_event, _InRange&& __in_rng, _OutRange&& __out_rng,
+               _BinaryOp __binary_op, std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
                _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial,
                std::size_t __current_num_items) const
     {
@@ -576,13 +576,13 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
 
     template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
     sycl::event
-    operator()(sycl::queue __q, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng, std::size_t __n,
-               _UnaryPredicate __pred) const
+    operator()(sycl::queue __q, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng,
+               std::size_t __n, _UnaryPredicate __pred) const
     {
         using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-        using _KernelFunc = __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>,
-                                                            std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>,
-                                                            _UnaryPredicate>;
+        using _KernelFunc =
+            __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>,
+                                            std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate>;
 
         static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
 
@@ -594,7 +594,6 @@ struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
     }
 };
 
-
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRange,
           typename _OutRange, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
           typename _TileValues>
@@ -719,8 +718,8 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
                            oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _Event, typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
-              typename _StatusFlags, typename _StatusValues>
+    template <typename _Event, typename _InRange, typename _OutRange, typename _NumSelectedRange,
+              typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues>
     sycl::event
     operator()(sycl::queue __q, _Event __fill_event, _InRange&& __in_rng, _OutRange&& __out_rng,
                _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags,
@@ -768,14 +767,26 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
         __lookback_init_kernel<_KernelName, _SizeT, _BinaryOp>>;
 
-    using _CopyIfKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-        __copy_if_kernel<_KernelName, _Type>>;
+    using _CopyIfKernel =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__copy_if_kernel<_KernelName, _Type>>;
 
     using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
         __copy_if_single_wg_kernel<_KernelName, _Type>>;
 
     const std::size_t __n = __in_rng.size();
 
+    // Next power of 2 greater than or equal to __n
+    auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n);
+
+    //If we fit in a single WG SLM, use the single wg version from oneDPL main
+    if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform))
+    {
+        return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if(
+            oneapi::dpl::execution::__dpl::make_device_policy<_CopyIfSingleWgKernel>(__queue), __n,
+            std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng),
+            std::forward<_NumSelectedRange>(__num_rng), __pred);
+    }
+
     constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size;
     constexpr std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 
@@ -784,13 +795,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile);
     std::size_t __current_num_items = __current_num_wgs * __workgroup_size;
 
-    //If we fit in a single WG, use the single wg version
-    if (__current_num_wgs == 1)
-    {
-        return __copy_if_single_wg_submitter<__elems_per_workitem, __workgroup_size, _CopyIfSingleWgKernel>{}(__queue, __in_rng, __out_rng,
-                                                                                 __num_rng, __n, __pred);
-    }
-
     __scan_lookback_mem_mgr<_FlagType> __device_mem_mgr(__queue, __current_num_wgs);
     __device_mem_mgr.allocate();
 
@@ -831,7 +835,7 @@ template <typename _InIterator, typename _OutIterator, typename _NumSelectedRang
           typename _KernelParam>
 sycl::event
 copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
-                    _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {})
+        _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {})
 {
     auto __n = __in_end - __in_begin;
 
@@ -845,7 +849,7 @@ copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutI
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
     return __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred,
-                                     __param);
+                                            __param);
 }
 
 template <typename _InRange, typename _OutRange, typename _BinaryOp, typename _KernelParam>
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 162fcf2c282..6edd2625080 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -574,11 +574,11 @@ template <typename _Size, ::std::uint16_t _ElemsPerItem, ::std::uint16_t _WGSize
 struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _WGSize, _IsFullGroup,
                                                         __internal::__optional_kernel_name<_ScanKernelName...>>
 {
-    template <typename _Policy, typename _InRng, typename _OutRng, typename _InitType, typename _BinaryOperation,
-              typename _UnaryOp>
+    template <typename _Policy, typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _InitType,
+              typename _BinaryOperation, typename _UnaryOp>
     auto
-    operator()(const _Policy& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init,
-               _BinaryOperation __bin_op, _UnaryOp __unary_op)
+    operator()(const _Policy& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_copied_rng,
+               ::std::size_t __n, _InitType __init, _BinaryOperation __bin_op, _UnaryOp __unary_op)
     {
         using _ValueType = ::std::uint16_t;
 
@@ -589,16 +589,13 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
 
         constexpr ::std::uint32_t __elems_per_wg = _ElemsPerItem * _WGSize;
 
-        sycl::buffer<_Size> __res(sycl::range<1>(1));
-
-        auto __event = __policy.queue().submit([&](sycl::handler& __hdl) {
-            oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng);
+        return __policy.queue().submit([&](sycl::handler& __hdl) {
+            oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_copied_rng);
 
             // Local memory is split into two parts. The first half stores the result of applying the
             // predicate on each element of the input range. The second half stores the index of the output
             // range to copy elements of the input range.
             auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg * 2}, __hdl);
-            auto __res_acc = __res.template get_access<access_mode::write>(__hdl);
 
             __hdl.parallel_for<_ScanKernelName...>(
                 sycl::nd_range<1>(_WGSize, _WGSize), [=](sycl::nd_item<1> __self_item) {
@@ -656,11 +653,10 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
                     if (__item_id == 0)
                     {
                         // Add predicate of last element to account for the scan's exclusivity
-                        __res_acc[0] = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1];
+                        __num_copied_rng[0] = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1];
                     }
                 });
         });
-        return __future(__event, __res);
     }
 };
 
@@ -832,9 +828,11 @@ struct __invoke_single_group_copy_if
     // Specialization for devices that have a max work-group size of at least 1024
     static constexpr ::std::uint16_t __targeted_wg_size = 1024;
 
-    template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Pred>
+    template <::std::uint16_t _Size, typename _ExecutionPolicy, typename _InRng, typename _OutRng,
+              typename _NumCopiedRng, typename _Pred>
     auto
-    operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred&& __pred)
+    operator()(_ExecutionPolicy&& __exec, ::std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng,
+               _NumCopiedRng&& __num_copied_rng, _Pred&& __pred)
     {
         constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size);
         constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size);
@@ -846,23 +844,23 @@ struct __invoke_single_group_copy_if
         if (__is_full_group)
             return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
                 _SizeType, __num_elems_per_item, __wg_size, true,
-                   oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-                    __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>,
-                                                ::std::integral_constant<::std::uint16_t, __num_elems_per_item>,
-                                                /* _IsFullGroup= */ std::true_type, _CustomName>>
-                >()(
-                __exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{},
-                _ReduceOp{}, ::std::forward<_Pred>(__pred));
+                oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                    __scan_copy_single_wg_kernel<std::integral_constant<std::uint16_t, __wg_size>,
+                                                 std::integral_constant<std::uint16_t, __num_elems_per_item>,
+                                                 /* _IsFullGroup= */ std::true_type, _CustomName>>>()(
+                __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
+                std::forward<_NumCopiedRng>(__num_copied_rng), __n, _InitType{}, _ReduceOp{},
+                std::forward<_Pred>(__pred));
         else
             return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
                 _SizeType, __num_elems_per_item, __wg_size, false,
-                   oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-                    __scan_copy_single_wg_kernel<::std::integral_constant<::std::uint16_t, __wg_size>,
-                                                ::std::integral_constant<::std::uint16_t, __num_elems_per_item>,
-                                                /* _IsFullGroup= */ std::false_type, _CustomName>>
-                >()(
-                __exec, ::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n, _InitType{},
-                _ReduceOp{}, ::std::forward<_Pred>(__pred));
+                oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+                    __scan_copy_single_wg_kernel<std::integral_constant<std::uint16_t, __wg_size>,
+                                                 std::integral_constant<std::uint16_t, __num_elems_per_item>,
+                                                 /* _IsFullGroup= */ std::false_type, _CustomName>>>()(
+                __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
+                std::forward<_NumCopiedRng>(__num_copied_rng), __n, _InitType{}, _ReduceOp{},
+                std::forward<_Pred>(__pred));
     }
 };
 
@@ -907,36 +905,57 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag
         __copy_by_mask_op);
 }
 
+template <typename _Size>
+bool
+__group_copy_if_fits_in_slm(const sycl::queue& __queue, _Size __n, std::size_t __n_uniform)
+{
+    using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
+    ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__queue);
+
+    // The kernel stores n 16 bit integers for the predicate and another n 16 bit integers for the offsets,
+    // so check "scan" for a 32 bit type.
+    return (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<::std::uint32_t>(__queue, __n, __n_uniform) &&
+            __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size);
+}
+
+template <typename _ExecutionPolicy, typename _Size, typename _InRng, typename _OutRng, typename _NumCopiedRng,
+          typename _Pred>
+auto
+__dispatch_small_copy_if(_ExecutionPolicy&& __exec, _Size __n, _InRng&& __in_rng, _OutRng&& __out_rng,
+                         _NumCopiedRng&& __num_copied_rng, _Pred __pred)
+{
+    using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
+
+    using _SizeBreakpoints =
+        std::integer_sequence<std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384>;
+
+    return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
+        _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng),
+        std::forward<_OutRng>(__out_rng), std::forward<_NumCopiedRng>(__num_copied_rng), __pred);
+}
+
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred>
 auto
 __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
                    _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred)
 {
-    using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>;
-
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(static_cast<::std::make_unsigned_t<_Size>>(__n));
 
-    // Pessimistically only use half of the memory to take into account memory used by compiled kernel
-    const ::std::size_t __max_slm_size =
-        __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>() / 2;
-
-    // The kernel stores n integers for the predicate and another n integers for the offsets
-    const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2;
-
-    constexpr ::std::uint16_t __single_group_upper_limit = 16384;
+    if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__exec.queue(), __n, __n_uniform))
+    {
+        sycl::buffer<_Size> __res(sycl::range<1>(1));
+        auto __res_iterator = oneapi::dpl::begin(__res);
 
-    ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+        auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write,
+                                                              decltype(__res_iterator)>();
+        auto __res_rng = __keep(__res_iterator, __res_iterator + 1).all_view();
 
-    if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
-        __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
-    {
-        using _SizeBreakpoints =
-            ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384>;
+        sycl::event __event = oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if(
+            std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng),
+            std::forward<_OutRng>(__out_rng), std::move(__res_rng), __pred);
 
-        return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
-            _SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
-            ::std::forward<_OutRng>(__out_rng), __pred);
+        return __future(__event, __res);
     }
     else
     {
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index e0b153e31e2..e29261991b3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -55,11 +55,17 @@ __device_info(const _ExecutionPolicy& __policy)
 }
 #endif
 
+::std::size_t
+__max_work_group_size(const sycl::queue& __queue)
+{
+    return __queue.get_device().template get_info<sycl::info::device::max_work_group_size>();
+}
+
 template <typename _ExecutionPolicy>
 ::std::size_t
 __max_work_group_size(const _ExecutionPolicy& __policy)
 {
-    return __policy.queue().get_device().template get_info<sycl::info::device::max_work_group_size>();
+    return oneapi::dpl::__internal::__max_work_group_size(__policy.queue());
 }
 
 template <typename _ExecutionPolicy, typename _Size>

From 7538ed6babf05f2cfa42ec7ba3a46d7426fa9070 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 31 May 2024 10:16:52 -0400
Subject: [PATCH 104/134] add option to opt out of compiling single wg

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/experimental/kt/kernel_param.h    |  3 ++-
 .../dpl/experimental/kt/single_pass_scan.h       | 16 +++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/kernel_param.h b/include/oneapi/dpl/experimental/kt/kernel_param.h
index b3ee36be189..bbed93e777c 100644
--- a/include/oneapi/dpl/experimental/kt/kernel_param.h
+++ b/include/oneapi/dpl/experimental/kt/kernel_param.h
@@ -18,12 +18,13 @@ namespace oneapi::dpl::experimental::kt
 {
 
 template <std::uint16_t __data_per_work_item, std::uint16_t __work_group_size,
-          typename _KernelName = oneapi::dpl::execution::DefaultKernelName>
+          typename _KernelName = oneapi::dpl::execution::DefaultKernelName, typename _SingleWgOptOut = std::false_type>
 struct kernel_param
 {
     static constexpr std::uint16_t data_per_workitem = __data_per_work_item;
     static constexpr std::uint16_t workgroup_size = __work_group_size;
     using kernel_name = _KernelName;
+    using single_wg_opt_out = _SingleWgOptOut;
 };
 
 } // namespace oneapi::dpl::experimental::kt
diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 29b9fb94ef7..6d4d13ead6a 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -778,15 +778,17 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n);
 
-    //If we fit in a single WG SLM, use the single wg version from oneDPL main
-    if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform))
+    if constexpr (std::negation_v<typename _KernelParam::single_wg_opt_out>)
     {
-        return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if(
-            oneapi::dpl::execution::__dpl::make_device_policy<_CopyIfSingleWgKernel>(__queue), __n,
-            std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng),
-            std::forward<_NumSelectedRange>(__num_rng), __pred);
+        //If we fit in a single WG SLM, use the single wg version from oneDPL main
+        if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform))
+        {
+            return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if(
+                oneapi::dpl::execution::__dpl::make_device_policy<_CopyIfSingleWgKernel>(__queue), __n,
+                std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng),
+                std::forward<_NumSelectedRange>(__num_rng), __pred);
+        }
     }
-
     constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size;
     constexpr std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
 

From 293d7240e7aba7b9aaa6800af53d4b727382afe2 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 31 May 2024 10:51:08 -0400
Subject: [PATCH 105/134] adding opt out for single wg inclusive scan

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h     | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 6d4d13ead6a..87d4309c2f6 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -441,14 +441,18 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n);
 
-    // Perform a single-work group scan if the input is small
-    if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform))
+    if constexpr (std::negation_v<typename _KernelParam::single_wg_opt_out>)
     {
-        return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group(
-            oneapi::dpl::__internal::__device_backend_tag{},
-            oneapi::dpl::execution::__dpl::make_device_policy<typename _KernelParam::kernel_name>(__queue),
-            std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n,
-            oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, std::true_type{});
+        // Perform a single-work group scan if the input is small
+        if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform))
+        {
+            return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group(
+                oneapi::dpl::__internal::__device_backend_tag{},
+                oneapi::dpl::execution::__dpl::make_device_policy<typename _KernelParam::kernel_name>(__queue),
+                std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n,
+                oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op,
+                std::true_type{});
+        }
     }
 
     constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size;

From 565ba3ba8d667f6a9450750a8d77863d058665ed Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Fri, 31 May 2024 15:23:16 -0400
Subject: [PATCH 106/134] remove single_wg kt, in favor of main oneDPL version

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 98 -------------------
 1 file changed, 98 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 87d4309c2f6..3e54ce98bab 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -499,104 +499,6 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     }
 }
 
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _InRange, typename _OutRange,
-          typename _NumRng, typename _UnaryPredicate>
-struct __copy_if_single_wg_kernel_func
-{
-    static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
-    using _SizeT = std::size_t;
-    using _BinaryOp = std::plus<_SizeT>;
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-
-    _InRange __in_rng;
-    _OutRange __out_rng;
-    _NumRng __num_rng;
-    _SizeT __n;
-    _UnaryPredicate __pred;
-
-    [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
-    operator()(const sycl::nd_item<1>& __item) const
-    {
-        auto __group = __item.get_group();
-        auto __wg_local_id = __item.get_local_id(0);
-
-        // Global load into local
-        _SizeT __wg_count = 0;
-
-        if (__elems_in_tile == __n)
-        {
-#pragma unroll
-            for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
-            {
-                _Type __val = __in_rng[__i + __wg_local_id];
-
-                _SizeT __satisfies_pred = __pred(__val);
-                _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-
-                if (__satisfies_pred)
-                    __out_rng[__count] = __val;
-
-                __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
-            }
-        }
-        else
-        {
-            // Edge of input, have to handle memory bounds
-            // Might have unneccessary group_barrier calls
-#pragma unroll
-            for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
-            {
-                _SizeT __satisfies_pred = 0;
-                oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
-                if (__i + __wg_local_id < __n)
-                {
-                    new (&__val.__v) _Type(__in_rng[__i + __wg_local_id]);
-
-                    __satisfies_pred = __pred(__val.__v);
-                }
-                _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-                if (__i + __wg_local_id < __n)
-                {
-                    if (__satisfies_pred)
-                        __out_rng[__count] = std::move(__val.__v);
-                    __val.__v.~_Type();
-                }
-                __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
-            }
-        }
-
-        if (__group.leader())
-            __num_rng[0] = __wg_count;
-    }
-};
-
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _KernelName>
-struct __copy_if_single_wg_submitter;
-
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename... _Name>
-struct __copy_if_single_wg_submitter<__data_per_workitem, __workgroup_size,
-                                     oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
-{
-
-    template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate>
-    sycl::event
-    operator()(sycl::queue __q, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange&& __num_rng,
-               std::size_t __n, _UnaryPredicate __pred) const
-    {
-        using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-        using _KernelFunc =
-            __copy_if_single_wg_kernel_func<__data_per_workitem, __workgroup_size, std::decay_t<_InRange>,
-                                            std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate>;
-
-        static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
-
-        return __q.submit([&](sycl::handler& __hdl) {
-            oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng, __num_rng);
-            __hdl.parallel_for<_Name...>(sycl::nd_range<1>(__workgroup_size, __workgroup_size),
-                                         _KernelFunc{__in_rng, __out_rng, __num_rng, __n, __pred});
-        });
-    }
-};
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRange,
           typename _OutRange, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,

From b3fbfe24dff23fd4801ed94addfae9f494b985cf Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 09:47:09 -0400
Subject: [PATCH 107/134] trying scalar version of copy_if

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 122 +++++++++++++++++-
 1 file changed, 116 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 3e54ce98bab..56fb5a0bb91 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -267,24 +267,24 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag
                  _StatusValues __status_vals_full, _StatusValues __status_vals_partial, std::uint32_t __tile_id,
                  _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op)
 {
-    // The first sub-group will query the previous tiles to find a prefix
-    if (__subgroup.get_group_id() == 0)
+    // The last sub-group will query the previous tiles to find a prefix
+    if (__subgroup.get_group_id() == (__subgroup.get_group_range()[0] - 1))
     {
         _FlagType __flag(__status_flags, __status_vals_full, __status_vals_partial, __tile_id);
 
-        if (__subgroup.get_local_id() == 0)
+        if (__subgroup.get_local_id() == __subgroup.get_local_range()[0] - 1)
         {
             __flag.set_partial(__local_reduction);
         }
 
         __prev_tile_reduction = __flag.cooperative_lookback(__subgroup, __binary_op);
 
-        if (__subgroup.get_local_id() == 0)
+        if (__subgroup.get_local_id() == __subgroup.get_local_range()[0] - 1)
         {
             __flag.set_full(__binary_op(__prev_tile_reduction, __local_reduction));
         }
     }
-    __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, 0);
+    __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, __group.get_local_range()[0] - 1);
 }
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
@@ -616,6 +616,116 @@ struct __copy_if_kernel_func
     }
 };
 
+
+
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRange,
+          typename _OutRange, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
+          typename _TileValues>
+struct __copy_if_kernel_func_scalar
+{
+    static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
+    using _SizeT = std::size_t;
+    using _BinaryOp = std::plus<_SizeT>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _FlagStorageType = typename _FlagType::_FlagStorageType;
+
+    _InRange __in_rng;
+    _OutRange __out_rng;
+    _NumRng __num_rng;
+    _SizeT __n;
+    _UnaryPredicate __pred;
+    _StatusFlags __status_flags;
+    std::size_t __status_flags_size;
+    _StatusValues __status_vals_full;
+    _StatusValues __status_vals_partial;
+    _TileValues __wg_copy_if_values;
+    std::size_t __current_num_wgs;
+
+    [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
+    operator()(const sycl::nd_item<1>& __item) const
+    {
+        auto __group = __item.get_group();
+        auto __wg_local_id = __item.get_local_id(0);
+        auto __sg = __item.get_sub_group();
+
+        std::uint32_t __tile_id = 0;
+
+        // Obtain unique ID for this work-group that will be used in decoupled lookback
+        if (__group.leader())
+        {
+            sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                __idx_atomic(__status_flags[__status_flags_size - 1]);
+            __tile_id = __idx_atomic.fetch_add(1);
+        }
+
+        __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
+
+        std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
+
+        std::uint16_t __wi_count = 0;
+        // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
+
+        //TODO: check if it is better to check this at a subgroup or wg level rather than work item
+        if ((__wg_local_id + 1) * __data_per_workitem + __tile_id * __elems_in_tile <= __n)
+        {
+#pragma unroll
+            for (size_t __i = 0; __i < __data_per_workitem; ++__i)
+            {
+                // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
+                //  if load is done in a scalar fashion and provides the same performance, we
+                //  can avoid the broadcast (I think)
+                // would need to loop over the elements per work item first accumulating into
+                // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
+                // global memory needs to be loaded per work item per element, skipping copies
+                // when they were not saved.
+                _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
+
+                if (__pred(__val))
+                {
+                    __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val;
+                    ++__wi_count;
+                }
+            }
+
+        }
+        else
+        {
+            // Edge of input, have to handle memory bounds
+            for (size_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i)
+            {
+                if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
+                {
+                    _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
+
+                    if (__pred(__val))
+                    {
+                        __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val;
+                        ++__wi_count;
+                    }
+                }
+
+            }
+        }
+        _SizeT __wg_count  = __wi_count;
+        __wg_count = sycl::exclusive_scan_over_group(__group, __wg_count, _BinaryOp{});
+
+        // Phase 2: Global scan across __wg_count
+        _SizeT __copied_elements = 0;
+
+        __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id,
+                                    __wg_count, __copied_elements, _BinaryOp{});
+
+        // Phase 3: copy values to global memory
+        for (int __i = 0; __i < __wi_count; ++__i)
+        {
+            __out_rng[__copied_elements + __wg_count + __i] = __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem];
+        }
+        if (__tile_id == (__current_num_wgs - 1) && __wg_local_id == (__workgroup_size - 1))
+            __num_rng[0] = __copied_elements + __wg_count + __wi_count;
+    }
+};
+
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _KernelName>
 struct __copy_if_submitter;
 
@@ -636,7 +746,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
         using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc =
-            __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>,
+            __copy_if_kernel_func_scalar<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>,
                                   std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate,
                                   std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>,
                                   std::decay_t<_LocalAccessorType>>;

From e49bc9f4140b26798a2db498291d086ef2e3c20b Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 09:54:43 -0400
Subject: [PATCH 108/134] fix

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/experimental/kt/single_pass_scan.h  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 56fb5a0bb91..dfb633df854 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -551,7 +551,7 @@ struct __copy_if_kernel_func
         if ((__tile_id + 1) * __elems_in_tile <= __n)
         {
 #pragma unroll
-            for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
+            for (std::size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
             {
                 // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
                 //  if load is done in a scalar fashion and provides the same performance, we
@@ -576,7 +576,7 @@ struct __copy_if_kernel_func
             // Edge of input, have to handle memory bounds
             // Might have unneccessary group_barrier calls
 #pragma unroll
-            for (size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
+            for (std::size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
             {
                 _SizeT __satisfies_pred = 0;
                 oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
@@ -607,7 +607,7 @@ struct __copy_if_kernel_func
 
         //TODO: explore above comment about scalar load
         // Phase 3: copy values to global memory
-        for (int __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size)
+        for (std::size_t __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size)
         {
             __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i];
         }
@@ -670,7 +670,7 @@ struct __copy_if_kernel_func_scalar
         if ((__wg_local_id + 1) * __data_per_workitem + __tile_id * __elems_in_tile <= __n)
         {
 #pragma unroll
-            for (size_t __i = 0; __i < __data_per_workitem; ++__i)
+            for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i)
             {
                 // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
                 //  if load is done in a scalar fashion and provides the same performance, we
@@ -692,7 +692,7 @@ struct __copy_if_kernel_func_scalar
         else
         {
             // Edge of input, have to handle memory bounds
-            for (size_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i)
+            for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i)
             {
                 if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
                 {
@@ -717,9 +717,9 @@ struct __copy_if_kernel_func_scalar
                                     __wg_count, __copied_elements, _BinaryOp{});
 
         // Phase 3: copy values to global memory
-        for (int __i = 0; __i < __wi_count; ++__i)
+        for (std::uint16_t __i = 0; __i < __wi_count; ++__i)
         {
-            __out_rng[__copied_elements + __wg_count + __i] = __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem];
+            __out_rng[__copied_elements + __wg_count + __i] = __wg_copy_if_values[__i + __wg_local_id * __data_per_workitem];
         }
         if (__tile_id == (__current_num_wgs - 1) && __wg_local_id == (__workgroup_size - 1))
             __num_rng[0] = __copied_elements + __wg_count + __wi_count;

From 92438ee85f272d0bb47f25af3a6fc2596605d917 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 09:57:25 -0400
Subject: [PATCH 109/134] fix

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/experimental/kt/single_pass_scan.h  | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index dfb633df854..3f0da5a4407 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -694,17 +694,13 @@ struct __copy_if_kernel_func_scalar
             // Edge of input, have to handle memory bounds
             for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i)
             {
-                if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
-                {
-                    _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
+                _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
 
-                    if (__pred(__val))
-                    {
-                        __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val;
-                        ++__wi_count;
-                    }
+                if (__pred(__val))
+                {
+                    __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val;
+                    ++__wi_count;
                 }
-
             }
         }
         _SizeT __wg_count  = __wi_count;

From 5495da2cf81c14c9d6bd62436d53ec6ef0b492a8 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 10:05:24 -0400
Subject: [PATCH 110/134] full sum

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 3f0da5a4407..858df2990ad 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -710,7 +710,7 @@ struct __copy_if_kernel_func_scalar
         _SizeT __copied_elements = 0;
 
         __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id,
-                                    __wg_count, __copied_elements, _BinaryOp{});
+                                    __wg_count + __wi_count, __copied_elements, _BinaryOp{});
 
         // Phase 3: copy values to global memory
         for (std::uint16_t __i = 0; __i < __wi_count; ++__i)

From a7ca1b5695be9d8834bb4be93fe59c8527e614a3 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 10:07:05 -0400
Subject: [PATCH 111/134] switching arg to const ref

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 858df2990ad..2be524cfe24 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -265,7 +265,7 @@ template <typename _FlagType, typename _Group, typename _SubGroup, typename _Sta
 void
 __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags __status_flags,
                  _StatusValues __status_vals_full, _StatusValues __status_vals_partial, std::uint32_t __tile_id,
-                 _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op)
+                 const _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op)
 {
     // The last sub-group will query the previous tiles to find a prefix
     if (__subgroup.get_group_id() == (__subgroup.get_group_range()[0] - 1))

From 2d53eadd442a3ab949923ecb4c76d320b133e015 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 07:34:52 -0700
Subject: [PATCH 112/134] branch by tile, not by workitem

---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 2be524cfe24..4175d68725e 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -667,18 +667,11 @@ struct __copy_if_kernel_func_scalar
         // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
 
         //TODO: check if it is better to check this at a subgroup or wg level rather than work item
-        if ((__wg_local_id + 1) * __data_per_workitem + __tile_id * __elems_in_tile <= __n)
+        if ((__tile_id + 1) * __elems_in_tile <= __n)
         {
 #pragma unroll
             for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i)
             {
-                // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
-                //  if load is done in a scalar fashion and provides the same performance, we
-                //  can avoid the broadcast (I think)
-                // would need to loop over the elements per work item first accumulating into
-                // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
-                // global memory needs to be loaded per work item per element, skipping copies
-                // when they were not saved.
                 _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
 
                 if (__pred(__val))

From f26aff0fa4bc4858bb748f9aa64e09e6caf5287d Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 11:48:58 -0400
Subject: [PATCH 113/134] removing unused block_strided version

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 122 +-----------------
 1 file changed, 1 insertion(+), 121 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 4175d68725e..d70e0aca63d 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -499,7 +499,6 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     }
 }
 
-
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRange,
           typename _OutRange, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
           typename _TileValues>
@@ -523,124 +522,6 @@ struct __copy_if_kernel_func
     _TileValues __wg_copy_if_values;
     std::size_t __current_num_wgs;
 
-    [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
-    operator()(const sycl::nd_item<1>& __item) const
-    {
-        auto __group = __item.get_group();
-        auto __wg_local_id = __item.get_local_id(0);
-        auto __sg = __item.get_sub_group();
-
-        std::uint32_t __tile_id = 0;
-
-        // Obtain unique ID for this work-group that will be used in decoupled lookback
-        if (__group.leader())
-        {
-            sycl::atomic_ref<_FlagStorageType, sycl::memory_order::relaxed, sycl::memory_scope::device,
-                             sycl::access::address_space::global_space>
-                __idx_atomic(__status_flags[__status_flags_size - 1]);
-            __tile_id = __idx_atomic.fetch_add(1);
-        }
-
-        __tile_id = sycl::group_broadcast(__group, __tile_id, 0);
-
-        std::size_t __current_offset = static_cast<std::size_t>(__tile_id) * __elems_in_tile;
-
-        _SizeT __wg_count = 0;
-
-        // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
-        if ((__tile_id + 1) * __elems_in_tile <= __n)
-        {
-#pragma unroll
-            for (std::size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
-            {
-                // TODO: explore scalar impl.  Does this allow us to avoid the group broadcast (sync)?
-                //  if load is done in a scalar fashion and provides the same performance, we
-                //  can avoid the broadcast (I think)
-                // would need to loop over the elements per work item first accumulating into
-                // satisfies pred, copying to "my slot" in SLM then do scan, then the copy to
-                // global memory needs to be loaded per work item per element, skipping copies
-                // when they were not saved.
-                _Type __val = __in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id];
-
-                _SizeT __satisfies_pred = __pred(__val);
-                _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-
-                if (__satisfies_pred)
-                    __wg_copy_if_values[__count] = __val;
-
-                __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
-            }
-        }
-        else
-        {
-            // Edge of input, have to handle memory bounds
-            // Might have unneccessary group_barrier calls
-#pragma unroll
-            for (std::size_t __i = 0; __i < __elems_in_tile; __i += __workgroup_size)
-            {
-                _SizeT __satisfies_pred = 0;
-                oneapi::dpl::__internal::__lazy_ctor_storage<_Type> __val;
-                if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
-                {
-                    new (&__val.__v) _Type(__in_rng[__i + __wg_local_id + __elems_in_tile * __tile_id]);
-
-                    __satisfies_pred = __pred(__val.__v);
-                }
-                _SizeT __count = sycl::exclusive_scan_over_group(__group, __satisfies_pred, __wg_count, _BinaryOp{});
-
-                if (__i + __wg_local_id + __elems_in_tile * __tile_id < __n)
-                {
-                    if (__satisfies_pred)
-                        __wg_copy_if_values[__count] = std::move(__val.__v);
-                    __val.__v.~_Type();
-                }
-
-                __wg_count = sycl::group_broadcast(__group, __count + __satisfies_pred, __workgroup_size - 1);
-            }
-        }
-
-        // Phase 2: Global scan across __wg_count
-        _SizeT __copied_elements = 0;
-
-        __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id,
-                                    __wg_count, __copied_elements, _BinaryOp{});
-
-        //TODO: explore above comment about scalar load
-        // Phase 3: copy values to global memory
-        for (std::size_t __i = __wg_local_id; __i < __wg_count; __i += __workgroup_size)
-        {
-            __out_rng[__copied_elements + __i] = __wg_copy_if_values[__i];
-        }
-        if (__tile_id == (__current_num_wgs - 1) && __group.leader())
-            __num_rng[0] = __copied_elements + __wg_count;
-    }
-};
-
-
-
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRange,
-          typename _OutRange, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
-          typename _TileValues>
-struct __copy_if_kernel_func_scalar
-{
-    static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
-    using _SizeT = std::size_t;
-    using _BinaryOp = std::plus<_SizeT>;
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-    using _FlagStorageType = typename _FlagType::_FlagStorageType;
-
-    _InRange __in_rng;
-    _OutRange __out_rng;
-    _NumRng __num_rng;
-    _SizeT __n;
-    _UnaryPredicate __pred;
-    _StatusFlags __status_flags;
-    std::size_t __status_flags_size;
-    _StatusValues __status_vals_full;
-    _StatusValues __status_vals_partial;
-    _TileValues __wg_copy_if_values;
-    std::size_t __current_num_wgs;
-
     [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] void
     operator()(const sycl::nd_item<1>& __item) const
     {
@@ -666,7 +547,6 @@ struct __copy_if_kernel_func_scalar
         std::uint16_t __wi_count = 0;
         // Phase 1: Create __wg_count and construct in-order __wg_copy_if_values
 
-        //TODO: check if it is better to check this at a subgroup or wg level rather than work item
         if ((__tile_id + 1) * __elems_in_tile <= __n)
         {
 #pragma unroll
@@ -735,7 +615,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
         using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc =
-            __copy_if_kernel_func_scalar<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>,
+            __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>,
                                   std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate,
                                   std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>,
                                   std::decay_t<_LocalAccessorType>>;

From c0ab6512e0cfa36cb77ae7f0f0e18d28112afe4f Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 11:55:37 -0400
Subject: [PATCH 114/134] range API and formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 37 +++++++++++++------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index d70e0aca63d..42f7fcc7f78 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -560,12 +560,12 @@ struct __copy_if_kernel_func
                     ++__wi_count;
                 }
             }
-
         }
         else
         {
             // Edge of input, have to handle memory bounds
-            for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n; ++__i)
+            for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n;
+                 ++__i)
             {
                 _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
 
@@ -576,7 +576,7 @@ struct __copy_if_kernel_func
                 }
             }
         }
-        _SizeT __wg_count  = __wi_count;
+        _SizeT __wg_count = __wi_count;
         __wg_count = sycl::exclusive_scan_over_group(__group, __wg_count, _BinaryOp{});
 
         // Phase 2: Global scan across __wg_count
@@ -588,7 +588,8 @@ struct __copy_if_kernel_func
         // Phase 3: copy values to global memory
         for (std::uint16_t __i = 0; __i < __wi_count; ++__i)
         {
-            __out_rng[__copied_elements + __wg_count + __i] = __wg_copy_if_values[__i + __wg_local_id * __data_per_workitem];
+            __out_rng[__copied_elements + __wg_count + __i] =
+                __wg_copy_if_values[__i + __wg_local_id * __data_per_workitem];
         }
         if (__tile_id == (__current_num_wgs - 1) && __wg_local_id == (__workgroup_size - 1))
             __num_rng[0] = __copied_elements + __wg_count + __wi_count;
@@ -718,25 +719,39 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
 } // namespace __impl
 
-template <typename _InIterator, typename _OutIterator, typename _NumSelectedRange, typename _UnaryPredicate,
+template <typename _InRange, typename _OutRange, typename _NumCopiedRange, typename _UnaryPredicate,
+          typename _KernelParam>
+sycl::event
+copy_if(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumCopiedRange&& __num_rng,
+        _UnaryPredicate __pred, _KernelParam __param = {})
+{
+    auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRange>(__in_rng));
+    auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__out_rng));
+    auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__num_rng));
+
+    return __impl::single_pass_copy_if_impl(__queue, std::move(__in_view), std::move(__out_view), std::move(__num_view),
+                                            __pred, __param);
+}
+
+template <typename _InIterator, typename _OutIterator, typename _NumCopiedIterator, typename _UnaryPredicate,
           typename _KernelParam>
 sycl::event
 copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
-        _NumSelectedRange __num_begin, _UnaryPredicate __pred, _KernelParam __param = {})
+        _NumCopiedIterator __num_begin, _UnaryPredicate __pred, _KernelParam __param = {})
 {
     auto __n = __in_end - __in_begin;
 
     auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
-    auto __buf1 = __keep1(__in_begin, __in_end);
+    auto __buf_in = __keep1(__in_begin, __in_end);
     auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
-    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
+    auto __buf_out = __keep2(__out_begin, __out_begin + __n);
 
     auto __keep_num =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumSelectedRange>();
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _NumCopiedIterator>();
     auto __buf_num = __keep2(__num_begin, __num_begin + 1);
 
-    return __impl::single_pass_copy_if_impl(__queue, __buf1.all_view(), __buf2.all_view(), __buf_num.all_view(), __pred,
-                                            __param);
+    return __impl::single_pass_copy_if_impl(__queue, __buf_in.all_view(), __buf_out.all_view(), __buf_num.all_view(),
+                                            __pred, __param);
 }
 
 template <typename _InRange, typename _OutRange, typename _BinaryOp, typename _KernelParam>

From 517c34163d532783eeea3777db4dee7642b0f52f Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 14:25:23 -0400
Subject: [PATCH 115/134] removing unnecessary stuff

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    |  13 +-
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |   1 -
 .../hetero/dpcpp/parallel_backend_sycl_scan.h | 723 ------------------
 include/oneapi/dpl/pstl/utils.h               |   8 -
 .../numeric/numeric.ops/scan_kt.pass.cpp      |  66 --
 5 files changed, 2 insertions(+), 809 deletions(-)
 delete mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
 delete mode 100644 test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 42f7fcc7f78..466fe8548d9 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -39,12 +39,6 @@ namespace __impl
 template <typename... _Name>
 class __copy_if_kernel;
 
-template <typename... _Name>
-class __copy_if_single_wg_kernel;
-
-template <typename... _Name>
-class __inclusive_scan_kernel;
-
 template <typename... _Name>
 class __lookback_init_kernel;
 
@@ -420,7 +414,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     using _FlagType = __scan_status_flag<_Type>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 
-    using _KernelName = __inclusive_scan_kernel<typename _KernelParam::kernel_name>;
+    using _KernelName = typename _KernelParam::kernel_name;
     using _LookbackInitKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
         __lookback_init_kernel<_KernelName, _Type, _BinaryOp>>;
     using _LookbackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
@@ -656,9 +650,6 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
     using _CopyIfKernel =
         oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__copy_if_kernel<_KernelName, _Type>>;
 
-    using _CopyIfSingleWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-        __copy_if_single_wg_kernel<_KernelName, _Type>>;
-
     const std::size_t __n = __in_rng.size();
 
     // Next power of 2 greater than or equal to __n
@@ -670,7 +661,7 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform))
         {
             return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if(
-                oneapi::dpl::execution::__dpl::make_device_policy<_CopyIfSingleWgKernel>(__queue), __n,
+                oneapi::dpl::execution::__dpl::make_device_policy<_KernelName>(__queue), __n,
                 std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng),
                 std::forward<_NumSelectedRange>(__num_rng), __pred);
         }
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 6edd2625080..39318972cb4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -46,7 +46,6 @@
 #endif
 
 #include "sycl_traits.h" //SYCL traits specialization for some oneDPL types.
-#include "parallel_backend_sycl_scan.h"
 
 namespace oneapi
 {
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
deleted file mode 100644
index 8752c4baf0e..00000000000
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_scan.h
+++ /dev/null
@@ -1,723 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Copyright (C) Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// This file incorporates work covered by the following copyright and permission
-// notice:
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _ONEDPL_parallel_backend_sycl_scan_H
-#define _ONEDPL_parallel_backend_sycl_scan_H
-
-#include <cstdint>
-#include <sycl/sycl.hpp>
-
-namespace oneapi::dpl::experimental::kt
-{
-
-inline namespace igpu {
-
-constexpr ::std::size_t SUBGROUP_SIZE = 32;
-
-template <typename Type, typename UseAtomic64, template <typename, typename> typename LookbackScanMemory,
-          typename TileId>
-struct ScanMemoryManager
-{
-    using _TileIdT = typename TileId::_TileIdT;
-    using _LookbackScanMemory = LookbackScanMemory<Type, UseAtomic64>;
-    using _FlagT = typename _LookbackScanMemory::_FlagT;
-
-    ScanMemoryManager(sycl::queue q) : q{q} {};
-
-    ::std::uint8_t*
-    scan_memory_ptr() noexcept
-    {
-        return scan_memory_begin;
-    };
-
-    _TileIdT*
-    tile_id_ptr() noexcept
-    {
-        return tile_id_begin;
-    };
-
-    void
-    allocate(::std::size_t num_wgs)
-    {
-        ::std::size_t scan_memory_size = _LookbackScanMemory::get_memory_size(num_wgs);
-        constexpr ::std::size_t padded_tileid_size = TileId::get_padded_memory_size();
-        constexpr ::std::size_t tileid_size = TileId::get_memory_size();
-
-        auto mem_size_bytes = scan_memory_size + padded_tileid_size;
-
-        scratch = sycl::malloc_device<::std::uint8_t>(mem_size_bytes, q);
-
-        scan_memory_begin = scratch;
-
-        void* base_tileid_ptr = reinterpret_cast<void*>(scan_memory_begin + scan_memory_size);
-        size_t remainder = mem_size_bytes - scan_memory_size;
-
-        tile_id_begin = reinterpret_cast<_TileIdT*>(
-            ::std::align(::std::alignment_of_v<_TileIdT>, tileid_size, base_tileid_ptr, remainder));
-    }
-
-    sycl::event
-    async_free(sycl::event dependency)
-    {
-        return q.submit(
-            [e = dependency, ptr = scratch, q_ = q](sycl::handler& hdl)
-            {
-                hdl.depends_on(e);
-                hdl.host_task([=]() { sycl::free(ptr, q_); });
-            });
-    }
-
-    void
-    free()
-    {
-        sycl::free(scratch, q);
-    }
-
-  private:
-    ::std::uint8_t* scratch = nullptr;
-    ::std::uint8_t* scan_memory_begin = nullptr;
-    _TileIdT* tile_id_begin = nullptr;
-
-    sycl::queue q;
-};
-
-template <typename _T, typename UseAtomic64>
-struct LookbackScanMemory;
-
-template <typename _T>
-struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::false_type>
-{
-    using _FlagT = ::std::uint32_t;
-    using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::acq_rel, sycl::memory_scope::device,
-                                             sycl::access::address_space::global_space>;
-
-    static constexpr _FlagT NOT_READY = 0;
-    static constexpr _FlagT PARTIAL_MASK = 1;
-    static constexpr _FlagT FULL_MASK = 2;
-    static constexpr _FlagT OUT_OF_BOUNDS = 4;
-
-    static constexpr ::std::size_t padding = SUBGROUP_SIZE;
-
-    // LookbackScanMemory: [Partial Value, ..., Full Value, ..., Flag, ...]
-    // Each section has num_wgs + padding elements
-    LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
-        : num_elements(get_num_elements(num_wgs)), tile_values_begin(reinterpret_cast<_T*>(scan_memory_begin)),
-          flags_begin(get_flags_begin(scan_memory_begin, num_wgs))
-    {
-    }
-
-    void
-    set_partial(::std::size_t tile_id, _T val)
-    {
-        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
-
-        tile_values_begin[tile_id + padding] = val;
-        atomic_flag.store(PARTIAL_MASK);
-    }
-
-    void
-    set_full(::std::size_t tile_id, _T val)
-    {
-        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
-
-        tile_values_begin[tile_id + padding + num_elements] = val;
-        atomic_flag.store(FULL_MASK);
-    }
-
-    _AtomicFlagRefT
-    get_flag(::std::size_t tile_id) const
-    {
-        return _AtomicFlagRefT(*(flags_begin + tile_id + padding));
-    }
-
-    _T
-    get_value(::std::size_t tile_id, _FlagT flag) const
-    {
-        // full_value and partial_value are num_elements apart
-        return *(tile_values_begin + tile_id + padding + num_elements * is_full(flag));
-    }
-
-    static ::std::size_t
-    get_tile_values_bytes(::std::size_t num_elements)
-    {
-        return (2 * num_elements) * sizeof(_T);
-    }
-
-    static ::std::size_t
-    get_flag_bytes(::std::size_t num_elements)
-    {
-        return num_elements * sizeof(_FlagT);
-    }
-
-    static ::std::size_t
-    get_padded_flag_bytes(::std::size_t num_elements)
-    {
-        // sizeof(_FlagT) extra bytes for possible intenal alignment
-        return get_flag_bytes(num_elements) + sizeof(_FlagT);
-    }
-
-    static _FlagT*
-    get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
-    {
-        // Aligned flags
-        ::std::size_t num_elements = get_num_elements(num_wgs);
-        ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements);
-        void* base_flags = reinterpret_cast<void*>(scan_memory_begin + tile_values_bytes);
-        auto remainder = get_padded_flag_bytes(num_elements); // scan_memory_bytes - tile_values_bytes
-        return reinterpret_cast<_FlagT*>(
-            ::std::align(::std::alignment_of_v<_FlagT>, get_flag_bytes(num_elements), base_flags, remainder));
-    }
-
-    static ::std::size_t
-    get_memory_size(::std::size_t num_wgs)
-    {
-        ::std::size_t num_elements = get_num_elements(num_wgs);
-        // sizeof(_T) extra bytes are not needed because LookbackScanMemory is going at the beginning of the scratch
-        ::std::size_t tile_values_bytes = get_tile_values_bytes(num_elements);
-        // Padding to provide room for aligment
-        ::std::size_t flag_bytes = get_padded_flag_bytes(num_elements);
-
-        return tile_values_bytes + flag_bytes;
-    }
-
-    static ::std::size_t
-    get_num_elements(::std::size_t num_wgs)
-    {
-        return padding + num_wgs;
-    }
-
-    static bool
-    is_ready(_FlagT flag)
-    {
-        return flag != NOT_READY;
-    }
-
-    static bool
-    is_full(_FlagT flag)
-    {
-        return flag == FULL_MASK;
-    }
-
-    static bool
-    is_out_of_bounds(_FlagT flag)
-    {
-        return flag == OUT_OF_BOUNDS;
-    }
-
-  private:
-    ::std::size_t num_elements;
-    _FlagT* flags_begin;
-    _T* tile_values_begin;
-};
-
-template <typename _T>
-struct LookbackScanMemory<_T, /* UseAtomic64=*/::std::true_type>
-{
-    using _FlagT = ::std::uint64_t;
-    using _AtomicFlagRefT = sycl::atomic_ref<_FlagT, sycl::memory_order::relaxed, sycl::memory_scope::device,
-                                             sycl::access::address_space::global_space>;
-
-    // Each flag is divided in 2 32bit values
-    // 32..63 status bits
-    // 00..31 value bits
-    // Example: status = full scanned value, int value = 15:
-    // 1000 0000 0000 0000 0000 0000 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 1111
-
-    // Status values:
-    // 00xxxx - not computed
-    // 01xxxx - partial
-    // 10xxxx - full
-    // 110000 - out of bounds
-
-    static constexpr _FlagT NOT_READY = 0;
-    static constexpr _FlagT PARTIAL_MASK = 1l << (sizeof(_FlagT) * 8 - 2);
-    static constexpr _FlagT FULL_MASK = 1l << (sizeof(_FlagT) * 8 - 1);
-    static constexpr _FlagT OUT_OF_BOUNDS = PARTIAL_MASK | FULL_MASK;
-
-    static constexpr _FlagT VALUE_MASK = (1l << sizeof(::std::uint32_t) * 8) - 1; // 32 bit mask to store value
-
-    static constexpr ::std::size_t padding = SUBGROUP_SIZE;
-
-    LookbackScanMemory(::std::uint8_t* scan_memory_begin, ::std::size_t num_wgs)
-        : num_elements(get_num_elements(num_wgs)), flags_begin(get_flags_begin(scan_memory_begin, num_wgs))
-    {
-    }
-
-    void
-    set_partial(::std::size_t tile_id, _T val)
-    {
-        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
-
-        atomic_flag.store(PARTIAL_MASK | static_cast<::std::uint32_t>(val));
-    }
-
-    void
-    set_full(::std::size_t tile_id, _T val)
-    {
-        _AtomicFlagRefT atomic_flag(*(flags_begin + tile_id + padding));
-
-        atomic_flag.store(FULL_MASK | static_cast<::std::uint32_t>(val));
-    }
-
-    _AtomicFlagRefT
-    get_flag(::std::size_t tile_id) const
-    {
-        return _AtomicFlagRefT(*(flags_begin + tile_id + padding));
-    }
-
-    _T
-    get_value(::std::size_t, _FlagT flag) const
-    {
-        return static_cast<_T>(flag & VALUE_MASK);
-    }
-
-    static _FlagT*
-    get_flags_begin(::std::uint8_t* scan_memory_begin, ::std::size_t)
-    {
-        return reinterpret_cast<_FlagT*>(scan_memory_begin);
-    }
-
-    static ::std::size_t
-    get_memory_size(::std::size_t num_wgs)
-    {
-        ::std::size_t num_elements = get_num_elements(num_wgs);
-        return num_elements * sizeof(_FlagT);
-    }
-
-    static ::std::size_t
-    get_num_elements(::std::size_t num_wgs)
-    {
-        return padding + num_wgs;
-    }
-
-    static bool
-    is_ready(_FlagT flag)
-    {
-        // flag & OUT_OF_BOUNDS != NOT_READY means it has either partial or full value, or is out of bounds
-        return (flag & OUT_OF_BOUNDS) != NOT_READY;
-    }
-
-    static bool
-    is_full(_FlagT flag)
-    {
-        return (flag & OUT_OF_BOUNDS) == FULL_MASK;
-    }
-
-    static bool
-    is_out_of_bounds(_FlagT flag)
-    {
-        return (flag & OUT_OF_BOUNDS) == OUT_OF_BOUNDS;
-    }
-
-  private:
-    ::std::size_t num_elements;
-    _FlagT* flags_begin;
-};
-
-struct TileId
-{
-    using _TileIdT = ::std::uint32_t;
-    using _AtomicTileRefT = sycl::atomic_ref<_TileIdT, sycl::memory_order::relaxed, sycl::memory_scope::device,
-                                             sycl::access::address_space::global_space>;
-
-    TileId(_TileIdT* tileid_memory) : tile_counter(*(tileid_memory)) {}
-
-    constexpr static ::std::size_t
-    get_padded_memory_size()
-    {
-        // extra sizeof(_TileIdT) for possible aligment issues
-        return sizeof(_TileIdT) + sizeof(_TileIdT);
-    }
-
-    constexpr static ::std::size_t
-    get_memory_size()
-    {
-        // extra sizeof(_TileIdT) for possible aligment issues
-        return sizeof(_TileIdT);
-    }
-
-    _TileIdT
-    fetch_inc()
-    {
-        return tile_counter.fetch_add(1);
-    }
-
-    _AtomicTileRefT tile_counter;
-};
-
-struct cooperative_lookback
-{
-
-    template <typename _T, typename _Subgroup, typename BinOp,
-              template <typename, typename> typename LookbackScanMemory, typename UseAtomic64>
-    _T
-    operator()(std::uint32_t tile_id, const _Subgroup& subgroup, BinOp bin_op,
-               LookbackScanMemory<_T, UseAtomic64> memory)
-    {
-        using _LookbackScanMemory = LookbackScanMemory<_T, UseAtomic64>;
-        using FlagT = typename _LookbackScanMemory::_FlagT;
-
-        _T sum = 0;
-        constexpr int offset = -1;
-        int local_id = subgroup.get_local_id();
-
-        for (int tile = static_cast<int>(tile_id) + offset; tile >= 0; tile -= SUBGROUP_SIZE)
-        {
-            auto atomic_flag = memory.get_flag(tile - local_id); //
-            FlagT flag;
-            do
-            {
-                flag = atomic_flag.load();
-            } while (!sycl::all_of_group(subgroup, _LookbackScanMemory::is_ready(flag) ||
-                                                       (tile - local_id < 0))); // Loop till all ready
-
-            bool is_full = _LookbackScanMemory::is_full(flag);
-            auto is_full_ballot = sycl::ext::oneapi::group_ballot(subgroup, is_full);
-            auto lowest_item_with_full = is_full_ballot.find_low();
-
-            // TODO: Use identity_fn for out of bounds values
-            _T contribution = local_id <= lowest_item_with_full && (tile - local_id >= 0)
-                                  ? memory.get_value(tile - local_id, flag)
-                                  : _T{0};
-
-            // Sum all of the partial results from the tiles found, as well as the full contribution from the closest tile (if any)
-            sum = bin_op(sum, contribution);
-            // If we found a full value, we can stop looking at previous tiles. Otherwise,
-            // keep going through tiles until we either find a full tile or we've completely
-            // recomputed the prefix using partial values
-            if (is_full_ballot.any())
-                break;
-
-        }
-        sum = sycl::reduce_over_group(subgroup, sum, bin_op);
-
-        return sum;
-    }
-};
-
-template <typename _KernelParam, typename _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp>
-void
-single_pass_scan_impl_single_wg(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
-{
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-
-    static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan");
-
-    const ::std::size_t n = __in_rng.size();
-
-    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
-    // Avoid non_uniform n by padding up to a multiple of wgsize
-    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
-    constexpr ::std::size_t num_workitems = wgsize;
-
-    auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
-
-        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for(
-            sycl::nd_range<1>(num_workitems, wgsize), [=
-        ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-                auto group = item.get_group();
-                ::std::uint32_t local_id = item.get_local_id(0);
-                constexpr ::std::uint32_t stride = wgsize;
-                auto subgroup = item.get_sub_group();
-
-                constexpr std::uint32_t tile_id = 0;
-                constexpr std::uint32_t wg_begin = 0;
-                constexpr std::uint32_t wg_end = elems_in_tile;
-
-                std::uint32_t wg_local_memory_size = elems_in_tile;
-
-                auto out_begin = __out_rng.begin();
-                _Type carry = 0;
-
-                // Global load into local
-                if (wg_end > n)
-                    wg_local_memory_size = n;
-
-                //TODO: assumes default ctor produces identity w.r.t. __binary_op
-                // _Type my_reducer{};
-                if (wg_end <= n)
-                {
-#pragma unroll
-                    for (std::uint32_t step = 0; step < elems_per_workitem; ++step)
-                    {
-                        ::std::uint32_t i = stride * step;
-                        _Type in_val = __in_rng[i + local_id];
-                        // my_reducer = __binary_op(my_reducer, in_val);
-                        _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry);
-                        out_begin[i + local_id] = out;
-                        carry = group_broadcast(group, out, stride - 1);
-                    }
-                }
-                else
-                {
-#pragma unroll
-                    for (std::uint32_t step = 0; step < elems_per_workitem; ++step)
-                    {
-                        ::std::uint32_t i = stride * step;
-                        _Type in_val;
-
-                        if (i + local_id < n)
-                        {
-                            in_val = __in_rng[i + local_id];
-                            // my_reducer = __binary_op(my_reducer, in_val);
-                        }
-                        _Type out = sycl::inclusive_scan_over_group(group, in_val, __binary_op, carry);
-                        if (i + local_id < n)
-                        {
-                            out_begin[i + local_id] = out;
-                        }
-                        carry = group_broadcast(group, out, stride - 1);
-                    }
-                }
-            });
-    });
-
-    event.wait();
-}
-
-template <typename _KernelParam, typename _Inclusive, typename _UseAtomic64, typename _UseDynamicTileID,
-          typename _InRange, typename _OutRange, typename _BinaryOp>
-void
-single_pass_scan_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op)
-{
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
-    using _TileIdT = TileId::_TileIdT;
-    using _LookbackScanMemory = LookbackScanMemory<_Type, _UseAtomic64>;
-    using _FlagT = typename _LookbackScanMemory::_FlagT;
-
-    static_assert(std::is_same_v<_Inclusive, ::std::true_type>, "Single-pass scan only available for inclusive scan");
-
-    const ::std::size_t n = __in_rng.size();
-
-    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
-    // Avoid non_uniform n by padding up to a multiple of wgsize
-    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
-    ::std::size_t num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(n, elems_in_tile);
-    ::std::size_t num_workitems = num_wgs * wgsize;
-
-    ScanMemoryManager<_Type, _UseAtomic64, LookbackScanMemory, TileId> scratch(__queue);
-    scratch.allocate(num_wgs);
-
-    // Memory Structure:
-    // [Lookback Scan Memory, Tile Id Counter]
-    auto scan_memory_begin = scratch.scan_memory_ptr();
-    auto status_flags_begin = _LookbackScanMemory::get_flags_begin(scan_memory_begin, num_wgs);
-    auto tile_id_begin = scratch.tile_id_ptr();
-
-    ::std::size_t num_elements = _LookbackScanMemory::get_num_elements(num_wgs);
-    // fill_num_wgs num_elements + 1 to also initialize tile_id_counter
-    ::std::size_t fill_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(num_elements + 1, wgsize);
-
-    auto fill_event = __queue.memset(status_flags_begin, 0, num_elements * sizeof(_FlagT) + 1 * sizeof(_TileIdT));
-
-    auto event = __queue.submit([&](sycl::handler& hdl) {
-        auto tile_id_lacc = sycl::local_accessor<std::uint32_t, 1>(sycl::range<1>{1}, hdl);
-        auto tile_vals = sycl::local_accessor<_Type, 1>(sycl::range<1>{elems_in_tile}, hdl);
-        hdl.depends_on(fill_event);
-
-        oneapi::dpl::__ranges::__require_access(hdl, __in_rng, __out_rng);
-        hdl.parallel_for(
-            sycl::nd_range<1>(num_workitems, wgsize), [=
-        ](const sycl::nd_item<1>& item) [[intel::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-                auto group = item.get_group();
-                ::std::uint32_t local_id = item.get_local_id(0);
-                constexpr ::std::uint32_t stride = wgsize;
-                auto subgroup = item.get_sub_group();
-
-                std::uint32_t tile_id;
-                if constexpr (std::is_same_v<_UseDynamicTileID, ::std::true_type>)
-                {
-                    // Obtain unique ID for this work-group that will be used in decoupled lookback
-                    TileId dynamic_tile_id(tile_id_begin);
-                    if (group.leader())
-                    {
-                        tile_id_lacc[0] = dynamic_tile_id.fetch_inc();
-                    }
-                    sycl::group_barrier(group);
-                    tile_id = tile_id_lacc[0];
-                }
-                else
-                {
-                    tile_id = group.get_group_linear_id();
-                }
-
-                // Global load into local
-                auto wg_current_offset = (tile_id * elems_in_tile);
-                auto wg_next_offset = ((tile_id + 1) * elems_in_tile);
-                auto wg_local_memory_size = elems_in_tile;
-
-                if (wg_next_offset > n)
-                    wg_local_memory_size = n - wg_current_offset;
-                //TODO: assumes default ctor produces identity w.r.t. __binary_op
-                _Type my_reducer{};
-                if (wg_next_offset <= n)
-                {
-#pragma unroll
-                    for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                    {
-                        _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
-                        my_reducer = __binary_op(my_reducer, in_val);
-                        tile_vals[local_id + stride * i] = in_val;
-                    }
-                }
-                else
-                {
-#pragma unroll
-                    for (std::uint32_t i = 0; i < elems_per_workitem; ++i)
-                    {
-                        if (wg_current_offset + local_id + stride * i < n)
-                        {
-                            _Type in_val = __in_rng[wg_current_offset + local_id + stride * i];
-                            my_reducer = __binary_op(my_reducer, in_val);
-                            tile_vals[local_id + stride * i] = in_val;
-                        }
-                    }
-                }
-
-                auto local_sum = sycl::reduce_over_group(group, my_reducer, __binary_op);
-
-                auto in_begin = tile_vals.template get_multi_ptr<sycl::access::decorated::no>().get();
-                auto out_begin = __out_rng.begin() + wg_current_offset;
-
-                _Type prev_sum = 0;
-
-                // The first sub-group will query the previous tiles to find a prefix
-                if (subgroup.get_group_id() == 0)
-                {
-                    _LookbackScanMemory scan_mem(scan_memory_begin, num_wgs);
-
-                    if (group.leader())
-                        scan_mem.set_partial(tile_id, local_sum);
-
-                    // Find lowest work-item that has a full result (if any) and sum up subsequent partial results to obtain this tile's exclusive sum
-                    prev_sum = cooperative_lookback()(tile_id, subgroup, __binary_op, scan_mem);
-
-                    if (group.leader())
-                        scan_mem.set_full(tile_id, prev_sum + local_sum);
-                }
-
-                _Type carry = sycl::group_broadcast(group, prev_sum, 0);
-// TODO: Find a fix for _ONEDPL_PRAGMA_UNROLL
-#pragma unroll
-                for (::std::uint32_t step = 0; step < elems_per_workitem; ++step)
-                {
-                    ::std::uint32_t i = stride * step;
-                    _Type x;
-                    if (i + local_id < wg_local_memory_size)
-                    {
-                        x = in_begin[i + local_id];
-                    }
-                    _Type out = sycl::inclusive_scan_over_group(group, x, __binary_op, carry);
-                    if (i + local_id < wg_local_memory_size)
-                    {
-                        out_begin[i + local_id] = out;
-                    }
-                    carry = group_broadcast(group, out, stride - 1);
-                }
-            });
-    });
-
-    scratch.async_free(event);
-
-    event.wait();
-}
-
-template <typename _KernelParam, typename _Inclusive, typename _InIterator, typename _OutIterator, typename _BinaryOp>
-void
-single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
-                           _BinaryOp __binary_op)
-{
-    auto __n = __in_end - __in_begin;
-
-    auto __keep1 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
-    auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 =
-        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
-    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
-
-    // Avoid aspect query overhead for sizeof(Types) > 32 bits
-    if constexpr (sizeof(typename std::iterator_traits<_InIterator>::value_type) <= sizeof(std::uint32_t))
-    {
-        if (__queue.get_device().has(sycl::aspect::atomic64))
-        {
-            single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::true_type,
-                                  /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
-                                                                          __binary_op);
-        }
-        else
-        {
-            single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type,
-                                  /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
-                                                                          __binary_op);
-        }
-    }
-    else
-    {
-        single_pass_scan_impl<_KernelParam, _Inclusive, /* UseAtomic64 */ std::false_type,
-                              /* UseDynamicTileID */ std::false_type>(__queue, __buf1.all_view(), __buf2.all_view(),
-                                                                      __binary_op);
-    }
-}
-
-template <typename _KernelParam, typename _Inclusive, typename _InIterator, typename _OutIterator, typename _BinaryOp>
-void
-single_pass_single_wg_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end,
-                                     _OutIterator __out_begin, _BinaryOp __binary_op)
-{
-    auto __n = __in_end - __in_begin;
-
-    auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _InIterator>();
-    auto __buf1 = __keep1(__in_begin, __in_end);
-    auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>();
-    auto __buf2 = __keep2(__out_begin, __out_begin + __n);
-
-    // Avoid aspect query overhead for sizeof(Types) > 32 bits
-    single_pass_scan_impl_single_wg<_KernelParam, /* Inclusive */ std::true_type>(__queue, __buf1.all_view(),
-                                                                                  __buf2.all_view(), __binary_op);
-}
-
-template <typename _KernelParam, typename _InIterator, typename _OutIterator, typename _BinaryOp>
-void
-single_pass_inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin,
-                           _BinaryOp __binary_op)
-{
-    constexpr ::std::size_t wgsize = _KernelParam::workgroup_size;
-    constexpr ::std::size_t elems_per_workitem = _KernelParam::data_per_workitem;
-    // Avoid non_uniform n by padding up to a multiple of wgsize
-    constexpr ::std::uint32_t elems_in_tile = wgsize * elems_per_workitem;
-    auto __n = __in_end - __in_begin;
-
-    if (__n <= elems_in_tile)
-    {
-        single_pass_single_wg_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(
-            __queue, __in_begin, __in_end, __out_begin, __binary_op);
-    }
-    else
-    {
-        single_pass_inclusive_scan<_KernelParam, /* Inclusive */ std::true_type>(__queue, __in_begin, __in_end,
-                                                                                 __out_begin, __binary_op);
-    }
-}
-
-} // inline namespace igpu
-
-} // namespace oneapi::dpl::experimental::kt
-
-#endif /* _ONEDPL_parallel_backend_sycl_scan_H */
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index c68e74e6ef7..e8bbde63c04 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -765,14 +765,6 @@ struct __is_iterator_type<_T, std::void_t<typename std::iterator_traits<_T>::dif
 template <typename _T>
 static constexpr bool __is_iterator_type_v = __is_iterator_type<_T>::value;
 
-//For use to lazily create objects values of type _Tp without requiring a default constructibility of _Tp
-template <typename _Tp>
-union __lazy_ctor_storage
-{
-    _Tp __v;
-    __lazy_ctor_storage() {}
-};
-
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi
diff --git a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
deleted file mode 100644
index b3407581f37..00000000000
--- a/test/parallel_api/numeric/numeric.ops/scan_kt.pass.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// -*- C++ -*-
-//===-- scan.pass.cpp -----------------------------------------------------===//
-//
-// Copyright (C) Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// This file incorporates work covered by the following copyright and permission
-// notice:
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "support/test_config.h"
-
-#include _PSTL_TEST_HEADER(execution)
-#include _PSTL_TEST_HEADER(numeric)
-
-int
-main()
-{
-    bool all_passed = true;
-    sycl::queue q;
-
-    for (int logn : {4, 8, 11, 16, 19, 21})
-    {
-        std::cout << "Testing 2^" << logn << std::endl;
-        int n = 1 << logn;
-        std::vector<int> v(n, 1);
-        int* in_ptr = sycl::malloc_device<int>(n, q);
-        int* out_ptr = sycl::malloc_device<int>(n, q);
-
-        q.copy(v.data(), in_ptr, n).wait();
-        using KernelParams = oneapi::dpl::experimental::kt::kernel_param<8, 128, class ScanKernel>;
-        oneapi::dpl::experimental::kt::single_pass_inclusive_scan<KernelParams>(q, in_ptr, in_ptr+n, out_ptr, ::std::plus<int>());
-
-        std::vector<int> tmp(n, 0);
-        q.copy(out_ptr, tmp.data(), n);
-        q.wait();
-
-        std::inclusive_scan(v.begin(), v.end(), v.begin());
-
-        bool passed = true;
-        for (size_t i  = 0; i < n; ++i)
-        {
-            if (tmp[i] != v[i])
-            {
-                passed = false;
-                std::cout << "expected " << i << ' ' << v[i] << ' ' << tmp[i] << '\n';
-            }
-        }
-
-        if (passed)
-            std::cout << " passed" << std::endl;
-        else
-            std::cout << " failed" << std::endl;
-
-        all_passed &= passed;
-        sycl::free(in_ptr, q);
-        sycl::free(out_ptr, q);
-    }
-
-    return !all_passed;
-}

From 427a5f462ac5a97bc82e84bf7a7893bbe4031410 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 14:31:11 -0400
Subject: [PATCH 116/134] naming consistency

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 72 +++++++++----------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 466fe8548d9..bd7cd8a9cfb 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -282,15 +282,15 @@ __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlag
 }
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
-          typename _InRange, typename _OutRange, typename _BinaryOp, typename _StatusFlags, typename _StatusValues,
+          typename _InRng, typename _OutRng, typename _BinaryOp, typename _StatusFlags, typename _StatusValues,
           typename _TileVals>
 struct __lookback_kernel_func
 {
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
 
-    _InRange __in_rng;
-    _OutRange __out_rng;
+    _InRng __in_rng;
+    _OutRng __out_rng;
     _BinaryOp __binary_op;
     std::size_t __n;
     _StatusFlags __status_flags;
@@ -378,17 +378,17 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _
                                  oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _InRange, typename _OutRange, typename _BinaryOp, typename _StatusFlags, typename _StatusValues>
+    template <typename _InRng, typename _OutRng, typename _BinaryOp, typename _StatusFlags, typename _StatusValues>
     sycl::event
-    operator()(sycl::queue __q, sycl::event __prev_event, _InRange&& __in_rng, _OutRange&& __out_rng,
+    operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng,
                _BinaryOp __binary_op, std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
                _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial,
                std::size_t __current_num_items) const
     {
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc =
-            __lookback_kernel_func<__data_per_workitem, __workgroup_size, _Type, _FlagType, std::decay_t<_InRange>,
-                                   std::decay_t<_OutRange>, std::decay_t<_BinaryOp>, std::decay_t<_StatusFlags>,
+            __lookback_kernel_func<__data_per_workitem, __workgroup_size, _Type, _FlagType, std::decay_t<_InRng>,
+                                   std::decay_t<_OutRng>, std::decay_t<_BinaryOp>, std::decay_t<_StatusFlags>,
                                    std::decay_t<_StatusValues>, std::decay_t<_LocalAccessorType>>;
 
         static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
@@ -406,11 +406,11 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _
     }
 };
 
-template <bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp, typename _KernelParam>
+template <bool _Inclusive, typename _InRng, typename _OutRng, typename _BinaryOp, typename _KernelParam>
 sycl::event
-__single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, _KernelParam)
+__single_pass_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, _KernelParam)
 {
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
     using _FlagType = __scan_status_flag<_Type>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 
@@ -443,7 +443,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
             return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group(
                 oneapi::dpl::__internal::__device_backend_tag{},
                 oneapi::dpl::execution::__dpl::make_device_policy<typename _KernelParam::kernel_name>(__queue),
-                std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n,
+                std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
                 oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op,
                 std::true_type{});
         }
@@ -493,19 +493,19 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     }
 }
 
-template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRange,
-          typename _OutRange, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
+template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _FlagType, typename _InRng,
+          typename _OutRng, typename _NumRng, typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues,
           typename _TileValues>
 struct __copy_if_kernel_func
 {
     static constexpr std::uint32_t __elems_in_tile = __workgroup_size * __data_per_workitem;
     using _SizeT = std::size_t;
     using _BinaryOp = std::plus<_SizeT>;
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 
-    _InRange __in_rng;
-    _OutRange __out_rng;
+    _InRng __in_rng;
+    _OutRng __out_rng;
     _NumRng __num_rng;
     _SizeT __n;
     _UnaryPredicate __pred;
@@ -598,20 +598,20 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
                            oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _Event, typename _InRange, typename _OutRange, typename _NumSelectedRange,
+    template <typename _Event, typename _InRng, typename _OutRng, typename _NumCopiedRng,
               typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues>
     sycl::event
-    operator()(sycl::queue __q, _Event __fill_event, _InRange&& __in_rng, _OutRange&& __out_rng,
-               _NumSelectedRange&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags,
+    operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng,
+               _NumCopiedRng&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags,
                std::size_t __status_flags_size, _StatusValues&& __status_vals_full,
                _StatusValues&& __status_vals_partial, std::size_t __current_num_items,
                std::size_t __current_num_wgs) const
     {
-        using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+        using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
         using _KernelFunc =
-            __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRange>,
-                                  std::decay_t<_OutRange>, std::decay_t<_NumSelectedRange>, _UnaryPredicate,
+            __copy_if_kernel_func<__data_per_workitem, __workgroup_size, _FlagType, std::decay_t<_InRng>,
+                                  std::decay_t<_OutRng>, std::decay_t<_NumCopiedRng>, _UnaryPredicate,
                                   std::decay_t<_StatusFlags>, std::decay_t<_StatusValues>,
                                   std::decay_t<_LocalAccessorType>>;
 
@@ -631,15 +631,15 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
     }
 };
 
-template <typename _InRange, typename _OutRange, typename _NumSelectedRange, typename _UnaryPredicate,
+template <typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _UnaryPredicate,
           typename _KernelParam>
 sycl::event
-single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumSelectedRange __num_rng,
+single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng __num_rng,
                          _UnaryPredicate __pred, _KernelParam)
 {
     using _SizeT = uint64_t;
     using _KernelName = __copy_if_kernel<typename _KernelParam::kernel_name>;
-    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
     using _FlagType = __scan_status_flag<_SizeT>;
 
     using _BinaryOp = std::plus<_SizeT>;
@@ -662,8 +662,8 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
         {
             return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if(
                 oneapi::dpl::execution::__dpl::make_device_policy<_KernelName>(__queue), __n,
-                std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng),
-                std::forward<_NumSelectedRange>(__num_rng), __pred);
+                std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
+                std::forward<_NumCopiedRng>(__num_rng), __pred);
         }
     }
     constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size;
@@ -710,15 +710,15 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& _
 
 } // namespace __impl
 
-template <typename _InRange, typename _OutRange, typename _NumCopiedRange, typename _UnaryPredicate,
+template <typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _UnaryPredicate,
           typename _KernelParam>
 sycl::event
-copy_if(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _NumCopiedRange&& __num_rng,
+copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng,
         _UnaryPredicate __pred, _KernelParam __param = {})
 {
-    auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRange>(__in_rng));
-    auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__out_rng));
-    auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__num_rng));
+    auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng));
+    auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng));
+    auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__num_rng));
 
     return __impl::single_pass_copy_if_impl(__queue, std::move(__in_view), std::move(__out_view), std::move(__num_view),
                                             __pred, __param);
@@ -745,13 +745,13 @@ copy_if(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutI
                                             __pred, __param);
 }
 
-template <typename _InRange, typename _OutRange, typename _BinaryOp, typename _KernelParam>
+template <typename _InRng, typename _OutRng, typename _BinaryOp, typename _KernelParam>
 sycl::event
-inclusive_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op,
+inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op,
                _KernelParam __param = {})
 {
-    auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRange>(__in_rng));
-    auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRange>(__out_rng));
+    auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng));
+    auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng));
 
     return __impl::__single_pass_scan<true>(__queue, std::move(__in_view), std::move(__out_view), __binary_op, __param);
 }

From e9091e1c85d3e71cbd25572cae58a79f4f9ca3b3 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 14:33:34 -0400
Subject: [PATCH 117/134] formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 27 +++++++++----------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index bd7cd8a9cfb..b735f1e77fe 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -380,8 +380,8 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _
 
     template <typename _InRng, typename _OutRng, typename _BinaryOp, typename _StatusFlags, typename _StatusValues>
     sycl::event
-    operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng,
-               _BinaryOp __binary_op, std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
+    operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op,
+               std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
                _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial,
                std::size_t __current_num_items) const
     {
@@ -598,14 +598,13 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
                            oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
 
-    template <typename _Event, typename _InRng, typename _OutRng, typename _NumCopiedRng,
-              typename _UnaryPredicate, typename _StatusFlags, typename _StatusValues>
+    template <typename _Event, typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _UnaryPredicate,
+              typename _StatusFlags, typename _StatusValues>
     sycl::event
-    operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng,
-               _NumCopiedRng&& __num_rng, std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags,
-               std::size_t __status_flags_size, _StatusValues&& __status_vals_full,
-               _StatusValues&& __status_vals_partial, std::size_t __current_num_items,
-               std::size_t __current_num_wgs) const
+    operator()(sycl::queue __q, _Event __fill_event, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng,
+               std::size_t __n, _UnaryPredicate __pred, _StatusFlags&& __status_flags, std::size_t __status_flags_size,
+               _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial,
+               std::size_t __current_num_items, std::size_t __current_num_wgs) const
     {
         using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
         using _LocalAccessorType = sycl::local_accessor<_Type, 1>;
@@ -631,8 +630,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
     }
 };
 
-template <typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _UnaryPredicate,
-          typename _KernelParam>
+template <typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _UnaryPredicate, typename _KernelParam>
 sycl::event
 single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng __num_rng,
                          _UnaryPredicate __pred, _KernelParam)
@@ -710,11 +708,10 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out
 
 } // namespace __impl
 
-template <typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _UnaryPredicate,
-          typename _KernelParam>
+template <typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _UnaryPredicate, typename _KernelParam>
 sycl::event
-copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng,
-        _UnaryPredicate __pred, _KernelParam __param = {})
+copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng, _UnaryPredicate __pred,
+        _KernelParam __param = {})
 {
     auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng));
     auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng));

From 7315d2bc4720b7b8d4aebe1ffb97d0d20fe95fae Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 4 Jun 2024 14:34:57 -0400
Subject: [PATCH 118/134] reverting overreach

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index b735f1e77fe..5204f5d4f7f 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -406,11 +406,11 @@ struct __lookback_scan_submitter<__data_per_workitem, __workgroup_size, _Type, _
     }
 };
 
-template <bool _Inclusive, typename _InRng, typename _OutRng, typename _BinaryOp, typename _KernelParam>
+template <bool _Inclusive, typename _InRange, typename _OutRange, typename _BinaryOp, typename _KernelParam>
 sycl::event
-__single_pass_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, _KernelParam)
+__single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, _KernelParam)
 {
-    using _Type = oneapi::dpl::__internal::__value_t<_InRng>;
+    using _Type = oneapi::dpl::__internal::__value_t<_InRange>;
     using _FlagType = __scan_status_flag<_Type>;
     using _FlagStorageType = typename _FlagType::_FlagStorageType;
 

From 0983043aba315f0743982f0fe9c2f492afd6fdb5 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 5 Jun 2024 14:03:32 -0400
Subject: [PATCH 119/134] upgrading tests to match scan, cmake

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    |   4 +-
 test/kt/CMakeLists.txt                        |  47 ++--
 test/kt/single_pass_copy_if.cpp               | 266 ++++++++++++++++++
 test/kt/single_pass_scan.cpp                  |   4 +-
 4 files changed, 298 insertions(+), 23 deletions(-)
 create mode 100644 test/kt/single_pass_copy_if.cpp

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 5204f5d4f7f..bc04d1da4dd 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -443,7 +443,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
             return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group(
                 oneapi::dpl::__internal::__device_backend_tag{},
                 oneapi::dpl::execution::__dpl::make_device_policy<typename _KernelParam::kernel_name>(__queue),
-                std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
+                std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n,
                 oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op,
                 std::true_type{});
         }
@@ -715,7 +715,7 @@ copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedR
 {
     auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng));
     auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng));
-    auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__num_rng));
+    auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_NumCopiedRng>(__num_rng));
 
     return __impl::single_pass_copy_if_impl(__queue, std::move(__in_view), std::move(__out_view), std::move(__num_view),
                                             __pred, __param);
diff --git a/test/kt/CMakeLists.txt b/test/kt/CMakeLists.txt
index 3e312108819..3159cfdae7e 100644
--- a/test/kt/CMakeLists.txt
+++ b/test/kt/CMakeLists.txt
@@ -130,50 +130,57 @@ if (ONEDPL_TEST_ENABLE_KT_ESIMD)
     _generate_esimd_sort_test("esimd_radix_sort" "256" "32" "double" "" 1000) # segfault
 endif()
 
-function (_generate_gpu_scan_test _data_per_work_item _work_group_size _type)
-
-    if ((NOT TARGET build-scan-kt-tests) AND (NOT TARGET run-scan-kt-tests))
-        add_custom_target(build-scan-kt-tests COMMENT "Build all scan kernel template tests")
-        add_custom_target(run-scan-kt-tests
-            COMMAND "${CMAKE_CTEST_COMMAND}" -R "^run-scan-kt-tests$" --output-on-failure --no-label-summary
-            DEPENDS build-scan-kt-tests
-            COMMENT "Build and run all scan kernel template tests")
+function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_size _type _single_wg_optout)
+
+    if ((NOT TARGET "build-${_alg}-kt-tests") AND (NOT TARGET "run-${_alg}-kt-tests"))
+        add_custom_target("build-${_alg}-kt-tests" COMMENT "Build all ${_alg} kernel template tests")
+        add_custom_target("run-${_alg}-kt-tests"
+            COMMAND "${CMAKE_CTEST_COMMAND}" -R "^run-${_alg}-kt-tests$" --output-on-failure --no-label-summary
+            DEPENDS "build-${_alg}-kt-tests"
+            COMMENT "Build and run all ${_alg} kernel template tests")
     endif()
 
     string(REPLACE "_t" "" _type_short ${_type})
-    set(_target_name "single_pass_scan_dpwi${_data_per_work_item}_wgs${_work_group_size}_${_type_short}")
-    set(_test_path "single_pass_scan.cpp")
+    set(_target_name "single_pass_${_alg}_dpwi${_data_per_work_item}_wgs${_work_group_size}_${_type_short}_${_single_wg_optout}")
+    set(_test_path "single_pass_${_alg}.cpp")
 
     #_generate_test_randomly(${_target_name} ${_test_path} ${_probability_permille})
     _generate_test(${_target_name} ${_test_path})
     if(TARGET ${_target_name})
-        add_dependencies(build-scan-kt-tests ${_target_name})
-        add_dependencies(run-scan-kt-tests ${_target_name})
+        add_dependencies("build-${_alg}-kt-tests" ${_target_name})
+        add_dependencies("run-${_alg}-kt-tests" ${_target_name})
 
         target_compile_definitions(${_target_name} PRIVATE TEST_DATA_PER_WORK_ITEM=${_data_per_work_item})
         target_compile_definitions(${_target_name} PRIVATE TEST_WORK_GROUP_SIZE=${_work_group_size})
+        target_compile_definitions(${_target_name} PRIVATE TEST_SINGLE_WG_OPTOUT=${_single_wg_optout})
         target_compile_definitions(${_target_name} PRIVATE TEST_TYPE=${_type})
     endif()
 
 endfunction()
 
-function(_generate_gpu_scan_tests)
+function(_generate_gpu_single_pass_tests)
+    set(_alg_all "scan" "copy_if")
     set(_data_per_work_item_all "1" "2" "4" "8" "16" "32")
     set(_work_group_size_all "64" "128" "256" "512" "1024")
     set(_type_all "uint32_t" "int32_t" "float" "int64_t" "uint64_t" "double")
 
-    foreach (_data_per_work_item ${_data_per_work_item_all})
-        foreach (_work_group_size ${_work_group_size_all})
-            foreach (_type ${_type_all})
-                _generate_gpu_scan_test(${_data_per_work_item} ${_work_group_size} ${_type})
+    foreach (_alg ${_alg_all})
+        foreach (_data_per_work_item ${_data_per_work_item_all})
+            foreach (_work_group_size ${_work_group_size_all})
+                foreach (_type ${_type_all})
+                    _generate_gpu_single_pass_test(${_alg} ${_data_per_work_item} ${_work_group_size} ${_type} "false")
+                endforeach()
             endforeach()
         endforeach()
+        # to not double the number of tests, check single wg output with a single test per alg
+        _generate_gpu_single_pass_test(${_alg} "8" "512" "float" "true")
+
+        _generate_test("single_pass_${_alg}" "single_pass_${_alg}.cpp")
+        target_compile_definitions("single_pass_${_alg}" PRIVATE TEST_DATA_PER_WORK_ITEM=8 TEST_WORK_GROUP_SIZE=256 TEST_TYPE=uint32_t TEST_SINGLE_WG_OPTOUT=false)
     endforeach()
 
-    _generate_test("single_pass_scan" "single_pass_scan.cpp")
-    target_compile_definitions("single_pass_scan" PRIVATE TEST_DATA_PER_WORK_ITEM=8 TEST_WORK_GROUP_SIZE=256 TEST_TYPE=uint32_t)
 endfunction()
 
 if (ONEDPL_TEST_ENABLE_KT_SYCL)
-    _generate_gpu_scan_tests()
+    _generate_gpu_single_pass_tests()
 endif()
diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp
new file mode 100644
index 00000000000..86449355ac2
--- /dev/null
+++ b/test/kt/single_pass_copy_if.cpp
@@ -0,0 +1,266 @@
+// -*- C++ -*-
+//===-- single_pass_copy_if.cpp -------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../support/test_config.h"
+
+#include <oneapi/dpl/experimental/kernel_templates>
+
+#if LOG_TEST_INFO
+#    include <iostream>
+#endif
+
+#if _ENABLE_RANGES_TESTING
+#    include <oneapi/dpl/ranges>
+#endif
+
+#include "../support/utils.h"
+#include "../support/sycl_alloc_utils.h"
+#include "../support/scan_serial_impl.h"
+
+#include "esimd_radix_sort_utils.h"
+
+#include <random>
+#include <algorithm>
+#include <numeric>
+#include <vector>
+#include <cstdint>
+#include <type_traits>
+
+inline const std::vector<std::size_t> copy_if_sizes = {
+    1,       6,         16,      43,        256,           316,           2048,
+    5072,    8192,      14001,   1 << 14,   (1 << 14) + 1, 50000,         67543,
+    100'000, 1 << 17,   179'581, 250'000,   1 << 18,       (1 << 18) + 1, 500'000,
+    888'235, 1'000'000, 1 << 20, 10'000'000};
+
+template <typename T>
+struct __less_than_val
+{
+    const T __val;
+    __less_than_val() : __val{std::is_signed_v<T> ? 0 : std::numeric_limits<T>::max()/T{2}}
+    {
+    }
+    __less_than_val(const T& __v) : __val{__v}
+    {
+    }
+    bool
+    operator()(const T& __v) const
+    {
+        return __v < __val;
+    }
+};
+
+template <typename T>
+auto
+generate_copy_if_data(T* input, std::size_t size, std::uint32_t seed)
+{
+    // Integer numbers are generated even for floating point types in order to avoid rounding errors,
+    // and simplify the final check
+    using substitute_t = std::conditional_t<std::is_signed_v<T>, std::int64_t, std::uint64_t>;
+
+    std::default_random_engine gen{seed};
+    std::uniform_int_distribution<substitute_t> dist(std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
+    std::generate(input, input + size, [&] { return dist(gen); });
+}
+
+#if _ENABLE_RANGES_TESTING
+template <typename T, typename Predicate, typename KernelParam>
+void
+test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
+{
+#    if LOG_TEST_INFO
+    std::cout << "\ttest_all_view(" << size << ") : " << TypeInfo().name<T>() << std::endl;
+#    endif
+    std::vector<T> input(size);
+    generate_copy_if_data(input.data(), size, 42);
+    std::vector<T> ref(input);
+    std::vector<T> out(size);
+    sycl::buffer<T> buf_out(input.size());
+    std::size_t num_copied = 0;
+    sycl::buffer<std::size_t> buf_num_copied(&num_copied, 1);
+    auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out), pred);
+    std::size_t num_copied_ref = out_end - std::begin(out);
+    {
+        sycl::buffer<T> buf(input.data(), input.size());
+
+        oneapi::dpl::experimental::ranges::all_view<T, sycl::access::mode::read> view(buf);
+        oneapi::dpl::experimental::ranges::all_view<T, sycl::access::mode::read_write> view_out(buf_out);
+        oneapi::dpl::experimental::ranges::all_view<std::size_t, sycl::access::mode::write> view_num_copied(buf_num_copied);
+        oneapi::dpl::experimental::kt::gpu::copy_if(q, view, view_out, view_num_copied, pred, param).wait();
+    }
+
+    auto acc = buf_out.get_host_access();
+    auto num_copied_acc = buf_num_copied.get_host_access();
+
+    std::string msg1 = "wrong num copied with all_view, n: " + std::to_string(size);
+    EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str());
+    std::string msg2 = "wrong results with all_view, n: " + std::to_string(size);
+    EXPECT_EQ_RANGES(ref, acc, msg2.c_str());
+}
+
+template <typename T, typename Predicate, typename KernelParam>
+void
+test_buffer(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
+{
+#    if LOG_TEST_INFO
+    std::cout << "\ttest_buffer(" << size << ") : " << TypeInfo().name<T>() << std::endl;
+#    endif
+    std::vector<T> input(size);
+    generate_copy_if_data(input.data(), size, 42);
+    std::vector<T> ref(input);
+    std::vector<T> out_ref(size);
+    sycl::buffer<T> buf_out(size);
+    std::size_t num_copied = 0;
+    sycl::buffer<std::size_t> buf_num_copied(&num_copied, 1);
+    auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out_ref), pred);
+    std::size_t num_copied_ref = out_end - std::begin(out_ref);
+    {
+        sycl::buffer<T> buf(input.data(), input.size());
+
+        oneapi::dpl::experimental::kt::gpu::copy_if(q, buf, buf_out, buf_num_copied, pred, param).wait();
+    }
+
+    auto acc = buf_out.get_host_access();
+    auto num_copied_acc = buf_num_copied.get_host_access();
+
+    std::string msg1 = "wrong num copied with buffer, n: " + std::to_string(size);
+    EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str());
+    std::string msg2 = "wrong results with buffer, n: " + std::to_string(size);
+    EXPECT_EQ_RANGES(ref, acc, msg2.c_str());
+
+}
+#endif
+
+template <typename T, sycl::usm::alloc _alloc_type, typename Predicate, typename KernelParam>
+void
+test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
+{
+#if LOG_TEST_INFO
+    std::cout << "\t\ttest_usm<" << TypeInfo().name<T>() << ", " << USMAllocPresentation().name<_alloc_type>() << ">("
+              << size << ");" << std::endl;
+#endif
+    std::vector<T> expected(size);
+    generate_copy_if_data(expected.data(), size, 42);
+    std::vector<T> out_ref(size);
+
+    TestUtils::usm_data_transfer<_alloc_type, T> dt_input(q, expected.begin(), expected.end());
+    TestUtils::usm_data_transfer<_alloc_type, T> dt_output(q, size);
+    TestUtils::usm_data_transfer<_alloc_type, std::size_t> dt_num_copied(q, 1);
+
+    std::size_t num_copied = 0;
+    auto out_end = std::copy_if(std::begin(expected), std::end(expected), std::begin(out_ref), pred);
+    std::size_t num_copied_ref = out_end - std::begin(out_ref);
+
+    oneapi::dpl::experimental::kt::gpu::copy_if(q, dt_input.get_data(), dt_input.get_data() + size,
+                                                       dt_output.get_data(), dt_num_copied.get_data(), pred, param)
+        .wait();
+
+    std::vector<T> actual(size);
+    dt_output.retrieve_data(actual.begin());
+    std::vector<std::size_t> num_copied_host(1);
+    dt_num_copied.retrieve_data(num_copied_host.begin());
+  
+    std::string msg1 = "wrong num copied with USM, n: " + std::to_string(size);
+    EXPECT_EQ(num_copied_ref, num_copied_host[0], msg1.c_str());
+    std::string msg2 = "wrong results with USM, n: " + std::to_string(size);
+    EXPECT_EQ_N(expected.begin(), actual.begin(), size, msg2.c_str());
+}
+
+///////////////////
+
+template <typename T, typename Predicate, typename KernelParam>
+void
+test_sycl_iterators(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
+{
+#if LOG_TEST_INFO
+    std::cout << "\t\ttest_sycl_iterators<" << TypeInfo().name<T>() << ">(" << size << ");" << std::endl;
+#endif
+    std::vector<T> input(size);
+    std::vector<T> output(size);
+    generate_copy_if_data(input.data(), size, 42);
+    std::vector<T> ref(input);
+    std::vector<T> out_ref(size);
+    std::size_t num_copied = 0;
+    auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out_ref), pred);
+    std::size_t num_copied_ref = out_end - std::begin(out_ref);
+    {
+        sycl::buffer<T> buf(input.data(), input.size());
+        sycl::buffer<T> buf_out(output.data(), output.size());
+        sycl::buffer<std::size_t> buf_num(&num_copied, 1);
+        oneapi::dpl::experimental::kt::gpu::copy_if(q, oneapi::dpl::begin(buf), oneapi::dpl::end(buf),
+                                                    oneapi::dpl::begin(buf_out), oneapi::dpl::begin(buf_num), pred,
+                                                    param)
+            .wait();
+    }
+
+    std::string msg1 = "wrong num copied with oneapi::dpl::begin/end, n: " + std::to_string(size);
+    EXPECT_EQ(num_copied_ref, num_copied, msg1.c_str());
+    std::string msg2 = "wrong results with oneapi::dpl::begin/end, n: " + std::to_string(size);
+    EXPECT_EQ_RANGES(ref, output, msg2.c_str());
+}
+
+template <typename T, typename Predicate, typename KernelParam>
+void
+test_general_cases(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
+{
+    test_usm<T, sycl::usm::alloc::shared>(q, size, pred, TestUtils::get_new_kernel_params<0>(param));
+    test_usm<T, sycl::usm::alloc::device>(q, size, pred, TestUtils::get_new_kernel_params<1>(param));
+    test_sycl_iterators<T>(q, size, pred, TestUtils::get_new_kernel_params<2>(param));
+#if _ENABLE_RANGES_TESTING
+    test_all_view<T>(q, size, pred, TestUtils::get_new_kernel_params<3>(param));
+    test_buffer<T>(q, size, pred, TestUtils::get_new_kernel_params<4>(param));
+#endif
+}
+
+template <typename T, typename Predicate, typename KernelParam>
+void
+test_all_cases(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
+{
+    test_general_cases<T>(q, size, pred, TestUtils::get_new_kernel_params<0>(param));
+
+}
+
+int
+main()
+{
+#if LOG_TEST_INFO
+    std::cout << "TEST_DATA_PER_WORK_ITEM : " << TEST_DATA_PER_WORK_ITEM << "\n"
+              << "TEST_WORK_GROUP_SIZE    : " << TEST_WORK_GROUP_SIZE << "\n"
+              << "TEST_SINGLE_WG_OPTOUT   : " << TEST_SINGLE_WG_OPTOUT << "\n"
+              << "TEST_TYPE               : " << TypeInfo().name<TEST_TYPE>() << std::endl;
+#endif
+
+    constexpr oneapi::dpl::experimental::kt::kernel_param<TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE,
+                                                          /*opt_out_single_wg=*/ std::bool_constant<TEST_SINGLE_WG_OPTOUT>> params;
+    auto q = TestUtils::get_test_queue();
+    bool run_test = can_run_test<decltype(params), TEST_TYPE>(q, params);
+
+    auto __predicate = __less_than_val<TEST_TYPE>{};
+    if (run_test)
+    {
+
+        try
+        {
+            for (auto size : copy_if_sizes)
+                test_all_cases<TEST_TYPE>(q, size, __predicate, params);
+        }
+        catch (const std::exception& exc)
+        {
+            std::cerr << "Exception: " << exc.what() << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    return TestUtils::done(run_test);
+}
diff --git a/test/kt/single_pass_scan.cpp b/test/kt/single_pass_scan.cpp
index 860db88d2b3..a92b451cdcf 100644
--- a/test/kt/single_pass_scan.cpp
+++ b/test/kt/single_pass_scan.cpp
@@ -206,10 +206,12 @@ main()
 #if LOG_TEST_INFO
     std::cout << "TEST_DATA_PER_WORK_ITEM : " << TEST_DATA_PER_WORK_ITEM << "\n"
               << "TEST_WORK_GROUP_SIZE    : " << TEST_WORK_GROUP_SIZE << "\n"
+              << "TEST_SINGLE_WG_OPTOUT   : " << TEST_SINGLE_WG_OPTOUT << "\n"
               << "TEST_TYPE               : " << TypeInfo().name<TEST_TYPE>() << std::endl;
 #endif
 
-    constexpr oneapi::dpl::experimental::kt::kernel_param<TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE> params;
+    constexpr oneapi::dpl::experimental::kt::kernel_param<TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE,
+                                                              /*opt_out_single_wg=*/ std::bool_constant<TEST_SINGLE_WG_OPTOUT>> params;
     auto q = TestUtils::get_test_queue();
     bool run_test = can_run_test<decltype(params), TEST_TYPE>(q, params);
 

From e09ccaebf7957ea22265482a8e0b437fc875abcd Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 5 Jun 2024 14:34:34 -0400
Subject: [PATCH 120/134] test bugfix

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 test/kt/single_pass_copy_if.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp
index 86449355ac2..2fdd04e0c32 100644
--- a/test/kt/single_pass_copy_if.cpp
+++ b/test/kt/single_pass_copy_if.cpp
@@ -85,12 +85,12 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param
     std::vector<T> input(size);
     generate_copy_if_data(input.data(), size, 42);
     std::vector<T> ref(input);
-    std::vector<T> out(size);
+    std::vector<T> out_ref(size);
     sycl::buffer<T> buf_out(input.size());
     std::size_t num_copied = 0;
     sycl::buffer<std::size_t> buf_num_copied(&num_copied, 1);
-    auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out), pred);
-    std::size_t num_copied_ref = out_end - std::begin(out);
+    auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out_ref), pred);
+    std::size_t num_copied_ref = out_end - std::begin(out_ref);
     {
         sycl::buffer<T> buf(input.data(), input.size());
 
@@ -106,7 +106,7 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param
     std::string msg1 = "wrong num copied with all_view, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str());
     std::string msg2 = "wrong results with all_view, n: " + std::to_string(size);
-    EXPECT_EQ_RANGES(ref, acc, msg2.c_str());
+    EXPECT_EQ_RANGES(out_ref, acc, msg2.c_str());
 }
 
 template <typename T, typename Predicate, typename KernelParam>
@@ -137,7 +137,7 @@ test_buffer(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
     std::string msg1 = "wrong num copied with buffer, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str());
     std::string msg2 = "wrong results with buffer, n: " + std::to_string(size);
-    EXPECT_EQ_RANGES(ref, acc, msg2.c_str());
+    EXPECT_EQ_RANGES(out_ref, acc, msg2.c_str());
 
 }
 #endif
@@ -150,16 +150,16 @@ test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
     std::cout << "\t\ttest_usm<" << TypeInfo().name<T>() << ", " << USMAllocPresentation().name<_alloc_type>() << ">("
               << size << ");" << std::endl;
 #endif
-    std::vector<T> expected(size);
-    generate_copy_if_data(expected.data(), size, 42);
+    std::vector<T> in_ref(size);
+    generate_copy_if_data(in_ref.data(), size, 42);
     std::vector<T> out_ref(size);
 
-    TestUtils::usm_data_transfer<_alloc_type, T> dt_input(q, expected.begin(), expected.end());
+    TestUtils::usm_data_transfer<_alloc_type, T> dt_input(q, in_ref.begin(), in_ref.end());
     TestUtils::usm_data_transfer<_alloc_type, T> dt_output(q, size);
     TestUtils::usm_data_transfer<_alloc_type, std::size_t> dt_num_copied(q, 1);
 
     std::size_t num_copied = 0;
-    auto out_end = std::copy_if(std::begin(expected), std::end(expected), std::begin(out_ref), pred);
+    auto out_end = std::copy_if(std::begin(in_ref), std::end(in_ref), std::begin(out_ref), pred);
     std::size_t num_copied_ref = out_end - std::begin(out_ref);
 
     oneapi::dpl::experimental::kt::gpu::copy_if(q, dt_input.get_data(), dt_input.get_data() + size,
@@ -174,7 +174,7 @@ test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
     std::string msg1 = "wrong num copied with USM, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied_host[0], msg1.c_str());
     std::string msg2 = "wrong results with USM, n: " + std::to_string(size);
-    EXPECT_EQ_N(expected.begin(), actual.begin(), size, msg2.c_str());
+    EXPECT_EQ_N(out_ref.begin(), actual.begin(), size, msg2.c_str());
 }
 
 ///////////////////
@@ -207,7 +207,7 @@ test_sycl_iterators(sycl::queue q, std::size_t size, Predicate pred, KernelParam
     std::string msg1 = "wrong num copied with oneapi::dpl::begin/end, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied, msg1.c_str());
     std::string msg2 = "wrong results with oneapi::dpl::begin/end, n: " + std::to_string(size);
-    EXPECT_EQ_RANGES(ref, output, msg2.c_str());
+    EXPECT_EQ_RANGES(out_ref, output, msg2.c_str());
 }
 
 template <typename T, typename Predicate, typename KernelParam>

From 96acd30ca2456ff589e6d9a8d78aa4bd1d2f2472 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 5 Jun 2024 16:57:50 -0400
Subject: [PATCH 121/134] bugfix for non-full case

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index bc04d1da4dd..85192b3bd92 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -558,8 +558,8 @@ struct __copy_if_kernel_func
         else
         {
             // Edge of input, have to handle memory bounds
-            for (std::uint16_t __i = 0; __i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id < __n;
-                 ++__i)
+            std::uint16_t __end = std::min(std::size_t{__data_per_workitem}, __n - __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id);
+            for (std::uint16_t __i = 0; __i < __end; ++__i)
             {
                 _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
 

From f15c759fb197df30ce228ee911425a08f48d97e9 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 5 Jun 2024 14:05:29 -0700
Subject: [PATCH 122/134] fix range to check

---
 test/kt/single_pass_copy_if.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp
index 2fdd04e0c32..857343c9ef7 100644
--- a/test/kt/single_pass_copy_if.cpp
+++ b/test/kt/single_pass_copy_if.cpp
@@ -106,7 +106,7 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param
     std::string msg1 = "wrong num copied with all_view, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str());
     std::string msg2 = "wrong results with all_view, n: " + std::to_string(size);
-    EXPECT_EQ_RANGES(out_ref, acc, msg2.c_str());
+    EXPECT_EQ_N(out_ref.begin(), acc.begin(), num_copied_ref, msg2.c_str());
 }
 
 template <typename T, typename Predicate, typename KernelParam>
@@ -137,7 +137,7 @@ test_buffer(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
     std::string msg1 = "wrong num copied with buffer, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str());
     std::string msg2 = "wrong results with buffer, n: " + std::to_string(size);
-    EXPECT_EQ_RANGES(out_ref, acc, msg2.c_str());
+    EXPECT_EQ_N(out_ref.begin(), acc.begin(), num_copied_ref, msg2.c_str());
 
 }
 #endif
@@ -174,11 +174,9 @@ test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
     std::string msg1 = "wrong num copied with USM, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied_host[0], msg1.c_str());
     std::string msg2 = "wrong results with USM, n: " + std::to_string(size);
-    EXPECT_EQ_N(out_ref.begin(), actual.begin(), size, msg2.c_str());
+    EXPECT_EQ_N(out_ref.begin(), actual.begin(), num_copied_ref, msg2.c_str());
 }
 
-///////////////////
-
 template <typename T, typename Predicate, typename KernelParam>
 void
 test_sycl_iterators(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
@@ -207,7 +205,7 @@ test_sycl_iterators(sycl::queue q, std::size_t size, Predicate pred, KernelParam
     std::string msg1 = "wrong num copied with oneapi::dpl::begin/end, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied, msg1.c_str());
     std::string msg2 = "wrong results with oneapi::dpl::begin/end, n: " + std::to_string(size);
-    EXPECT_EQ_RANGES(out_ref, output, msg2.c_str());
+    EXPECT_EQ_N(out_ref.begin(), output.begin(), num_copied_ref, msg2.c_str());
 }
 
 template <typename T, typename Predicate, typename KernelParam>

From b195e4d5d5e6c3eb17688f5a3eb87169f8abefff Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 5 Jun 2024 17:26:13 -0400
Subject: [PATCH 123/134] adjust data generation

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 test/kt/single_pass_copy_if.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp
index 857343c9ef7..88311e741ad 100644
--- a/test/kt/single_pass_copy_if.cpp
+++ b/test/kt/single_pass_copy_if.cpp
@@ -48,12 +48,6 @@ template <typename T>
 struct __less_than_val
 {
     const T __val;
-    __less_than_val() : __val{std::is_signed_v<T> ? 0 : std::numeric_limits<T>::max()/T{2}}
-    {
-    }
-    __less_than_val(const T& __v) : __val{__v}
-    {
-    }
     bool
     operator()(const T& __v) const
     {
@@ -70,7 +64,9 @@ generate_copy_if_data(T* input, std::size_t size, std::uint32_t seed)
     using substitute_t = std::conditional_t<std::is_signed_v<T>, std::int64_t, std::uint64_t>;
 
     std::default_random_engine gen{seed};
-    std::uniform_int_distribution<substitute_t> dist(std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
+    substitute_t start = std::is_signed_v<T> ? -10 : 0;
+    substitute_t end = std::is_signed_v<T> ? 10 : 20;
+    std::uniform_int_distribution<substitute_t> dist(start, end);
     std::generate(input, input + size, [&] { return dist(gen); });
 }
 
@@ -244,7 +240,7 @@ main()
     auto q = TestUtils::get_test_queue();
     bool run_test = can_run_test<decltype(params), TEST_TYPE>(q, params);
 
-    auto __predicate = __less_than_val<TEST_TYPE>{};
+    auto __predicate = __less_than_val<TEST_TYPE>{std::is_signed_v<TEST_TYPE> ? TEST_TYPE{0} : TEST_TYPE{10}};
     if (run_test)
     {
 

From 4528cbbe54c3a53d3e43e8c1d19c7e49ac6446fb Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 5 Jun 2024 17:33:50 -0400
Subject: [PATCH 124/134] better fix for non-full case

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h      | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 85192b3bd92..7848fc1fd8f 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -558,15 +558,18 @@ struct __copy_if_kernel_func
         else
         {
             // Edge of input, have to handle memory bounds
-            std::uint16_t __end = std::min(std::size_t{__data_per_workitem}, __n - __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id);
-            for (std::uint16_t __i = 0; __i < __end; ++__i)
+#pragma unroll
+            for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i)
             {
-                _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
-
-                if (__pred(__val))
+                if (__i + (__wg_local_id) * __data_per_workitem + __elems_in_tile * __tile_id < __n)
                 {
-                    __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val;
-                    ++__wi_count;
+                    _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
+
+                    if (__pred(__val))
+                    {
+                        __wg_copy_if_values[__wi_count + __wg_local_id * __data_per_workitem] = __val;
+                        ++__wi_count;
+                    }
                 }
             }
         }

From 831f9c9860a847f35c635248f4f0ede5fc1added Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Wed, 5 Jun 2024 17:46:06 -0400
Subject: [PATCH 125/134] removing old test

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../numeric/numeric.ops/copy_if_kt.pass.cpp   | 104 ------------------
 1 file changed, 104 deletions(-)
 delete mode 100644 test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp

diff --git a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp b/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
deleted file mode 100644
index a46b76a3be2..00000000000
--- a/test/parallel_api/numeric/numeric.ops/copy_if_kt.pass.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-// -*- C++ -*-
-//===-- scan.pass.cpp -----------------------------------------------------===//
-//
-// Copyright (C) Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// This file incorporates work covered by the following copyright and permission
-// notice:
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "support/test_config.h"
-#include "support/utils.h"
-
-#include <oneapi/dpl/experimental/kernel_templates>
-#include _PSTL_TEST_HEADER(execution)
-#include _PSTL_TEST_HEADER(numeric)
-
-using namespace TestUtils;
-
-template <typename T, typename Predicate, typename Generator, typename KernelParam>
-bool
-test(Predicate pred, Generator gen, KernelParam param)
-{
-    bool all_passed = true;
-    sycl::queue q;
-
-    for (int logn : {4, 8, 10, 12, 14, 15, 18})
-    {
-        int n = 1 << logn;
-
-        Sequence<T> in(n, [&](size_t k) -> T { return gen(n ^ k); });
-
-        Sequence<T> std_out(n);
-
-        T* in_ptr = sycl::malloc_device<T>(n, q);
-        T* out_ptr = sycl::malloc_device<T>(n, q);
-        size_t* out_num = sycl::malloc_device<size_t>(1, q);
-
-        q.copy(in.data(), in_ptr, n).wait();
-        oneapi::dpl::experimental::kt::gpu::copy_if(q, in_ptr, in_ptr + n, out_ptr, out_num, pred, param).wait();
-
-        Sequence<T> kt_out(n);
-        size_t num_selected = 0;
-        q.copy(out_ptr, kt_out.data(), n);
-        q.copy(out_num, &num_selected, 1);
-        q.wait();
-
-        auto std_out_end = std::copy_if(in.begin(), in.end(), std_out.begin(), pred);
-
-        bool passed = true;
-        if (num_selected != (std_out_end - std_out.begin()))
-        {
-            passed = false;
-            std::cout << "Num selected wrong: expected " << (std_out_end - std_out.begin()) << " " << num_selected
-                      << "\n";
-        }
-
-        for (size_t i = 0; i < (std_out_end - std_out.begin()); ++i)
-        {
-            if (kt_out[i] != std_out[i])
-            {
-                passed = false;
-                std::cout << "expected " << i << ' ' << std_out[i] << ' ' << kt_out[i] << '\n';
-            }
-        }
-
-        if (passed)
-            std::cout << " passed" << std::endl;
-        else
-            std::cout << " failed" << std::endl;
-
-        all_passed &= passed;
-        sycl::free(in_ptr, q);
-        sycl::free(out_ptr, q);
-        sycl::free(out_num, q);
-    }
-
-    return all_passed;
-}
-
-int
-main()
-{
-    bool all_passed = true;
-    constexpr int n_elements_per_workitem = 8;
-
-    auto param = oneapi::dpl::experimental::kt::kernel_param<n_elements_per_workitem, 128>{};
-    all_passed &=
-        test<float64_t>([](const float64_t& x) { return x * x <= 1024; },
-                        [](size_t j) { return ((j + 1) % 7 & 2) != 0 ? float64_t(j % 32) : float64_t(j % 33 + 34); },
-                        TestUtils::get_new_kernel_params<0>(param));
-    all_passed &= test<int>([](const int&) { return true; }, [](size_t j) { return j; },
-                        TestUtils::get_new_kernel_params<1>(param));
-    all_passed &= test<std::int32_t>([](const std::int32_t& x) { return x != 42; },
-                                     [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? std::int32_t(j + 1) : 42; },
-                                     TestUtils::get_new_kernel_params<2>(param));
-
-    return all_passed;
-}

From 7be4594ec4fbaaffbc12732f4325113f944f91c4 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 6 Jun 2024 09:04:58 -0400
Subject: [PATCH 126/134] undo change to unroll version check

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/pstl/onedpl_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h
index fff5f2405b5..450cae9a347 100644
--- a/include/oneapi/dpl/pstl/onedpl_config.h
+++ b/include/oneapi/dpl/pstl/onedpl_config.h
@@ -123,7 +123,7 @@
 // Enable loop unrolling pragmas where supported
 #if (__INTEL_LLVM_COMPILER || __INTEL_COMPILER ||                                                                      \
      (!defined(__INTEL_LLVM_COMPILER) && !defined(__INTEL_COMPILER) &&                                                 \
-      ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 40000))))
+      ((_ONEDPL_GCC_VERSION >= 80000) || (_ONEDPL_CLANG_VERSION >= 30700))))
 #    define _ONEDPL_PRAGMA_UNROLL _ONEDPL_PRAGMA(unroll)
 #else //no pragma unroll
 #    define _ONEDPL_PRAGMA_UNROLL

From 68c258d67129e828ea088863a8fb8ccc3dfb04b0 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 6 Jun 2024 09:16:51 -0400
Subject: [PATCH 127/134] formatting

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/experimental/kt/single_pass_scan.h |  2 +-
 test/kt/single_pass_copy_if.cpp                   | 15 ++++++++-------
 test/kt/single_pass_scan.cpp                      |  6 ++++--
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 7848fc1fd8f..66481abb1c8 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -561,7 +561,7 @@ struct __copy_if_kernel_func
 #pragma unroll
             for (std::uint16_t __i = 0; __i < __data_per_workitem; ++__i)
             {
-                if (__i + (__wg_local_id) * __data_per_workitem + __elems_in_tile * __tile_id < __n)
+                if (__i + (__wg_local_id)*__data_per_workitem + __elems_in_tile * __tile_id < __n)
                 {
                     _Type __val = __in_rng[__i + __wg_local_id * __data_per_workitem + __elems_in_tile * __tile_id];
 
diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp
index 88311e741ad..9896e394534 100644
--- a/test/kt/single_pass_copy_if.cpp
+++ b/test/kt/single_pass_copy_if.cpp
@@ -92,7 +92,8 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param
 
         oneapi::dpl::experimental::ranges::all_view<T, sycl::access::mode::read> view(buf);
         oneapi::dpl::experimental::ranges::all_view<T, sycl::access::mode::read_write> view_out(buf_out);
-        oneapi::dpl::experimental::ranges::all_view<std::size_t, sycl::access::mode::write> view_num_copied(buf_num_copied);
+        oneapi::dpl::experimental::ranges::all_view<std::size_t, sycl::access::mode::write> view_num_copied(
+            buf_num_copied);
         oneapi::dpl::experimental::kt::gpu::copy_if(q, view, view_out, view_num_copied, pred, param).wait();
     }
 
@@ -134,7 +135,6 @@ test_buffer(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
     EXPECT_EQ(num_copied_ref, num_copied_acc[0], msg1.c_str());
     std::string msg2 = "wrong results with buffer, n: " + std::to_string(size);
     EXPECT_EQ_N(out_ref.begin(), acc.begin(), num_copied_ref, msg2.c_str());
-
 }
 #endif
 
@@ -159,14 +159,14 @@ test_usm(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
     std::size_t num_copied_ref = out_end - std::begin(out_ref);
 
     oneapi::dpl::experimental::kt::gpu::copy_if(q, dt_input.get_data(), dt_input.get_data() + size,
-                                                       dt_output.get_data(), dt_num_copied.get_data(), pred, param)
+                                                dt_output.get_data(), dt_num_copied.get_data(), pred, param)
         .wait();
 
     std::vector<T> actual(size);
     dt_output.retrieve_data(actual.begin());
     std::vector<std::size_t> num_copied_host(1);
     dt_num_copied.retrieve_data(num_copied_host.begin());
-  
+
     std::string msg1 = "wrong num copied with USM, n: " + std::to_string(size);
     EXPECT_EQ(num_copied_ref, num_copied_host[0], msg1.c_str());
     std::string msg2 = "wrong results with USM, n: " + std::to_string(size);
@@ -222,7 +222,6 @@ void
 test_all_cases(sycl::queue q, std::size_t size, Predicate pred, KernelParam param)
 {
     test_general_cases<T>(q, size, pred, TestUtils::get_new_kernel_params<0>(param));
-
 }
 
 int
@@ -235,8 +234,10 @@ main()
               << "TEST_TYPE               : " << TypeInfo().name<TEST_TYPE>() << std::endl;
 #endif
 
-    constexpr oneapi::dpl::experimental::kt::kernel_param<TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE,
-                                                          /*opt_out_single_wg=*/ std::bool_constant<TEST_SINGLE_WG_OPTOUT>> params;
+    constexpr oneapi::dpl::experimental::kt::kernel_param<
+        TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE,
+        /*opt_out_single_wg=*/std::bool_constant<TEST_SINGLE_WG_OPTOUT>>
+        params;
     auto q = TestUtils::get_test_queue();
     bool run_test = can_run_test<decltype(params), TEST_TYPE>(q, params);
 
diff --git a/test/kt/single_pass_scan.cpp b/test/kt/single_pass_scan.cpp
index a92b451cdcf..06c8a545748 100644
--- a/test/kt/single_pass_scan.cpp
+++ b/test/kt/single_pass_scan.cpp
@@ -210,8 +210,10 @@ main()
               << "TEST_TYPE               : " << TypeInfo().name<TEST_TYPE>() << std::endl;
 #endif
 
-    constexpr oneapi::dpl::experimental::kt::kernel_param<TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE,
-                                                              /*opt_out_single_wg=*/ std::bool_constant<TEST_SINGLE_WG_OPTOUT>> params;
+    constexpr oneapi::dpl::experimental::kt::kernel_param<
+        TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE,
+        /*opt_out_single_wg=*/std::bool_constant<TEST_SINGLE_WG_OPTOUT>>
+        params;
     auto q = TestUtils::get_test_queue();
     bool run_test = can_run_test<decltype(params), TEST_TYPE>(q, params);
 

From 347bcf5d44b397742d42ff2e0f6941b7a4ad0434 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 6 Jun 2024 16:48:36 -0400
Subject: [PATCH 128/134] allowing alg to dictate active threads

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../dpl/experimental/kt/single_pass_scan.h    | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 66481abb1c8..c1eddba4a0d 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -259,26 +259,27 @@ template <typename _FlagType, typename _Group, typename _SubGroup, typename _Sta
 void
 __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags __status_flags,
                  _StatusValues __status_vals_full, _StatusValues __status_vals_partial, std::uint32_t __tile_id,
-                 const _Type& __local_reduction, _Type& __prev_tile_reduction, _BinaryOp __binary_op)
+                 const _Type& __local_reduction, _Type& __prev_tile_reduction, bool __is_active_sg,
+                 bool __is_active_wi, std::uint16_t __active_wg_wi_id, _BinaryOp __binary_op)
 {
     // The last sub-group will query the previous tiles to find a prefix
-    if (__subgroup.get_group_id() == (__subgroup.get_group_range()[0] - 1))
+    if (__is_active_sg)
     {
         _FlagType __flag(__status_flags, __status_vals_full, __status_vals_partial, __tile_id);
 
-        if (__subgroup.get_local_id() == __subgroup.get_local_range()[0] - 1)
+        if (__is_active_wi)
         {
             __flag.set_partial(__local_reduction);
         }
 
         __prev_tile_reduction = __flag.cooperative_lookback(__subgroup, __binary_op);
 
-        if (__subgroup.get_local_id() == __subgroup.get_local_range()[0] - 1)
+        if (__is_active_wi)
         {
             __flag.set_full(__binary_op(__prev_tile_reduction, __local_reduction));
         }
     }
-    __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, __group.get_local_range()[0] - 1);
+    __prev_tile_reduction = sycl::group_broadcast(__group, __prev_tile_reduction, __active_wg_wi_id);
 }
 
 template <std::uint16_t __data_per_workitem, std::uint16_t __workgroup_size, typename _Type, typename _FlagType,
@@ -305,6 +306,10 @@ struct __lookback_kernel_func
         auto __group = __item.get_group();
         auto __subgroup = __item.get_sub_group();
         auto __local_id = __item.get_local_id(0);
+        constexpr bool __is_active_sg = (__subgroup.get_group_id() == 0);
+        constexpr bool __is_active_wi = (__subgroup.get_local_id() == 0);
+        constexpr std::uint16_t __active_wg_wi_id = 0;
+
 
         std::uint32_t __tile_id = 0;
 
@@ -361,7 +366,7 @@ struct __lookback_kernel_func
         _Type __prev_tile_reduction{};
 
         __lookback_phase<_FlagType>(__group, __subgroup, __status_flags, __status_vals_full, __status_vals_partial,
-                                    __tile_id, __local_reduction, __prev_tile_reduction, __binary_op);
+                                    __tile_id, __local_reduction, __prev_tile_reduction, __is_active_sg, __is_active_wi, __active_wg_wi_id, __binary_op);
 
         sycl::joint_inclusive_scan(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __out_begin,
                                    __binary_op, __prev_tile_reduction);
@@ -523,6 +528,12 @@ struct __copy_if_kernel_func
         auto __wg_local_id = __item.get_local_id(0);
         auto __sg = __item.get_sub_group();
 
+        constexpr std::uint16_t __active_sg_id = __workgroup_size / SUBGROUP_SIZE - 1;
+        constexpr std::uint16_t __active_sg_wi_id = SUBGROUP_SIZE - 1;
+        bool __is_active_sg = (__subgroup.get_group_id() == __active_sg_id);
+        bool __is_active_wi = (__subgroup.get_local_id() == __active_sg_wi_id);
+        constexpr std::uint16_t __active_wg_wi_id = __workgroup_size - 1;
+
         std::uint32_t __tile_id = 0;
 
         // Obtain unique ID for this work-group that will be used in decoupled lookback
@@ -580,7 +591,8 @@ struct __copy_if_kernel_func
         _SizeT __copied_elements = 0;
 
         __lookback_phase<_FlagType>(__group, __sg, __status_flags, __status_vals_full, __status_vals_partial, __tile_id,
-                                    __wg_count + __wi_count, __copied_elements, _BinaryOp{});
+                                    __wg_count + __wi_count, __copied_elements, __is_active_sg, __is_active_wi,
+                                    __active_wg_wi_id, _BinaryOp{});
 
         // Phase 3: copy values to global memory
         for (std::uint16_t __i = 0; __i < __wi_count; ++__i)

From 62f58e224fc101454fb4a67bb1f781d7fe64f176 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 6 Jun 2024 13:57:50 -0700
Subject: [PATCH 129/134] bugfix for indexes

---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index c1eddba4a0d..57014969de0 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -306,8 +306,8 @@ struct __lookback_kernel_func
         auto __group = __item.get_group();
         auto __subgroup = __item.get_sub_group();
         auto __local_id = __item.get_local_id(0);
-        constexpr bool __is_active_sg = (__subgroup.get_group_id() == 0);
-        constexpr bool __is_active_wi = (__subgroup.get_local_id() == 0);
+        bool __is_active_sg = (__subgroup.get_group_id() == 0);
+        bool __is_active_wi = (__subgroup.get_local_id() == 0);
         constexpr std::uint16_t __active_wg_wi_id = 0;
 
 
@@ -530,8 +530,8 @@ struct __copy_if_kernel_func
 
         constexpr std::uint16_t __active_sg_id = __workgroup_size / SUBGROUP_SIZE - 1;
         constexpr std::uint16_t __active_sg_wi_id = SUBGROUP_SIZE - 1;
-        bool __is_active_sg = (__subgroup.get_group_id() == __active_sg_id);
-        bool __is_active_wi = (__subgroup.get_local_id() == __active_sg_wi_id);
+        bool __is_active_sg = (__sg.get_group_id() == __active_sg_id);
+        bool __is_active_wi = (__sg.get_local_id() == __active_sg_wi_id);
         constexpr std::uint16_t __active_wg_wi_id = __workgroup_size - 1;
 
         std::uint32_t __tile_id = 0;

From 5c4bd74213db66f153a7f608154feaba151987d3 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Thu, 6 Jun 2024 17:13:09 -0400
Subject: [PATCH 130/134] clang format

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 include/oneapi/dpl/experimental/kt/single_pass_scan.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 57014969de0..ea756308fd9 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -259,8 +259,8 @@ template <typename _FlagType, typename _Group, typename _SubGroup, typename _Sta
 void
 __lookback_phase(const _Group& __group, const _SubGroup& __subgroup, _StatusFlags __status_flags,
                  _StatusValues __status_vals_full, _StatusValues __status_vals_partial, std::uint32_t __tile_id,
-                 const _Type& __local_reduction, _Type& __prev_tile_reduction, bool __is_active_sg,
-                 bool __is_active_wi, std::uint16_t __active_wg_wi_id, _BinaryOp __binary_op)
+                 const _Type& __local_reduction, _Type& __prev_tile_reduction, bool __is_active_sg, bool __is_active_wi,
+                 std::uint16_t __active_wg_wi_id, _BinaryOp __binary_op)
 {
     // The last sub-group will query the previous tiles to find a prefix
     if (__is_active_sg)
@@ -310,7 +310,6 @@ struct __lookback_kernel_func
         bool __is_active_wi = (__subgroup.get_local_id() == 0);
         constexpr std::uint16_t __active_wg_wi_id = 0;
 
-
         std::uint32_t __tile_id = 0;
 
         // Obtain unique ID for this work-group that will be used in decoupled lookback
@@ -366,7 +365,8 @@ struct __lookback_kernel_func
         _Type __prev_tile_reduction{};
 
         __lookback_phase<_FlagType>(__group, __subgroup, __status_flags, __status_vals_full, __status_vals_partial,
-                                    __tile_id, __local_reduction, __prev_tile_reduction, __is_active_sg, __is_active_wi, __active_wg_wi_id, __binary_op);
+                                    __tile_id, __local_reduction, __prev_tile_reduction, __is_active_sg, __is_active_wi,
+                                    __active_wg_wi_id, __binary_op);
 
         sycl::joint_inclusive_scan(__group, __tile_vals_ptr, __tile_vals_ptr + __wg_local_memory_size, __out_begin,
                                    __binary_op, __prev_tile_reduction);

From 98324ed68c422f8914ddf467f51930dce2aa42b3 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 18 Jun 2024 10:53:42 -0400
Subject: [PATCH 131/134] address reviewer comments

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/experimental/kt/single_pass_scan.h   | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index ea756308fd9..24ee071ee3d 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -647,7 +647,7 @@ struct __copy_if_submitter<__data_per_workitem, __workgroup_size, _FlagType,
 
 template <typename _InRng, typename _OutRng, typename _NumCopiedRng, typename _UnaryPredicate, typename _KernelParam>
 sycl::event
-single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng __num_rng,
+single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng,
                          _UnaryPredicate __pred, _KernelParam)
 {
     using _SizeT = uint64_t;
@@ -702,8 +702,9 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out
         __queue, __status_flags, __status_vals_partial, __status_flags_size, _FlagType::__padding);
 
     sycl::event __prev_event = __copy_if_submitter<__elems_per_workitem, __workgroup_size, _FlagType, _CopyIfKernel>{}(
-        __queue, __fill_event, __in_rng, __out_rng, __num_rng, __n, __pred, __status_flags, __status_flags_size,
-        __status_vals_full, __status_vals_partial, __current_num_items, __current_num_wgs);
+        __queue, __fill_event, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
+        std::forward<_NumCopiedRng>(__num_rng), __n, __pred, __status_flags, __status_flags_size, __status_vals_full,
+        __status_vals_partial, __current_num_items, __current_num_wgs);
 
     // TODO: Currently, the following portion of code makes this entire function synchronous.
     // Ideally, we should be able to use the asynchronous free below, but we have found that doing
@@ -728,9 +729,9 @@ sycl::event
 copy_if(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _NumCopiedRng&& __num_rng, _UnaryPredicate __pred,
         _KernelParam __param = {})
 {
-    auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng));
-    auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng));
-    auto __num_view = oneapi::dpl::__ranges::views::all(std::forward<_NumCopiedRng>(__num_rng));
+    auto __in_view = oneapi::dpl::__ranges::views::all_read(std::forward<_InRng>(__in_rng));
+    auto __out_view = oneapi::dpl::__ranges::views::all_write(std::forward<_OutRng>(__out_rng));
+    auto __num_view = oneapi::dpl::__ranges::views::all_write(std::forward<_NumCopiedRng>(__num_rng));
 
     return __impl::single_pass_copy_if_impl(__queue, std::move(__in_view), std::move(__out_view), std::move(__num_view),
                                             __pred, __param);

From 4377c2de470d969cf2c3bc5da6a6ca0b9237ad00 Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 18 Jun 2024 11:39:04 -0400
Subject: [PATCH 132/134] simplify data generation and cutoff calculation.

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 test/kt/single_pass_copy_if.cpp | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp
index 9896e394534..327e15563d5 100644
--- a/test/kt/single_pass_copy_if.cpp
+++ b/test/kt/single_pass_copy_if.cpp
@@ -61,13 +61,18 @@ generate_copy_if_data(T* input, std::size_t size, std::uint32_t seed)
 {
     // Integer numbers are generated even for floating point types in order to avoid rounding errors,
     // and simplify the final check
-    using substitute_t = std::conditional_t<std::is_signed_v<T>, std::int64_t, std::uint64_t>;
-
     std::default_random_engine gen{seed};
-    substitute_t start = std::is_signed_v<T> ? -10 : 0;
-    substitute_t end = std::is_signed_v<T> ? 10 : 20;
-    std::uniform_int_distribution<substitute_t> dist(start, end);
-    std::generate(input, input + size, [&] { return dist(gen); });
+
+    if constexpr (std::is_integral_v<T>)
+    {
+        std::uniform_int_distribution<T> dist(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+        std::generate(input, input + size, [&] { return dist(gen); });
+    }
+    else
+    {
+        std::uniform_real_distribution<T> dist(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+        std::generate(input, input + size, [&] { return dist(gen); });
+    }
 }
 
 #if _ENABLE_RANGES_TESTING
@@ -87,15 +92,12 @@ test_all_view(sycl::queue q, std::size_t size, Predicate pred, KernelParam param
     sycl::buffer<std::size_t> buf_num_copied(&num_copied, 1);
     auto out_end = std::copy_if(std::begin(ref), std::end(ref), std::begin(out_ref), pred);
     std::size_t num_copied_ref = out_end - std::begin(out_ref);
-    {
-        sycl::buffer<T> buf(input.data(), input.size());
+    sycl::buffer<T> buf(input.data(), input.size());
 
-        oneapi::dpl::experimental::ranges::all_view<T, sycl::access::mode::read> view(buf);
-        oneapi::dpl::experimental::ranges::all_view<T, sycl::access::mode::read_write> view_out(buf_out);
-        oneapi::dpl::experimental::ranges::all_view<std::size_t, sycl::access::mode::write> view_num_copied(
-            buf_num_copied);
-        oneapi::dpl::experimental::kt::gpu::copy_if(q, view, view_out, view_num_copied, pred, param).wait();
-    }
+    oneapi::dpl::experimental::ranges::all_view<T, sycl::access::mode::read> view(buf);
+    oneapi::dpl::experimental::ranges::all_view<T, sycl::access::mode::write> view_out(buf_out);
+    oneapi::dpl::experimental::ranges::all_view<std::size_t, sycl::access::mode::write> view_num_copied(buf_num_copied);
+    oneapi::dpl::experimental::kt::gpu::copy_if(q, view, view_out, view_num_copied, pred, param).wait();
 
     auto acc = buf_out.get_host_access();
     auto num_copied_acc = buf_num_copied.get_host_access();
@@ -241,7 +243,8 @@ main()
     auto q = TestUtils::get_test_queue();
     bool run_test = can_run_test<decltype(params), TEST_TYPE>(q, params);
 
-    auto __predicate = __less_than_val<TEST_TYPE>{std::is_signed_v<TEST_TYPE> ? TEST_TYPE{0} : TEST_TYPE{10}};
+    TEST_TYPE cutoff = std::is_signed_v<TEST_TYPE> ? TEST_TYPE{0} : std::numeric_limits<TEST_TYPE>::max() / 2;
+    auto __predicate = __less_than_val<TEST_TYPE>{cutoff};
     if (run_test)
     {
 

From 496ed4d3eef3826873c453861d622901fee6072b Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 18 Jun 2024 11:57:36 -0400
Subject: [PATCH 133/134] strip out single workgroup opt out

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 .../oneapi/dpl/experimental/kt/kernel_param.h |  3 +-
 .../dpl/experimental/kt/single_pass_scan.h    | 33 ++++++++-----------
 test/kt/CMakeLists.txt                        | 11 +++----
 test/kt/single_pass_copy_if.cpp               |  6 +---
 test/kt/single_pass_scan.cpp                  |  6 +---
 5 files changed, 20 insertions(+), 39 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/kernel_param.h b/include/oneapi/dpl/experimental/kt/kernel_param.h
index bbed93e777c..b3ee36be189 100644
--- a/include/oneapi/dpl/experimental/kt/kernel_param.h
+++ b/include/oneapi/dpl/experimental/kt/kernel_param.h
@@ -18,13 +18,12 @@ namespace oneapi::dpl::experimental::kt
 {
 
 template <std::uint16_t __data_per_work_item, std::uint16_t __work_group_size,
-          typename _KernelName = oneapi::dpl::execution::DefaultKernelName, typename _SingleWgOptOut = std::false_type>
+          typename _KernelName = oneapi::dpl::execution::DefaultKernelName>
 struct kernel_param
 {
     static constexpr std::uint16_t data_per_workitem = __data_per_work_item;
     static constexpr std::uint16_t workgroup_size = __work_group_size;
     using kernel_name = _KernelName;
-    using single_wg_opt_out = _SingleWgOptOut;
 };
 
 } // namespace oneapi::dpl::experimental::kt
diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
index 24ee071ee3d..48e860b3880 100644
--- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h
+++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h
@@ -440,18 +440,14 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n);
 
-    if constexpr (std::negation_v<typename _KernelParam::single_wg_opt_out>)
+    // Perform a single-work group scan if the input is small
+    if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform))
     {
-        // Perform a single-work group scan if the input is small
-        if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform))
-        {
-            return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group(
-                oneapi::dpl::__internal::__device_backend_tag{},
-                oneapi::dpl::execution::__dpl::make_device_policy<typename _KernelParam::kernel_name>(__queue),
-                std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n,
-                oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op,
-                std::true_type{});
-        }
+        return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group(
+            oneapi::dpl::__internal::__device_backend_tag{},
+            oneapi::dpl::execution::__dpl::make_device_policy<typename _KernelParam::kernel_name>(__queue),
+            std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n,
+            oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, std::true_type{});
     }
 
     constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size;
@@ -668,16 +664,13 @@ single_pass_copy_if_impl(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n);
 
-    if constexpr (std::negation_v<typename _KernelParam::single_wg_opt_out>)
+    //If we fit in a single WG SLM, use the single wg version from oneDPL main
+    if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform))
     {
-        //If we fit in a single WG SLM, use the single wg version from oneDPL main
-        if (oneapi::dpl::__par_backend_hetero::__group_copy_if_fits_in_slm(__queue, __n, __n_uniform))
-        {
-            return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if(
-                oneapi::dpl::execution::__dpl::make_device_policy<_KernelName>(__queue), __n,
-                std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
-                std::forward<_NumCopiedRng>(__num_rng), __pred);
-        }
+        return oneapi::dpl::__par_backend_hetero::__dispatch_small_copy_if(
+            oneapi::dpl::execution::__dpl::make_device_policy<_KernelName>(__queue), __n,
+            std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), std::forward<_NumCopiedRng>(__num_rng),
+            __pred);
     }
     constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size;
     constexpr std::size_t __elems_per_workitem = _KernelParam::data_per_workitem;
diff --git a/test/kt/CMakeLists.txt b/test/kt/CMakeLists.txt
index 3159cfdae7e..cd2c600e1c5 100644
--- a/test/kt/CMakeLists.txt
+++ b/test/kt/CMakeLists.txt
@@ -130,7 +130,7 @@ if (ONEDPL_TEST_ENABLE_KT_ESIMD)
     _generate_esimd_sort_test("esimd_radix_sort" "256" "32" "double" "" 1000) # segfault
 endif()
 
-function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_size _type _single_wg_optout)
+function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_size _type)
 
     if ((NOT TARGET "build-${_alg}-kt-tests") AND (NOT TARGET "run-${_alg}-kt-tests"))
         add_custom_target("build-${_alg}-kt-tests" COMMENT "Build all ${_alg} kernel template tests")
@@ -141,7 +141,7 @@ function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_si
     endif()
 
     string(REPLACE "_t" "" _type_short ${_type})
-    set(_target_name "single_pass_${_alg}_dpwi${_data_per_work_item}_wgs${_work_group_size}_${_type_short}_${_single_wg_optout}")
+    set(_target_name "single_pass_${_alg}_dpwi${_data_per_work_item}_wgs${_work_group_size}_${_type_short}")
     set(_test_path "single_pass_${_alg}.cpp")
 
     #_generate_test_randomly(${_target_name} ${_test_path} ${_probability_permille})
@@ -152,7 +152,6 @@ function (_generate_gpu_single_pass_test _alg _data_per_work_item _work_group_si
 
         target_compile_definitions(${_target_name} PRIVATE TEST_DATA_PER_WORK_ITEM=${_data_per_work_item})
         target_compile_definitions(${_target_name} PRIVATE TEST_WORK_GROUP_SIZE=${_work_group_size})
-        target_compile_definitions(${_target_name} PRIVATE TEST_SINGLE_WG_OPTOUT=${_single_wg_optout})
         target_compile_definitions(${_target_name} PRIVATE TEST_TYPE=${_type})
     endif()
 
@@ -168,15 +167,13 @@ function(_generate_gpu_single_pass_tests)
         foreach (_data_per_work_item ${_data_per_work_item_all})
             foreach (_work_group_size ${_work_group_size_all})
                 foreach (_type ${_type_all})
-                    _generate_gpu_single_pass_test(${_alg} ${_data_per_work_item} ${_work_group_size} ${_type} "false")
+                    _generate_gpu_single_pass_test(${_alg} ${_data_per_work_item} ${_work_group_size} ${_type})
                 endforeach()
             endforeach()
         endforeach()
-        # to not double the number of tests, check single wg output with a single test per alg
-        _generate_gpu_single_pass_test(${_alg} "8" "512" "float" "true")
 
         _generate_test("single_pass_${_alg}" "single_pass_${_alg}.cpp")
-        target_compile_definitions("single_pass_${_alg}" PRIVATE TEST_DATA_PER_WORK_ITEM=8 TEST_WORK_GROUP_SIZE=256 TEST_TYPE=uint32_t TEST_SINGLE_WG_OPTOUT=false)
+        target_compile_definitions("single_pass_${_alg}" PRIVATE TEST_DATA_PER_WORK_ITEM=8 TEST_WORK_GROUP_SIZE=256 TEST_TYPE=uint32_t)
     endforeach()
 
 endfunction()
diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp
index 327e15563d5..66a373643a1 100644
--- a/test/kt/single_pass_copy_if.cpp
+++ b/test/kt/single_pass_copy_if.cpp
@@ -232,14 +232,10 @@ main()
 #if LOG_TEST_INFO
     std::cout << "TEST_DATA_PER_WORK_ITEM : " << TEST_DATA_PER_WORK_ITEM << "\n"
               << "TEST_WORK_GROUP_SIZE    : " << TEST_WORK_GROUP_SIZE << "\n"
-              << "TEST_SINGLE_WG_OPTOUT   : " << TEST_SINGLE_WG_OPTOUT << "\n"
               << "TEST_TYPE               : " << TypeInfo().name<TEST_TYPE>() << std::endl;
 #endif
 
-    constexpr oneapi::dpl::experimental::kt::kernel_param<
-        TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE,
-        /*opt_out_single_wg=*/std::bool_constant<TEST_SINGLE_WG_OPTOUT>>
-        params;
+    constexpr oneapi::dpl::experimental::kt::kernel_param<TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE> params;
     auto q = TestUtils::get_test_queue();
     bool run_test = can_run_test<decltype(params), TEST_TYPE>(q, params);
 
diff --git a/test/kt/single_pass_scan.cpp b/test/kt/single_pass_scan.cpp
index 06c8a545748..860db88d2b3 100644
--- a/test/kt/single_pass_scan.cpp
+++ b/test/kt/single_pass_scan.cpp
@@ -206,14 +206,10 @@ main()
 #if LOG_TEST_INFO
     std::cout << "TEST_DATA_PER_WORK_ITEM : " << TEST_DATA_PER_WORK_ITEM << "\n"
               << "TEST_WORK_GROUP_SIZE    : " << TEST_WORK_GROUP_SIZE << "\n"
-              << "TEST_SINGLE_WG_OPTOUT   : " << TEST_SINGLE_WG_OPTOUT << "\n"
               << "TEST_TYPE               : " << TypeInfo().name<TEST_TYPE>() << std::endl;
 #endif
 
-    constexpr oneapi::dpl::experimental::kt::kernel_param<
-        TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE,
-        /*opt_out_single_wg=*/std::bool_constant<TEST_SINGLE_WG_OPTOUT>>
-        params;
+    constexpr oneapi::dpl::experimental::kt::kernel_param<TEST_DATA_PER_WORK_ITEM, TEST_WORK_GROUP_SIZE> params;
     auto q = TestUtils::get_test_queue();
     bool run_test = can_run_test<decltype(params), TEST_TYPE>(q, params);
 

From 2d9e8a76c6f2f1169dea87ebce4a9f71bce7456f Mon Sep 17 00:00:00 2001
From: Dan Hoeflinger <dan.hoeflinger@intel.com>
Date: Tue, 18 Jun 2024 14:17:18 -0400
Subject: [PATCH 134/134] minimal data generation changes

Signed-off-by: Dan Hoeflinger <dan.hoeflinger@intel.com>
---
 test/kt/single_pass_copy_if.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/kt/single_pass_copy_if.cpp b/test/kt/single_pass_copy_if.cpp
index 66a373643a1..86b3cc46820 100644
--- a/test/kt/single_pass_copy_if.cpp
+++ b/test/kt/single_pass_copy_if.cpp
@@ -59,18 +59,16 @@ template <typename T>
 auto
 generate_copy_if_data(T* input, std::size_t size, std::uint32_t seed)
 {
-    // Integer numbers are generated even for floating point types in order to avoid rounding errors,
-    // and simplify the final check
     std::default_random_engine gen{seed};
 
     if constexpr (std::is_integral_v<T>)
     {
-        std::uniform_int_distribution<T> dist(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+        std::uniform_int_distribution<T> dist(std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
         std::generate(input, input + size, [&] { return dist(gen); });
     }
     else
     {
-        std::uniform_real_distribution<T> dist(std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+        std::uniform_real_distribution<T> dist(std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max());
         std::generate(input, input + size, [&] { return dist(gen); });
     }
 }